001 package org.maltparser.core.feature.spec.reader;
002
003 import java.io.BufferedReader;
004 import java.io.IOException;
005 import java.io.InputStreamReader;
006 import java.net.URL;
007 import java.util.ArrayList;
008 import java.util.EnumMap;
009 import java.util.regex.Pattern;
010
011 import org.maltparser.core.exception.MaltChainedException;
012 import org.maltparser.core.feature.FeatureException;
013 import org.maltparser.core.feature.spec.SpecificationModels;
014 /**
015 *
016 *
017 * @author Johan Hall
018 */
019 public class ParReader implements FeatureSpecReader {
020 public enum DataStructures {
021 STACK, INPUT, LEFTCONTEXT, RIGHTCONTEXT
022 };
023 public enum ColumnNames {
024 POS, DEP, LEX, LEMMA, CPOS, FEATS
025 };
026 private EnumMap<ColumnNames, String> columnNameMap;
027 private EnumMap<DataStructures, String> dataStructuresMap;
028 private boolean useSplitFeats = true;
029 private boolean covington = false;
030 private boolean pppath;
031 private boolean pplifted;
032 private boolean ppcoveredRoot;
033
034 public ParReader() throws MaltChainedException {
035 initializeColumnNameMap();
036 initializeDataStructuresMap();
037 setPppath(false);
038 setPplifted(false);
039 setPpcoveredRoot(false);
040 }
041
042 public void load(URL specModelURL, SpecificationModels featureSpecModels) throws MaltChainedException {
043 BufferedReader br = null;
044 Pattern tabPattern = Pattern.compile("\t");
045 if (specModelURL == null) {
046 throw new FeatureException("The feature specification file cannot be found. ");
047 }
048 try {
049 br = new BufferedReader(new InputStreamReader(specModelURL.openStream()));
050 } catch (IOException e) {
051 throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e);
052 }
053
054 if (br != null) {
055 int specModelIndex = featureSpecModels.getNextIndex();
056 String fileLine;
057 String items[];
058 StringBuilder featureText = new StringBuilder();
059 String splitfeats = "";
060 ArrayList<String> fileLines = new ArrayList<String>();
061 ArrayList<String> orderFileLines = new ArrayList<String>();
062 while (true) {
063 try {
064 fileLine = br.readLine();
065 } catch (IOException e) {
066 throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e);
067 }
068 if (fileLine == null) {
069 break;
070 }
071 if (fileLine.length() <= 1 && fileLine.trim().substring(0, 2).trim().equals("--")) {
072 continue;
073 }
074 fileLines.add(fileLine);
075 }
076 try {
077 br.close();
078 } catch (IOException e) {
079 throw new FeatureException("Could not close the feature specification file '"+specModelURL.toString()+"'. ", e);
080 }
081
082 for (int j = 0; j < fileLines.size(); j++) {
083 orderFileLines.add(fileLines.get(j));
084 }
085
086 boolean deprel = false;
087 for (int j=0; j < orderFileLines.size(); j++) {
088 deprel = false;
089 featureText.setLength(0);
090 splitfeats = "";
091 items = tabPattern.split(orderFileLines.get(j));
092 if (items.length < 2) {
093 throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' must contain at least two columns.");
094 }
095 if (!(columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim())) || columnNameMap.containsValue(items[0].trim()))) {
096 throw new FeatureException("Column one in the feature specification file '"+specModelURL.toString()+"' contains an unknown value '"+items[0].trim()+"'. ");
097 }
098 if (items[0].trim().equalsIgnoreCase("DEP") || items[0].trim().equalsIgnoreCase("DEPREL")) {
099 featureText.append("OutputColumn(DEPREL, ");
100 deprel = true;
101 } else {
102 if (columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim()))) {
103 featureText.append("InputColumn("+columnNameMap.get(ColumnNames.valueOf(items[0].trim()))+", ");
104 } else if (columnNameMap.containsValue(items[0].trim())) {
105 featureText.append("InputColumn("+items[0].trim()+", ");
106 }
107 if (items[0].trim().equalsIgnoreCase("FEATS") && isUseSplitFeats()) {
108 splitfeats = "Split(";
109 }
110 }
111 if (!(items[1].trim().equalsIgnoreCase("STACK") || items[1].trim().equalsIgnoreCase("INPUT") || items[1].trim().equalsIgnoreCase("CONTEXT"))) {
112 throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should be either 'STACK', 'INPUT' or 'CONTEXT' (Covington), not '"+items[1].trim()+"'. ");
113 }
114 int offset = 0;
115 if (items.length >= 3) {
116 try {
117 offset = new Integer(Integer.parseInt(items[2]));
118 } catch (NumberFormatException e) {
119 throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' contains a illegal integer value. ", e);
120 }
121 }
122 String functionArg = "";
123
124 if (items[1].trim().equalsIgnoreCase("CONTEXT")) {
125 if (offset >= 0) {
126 functionArg = dataStructuresMap.get(DataStructures.valueOf("LEFTCONTEXT"))+"["+offset+"]";
127 } else {
128 functionArg = dataStructuresMap.get(DataStructures.valueOf("RIGHTCONTEXT"))+"["+Math.abs(offset + 1)+"]";
129 }
130 } else if (dataStructuresMap.containsKey(DataStructures.valueOf(items[1].trim()))) {
131 if (covington == true) {
132 if (dataStructuresMap.get(DataStructures.valueOf(items[1].trim())).equalsIgnoreCase("Stack")) {
133 functionArg = "Left["+offset+"]";
134 } else {
135 functionArg = "Right["+offset+"]";
136 }
137 } else {
138 functionArg = dataStructuresMap.get(DataStructures.valueOf(items[1].trim()))+"["+offset+"]";
139 }
140 } else if (dataStructuresMap.containsValue(items[1].trim())) {
141 if (covington == true) {
142 if (items[1].trim().equalsIgnoreCase("Stack")) {
143 functionArg = "Left["+offset+"]";
144 } else {
145 functionArg = "Right["+offset+"]";
146 }
147 } else {
148 functionArg = items[1].trim()+"["+offset+"]";
149 }
150
151 } else {
152 throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should not contain the value '"+items[1].trim());
153 }
154
155 int linearOffset = 0;
156 int headOffset = 0;
157 int depOffset = 0;
158 int sibOffset = 0;
159 int suffixLength = 0;
160 if (items.length >= 4) { linearOffset = new Integer(Integer.parseInt(items[3])); }
161 if (items.length >= 5) { headOffset = new Integer(Integer.parseInt(items[4])); }
162 if (items.length >= 6) { depOffset = new Integer(Integer.parseInt(items[5])); }
163 if (items.length >= 7) { sibOffset = new Integer(Integer.parseInt(items[6])); }
164 if (items.length >= 8) { suffixLength = new Integer(Integer.parseInt(items[7])); }
165 if (linearOffset < 0) {
166 linearOffset = Math.abs(linearOffset);
167 for (int i = 0; i < linearOffset; i++) {
168 functionArg = "pred("+functionArg+")";
169 }
170 } else if (linearOffset > 0) {
171 for (int i = 0; i < linearOffset; i++) {
172 functionArg = "succ("+functionArg+")";
173 }
174 }
175 if (headOffset >= 0) {
176 for (int i = 0; i < headOffset; i++) {
177 functionArg = "head("+functionArg+")";
178 }
179 } else {
180 throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' should not contain a negative head function value. ");
181 }
182 if (depOffset < 0) {
183 depOffset = Math.abs(depOffset);
184 for (int i = 0; i < depOffset; i++) {
185 functionArg = "ldep("+functionArg+")";
186 }
187 } else if (depOffset > 0) {
188 for (int i = 0; i < depOffset; i++) {
189 functionArg = "rdep("+functionArg+")";
190 }
191 }
192 if (sibOffset < 0) {
193 sibOffset = Math.abs(sibOffset);
194 for (int i = 0; i < sibOffset; i++) {
195 functionArg = "lsib("+functionArg+")";
196 }
197 } else if (sibOffset > 0) {
198 for (int i = 0; i < sibOffset; i++) {
199 functionArg = "rsib("+functionArg+")";
200 }
201 }
202
203 if (deprel == true && (pppath == true || pplifted == true || ppcoveredRoot == true)) {
204 featureSpecModels.add(specModelIndex, mergePseudoProjColumns(functionArg));
205 } else {
206 if (suffixLength != 0) {
207 featureSpecModels.add(specModelIndex, "Suffix("+featureText.toString()+functionArg+"),"+suffixLength+")");
208 } else if (splitfeats.equals("Split(")) {
209 featureSpecModels.add(specModelIndex, splitfeats+featureText.toString()+functionArg+"),\\|)");
210 } else {
211 featureSpecModels.add(specModelIndex, featureText.toString()+functionArg+")");
212 }
213 }
214
215 }
216 }
217 }
218
219 private String mergePseudoProjColumns(String functionArg) {
220 StringBuilder newFeatureText = new StringBuilder();
221 int c = 1;
222
223 if (pplifted == true) { c++; };
224 if (pppath == true) { c++; };
225 if (ppcoveredRoot == true) { c++; };
226
227 if (c == 1) { // no merge
228 newFeatureText.append("OutputColumn(DEPREL, ");
229 newFeatureText.append(functionArg);
230 newFeatureText.append(')');
231 return newFeatureText.toString();
232 }
233 if (c == 2) {
234 newFeatureText.append("Merge(");
235 newFeatureText.append("OutputColumn(DEPREL, ");
236 newFeatureText.append(functionArg);
237 newFeatureText.append("), ");
238 if (pplifted == true) {
239 newFeatureText.append("OutputTable(PPLIFTED, ");
240 newFeatureText.append(functionArg);
241 newFeatureText.append(")");
242 }
243 if (pppath == true) {
244 newFeatureText.append("OutputTable(PPPATH, ");
245 newFeatureText.append(functionArg);
246 newFeatureText.append(")");
247 }
248 if (ppcoveredRoot == true) {
249 newFeatureText.append("OutputTable(PPCOVERED, ");
250 newFeatureText.append(functionArg);
251 newFeatureText.append(")");
252 }
253 newFeatureText.append(")");
254 } else if (c == 3) { // use Merge3
255 int i = 0;
256 newFeatureText.append("Merge3(");
257 newFeatureText.append("OutputColumn(DEPREL, ");
258 newFeatureText.append(functionArg);
259 newFeatureText.append("), ");
260 i++;
261 if (pplifted == true) {
262 newFeatureText.append("OutputTable(PPLIFTED, ");
263 newFeatureText.append(functionArg);
264 i++;
265 if (i<3) {
266 newFeatureText.append("), ");
267 } else {
268 newFeatureText.append(")");
269 }
270 }
271 if (pppath == true) {
272 newFeatureText.append("OutputTable(PPPATH, ");
273 newFeatureText.append(functionArg);
274 i++;
275 if (i<3) {
276 newFeatureText.append("), ");
277 } else {
278 newFeatureText.append(")");
279 }
280 }
281 if (ppcoveredRoot == true) {
282 newFeatureText.append("OutputTable(PPCOVERED, ");
283 newFeatureText.append(functionArg);
284 i++;
285 if (i<3) {
286 newFeatureText.append("), ");
287 } else {
288 newFeatureText.append(")");
289 }
290 }
291 newFeatureText.append(")");
292 } else { // c == 4
293 newFeatureText.append("Merge(Merge(");
294 newFeatureText.append("OutputColumn(DEPREL, ");
295 newFeatureText.append(functionArg);
296 newFeatureText.append("), ");
297 newFeatureText.append("OutputTable(PPLIFTED, ");
298 newFeatureText.append(functionArg);
299 newFeatureText.append(")), Merge(");
300 newFeatureText.append("OutputTable(PPPATH, ");
301 newFeatureText.append(functionArg);
302 newFeatureText.append("), ");
303 newFeatureText.append("OutputTable(PPCOVERED, ");
304 newFeatureText.append(functionArg);
305 newFeatureText.append(")))");
306 }
307 return newFeatureText.toString();
308 }
309
310 public EnumMap<ColumnNames, String> getColumnNameMap() {
311 return columnNameMap;
312 }
313
314 public void initializeColumnNameMap() {
315 columnNameMap = new EnumMap<ColumnNames, String>(ColumnNames.class);
316 columnNameMap.put(ColumnNames.POS, "POSTAG");
317 columnNameMap.put(ColumnNames.CPOS, "CPOSTAG");
318 columnNameMap.put(ColumnNames.DEP, "DEPREL");
319 columnNameMap.put(ColumnNames.LEX, "FORM");
320 columnNameMap.put(ColumnNames.LEMMA, "LEMMA");
321 columnNameMap.put(ColumnNames.FEATS, "FEATS");
322 }
323
324 public void setColumnNameMap(EnumMap<ColumnNames, String> columnNameMap) {
325 this.columnNameMap = columnNameMap;
326 }
327
328 public EnumMap<DataStructures, String> getDataStructuresMap() {
329 return dataStructuresMap;
330 }
331
332 //TODO Fix covington
333 public void initializeDataStructuresMap() {
334 dataStructuresMap = new EnumMap<DataStructures, String>(DataStructures.class);
335 dataStructuresMap.put(DataStructures.STACK, "Stack");
336 dataStructuresMap.put(DataStructures.INPUT, "Input");
337 }
338
339 public void setDataStructuresMap(EnumMap<DataStructures, String> dataStructuresMap) {
340 this.dataStructuresMap = dataStructuresMap;
341 }
342
343 public boolean isUseSplitFeats() {
344 return useSplitFeats;
345 }
346
347 public void setUseSplitFeats(boolean useSplitFeats) {
348 this.useSplitFeats = useSplitFeats;
349 }
350
351 public boolean isCovington() {
352 return covington;
353 }
354
355 public void setCovington(boolean covington) {
356 this.covington = covington;
357 }
358
359 public boolean isPppath() {
360 return pppath;
361 }
362
363 public void setPppath(boolean pppath) {
364 this.pppath = pppath;
365 }
366
367 public boolean isPplifted() {
368 return pplifted;
369 }
370
371 public void setPplifted(boolean pplifted) {
372 this.pplifted = pplifted;
373 }
374
375 public boolean isPpcoveredRoot() {
376 return ppcoveredRoot;
377 }
378
379 public void setPpcoveredRoot(boolean ppcoveredRoot) {
380 this.ppcoveredRoot = ppcoveredRoot;
381 }
382
383 public String toString() {
384 StringBuilder sb = new StringBuilder();
385 sb.append("Mapping of column names:\n");
386 for (ColumnNames columnName : ColumnNames.values()) {
387 sb.append(columnName.toString()+"\t"+columnNameMap.get(columnName)+"\n");
388 }
389 sb.append("Mapping of data structures:\n");
390 for (DataStructures dataStruct : DataStructures.values()) {
391 sb.append(dataStruct.toString()+"\t"+dataStructuresMap.get(dataStruct)+"\n");
392 }
393 sb.append("Split FEATS column: "+useSplitFeats+"\n");
394 return sb.toString();
395 }
396 }