001 package org.maltparser.core.syntaxgraph.reader;
002
003 import java.io.BufferedReader;
004 import java.io.FileInputStream;
005 import java.io.FileNotFoundException;
006 import java.io.IOException;
007 import java.io.InputStream;
008 import java.io.InputStreamReader;
009 import java.io.UnsupportedEncodingException;
010 import java.net.URL;
011 import java.util.Iterator;
012
013 import org.maltparser.core.exception.MaltChainedException;
014 import org.maltparser.core.io.dataformat.ColumnDescription;
015 import org.maltparser.core.io.dataformat.DataFormatException;
016 import org.maltparser.core.io.dataformat.DataFormatInstance;
017 import org.maltparser.core.syntaxgraph.DependencyStructure;
018 import org.maltparser.core.syntaxgraph.Element;
019 import org.maltparser.core.syntaxgraph.TokenStructure;
020 import org.maltparser.core.syntaxgraph.edge.Edge;
021 /**
022 *
023 *
024 * @author Johan Hall
025 */
026 public class TabReader implements SyntaxGraphReader {
027 private BufferedReader reader;
028 private int sentenceCount;
029 private final StringBuilder input;
030 private DataFormatInstance dataFormatInstance;
031 private static final String IGNORE_COLUMN_SIGN = "_";
032 private static final char TAB = '\t';
033 private static final char NEWLINE = '\n';
034 private static final char CARRIAGE_RETURN = '\r';
035 private String fileName = null;
036 private URL url = null;
037 private String charsetName;
038 private int nIterations;
039 private int cIterations;
040 private boolean closeStream = true;
041
042 public TabReader() {
043 input = new StringBuilder();
044 nIterations = 1;
045 cIterations = 1;
046 }
047
048 private void reopen() throws MaltChainedException {
049 close();
050 if (fileName != null) {
051 open(fileName, charsetName);
052 } else if (url != null) {
053 open(url, charsetName);
054 } else {
055 throw new DataFormatException("The input stream cannot be reopen. ");
056 }
057 }
058
059 public void open(String fileName, String charsetName) throws MaltChainedException {
060 setFileName(fileName);
061 setCharsetName(charsetName);
062 try {
063 open(new FileInputStream(fileName), charsetName);
064 } catch (FileNotFoundException e) {
065 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
066 }
067 }
068
069 public void open(URL url, String charsetName) throws MaltChainedException {
070 setUrl(url);
071 setCharsetName(charsetName);
072 if (url == null) {
073 throw new DataFormatException("The input file cannot be found. ");
074 }
075 try {
076 open(url.openStream(), charsetName);
077 } catch (IOException e) {
078 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
079 }
080 }
081
082 public void open(InputStream is, String charsetName) throws MaltChainedException {
083 try {
084 if (is == System.in) {
085 closeStream = false;
086 }
087 open(new InputStreamReader(is, charsetName));
088 } catch (UnsupportedEncodingException e) {
089 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
090 }
091 }
092
093 private void open(InputStreamReader isr) throws MaltChainedException {
094 setReader(new BufferedReader(isr));
095 setSentenceCount(0);
096 }
097
098 public void readProlog() throws MaltChainedException {
099
100 }
101
102 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
103 if (syntaxGraph == null || dataFormatInstance == null) {
104 return false;
105 }
106
107 Element node = null;
108 Edge edge = null;
109 input.setLength(0);
110 int i = 0;
111 int terminalCounter = 0;
112 int nNewLines = 0;
113 syntaxGraph.clear();
114 Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
115 while (true) {
116 int c;
117
118 try {
119 c = reader.read();
120 } catch (IOException e) {
121 close();
122 throw new DataFormatException("Error when reading from the input file. ", e);
123 }
124 if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) {
125 if (input.length() != 0) {
126 if (i == 0) {
127 terminalCounter++;
128 node = syntaxGraph.addTokenNode(terminalCounter);
129 }
130 ColumnDescription column = null;
131 if (columns.hasNext()) {
132 column = columns.next();
133 if (column.getCategory() == ColumnDescription.INPUT && node != null) {
134 syntaxGraph.addLabel(node, column.getName(), input.toString());
135 } else if (column.getCategory() == ColumnDescription.HEAD) {
136 if (syntaxGraph instanceof DependencyStructure) {
137 if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix
138 //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) {
139 edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter);
140 }
141 }
142 else {
143 close();
144 throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
145 }
146 } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
147 //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody
148 syntaxGraph.addLabel(edge, column.getName(), input.toString());
149 //} // bugfix
150 }
151 }
152 input.setLength(0);
153 nNewLines = 0;
154 i++;
155 } else if (c == TAB) {
156 throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. ");
157 }
158 if (c == NEWLINE) {
159 nNewLines++;
160 i = 0;
161 columns = dataFormatInstance.iterator();
162 }
163 } else {
164 input.append((char)c);
165 }
166
167 if (nNewLines == 2 && c == NEWLINE) {
168 if (syntaxGraph.hasTokens()) {
169 sentenceCount++;
170 }
171 return true;
172 } else if (c == -1) {
173 if (syntaxGraph.hasTokens()) {
174 sentenceCount++;
175 }
176 if (cIterations < nIterations) {
177 cIterations++;
178 reopen();
179 return true;
180 }
181
182 return false;
183 }
184 }
185 }
186
187 public void readEpilog() throws MaltChainedException {
188
189 }
190
191 public BufferedReader getReader() {
192 return reader;
193 }
194
195 public void setReader(BufferedReader reader) throws MaltChainedException {
196 close();
197 this.reader = reader;
198 }
199
200 public DataFormatInstance getDataFormatInstance() {
201 return dataFormatInstance;
202 }
203
204 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
205 this.dataFormatInstance = dataFormatInstance;
206 }
207
208 public int getSentenceCount() throws MaltChainedException {
209 return sentenceCount;
210 }
211
212 public void setSentenceCount(int sentenceCount) {
213 this.sentenceCount = sentenceCount;
214 }
215
216 public String getOptions() {
217 return null;
218 }
219
220 public void setOptions(String optionString) throws MaltChainedException {
221
222 }
223
224 public String getFileName() {
225 return fileName;
226 }
227
228 public void setFileName(String fileName) {
229 this.fileName = fileName;
230 }
231
232 public URL getUrl() {
233 return url;
234 }
235
236 public void setUrl(URL url) {
237 this.url = url;
238 }
239
240 public String getCharsetName() {
241 return charsetName;
242 }
243
244 public void setCharsetName(String charsetName) {
245 this.charsetName = charsetName;
246 }
247
248 public int getNIterations() {
249 return nIterations;
250 }
251
252 public void setNIterations(int iterations) {
253 nIterations = iterations;
254 }
255
256 public int getIterationCounter() {
257 return cIterations;
258 }
259
260 public void close() throws MaltChainedException {
261 try {
262 if (reader != null) {
263 if (closeStream) {
264 reader.close();
265 }
266 reader = null;
267 }
268 } catch (IOException e) {
269 throw new DataFormatException("Error when closing the input file. ", e);
270 }
271 }
272
273 public void clear() throws MaltChainedException {
274 close();
275 input.setLength(0);
276 dataFormatInstance = null;
277 sentenceCount = 0;
278 }
279 }