KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > jutils > csv > CSVReader


1 /*------------------------------------------------------------------------------
2 Name: CSVReader.java
3 Project: jutils.org
4 Comment: Reads CSV (Comma Separated Value) files
5 Version: $Id: CSVReader.java,v 1.1 2004/04/07 07:40:45 laurent Exp $
6 Author: Roedy Green roedy@mindprod.com, Heinrich Goetzger goetzger@gmx.net
7 ------------------------------------------------------------------------------*/

8 package org.jutils.csv;
9
10 import java.util.Vector JavaDoc;
11 import java.io.BufferedReader JavaDoc;
12 import java.io.EOFException JavaDoc;
13 import java.io.FileReader JavaDoc;
14 import java.io.IOException JavaDoc;
15 import java.io.Reader JavaDoc;
16
17 /**
18  * Reads CSV (Comma Separated Value) files.
19  *
20  * This format is mostly used my Microsoft Word and Excel.
21  * Fields are separated by commas, and enclosed in
22  * quotes if they contain commas or quotes.
23  * Embedded quotes are doubled.
24  * Embedded spaces do not normally require surrounding quotes.
25  * The last field on the line is not followed by a comma.
26  * Null fields are represented by two commas in a row.
27  * We ignore leading and trailing spaces on fields, even inside quotes.
28  *
29  * @author copyright (c) 2002 Roedy Green Canadian Mind Products
30  * Roedy posted this code on Newsgroups:comp.lang.java.programmer on 27th March 2002.
31  *
32  * Heinrich added some stuff like comment ability and linewise working.
33  *
34  */

35
36 public class CSVReader {
37    /**
38     * Constructor
39     *
40     * @param r input Reader source of CSV Fields to read.
41     * @param separator
42     * field separator character, usually ',' in North America,
43     * ';' in Europe and sometimes '\t' for tab.
44     */

45    public CSVReader (Reader JavaDoc r, char separator) {
46       /* convert Reader to BufferedReader if necessary */
47       if ( r instanceof BufferedReader JavaDoc ) {
48          this.r = (BufferedReader JavaDoc) r;
49       } else {
50          this.r = new BufferedReader JavaDoc(r);
51       }
52       this.separator = separator;
53    } // end of CSVReader
54

55    /**
56     * Constructor with default field separator ','.
57     *
58     * @param r input Reader source of CSV Fields to read.
59     */

60    public CSVReader (Reader JavaDoc r) {
61       /* convert Reader to BufferedReader if necessary */
62       if ( r instanceof BufferedReader JavaDoc ) {
63          this.r = (BufferedReader JavaDoc) r;
64       } else {
65          this.r = new BufferedReader JavaDoc(r);
66       }
67       this.separator = ',';
68    } // end of CSVReader
69

70    private static final boolean debugging = true;
71
72    /**
73     * Reader source of the CSV fields to be read.
74     */

75    private BufferedReader JavaDoc r;
76
77    /*
78    * field separator character, usually ',' in North America,
79    * ';' in Europe and sometimes '\t' for tab.
80    */

81    private char separator;
82
83    /**
84     * category of end of line char.
85     */

86    private static final int EOL = 0;
87
88    /**
89     * category of ordinary character
90     */

91    private static final int ORDINARY = 1;
92
93    /**
94     * categotory of the quote mark "
95     */

96    private static final int QUOTE = 2;
97
98    /**
99     * category of the separator, e.g. comma, semicolon
100     * or tab.
101     */

102    private static final int SEPARATOR = 3;
103
104    /**
105     * category of characters treated as white space.
106     */

107    private static final int WHITESPACE = 4;
108
109    /**
110     * categorise a character for the finite state machine.
111     *
112     * @param c the character to categorise
113     * @return integer representing the character's category.
114     */

115    private int categorise ( char c ) {
116       switch ( c ) {
117          case ' ':
118          case '\r':
119          case 0xff:
120             return WHITESPACE;
121 // case ';':
122
// case '!':
123
case '#':
124             //return EOL;
125
case '\n':
126             return EOL; /* artificially applied to end of line */
127          case '\"':
128             return QUOTE;
129          default:
130             if (c == separator) {
131                /* dynamically determined so can't use as case label */
132                return SEPARATOR;
133             } else if ( '!' <= c && c <= '~' ) {
134                /* do our tests in crafted order, hoping for an early return */
135                return ORDINARY;
136             } else if ( 0x00 <= c && c <= 0x20 ) {
137                return WHITESPACE;
138             } else if ( Character.isWhitespace(c) ) {
139                return WHITESPACE;
140             } else {
141                return ORDINARY;
142             }
143       } // end of switch
144
} // end of categorise
145

146
147    /**
148     * parser: We are in blanks before the field.
149     */

150    private static final int SEEKINGSTART = 0;
151
152    /**
153     * parser: We are in the middle of an ordinary field.
154     */

155    private static final int INPLAIN = 1;
156
157    /**
158     * parser: e are in middle of field surrounded in quotes.
159     */

160    private static final int INQUOTED = 2;
161
162    /**
163     * parser: We have just hit a quote, might be doubled
164     * or might be last one.
165     */

166    private static final int AFTERENDQUOTE = 3;
167
168    /**
169    * parser: We are in blanks after the field looking for the separator
170    */

171    private static final int SKIPPINGTAIL = 4;
172
173    /**
174     * state of the parser's finite state automaton.
175     */

176
177    /**
178     * The line we are parsing.
179     * null means none read yet.
180     * Line contains unprocessed chars. Processed ones are removed.
181     */

182    private String JavaDoc line = null;
183
184    /**
185     * How many lines we have read so far.
186     * Used in error messages.
187     */

188    private int lineCount = 0;
189
190    public String JavaDoc[] getLine() {
191       Vector JavaDoc lineArray = new Vector JavaDoc();
192       String JavaDoc token = null;
193       String JavaDoc returnArray [] = null;
194
195       // reading values from line until null comes
196

197       try {
198          while (lineArray.size() == 0) {
199             while ( (token = get() ) != null ) {
200                lineArray.add(token);
201             } // end of while
202
} // end of while
203
} catch (EOFException JavaDoc e) {
204          return null;
205       } catch (IOException JavaDoc e) {
206       }
207
208       returnArray = new String JavaDoc[lineArray.size()];
209
210       for(int ii=0; ii < lineArray.size(); ii++) {
211          returnArray[ii] = lineArray.elementAt(ii).toString();
212       } // end of for
213

214       return returnArray;
215    }
216
217    /**
218     * Read one field from the CSV file
219     *
220     * @return String value, even if the field is numeric. Surrounded
221     * and embedded double quotes are stripped.
222     * possibly "". null means end of line.
223     *
224     * @exception EOFException
225     * at end of file after all the fields have
226     * been read.
227     *
228     * @exception IOException
229     * Some problem reading the file, possibly malformed data.
230     */

231    private String JavaDoc get() throws EOFException JavaDoc, IOException JavaDoc {
232       StringBuffer JavaDoc field = new StringBuffer JavaDoc(50);
233       /* we implement the parser as a finite state automaton with five states. */
234       readLine();
235
236       int state = SEEKINGSTART; /* start seeking, even if partway through a line */
237       /* don't need to maintain state between fields. */
238
239       /* loop for each char in the line to find a field */
240       /* guaranteed to leave early by hitting EOL */
241       for ( int i=0; i<line.length(); i++ ) {
242          char c = line.charAt(i);
243          int category = categorise(c);
244          switch ( state ) {
245             case SEEKINGSTART: {
246                /* in blanks before field */
247                switch ( category ) {
248                   case WHITESPACE:
249                      /* ignore */
250                      break;
251                   case QUOTE:
252                      state = INQUOTED;
253                      break;
254                   case SEPARATOR:
255                      /* end of empty field */
256                      line = line.substring(i+1);
257                      return "";
258                   case EOL:
259                      /* end of line */
260                      line = null;
261                      return null;
262                   case ORDINARY:
263                      field.append(c);
264                      state = INPLAIN;
265                      break;
266                }
267                break;
268             } // end of SEEKINGSTART
269
case INPLAIN: {
270                /* in middle of ordinary field */
271                switch ( category ) {
272                   case QUOTE:
273                      throw new IOException JavaDoc("Malformed CSV stream. Missing quote at start of field on line " + lineCount);
274                   case SEPARATOR:
275                      /* done */
276                      line = line.substring(i+1);
277                      return field.toString().trim();
278                   case EOL:
279                      line = line.substring(i); /* push EOL back */
280                      return field.toString().trim();
281                   case WHITESPACE:
282                      field.append(' ');
283                      break;
284                   case ORDINARY:
285                      field.append(c);
286                      break;
287                }
288                break;
289             } // end of INPLAIN
290
case INQUOTED: {
291                /* in middle of field surrounded in quotes */
292                switch ( category ) {
293                   case QUOTE:
294                      state = AFTERENDQUOTE;
295                      break;
296                   case EOL:
297                      throw new IOException JavaDoc ("Malformed CSV stream. Missing quote after field on line "+lineCount);
298                   case WHITESPACE:
299                      field.append(' ');
300                      break;
301                   case SEPARATOR:
302                   case ORDINARY:
303                      field.append(c);
304                      break;
305                }
306                 break;
307             } // end of INQUOTED
308
case AFTERENDQUOTE: {
309                /* In situation like this "xxx" which may
310                   turn out to be xxx""xxx" or "xxx",
311                   We find out here. */

312                switch ( category ) {
313                      case QUOTE:
314                         field.append(c);
315                         state = INQUOTED;
316                         break;
317                      case SEPARATOR :
318                         /* we are done.*/
319                         line = line.substring(i+1);
320                         return field.toString().trim();
321                      case EOL:
322                         line = line.substring(i); /* push back eol */
323                         return field.toString().trim();
324                      case WHITESPACE:
325                         /* ignore trailing spaces up to separator */
326                         state = SKIPPINGTAIL;
327                         break;
328                      case ORDINARY:
329                         throw new IOException JavaDoc("Malformed CSV stream, missing separator after field on line " + lineCount);
330                }
331                break;
332             } // end of AFTERENDQUOTE
333
case SKIPPINGTAIL: {
334                /* in spaces after field seeking separator */
335                switch ( category ) {
336                   case SEPARATOR :
337                      /* we are done.*/
338                      line = line.substring(i+1);
339                      return field.toString().trim();
340                   case EOL:
341                      line = line.substring(i); /* push back eol */
342                      return field.toString().trim();
343                   case WHITESPACE:
344                      /* ignore trailing spaces up to separator */
345                      break;
346                   case QUOTE:
347                   case ORDINARY:
348                      throw new IOException JavaDoc("Malformed CSV stream, missing separator after field on line " + lineCount);
349                } // end of switch
350
break;
351             } // end of SKIPPINGTAIL
352
} // end switch(state)
353
} // end for
354
throw new IOException JavaDoc("Program logic bug. Should not reach here. Processing line " + lineCount);
355    } // end get
356

357    /**
358     * Make sure a line is available for parsing.
359     * Does nothing if there already is one.
360     *
361     * @exception EOFException
362     */

363    private void readLine() throws EOFException JavaDoc, IOException JavaDoc {
364       if ( line == null ) {
365          line = r.readLine(); /* this strips platform specific line ending */
366          if ( line == null ) {
367                 /* null means EOF, yet another inconsistent Java convention. */
368             throw new EOFException JavaDoc();
369          } else {
370             line += '\n'; /* apply standard line end for parser to find */
371             lineCount++;
372          }
373       }
374    } // end of readLine
375

376
377    /**
378     * Skip over fields you don't want to process.
379     *
380     * @param fields How many field you want to bypass reading.
381     * The newline counts as one field.
382     * @exception EOFException
383     * at end of file after all the fields have
384     * been read.
385     * @exception IOException
386     * Some problem reading the file, possibly malformed data.
387     */

388    public void skip(int fields) throws EOFException JavaDoc, IOException JavaDoc {
389       if ( fields <= 0 ) {
390          return;
391       }
392       for ( int i=0; i<fields; i++ ) {
393          // throw results away
394
get();
395       }
396    } // end of skip
397

398    /**
399     * Skip over remaining fields on this line you don't want to process.
400     *
401     * @exception EOFException
402     * at end of file after all the fields have
403     * been read.
404     * @exception IOException
405     * Some problem reading the file, possibly malformed data.
406     */

407    public void skipToNextLine() throws EOFException JavaDoc, IOException JavaDoc {
408       if ( line == null ) {
409          readLine();
410       }
411       line = null;
412    } // end of skipToNextLine
413

414    /**
415     * Close the Reader.
416     */

417    public void close() throws IOException JavaDoc {
418       if ( r != null ) {
419          r.close();
420          r = null;
421       }
422    } // end of close
423

424    /**
425     * @param args [0]: The name of the file.
426     */

427    private static void testSingleTokens(String JavaDoc[] args) {
428       if ( debugging ) {
429          try {
430             // read test file
431
CSVReader csv = new CSVReader(new FileReader JavaDoc(args[0]), ',');
432            try {
433                while ( true ) {
434                   System.out.println(csv.get());
435                }
436             } catch ( EOFException JavaDoc e ) {
437                 }
438                 csv.close();
439          } catch ( IOException JavaDoc e ) {
440             e.printStackTrace();
441             System.out.println(e.getMessage());
442          }
443       } // end if
444
} // end of testSingleTokens
445

446    /**
447     * @param args [0]: The name of the file.
448     */

449    private static void testLines(String JavaDoc[] args) {
450       int lineCounter = 0;
451       String JavaDoc loadLine[] = null;
452       String JavaDoc DEL = ",";
453
454       if ( debugging ) {
455          try {
456             // read test file
457
CSVReader csv = new CSVReader(new FileReader JavaDoc(args[0]), ',');
458
459             while( (loadLine = csv.getLine()) != null) {
460                lineCounter++;
461                StringBuffer JavaDoc logBuffer = new StringBuffer JavaDoc();
462                String JavaDoc logLine;
463                //log.debug("#" + lineCounter +" : '" + loadLine.length + "'");
464
logBuffer.append(loadLine[0]); // write first token, then write DEL in loop and the whole rest.
465
for(int i=1; i < loadLine.length; i++) {
466                   logBuffer.append(DEL).append(loadLine[i]);
467                }
468                logLine = logBuffer.toString();
469                logLine.substring(0, logLine.lastIndexOf(DEL));
470                //logLine.delete(logLine.lastIndexOf(DEL), logLine.length()); // is supported since JDK 1.4
471
//System.out.println("#" + lineCounter +" : '" + loadLine.length + "' " + logLine);
472
System.out.println(logLine);
473             } // end of while
474
csv.close();
475          } catch ( IOException JavaDoc e ) {
476             e.printStackTrace();
477             System.out.println(e.getMessage());
478          }
479       } // end if
480
} // end of testLines
481

482    /**
483     * Test driver
484     *
485     * @param args [0]: The name of the file.
486     */

487    static public void main(String JavaDoc[] args) {
488       //testSingleTokens(args);
489
testLines(args);
490    } // end main
491
} // end CSVReader
492

493 // end of file
494

495
Popular Tags