KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > Ostermiller > util > CSVParser


1 /*
2  * Read files in comma separated value format.
3  * Copyright (C) 2001-2004 Stephen Ostermiller
4  * http://ostermiller.org/contact.pl?regarding=Java+Utilities
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * See COPYING.TXT for details.
17  */

18
19 package com.Ostermiller.util;
20 import java.io.*;
21 import java.util.*;
22
23 /**
24  * Read files in comma separated value format.
25  * More information about this class is available from <a target="_top" HREF=
26  * "http://ostermiller.org/utils/CSVLexer.html">ostermiller.org</a>.
27  *
28  * CSV is a file format used as a portable representation of a database.
29  * Each line is one entry or record and the fields in a record are separated by commas.
30  * Commas may be preceded or followed by arbitrary space and/or tab characters which are
31  * ignored.
32  * <P>
33  * If field includes a comma or a new line, the whole field must be surrounded with double quotes.
34  * When the field is in quotes, any quote literals must be escaped by \" Backslash
35  * literals must be escaped by \\. Otherwise a backslash and the character following
36  * will be treated as the following character, IE. "\n" is equivalent to "n". Other escape
37  * sequences may be set using the setEscapes() method. Text that comes after quotes that have
38  * been closed but come before the next comma will be ignored.
39  * <P>
40  * Empty fields are returned as as String of length zero: "". The following line has three empty
41  * fields and three non-empty fields in it. There is an empty field on each end, and one in the
42  * middle. One token is returned as a space.<br>
43  * <pre>,second,," ",fifth,</pre>
44  * <P>
45  * Blank lines are always ignored. Other lines will be ignored if they start with a
46  * comment character as set by the setCommentStart() method.
47  * <P>
48  * An example of how CVSLexer might be used:
49  * <pre>
50  * CSVParser shredder = new CSVParser(System.in);
51  * shredder.setCommentStart("#;!");
52  * shredder.setEscapes("nrtf", "\n\r\t\f");
53  * String t;
54  * while ((t = shredder.nextValue()) != null){
55  * System.out.println("" + shredder.lastLineNumber() + " " + t);
56  * }
57  * </pre>
58  * <P>
59  * Some applications do not output CSV according to the generally accepted standards and this parse may
60  * not be able to handle it. One such application is the Microsoft Excel spreadsheet. A
61  * separate class must be use to read
62  * <a HREF="http://ostermiller.org/utils/ExcelCSV.html">Excel CSV</a>.
63  *
64  * @see com.Ostermiller.util.ExcelCSVParser
65  *
66  * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
67  * @since ostermillerutils 1.00.00
68  */

69 public class CSVParser implements CSVParse {
70
71     /**
72      * InputStream on which this parser is based.
73      *
74      * @since ostermillerutils 1.02.22
75      */

76     private InputStream inStream;
77
78     /**
79      * Reader on which this parser is based.
80      *
81      * @since ostermillerutils 1.02.22
82      */

83     private Reader inReader;
84
85     /**
86      * Does all the dirty work.
87      * Calls for new tokens are routed through
88      * this object.
89      *
90      * @since ostermillerutils 1.00.00
91      */

92     private CSVLexer lexer;
93
94     /**
95      * Token cache. Used for when we request a token
96      * from the lexer but can't return it because its
97      * on the next line.
98      *
99      * @since ostermillerutils 1.00.00
100      */

101     private String JavaDoc tokenCache;
102
103     /**
104      * Line cache. The line number that goes along with
105      * the tokenCache. Not valid if the tokenCache is
106      * null.
107      *
108      * @since ostermillerutils 1.00.00
109      */

110     private int lineCache;
111
112     /**
113      * The line number the last token came from, or -1 if
114      * no tokens have been returned.
115      *
116      * @since ostermillerutils 1.00.00
117      */

118     private int lastLine = -1;
119
120     /**
121      * Create a parser to parse comma separated values from
122      * an InputStream.
123      * <p>
124      * Byte to character conversion is done using the platform
125      * default locale.
126      *
127      * @param in stream that contains comma separated values.
128      *
129      * @since ostermillerutils 1.00.00
130      */

131     public CSVParser(InputStream in){
132         inStream = in;
133         lexer = new CSVLexer(in);
134     }
135
136     /**
137      * Create a parser to parse delimited values from
138      * an InputStream.
139      * <p>
140      * Byte to character conversion is done using the platform
141      * default locale.
142      *
143      * @param in stream that contains comma separated values.
144      * @param delimiter record separator
145      *
146      * @throws BadDelimiterException if the specified delimiter cannot be used
147      *
148      * @since ostermillerutils 1.02.24
149      */

150     public CSVParser(InputStream in, char delimiter) throws BadDelimiterException {
151         inStream = in;
152         lexer = new CSVLexer(in);
153         changeDelimiter(delimiter);
154     }
155
156     /**
157      * Create a parser to parse comma separated values from
158      * a Reader.
159      *
160      * @param in reader that contains comma separated values.
161      *
162      * @since ostermillerutils 1.00.00
163      */

164     public CSVParser(Reader in){
165         inReader = in;
166         lexer = new CSVLexer(in);
167     }
168
169     /**
170      * Create a parser to parse delimited values from
171      * a Reader.
172      *
173      * @param in reader that contains comma separated values.
174      * @param delimiter record separator
175      *
176      * @throws BadDelimiterException if the specified delimiter cannot be used
177      *
178      * @since ostermillerutils 1.02.24
179      */

180     public CSVParser(Reader in, char delimiter) throws BadDelimiterException {
181         inReader = in;
182         lexer = new CSVLexer(in);
183         changeDelimiter(delimiter);
184     }
185
186     /**
187      * Create a parser to parse delimited values from
188      * an InputStream.
189      * <p>
190      * Byte to character conversion is done using the platform
191      * default locale.
192      *
193      * @param in stream that contains comma separated values.
194      * @param escapes a list of characters that will represent escape sequences.
195      * @param replacements the list of replacement characters for those escape sequences.
196      * @param commentDelims list of characters a comment line may start with.
197      * @param delimiter record separator
198      *
199      * @throws BadDelimiterException if the specified delimiter cannot be used
200      *
201      * @since ostermillerutils 1.02.24
202      */

203     public CSVParser(InputStream in, char delimiter, String JavaDoc escapes, String JavaDoc replacements, String JavaDoc commentDelims) throws BadDelimiterException {
204         inStream = in;
205         lexer = new CSVLexer(in);
206         setEscapes(escapes, replacements);
207         setCommentStart(commentDelims);
208         changeDelimiter(delimiter);
209     }
210
211     /**
212      * Create a parser to parse comma separated values from
213      * an InputStream.
214      * <p>
215      * Byte to character conversion is done using the platform
216      * default locale.
217      *
218      * @param in stream that contains comma separated values.
219      * @param escapes a list of characters that will represent escape sequences.
220      * @param replacements the list of replacement characters for those escape sequences.
221      * @param commentDelims list of characters a comment line may start with.
222      *
223      * @since ostermillerutils 1.00.00
224      */

225     public CSVParser(InputStream in, String JavaDoc escapes, String JavaDoc replacements, String JavaDoc commentDelims){
226         inStream = in;
227         lexer = new CSVLexer(in);
228         setEscapes(escapes, replacements);
229         setCommentStart(commentDelims);
230     }
231
232     /**
233      * Create a parser to parse delimited values from
234      * a Reader.
235      *
236      * @param in reader that contains comma separated values.
237      * @param escapes a list of characters that will represent escape sequences.
238      * @param replacements the list of replacement characters for those escape sequences.
239      * @param commentDelims list of characters a comment line may start with.
240      * @param delimiter record separator
241      *
242      * @throws BadDelimiterException if the specified delimiter cannot be used
243      *
244      * @since ostermillerutils 1.02.24
245      */

246     public CSVParser(Reader in, char delimiter, String JavaDoc escapes, String JavaDoc replacements, String JavaDoc commentDelims) throws BadDelimiterException {
247         inReader = in;
248         lexer = new CSVLexer(in);
249         setEscapes(escapes, replacements);
250         setCommentStart(commentDelims);
251         changeDelimiter(delimiter);
252     }
253
254     /**
255      * Create a parser to parse comma separated values from
256      * a Reader.
257      *
258      * @param in reader that contains comma separated values.
259      * @param escapes a list of characters that will represent escape sequences.
260      * @param replacements the list of replacement characters for those escape sequences.
261      * @param commentDelims list of characters a comment line may start with.
262      *
263      * @since ostermillerutils 1.00.00
264      */

265     public CSVParser(Reader in, String JavaDoc escapes, String JavaDoc replacements, String JavaDoc commentDelims){
266         inReader = in;
267         lexer = new CSVLexer(in);
268         setEscapes(escapes, replacements);
269         setCommentStart(commentDelims);
270     }
271
272     /**
273      * Close any stream upon which this parser is based.
274      *
275      * @since ostermillerutils 1.02.22
276      * @throws IOException if an error occurs while closing the stream.
277      */

278     public void close() throws IOException {
279         if (inStream != null) inStream.close();
280         if (inReader != null) inReader.close();
281     }
282
283     /**
284      * get the next value.
285      *
286      * @return the next value or null if there are no more values.
287      * @throws IOException if an error occurs while reading.
288      *
289      * @since ostermillerutils 1.00.00
290      */

291     public String JavaDoc nextValue() throws IOException {
292         if (tokenCache == null){
293             tokenCache = lexer.getNextToken();
294             lineCache = lexer.getLineNumber();
295         }
296         lastLine = lineCache;
297         String JavaDoc result = tokenCache;
298         tokenCache = null;
299         return result;
300     }
301
302     /**
303      * Get the line number that the last token came from.
304      * <p>
305      * New line breaks that occur in the middle of a token are no
306      * counted in the line number count.
307      *
308      * @return line number or -1 if no tokens have been returned yet.
309      *
310      * @since ostermillerutils 1.00.00
311      */

312     public int lastLineNumber(){
313         return lastLine;
314     }
315
316     /**
317      * Get all the values from a line.
318      * <p>
319      * If the line has already been partially read, only the
320      * values that have not already been read will be included.
321      *
322      * @return all the values from the line or null if there are no more values.
323      * @throws IOException if an error occurs while reading.
324      *
325      * @since ostermillerutils 1.00.00
326      */

327     public String JavaDoc[] getLine() throws IOException{
328         int lineNumber = -1;
329         Vector<String JavaDoc> v = new Vector<String JavaDoc>();
330         if (tokenCache != null){
331             v.add(tokenCache);
332             lineNumber = lineCache;
333         }
334         while ((tokenCache = lexer.getNextToken()) != null
335                 && (lineNumber == -1 || lexer.getLineNumber() == lineNumber)){
336             v.add(tokenCache);
337             lineNumber = lexer.getLineNumber();
338         }
339         if (v.size() == 0){
340             return null;
341         }
342         lastLine = lineNumber;
343         lineCache = lexer.getLineNumber();
344         String JavaDoc[] result = new String JavaDoc[v.size()];
345         return ((String JavaDoc[])v.toArray(result));
346     }
347
348     /**
349      * Get all the values from the file.
350      * <p>
351      * If the file has already been partially read, only the
352      * values that have not already been read will be included.
353      * <p>
354      * Each line of the file that has at least one value will be
355      * represented. Comments and empty lines are ignored.
356      * <p>
357      * The resulting double array may be jagged.
358      *
359      * @return all the values from the file or null if there are no more values.
360      * @throws IOException if an error occurs while reading.
361      *
362      * @since ostermillerutils 1.00.00
363      */

364     public String JavaDoc[][] getAllValues() throws IOException {
365         Vector<String JavaDoc[]> v = new Vector<String JavaDoc[]>();
366         String JavaDoc[] line;
367         while((line = getLine()) != null){
368             v.add(line);
369         }
370         if (v.size() == 0){
371             return null;
372         }
373         String JavaDoc[][] result = new String JavaDoc[v.size()][];
374         return ((String JavaDoc[][])v.toArray(result));
375     }
376
377     /**
378      * Specify escape sequences and their replacements.
379      * Escape sequences set here are in addition to \\ and \".
380      * \\ and \" are always valid escape sequences. This method
381      * allows standard escape sequenced to be used. For example
382      * "\n" can be set to be a newline rather than an 'n'.
383      * A common way to call this method might be:<br>
384      * <code>setEscapes("nrtf", "\n\r\t\f");</code><br>
385      * which would set the escape sequences to be the Java escape
386      * sequences. Characters that follow a \ that are not escape
387      * sequences will still be interpreted as that character.<br>
388      * The two arguments to this method must be the same length. If
389      * they are not, the longer of the two will be truncated.
390      *
391      * @param escapes a list of characters that will represent escape sequences.
392      * @param replacements the list of replacement characters for those escape sequences.
393      *
394      * @since ostermillerutils 1.00.00
395      */

396     public void setEscapes(String JavaDoc escapes, String JavaDoc replacements){
397         lexer.setEscapes(escapes, replacements);
398     }
399
400     /**
401      * Change this parser so that it uses a new delimiter.
402      * <p>
403      * The initial character is a comma, the delimiter cannot be changed
404      * to a quote or other character that has special meaning in CSV.
405      *
406      * @param newDelim delimiter to which to switch.
407      * @throws BadDelimiterException if the character cannot be used as a delimiter.
408      *
409      * @since ostermillerutils 1.02.08
410      */

411     public void changeDelimiter(char newDelim) throws BadDelimiterException {
412         lexer.changeDelimiter(newDelim);
413     }
414
415     /**
416      * Change this parser so that it uses a new character for quoting.
417      * <p>
418      * The initial character is a double quote ("), the delimiter cannot be changed
419      * to a comma or other character that has special meaning in CSV.
420      *
421      * @param newQuote character to use for quoting.
422      * @throws BadQuoteException if the character cannot be used as a quote.
423      *
424      * @since ostermillerutils 1.02.16
425      */

426     public void changeQuote(char newQuote) throws BadQuoteException {
427         lexer.changeQuote(newQuote);
428     }
429
430     /**
431      * Set the characters that indicate a comment at the beginning of the line.
432      * For example if the string "#;!" were passed in, all of the following lines
433      * would be comments:<br>
434      * <pre> # Comment
435      * ; Another Comment
436      * ! Yet another comment</pre>
437      * By default there are no comments in CVS files. Commas and quotes may not be
438      * used to indicate comment lines.
439      *
440      * @param commentDelims list of characters a comment line may start with.
441      *
442      * @since ostermillerutils 1.00.00
443      */

444     public void setCommentStart(String JavaDoc commentDelims){
445         lexer.setCommentStart(commentDelims);
446     }
447
448     /**
449      * Get the number of the line from which the last value was retrieved.
450      *
451      * @return line number or -1 if no tokens have been returned.
452      *
453      * @since ostermillerutils 1.00.00
454      */

455     public int getLastLineNumber(){
456         return lastLine;
457     }
458
459     /**
460      * Parse the given file for comma separated values and print the results
461      * to System.out.
462      *
463      * @param args First argument is the file name. System.in used if no filename given.
464      *
465      * @since ostermillerutils 1.00.00
466      */

467     private static void main(String JavaDoc[] args){
468         InputStream in;
469         try {
470             if (args.length > 0){
471                 File f = new File(args[0]);
472                 if (f.exists()){
473                     if (f.canRead()){
474                         in = new FileInputStream(f);
475                     } else {
476                         throw new IOException("Could not open " + args[0]);
477                     }
478                 } else {
479                     throw new IOException("Could not find " + args[0]);
480                 }
481             } else {
482                 in = System.in;
483             }
484             CSVParser p = new CSVParser(in);
485             p.setCommentStart("#;!");
486             p.setEscapes("nrtf", "\n\r\t\f");
487             String JavaDoc[] t;
488             while ((t = p.getLine()) != null){
489                 for (int i=0; i<t.length; i++){
490                     System.out.print('"' + t[i] + '"');
491                     if (i<t.length-1){
492                         System.out.print(", ");
493                     }
494                 }
495                 System.out.println();
496             }
497         } catch (IOException e){
498             System.out.println(e.getMessage());
499         }
500     }
501
502     /**
503      * Parse the comma delimited data from a string.
504      * <p>
505      * Only escaped backslashes and quotes will be recognized as escape sequences.
506      * The data will be treated as having no comments.
507      *
508      * @param s string with comma delimited data to parse.
509      * @return parsed data.
510      *
511      * @since ostermillerutils 1.02.03
512      */

513     public static String JavaDoc[][] parse(String JavaDoc s){
514         try {
515             return (new CSVParser(new StringReader(s))).getAllValues();
516         } catch (IOException x){
517             return null;
518         }
519     }
520
521     /**
522      * Parse the delimited data from a string.
523      * <p>
524      * Only escaped backslashes and quotes will be recognized as escape sequences.
525      * The data will be treated as having no comments.
526      *
527      * @param s string with delimited data to parse.
528      * @param delimiter record separator
529      * @return parsed data.
530      * @throws BadDelimiterException if the character cannot be used as a delimiter.
531      *
532      * @since ostermillerutils 1.02.24
533      */

534     public static String JavaDoc[][] parse(String JavaDoc s, char delimiter) throws BadDelimiterException {
535         try {
536             return (new CSVParser(new StringReader(s), delimiter)).getAllValues();
537         } catch (IOException x){
538             return null;
539         }
540     }
541
542     /**
543      * Parse the comma delimited data from a string.
544      * Escaped backslashes and quotes will always recognized as escape sequences.
545      *
546      * @param s string with comma delimited data to parse.
547      * @param escapes a list of additional characters that will represent escape sequences.
548      * @param replacements the list of replacement characters for those escape sequences.
549      * @param commentDelims list of characters a comment line may start with.
550      * @return parsed data.
551      *
552      * @since ostermillerutils 1.02.03
553      */

554     public static String JavaDoc[][] parse(String JavaDoc s, String JavaDoc escapes, String JavaDoc replacements, String JavaDoc commentDelims){
555         try {
556             return (new CSVParser(new StringReader(s), escapes, replacements, commentDelims)).getAllValues();
557         } catch (IOException x){
558             return null;
559         }
560     }
561
562     /**
563      * Parse the delimited data from a string.
564      * Escaped backslashes and quotes will always recognized as escape sequences.
565      *
566      * @param s string with delimited data to parse.
567      * @param escapes a list of additional characters that will represent escape sequences.
568      * @param replacements the list of replacement characters for those escape sequences.
569      * @param commentDelims list of characters a comment line may start with.
570      * @param delimiter record separator
571      * @return parsed data.
572      * @throws BadDelimiterException if the character cannot be used as a delimiter.
573      *
574      * @since ostermillerutils 1.02.24
575      */

576     public static String JavaDoc[][] parse(String JavaDoc s, char delimiter, String JavaDoc escapes, String JavaDoc replacements, String JavaDoc commentDelims) throws BadDelimiterException{
577         try {
578             return (new CSVParser(new StringReader(s), delimiter, escapes, replacements, commentDelims)).getAllValues();
579         } catch (IOException x){
580             return null;
581         }
582     }
583
584     /**
585      * Parse the comma delimited data from a stream.
586      * <p>
587      * Only escaped backslashes and quotes will be recognized as escape sequences.
588      * The data will be treated as having no comments.
589      *
590      * @param in Reader with comma delimited data to parse.
591      * @param delimiter record separator
592      * @return parsed data.
593      * @throws BadDelimiterException if the character cannot be used as a delimiter.
594      * @throws IOException if an error occurs while reading.
595      *
596      * @since ostermillerutils 1.02.24
597      */

598     public static String JavaDoc[][] parse(Reader in, char delimiter) throws IOException, BadDelimiterException {
599         return (new CSVParser(in, delimiter)).getAllValues();
600     }
601
602     /**
603      * Parse the delimited data from a stream.
604      * <p>
605      * Only escaped backslashes and quotes will be recognized as escape sequences.
606      * The data will be treated as having no comments.
607      *
608      * @param in Reader with comma delimited data to parse.
609      * @return parsed data.
610      * @throws IOException if an error occurs while reading.
611      *
612      * @since ostermillerutils 1.02.03
613      */

614     public static String JavaDoc[][] parse(Reader in) throws IOException {
615         return (new CSVParser(in)).getAllValues();
616     }
617
618     /**
619      * Parse the delimited data from a stream.
620      * Escaped backslashes and quotes will always recognized as escape sequences.
621      *
622      * @param in Reader with delimited data to parse.
623      * @param delimiter record separator
624      * @param escapes a list of additional characters that will represent escape sequences.
625      * @param replacements the list of replacement characters for those escape sequences.
626      * @param commentDelims list of characters a comment line may start with.
627      * @return parsed data.
628      * @throws BadDelimiterException if the character cannot be used as a delimiter.
629      * @throws IOException if an error occurs while reading.
630      *
631      * @since ostermillerutils 1.02.24
632      */

633     public static String JavaDoc[][] parse(Reader in, char delimiter, String JavaDoc escapes, String JavaDoc replacements, String JavaDoc commentDelims) throws IOException, BadDelimiterException {
634         return (new CSVParser(in, delimiter, escapes, replacements, commentDelims)).getAllValues();
635     }
636
637     /**
638      * Parse the comma delimited data from a stream.
639      * Escaped backslashes and quotes will always recognized as escape sequences.
640      *
641      * @param in Reader with comma delimited data to parse.
642      * @param escapes a list of additional characters that will represent escape sequences.
643      * @param replacements the list of replacement characters for those escape sequences.
644      * @param commentDelims list of characters a comment line may start with.
645      * @return parsed data.
646      * @throws IOException if an error occurs while reading.
647      *
648      * @since ostermillerutils 1.02.03
649      */

650     public static String JavaDoc[][] parse(Reader in, String JavaDoc escapes, String JavaDoc replacements, String JavaDoc commentDelims) throws IOException {
651         return (new CSVParser(in, escapes, replacements, commentDelims)).getAllValues();
652     }
653 }
654
Popular Tags