KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > Ostermiller > util > ExcelCSVParser


1 /*
2  * Read files in Excel comma separated value format.
3  * Copyright (C) 2001-2004 Stephen Ostermiller
4  * http://ostermiller.org/contact.pl?regarding=Java+Utilities
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * See COPYING.TXT for details.
17  */

18
19 package com.Ostermiller.util;
20 import java.io.*;
21 import java.util.*;
22
23 /**
24  * Read files in comma separated value format as outputted by the Microsoft
25  * Excel Spreadsheet program.
26  * More information about this class is available from <a target="_top" HREF=
27  * "http://ostermiller.org/utils/ExcelCSV.html">ostermiller.org</a>.
28  * <P>
29  * Excel CSV is a file format used as a portable representation of a database.
30  * Each line is one entry or record and the fields in a record are separated by commas.
31  * <P>
32  * If field includes a comma or a new line, the whole field must be surrounded with double quotes.
33  * When the field is in quotes, any quote literals must be escaped by two quotes ("").
34  * Text that comes after quotes that have been closed but come before the next comma will be ignored.
35  * <P>
36  * Empty fields are returned as as String of length zero: "". The following line has three empty
37  * fields and three non-empty fields in it. There is an empty field on each end, and one in the
38  * middle. One token is returned as a space.<br>
39  * <pre>,second,, ,fifth,</pre>
40  * <P>
41  * Blank lines are always ignored. Other lines will be ignored if they start with a
42  * comment character as set by the setCommentStart() method.
43  * <P>
44  * An example of how CVSLexer might be used:
45  * <pre>
46  * ExcelCSVParser shredder = new ExcelCSVParser(System.in);
47  * String t;
48  * while ((t = shredder.nextValue()) != null){
49  * System.out.println("" + shredder.lastLineNumber() + " " + t);
50  * }
51  * </pre>
52  * <P>
53  * The CSV that Excel outputs differs from the
54  * <a HREF="http://ostermiller.org/utils/CSVLexer.html">standard</a>
55  * in several respects:
56  * <ul><li>Leading and trailing whitespace is significant.</li>
57  * <li>A backslash is not a special character and is not used to escape anything.</li>
58  * <li>Quotes inside quoted strings are escaped with a double quote rather than a backslash.</li>
59  * <li>Excel may convert data before putting it in CSV format:<ul>
60  * <li>Tabs are converted to a single space.</li>
61  * <li>New lines in the data are always represented as the UNIX new line. ("\n")</li>
62  * <li>Numbers that are greater than 12 digits may be represented in truncated
63  * scientific notation form.</li></ul>
64  * This parser does not attempt to fix these excel conversions, but users should be aware
65  * of them.</li></ul>
66  *
67  * @see com.Ostermiller.util.CSVParser
68  *
69  * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
70  * @since ostermillerutils 1.00.00
71  */

72 public class ExcelCSVParser implements CSVParse {
73
74     /**
75      * InputStream on which this parser is based.
76      *
77      * @since ostermillerutils 1.02.22
78      */

79     private InputStream inStream;
80
81     /**
82      * Reader on which this parser is based.
83      *
84      * @since ostermillerutils 1.02.22
85      */

86     private Reader inReader;
87
88     /**
89      * Does all the dirty work.
90      * Calls for new tokens are routed through
91      * this object.
92      *
93      * @since ostermillerutils 1.00.00
94      */

95     private ExcelCSVLexer lexer;
96
97     /**
98      * Token cache. Used for when we request a token
99      * from the lexer but can't return it because its
100      * on the next line.
101      *
102      * @since ostermillerutils 1.00.00
103      */

104     private String JavaDoc tokenCache;
105
106     /**
107      * Line cache. The line number that goes along with
108      * the tokenCache. Not valid if the tokenCache is
109      * null.
110      *
111      * @since ostermillerutils 1.00.00
112      */

113     private int lineCache;
114
115     /**
116      * The line number the last token came from, or -1 if
117      * no tokens have been returned.
118      *
119      * @since ostermillerutils 1.00.00
120      */

121     private int lastLine = -1;
122
123     /**
124      * Create a parser to parse delimited values from
125      * an InputStream.
126      *
127      * @param in stream that contains comma separated values.
128      * @param delimiter record separator
129      *
130      * @throws BadDelimiterException if the specified delimiter cannot be used
131      *
132      * @since ostermillerutils 1.02.24
133      */

134     public ExcelCSVParser(InputStream in, char delimiter) throws BadDelimiterException {
135         inStream = in;
136         lexer = new ExcelCSVLexer(in);
137         changeDelimiter(delimiter);
138     }
139
140     /**
141      * Create a parser to parse comma separated values from
142      * an InputStream.
143      *
144      * @param in stream that contains comma separated values.
145      *
146      * @since ostermillerutils 1.00.00
147      */

148     public ExcelCSVParser(InputStream in){
149         inStream = in;
150         lexer = new ExcelCSVLexer(in);
151     }
152
153     /**
154      * Create a parser to parse delimited values from
155      * a Reader.
156      *
157      * @param in reader that contains comma separated values.
158      * @param delimiter record separator
159      *
160      * @throws BadDelimiterException if the specified delimiter cannot be used
161      *
162      * @since ostermillerutils 1.02.24
163      */

164     public ExcelCSVParser(Reader in, char delimiter) throws BadDelimiterException {
165         inReader = in;
166         lexer = new ExcelCSVLexer(in);
167         changeDelimiter(delimiter);
168     }
169
170     /**
171      * Create a parser to parse comma separated values from
172      * a Reader.
173      *
174      * @param in reader that contains comma separated values.
175      *
176      * @since ostermillerutils 1.00.00
177      */

178     public ExcelCSVParser(Reader in){
179         inReader = in;
180         lexer = new ExcelCSVLexer(in);
181     }
182
183     /**
184      * Close any stream upon which this parser is based.
185      *
186      * @since ostermillerutils 1.02.22
187      * @throws IOException if an error occurs while closing the stream.
188      */

189     public void close() throws IOException {
190         if (inStream != null) inStream.close();
191         if (inReader != null) inReader.close();
192     }
193
194     /**
195      * get the next value.
196      *
197      * @return the next value or null if there are no more values.
198      * @throws IOException if an error occurs while reading.
199      *
200      * @since ostermillerutils 1.00.00
201      */

202     public String JavaDoc nextValue() throws IOException {
203         if (tokenCache == null){
204             tokenCache = lexer.getNextToken();
205             lineCache = lexer.getLineNumber();
206         }
207         lastLine = lineCache;
208         String JavaDoc result = tokenCache;
209         tokenCache = null;
210         return result;
211     }
212
213     /**
214      * Get the line number that the last token came from.
215      * <p>
216      * New line breaks that occur in the middle of a token are no
217      * counted in the line number count.
218      *
219      * @return line number or -1 if no tokens have been returned yet.
220      *
221      * @since ostermillerutils 1.00.00
222      */

223     public int lastLineNumber(){
224         return lastLine;
225     }
226
227     /**
228      * Get all the values from a line.
229      * <p>
230      * If the line has already been partially read, only the
231      * values that have not already been read will be included.
232      *
233      * @return all the values from the line or null if there are no more values.
234      * @throws IOException if an error occurs while reading.
235      *
236      * @since ostermillerutils 1.00.00
237      */

238     public String JavaDoc[] getLine() throws IOException{
239         int lineNumber = -1;
240         Vector<String JavaDoc> v = new Vector<String JavaDoc>();
241         if (tokenCache != null){
242             v.add(tokenCache);
243             lineNumber = lineCache;
244         }
245         while ((tokenCache = lexer.getNextToken()) != null
246                 && (lineNumber == -1 || lexer.getLineNumber() == lineNumber)){
247             v.add(tokenCache);
248             lineNumber = lexer.getLineNumber();
249         }
250         if (v.size() == 0){
251             return null;
252         }
253         lastLine = lineNumber;
254         lineCache = lexer.getLineNumber();
255         String JavaDoc[] result = new String JavaDoc[v.size()];
256         return ((String JavaDoc[])v.toArray(result));
257     }
258
259     /**
260      * Get all the values from the file.
261      * <p>
262      * If the file has already been partially read, only the
263      * values that have not already been read will be included.
264      * <p>
265      * Each line of the file that has at least one value will be
266      * represented. Comments and empty lines are ignored.
267      * <p>
268      * The resulting double array may be jagged.
269      *
270      * @return all the values from the file or null if there are no more values.
271      * @throws IOException if an error occurs while reading.
272      *
273      * @since ostermillerutils 1.00.00
274      */

275     public String JavaDoc[][] getAllValues() throws IOException {
276         Vector<String JavaDoc[]> v = new Vector<String JavaDoc[]>();
277         String JavaDoc[] line;
278         while((line = getLine()) != null){
279             v.add(line);
280         }
281         if (v.size() == 0){
282             return null;
283         }
284         String JavaDoc[][] result = new String JavaDoc[v.size()][];
285         return ((String JavaDoc[][])v.toArray(result));
286     }
287
288     /**
289      * Change this parser so that it uses a new delimiter.
290      * <p>
291      * The initial character is a comma, the delimiter cannot be changed
292      * to a quote or other character that has special meaning in CSV.
293      *
294      * @param newDelim delimiter to which to switch.
295      * @throws BadDelimiterException if the character cannot be used as a delimiter.
296      *
297      * @since ostermillerutils 1.02.08
298      */

299     public void changeDelimiter(char newDelim) throws BadDelimiterException {
300         lexer.changeDelimiter(newDelim);
301     }
302
303     /**
304      * Change this parser so that it uses a new character for quoting.
305      * <p>
306      * The initial character is a double quote ("), the delimiter cannot be changed
307      * to a comma or other character that has special meaning in CSV.
308      *
309      * @param newQuote character to use for quoting.
310      * @throws BadQuoteException if the character cannot be used as a quote.
311      *
312      * @since ostermillerutils 1.02.16
313      */

314     public void changeQuote(char newQuote) throws BadQuoteException {
315         lexer.changeQuote(newQuote);
316     }
317
318     /**
319      * Set the characters that indicate a comment at the beginning of the line.
320      * For example if the string "#;!" were passed in, all of the following lines
321      * would be comments:<br>
322      * <pre> # Comment
323      * ; Another Comment
324      * ! Yet another comment</pre>
325      * By default there are no comments in CVS files. Commas and quotes may not be
326      * used to indicate comment lines.
327      *
328      * @param commentDelims list of characters a comment line may start with.
329      *
330      * @since ostermillerutils 1.00.00
331      */

332     public void setCommentStart(String JavaDoc commentDelims){
333         lexer.setCommentStart(commentDelims);
334     }
335
336     /**
337      * Get the number of the line from which the last value was retrieved.
338      *
339      * @return line number or -1 if no tokens have been returned.
340      *
341      * @since ostermillerutils 1.00.00
342      */

343     public int getLastLineNumber(){
344         return lastLine;
345     }
346
347     /**
348      * Parse the given file for comma separated values and print the results
349      * to System.out.
350      *
351      * @param args First argument is the file name. System.in used if no filename given.
352      *
353      * @since ostermillerutils 1.00.00
354      */

355     private static void main(String JavaDoc[] args){
356         InputStream in;
357         try {
358             if (args.length > 0){
359                 File f = new File(args[0]);
360                 if (f.exists()){
361                     if (f.canRead()){
362                         in = new FileInputStream(f);
363                     } else {
364                         throw new IOException("Could not open " + args[0]);
365                     }
366                 } else {
367                     throw new IOException("Could not find " + args[0]);
368                 }
369             } else {
370                 in = System.in;
371             }
372             ExcelCSVParser p = new ExcelCSVParser(in);
373             String JavaDoc[] t;
374             while ((t = p.getLine()) != null){
375                 for (int i=0; i<t.length; i++){
376                     System.out.print('"' + t[i] + '"');
377                     if (i<t.length-1){
378                         System.out.print(", ");
379                     }
380                 }
381                 System.out.println();
382             }
383         } catch (IOException e){
384             System.out.println(e.getMessage());
385         }
386     }
387
388     /**
389      * Parse the comma delimited data from a string.
390      *
391      * @param s string with comma delimited data to parse.
392      * @return parsed data.
393      *
394      * @since ostermillerutils 1.02.03
395      */

396     public static String JavaDoc[][] parse(String JavaDoc s){
397         try {
398             return (new ExcelCSVParser(new StringReader(s))).getAllValues();
399         } catch (IOException x){
400             return null;
401         }
402     }
403
404     /**
405      * Parse the delimited data from a string.
406      *
407      * @param s string with delimited data to parse.
408      * @param delimiter record separator
409      * @return parsed data.
410      * @throws BadDelimiterException if the character cannot be used as a delimiter.
411      *
412      * @since ostermillerutils 1.02.24
413      */

414     public static String JavaDoc[][] parse(String JavaDoc s, char delimiter) throws BadDelimiterException {
415         try {
416             return (new ExcelCSVParser(new StringReader(s), delimiter)).getAllValues();
417         } catch (IOException x){
418             return null;
419         }
420     }
421
422     /**
423      * Parse the comma delimited data from a stream.
424      *
425      * @param in Reader with comma delimited data to parse.
426      * @return parsed data.
427      * @throws IOException if an error occurs while reading.
428      *
429      * @since ostermillerutils 1.02.03
430      */

431     public static String JavaDoc[][] parse(Reader in) throws IOException {
432         return (new ExcelCSVParser(in)).getAllValues();
433     }
434
435
436     /**
437      * Parse the delimited data from a stream.
438      *
439      * @param in Reader with delimited data to parse.
440      * @param delimiter record separator
441      * @return parsed data.
442      * @throws BadDelimiterException if the character cannot be used as a delimiter.
443      * @throws IOException if an error occurs while reading.
444      *
445      * @since ostermillerutils 1.02.24
446      */

447     public static String JavaDoc[][] parse(Reader in, char delimiter) throws IOException, BadDelimiterException {
448         return (new ExcelCSVParser(in, delimiter)).getAllValues();
449     }
450 }
451
Popular Tags