KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > apache > cocoon > generation > CSVGenerator


1 /* =============================================================================== *
2  * Copyright (C) 1999-2004, The Apache Software Foundation. All rights reserved. *
3  * *
4  * Licensed under the Apache License, Version 2.0 (the "License"). You may not use *
5  * this file except in compliance with the License. You may obtain a copy of the *
6  * License at <http://www.apache.org/licenses/LICENSE-2.0>. *
7  * *
8  * Unless required by applicable law or agreed to in writing, software distributed *
9  * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR *
10  * CONDITIONS OF ANY KIND, either express or implied. See the License for the *
11  * specific language governing permissions and limitations under the License. *
12  * =============================================================================== */

13 package org.apache.cocoon.generation;
14
15 import java.io.BufferedReader JavaDoc;
16 import java.io.ByteArrayInputStream JavaDoc;
17 import java.io.CharArrayWriter JavaDoc;
18 import java.io.IOException JavaDoc;
19 import java.io.InputStream JavaDoc;
20 import java.io.InputStreamReader JavaDoc;
21 import java.io.Reader JavaDoc;
22 import java.io.Serializable JavaDoc;
23 import java.util.HashMap JavaDoc;
24 import java.util.Map JavaDoc;
25
26 import org.apache.avalon.framework.parameters.Parameters;
27 import org.apache.cocoon.ProcessingException;
28 import org.apache.cocoon.environment.SourceResolver;
29 import org.apache.excalibur.source.Source;
30 import org.xml.sax.Attributes JavaDoc;
31 import org.xml.sax.Locator JavaDoc;
32 import org.xml.sax.SAXException JavaDoc;
33 import org.xml.sax.helpers.AttributesImpl JavaDoc;
34
35 /**
36  * <p>A simple parser converting a Comma Separated Values (CSV) file into XML.</p>
37  *
38  * <p>This parser is controlled by the following sitemap parameters:</p>
39  *
40  * <ul>
41  * <li>
42  * <b>process-headers</b>: whether the first line in the CSV is considered
43  * to be the header defining column names (the resulting output will be
44  * different if this is <i>true</i> or <i>false</i> (default: <i>false</i>).
45  * </li>
46  * <li>
47  * <b>encoding</b>: the character encoding (UTF-8, ISO8859-1, ...) used to
48  * interpret the input CSV source file (default: <i>system default</i>).
49  * </li>
50  * <li>
51  * <b>separator</b>: the field-separator character in the CSV file (comma,
52  * tab, ...) (default: <i>,</i> <small>comma</small>).
53  * </li>
54  * <li>
55  * <b>escape</b>: the character used to escape fields, or part of them, in
56  * the CSV file (default: <i>"</i> <small>quote</small>).
57  * </li>
58  * <li>
59  * <b>buffer-size</b>: the size of the buffer used for reading the source
60  * CSV file (default: <i>4096 bytes</i>).
61  * </li>
62  * </ul>
63  *
64  * <p>The generated output will look something like the following:</p>
65  *
66  * <pre>
67  * &lt;?xml version="1.0" encoding="ISO-8859-1"?&gt;
68  * &lt;csv:document xmlns:csv="http://apache.org/cocoon/csv/1.0"&gt;
69  * &lt;csv:header&gt;
70  * &lt;csv:column number="1"&gt;Column A&lt;/csv:column&gt;
71  * &lt;csv:column number="2"&gt;Column B&lt;/csv:column&gt;
72  * &lt;csv:column number="3"&gt;Column C&lt;/csv:column&gt;
73  * &lt;/csv:header&gt;
74  * &lt;csv:record number="1"&gt;
75  * &lt;csv:field number="1" column="Column A"&gt;Field A1&lt;/csv:field&gt;
76  * &lt;csv:field number="2" column="Column B"&gt;Field B1&lt;/csv:field&gt;
77  * &lt;csv:field number="3" column="Column C"&gt;Field C1&lt;/csv:field&gt;
78  * &lt;/csv:record&gt;
79  * &lt;csv:record number="2"&gt;
80  * &lt;csv:field number="1" column="Column A"&gt;Field A2&lt;/csv:field&gt;
81  * &lt;csv:field number="2" column="Column B"&gt;Field B2&lt;/csv:field&gt;
82  * &lt;csv:field number="3" column="Column C"&gt;Field C2&lt;/csv:field&gt;
83  * &lt;/csv:record&gt;
84  * &lt;/csv:document&gt;
85  * </pre>
86  *
87  * <p>Note that this generator has been thoroughly tested with CSV files generated
88  * by <a HREF="http://office.microsoft.com/" target="_new">Microsoft Excel</a>.
89  * Unfortunately no official CSV specification has ever been published by
90  * any standard body, so the interpretation of the format might be slightly
91  * different in cases.</p>
92  *
93  * @author <a HREF="mailto:pier@apache.org">Pier Fumagalli</a>
94  * @author Copyright &copy; 2000-2004 <a HREF="http://www.apache.org/">The Apache
95  * Software Foundation</a>. All rights reserved.
96  */

97 public class CSVGenerator extends FileGenerator {
98
99     /** <p>The namespace URI of XML generated by this instance.</p> */
100     public static final String JavaDoc NAMESPACE_URI = "http://apache.org/cocoon/csv/1.0";
101     /** <p>The namespace prefix of XML generated by this instance.</p> */
102     public static final String JavaDoc NAMESPACE_PREFIX = "csv";
103
104     /** <p>The default encoding configured in the Java VM.</p> */
105     private static final String JavaDoc DEFAULT_ENCODING =
106         new InputStreamReader JavaDoc(new ByteArrayInputStream JavaDoc(new byte[0])).getEncoding();
107     /** <p>The default field separator character.</p> */
108     private static final String JavaDoc DEFAULT_SEPARATOR = ",";
109     /** <p>The default field separator character.</p> */
110     private static final String JavaDoc DEFAULT_ESCAPE = "\"";
111     /** <p>The default field separator character.</p> */
112     private static final int DEFAULT_BUFFER_SIZE = 4096;
113     /** <p>A string used for indenting.</p> */
114     private static final char INDENT_STRING[] = "\n ".toCharArray();
115
116     /** <p>The encoding used to read the CSV resource from a stream.</p> */
117     private String JavaDoc encoding = DEFAULT_ENCODING;
118     /** <p>The character used to separate fields.</p> */
119     private char separator = DEFAULT_SEPARATOR.charAt(0);
120     /** <p>The character used to initiate and terminate esacaped sequences.</p> */
121     private char escape = DEFAULT_ESCAPE.charAt(0);
122     /** <p>The size of the buffer used to read the input.</p> */
123     private int buffersize = DEFAULT_BUFFER_SIZE;
124     /** <p>The current field (column) number in the current record.</p> */
125     private int fieldnumber = 1;
126     /** <p>The current record (line) number in the current CSV.</p> */
127     private int recordnumber = 1;
128     /** <p>A flag indicating whether the &lt;record&gt; tag was opened.</p> */
129     private boolean openrecord = false;
130     /** <p>The character buffer for the current field.</p> */
131     private CharArrayWriter JavaDoc buffer = null;
132     /** <p>A map of all known columns or null if no headers are processed.</p> */
133     private Map JavaDoc columns = null;
134
135     /**
136      * <p>Create a new {@link CSVGenerator} instance.</p>
137      */

138     public CSVGenerator() {
139         super();
140     }
141
142     /**
143      * <p>Recycle this component.</p>.
144      */

145     public void recycle() {
146         super.recycle();
147         
148         this.encoding = DEFAULT_ENCODING;
149         this.separator = DEFAULT_SEPARATOR.charAt(0);
150         this.escape = DEFAULT_ESCAPE.charAt(0);
151         this.buffersize = DEFAULT_BUFFER_SIZE;
152         this.buffer = null;
153         this.columns = null;
154         this.recordnumber = 1;
155         this.fieldnumber = 1;
156         this.openrecord = false;
157     }
158
159     /**
160      * <p>Setup this {@link CSVGenerator} instance.</p>
161      */

162     public void setup(SourceResolver resolver, Map JavaDoc object_model, String JavaDoc source,
163                       Parameters parameters)
164     throws ProcessingException, SAXException JavaDoc, IOException JavaDoc {
165         super.setup(resolver, object_model, source, parameters);
166
167         boolean header = parameters.getParameterAsBoolean("process-header", false);
168
169         this.encoding = parameters.getParameter("encoding", DEFAULT_ENCODING);
170         this.separator = parameters.getParameter("separator", DEFAULT_SEPARATOR).charAt(0);
171         this.escape = parameters.getParameter("escape", DEFAULT_ESCAPE).charAt(0);
172         this.buffersize = parameters.getParameterAsInteger("buffer-size", DEFAULT_BUFFER_SIZE);
173         this.buffer = new CharArrayWriter JavaDoc();
174         this.columns = (header ? new HashMap JavaDoc() : null);
175         this.recordnumber = (header ? 0 : 1);
176         this.fieldnumber = 1;
177         this.openrecord = false;
178     }
179
180     /**
181      * <p>Generate the unique key.</p>
182      */

183     public Serializable JavaDoc getKey() {
184         String JavaDoc key = this.inputSource.getURI();
185         if (this.columns != null) return (key + "+headers");
186         return key;
187     }
188
189     /**
190      * <p>Generate XML data from a Comma Separated Value resource.</p>.
191      */

192     public void generate()
193     throws IOException JavaDoc, SAXException JavaDoc, ProcessingException {
194
195         /* Create a new Reader correctly decoding the source stream */
196         CSVReader csv = new CSVReader(this.inputSource, this.encoding, this.buffersize);
197
198         try {
199             /* Start the document */
200             this.contentHandler.setDocumentLocator(csv);
201             this.contentHandler.startDocument();
202             this.contentHandler.startPrefixMapping(NAMESPACE_PREFIX, NAMESPACE_URI);
203             this.indent(0);
204             this.startElement("document");
205
206             /* Allocate buffer and status for parsing */
207             boolean unescaped = true;
208             int prev = -1;
209             int curr = -1;
210
211             /* Parse the file reading characters one-by-one */
212             while ((curr = csv.read()) >= 0) {
213
214                 /* Process any occurrence of the escape character */
215                 if (curr == this.escape) {
216                     if ((unescaped) && (prev == this.escape)) {
217                         this.buffer.write(this.escape);
218                     }
219                     unescaped = ! unescaped;
220                     prev = curr;
221                     continue;
222                 }
223
224                 /* Process any occurrence of the field separator */
225                 if ((unescaped) && (curr == this.separator)) {
226                     this.dumpField();
227                     prev = curr;
228                     continue;
229                 }
230
231                 /* Process newline characters */
232                 if ((unescaped) && ((curr == '\r') || (curr == '\n'))) {
233                     this.dumpField();
234                     this.dumpRecord();
235
236                     /* Record numbering */
237                     if (((curr == '\n') && (prev != '\r')) || (curr == '\r')) {
238                         this.recordnumber ++;
239                     }
240                     
241                     /* Nothing else to do */
242                     prev = curr;
243                     continue;
244                 }
245
246                 /* Any other character simply gets added to the buffer */
247                 this.buffer.write(curr);
248                 prev = curr;
249             }
250
251             /* Terminate any hanging open record element (just in case) */
252             this.dumpField();
253             this.dumpRecord();
254
255             /* Terminate the document */
256             this.indent(0);
257             this.endElement("document");
258             this.contentHandler.endPrefixMapping(NAMESPACE_PREFIX);
259             this.contentHandler.endDocument();
260
261         } finally {
262             csv.close();
263         }
264     }
265
266     
267     private void dumpField()
268     throws SAXException JavaDoc {
269         if (this.buffer.size() < 1) {
270             this.fieldnumber ++;
271             return;
272         }
273
274         if (! this.openrecord) {
275             this.indent(4);
276
277             if (this.recordnumber > 0) {
278                 AttributesImpl JavaDoc attributes = new AttributesImpl JavaDoc();
279                 String JavaDoc value = Integer.toString(this.recordnumber);
280                 attributes.addAttribute("", "number", "number", "CDATA", value);
281                 this.startElement("record", attributes);
282             } else {
283                 this.startElement("header");
284             }
285             this.openrecord = true;
286         }
287
288         /* Enclode the field in the proper element */
289         String JavaDoc element = "field";
290         char array[] = this.buffer.toCharArray();
291         this.indent(8);
292
293         AttributesImpl JavaDoc attributes = new AttributesImpl JavaDoc();
294         String JavaDoc value = Integer.toString(this.fieldnumber);
295         attributes.addAttribute("", "number", "number", "CDATA", value);
296
297         if (this.recordnumber < 1) {
298             this.columns.put(new Integer JavaDoc(this.fieldnumber), new String JavaDoc(array));
299             element = "column";
300         } else if (this.columns != null) {
301             String JavaDoc header = (String JavaDoc) this.columns.get(new Integer JavaDoc(this.fieldnumber));
302             if (header != null) {
303                 attributes.addAttribute("", "column", "column", "CDATA", header);
304             }
305         }
306
307         this.startElement(element, attributes);
308         this.contentHandler.characters(array, 0, array.length);
309         this.endElement(element);
310         this.buffer.reset();
311
312         this.fieldnumber ++;
313     }
314
315     private void dumpRecord()
316     throws SAXException JavaDoc {
317         if (this.openrecord) {
318             this.indent(4);
319             if (this.recordnumber > 0) {
320                 this.endElement("record");
321             } else {
322                 this.endElement("header");
323             }
324             this.openrecord = false;
325         }
326         this.fieldnumber = 1;
327     }
328
329     private void indent(int level)
330     throws SAXException JavaDoc {
331         this.contentHandler.characters(INDENT_STRING, 0, level + 1);
332     }
333
334     private void startElement(String JavaDoc name)
335     throws SAXException JavaDoc {
336         this.startElement(name, new AttributesImpl JavaDoc());
337     }
338
339     private void startElement(String JavaDoc name, Attributes JavaDoc atts)
340     throws SAXException JavaDoc {
341         if (name == null) throw new NullPointerException JavaDoc("Null name");
342         if (atts == null) atts = new AttributesImpl JavaDoc();
343         String JavaDoc qual = NAMESPACE_PREFIX + ':' + name;
344         this.contentHandler.startElement(NAMESPACE_URI, name, qual, atts);
345     }
346
347     private void endElement(String JavaDoc name)
348     throws SAXException JavaDoc {
349         String JavaDoc qual = NAMESPACE_PREFIX + ':' + name;
350         this.contentHandler.endElement(NAMESPACE_URI, name, qual);
351     }
352
353     private static final class CSVReader extends Reader JavaDoc implements Locator JavaDoc {
354         
355         private String JavaDoc uri = null;
356         private Reader JavaDoc input = null;
357         private int column = 1;
358         private int line = 1;
359         private int last = -1;
360
361         private CSVReader(Source source, String JavaDoc encoding, int buffer)
362         throws IOException JavaDoc {
363             InputStream JavaDoc stream = source.getInputStream();
364             Reader JavaDoc reader = new InputStreamReader JavaDoc(stream, encoding);
365             this.input = new BufferedReader JavaDoc(reader, buffer);
366             this.uri = source.getURI();
367         }
368
369         public String JavaDoc getPublicId() {
370             return null;
371         }
372
373         public String JavaDoc getSystemId() {
374             return this.uri;
375         }
376
377         public int getLineNumber() {
378             return this.line;
379         }
380
381         public int getColumnNumber() {
382             return this.column;
383         }
384
385         public void close()
386         throws IOException JavaDoc {
387             this.input.close();
388         }
389         
390         public int read()
391         throws IOException JavaDoc {
392             int c = this.input.read();
393             if (c < 0) return c;
394
395             if (((c == '\n') && (this.last != '\r')) || (c == '\r')) {
396                 this.column = 1;
397                 this.line ++;
398             }
399
400             this.last = c;
401             return c;
402         }
403
404         public int read(char b[], int o, int l)
405         throws IOException JavaDoc {
406             if (b == null) throw new NullPointerException JavaDoc();
407             if ((o<0)||(o>b.length)||(l<0)||((o+l)>b.length)||((o+l)<0)) {
408                 throw new IndexOutOfBoundsException JavaDoc();
409             }
410             if (l == 0) return 0;
411
412             int c = read();
413             if (c == -1) return -1;
414             b[o] = (char)c;
415
416             int i = 1;
417             try {
418                 for (i = 1; i < l ; i++) {
419                     c = read();
420                     if (c == -1) break;
421                     if (b != null) b[o + i] = (char)c;
422                 }
423             } catch (IOException JavaDoc ee) {
424                 return i;
425             }
426             return i;
427         }
428     }
429 }
430
Popular Tags