KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > cyberneko > html > filters > Writer


1 /*
2  * (C) Copyright 2002-2005, Andy Clark. All rights reserved.
3  *
4  * This file is distributed under an Apache style license. Please
5  * refer to the LICENSE file for specific details.
6  */

7
8 package org.cyberneko.html.filters;
9
10 import java.io.OutputStream JavaDoc;
11 import java.io.OutputStreamWriter JavaDoc;
12 import java.io.PrintWriter JavaDoc;
13 import java.io.UnsupportedEncodingException JavaDoc;
14
15 import org.cyberneko.html.HTMLConfiguration;
16 import org.cyberneko.html.HTMLElements;
17 import org.cyberneko.html.HTMLEntities;
18 import org.cyberneko.html.filters.DefaultFilter;
19
20 import org.apache.xerces.xni.Augmentations;
21 import org.apache.xerces.xni.NamespaceContext;
22 import org.apache.xerces.xni.QName;
23 import org.apache.xerces.xni.XMLAttributes;
24 import org.apache.xerces.xni.XMLLocator;
25 import org.apache.xerces.xni.XMLResourceIdentifier;
26 import org.apache.xerces.xni.XMLString;
27 import org.apache.xerces.xni.XNIException;
28 import org.apache.xerces.xni.parser.XMLDocumentFilter;
29 import org.apache.xerces.xni.parser.XMLInputSource;
30 import org.apache.xerces.xni.parser.XMLParserConfiguration;
31
32 /**
33  * An HTML writer written as a filter. Besides serializing the HTML
34  * event stream, the writer also passes the document events to the next
35  * stage in the pipeline. This allows applications to insert writer
36  * filters between other custom filters for debugging purposes.
37  * <p>
38  * Since an HTML document may have specified its encoding using the
39  * &lt;META&gt; tag and http-equiv/content attributes, the writer will
40  * automatically change any character set specified in this tag to
41  * match the encoding of the output stream. Therefore, the character
42  * encoding name used to construct the writer should be an official
43  * <a HREF='http://www.iana.org/assignments/character-sets'>IANA</a>
44  * encoding name and not a Java encoding name.
45  * <p>
46  * <strong>Note:</strong>
47  * The modified character set in the &lt;META&gt; tag is <em>not</em>
48  * propagated to the next stage in the pipeline. The changed value is
49  * only output to the stream; the original value is sent to the next
50  * stage in the pipeline.
51  *
52  * @author Andy Clark
53  *
54  * @version $Id: Writer.java,v 1.7 2005/02/14 04:01:33 andyc Exp $
55  */

56 public class Writer
57     extends DefaultFilter {
58
59     //
60
// Constants
61
//
62

63     /** Notify character entity references. */
64     public static final String JavaDoc NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs";
65
66     /** Notify built-in entity references. */
67     public static final String JavaDoc NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs";
68
69     /** Augmentations feature identifier. */
70     protected static final String JavaDoc AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
71
72     /** Filters property identifier. */
73     protected static final String JavaDoc FILTERS = "http://cyberneko.org/html/properties/filters";
74
75     //
76
// Data
77
//
78

79     /** The encoding. */
80     protected String JavaDoc fEncoding;
81
82     /**
83      * The print writer used for serializing the document with the
84      * appropriate character encoding.
85      */

86     protected PrintWriter JavaDoc fPrinter;
87
88     // state
89

90     /** Seen root element. */
91     protected boolean fSeenRootElement;
92
93     /** Seen http-equiv directive. */
94     protected boolean fSeenHttpEquiv;
95
96     /** Element depth. */
97     protected int fElementDepth;
98
99     /** Normalize character content. */
100     protected boolean fNormalize;
101
102     /** Print characters. */
103     protected boolean fPrintChars;
104
105     //
106
// Constructors
107
//
108

109     /** Constructs a writer filter that prints to standard out. */
110     public Writer() {
111         // Note: UTF-8 should *always* be a supported encoding. Although,
112
// I've heard of the old M$ JVM not supporting it! Amazing. -Ac
113
try {
114             fEncoding = "UTF-8";
115             fPrinter = new PrintWriter JavaDoc(new OutputStreamWriter JavaDoc(System.out, fEncoding));
116         }
117         catch (UnsupportedEncodingException JavaDoc e) {
118             throw new RuntimeException JavaDoc(e.getMessage());
119         }
120     } // <init>()
121

122     /**
123      * Constructs a writer filter using the specified output stream and
124      * encoding.
125      *
126      * @param outputStream The output stream to write to.
127      * @param encoding The encoding to be used for the output. The encoding name
128      * should be an official IANA encoding name.
129      */

130     public Writer(OutputStream JavaDoc outputStream, String JavaDoc encoding)
131         throws UnsupportedEncodingException JavaDoc {
132         this(new OutputStreamWriter JavaDoc(outputStream, encoding), encoding);
133     } // <init>(OutputStream,String)
134

135     /**
136      * Constructs a writer filter using the specified Java writer and
137      * encoding.
138      *
139      * @param writer The Java writer to write to.
140      * @param encoding The encoding to be used for the output. The encoding name
141      * should be an official IANA encoding name.
142      */

143     public Writer(java.io.Writer JavaDoc writer, String JavaDoc encoding) {
144         fEncoding = encoding;
145         if (writer instanceof PrintWriter JavaDoc) {
146             fPrinter = (PrintWriter JavaDoc)writer;
147         }
148         else {
149             fPrinter = new PrintWriter JavaDoc(writer);
150         }
151     } // <init>(java.io.Writer,String)
152

153     //
154
// XMLDocumentHandler methods
155
//
156

157     // since Xerces-J 2.2.0
158

159     /** Start document. */
160     public void startDocument(XMLLocator locator, String JavaDoc encoding,
161                               NamespaceContext nscontext, Augmentations augs)
162         throws XNIException {
163         fSeenRootElement = false;
164         fSeenHttpEquiv = false;
165         fElementDepth = 0;
166         fNormalize = true;
167         fPrintChars = true;
168         super.startDocument(locator, encoding, nscontext, augs);
169     } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
170

171     // old methods
172

173     /** Start document. */
174     public void startDocument(XMLLocator locator, String JavaDoc encoding, Augmentations augs)
175         throws XNIException {
176         startDocument(locator, encoding, null, augs);
177     } // startDocument(XMLLocator,String,Augmentations)
178

179     /** Comment. */
180     public void comment(XMLString text, Augmentations augs)
181         throws XNIException {
182         if (fSeenRootElement && fElementDepth <= 0) {
183             fPrinter.println();
184         }
185         fPrinter.print("<!--");
186         printCharacters(text, false);
187         fPrinter.print("-->");
188         if (!fSeenRootElement) {
189             fPrinter.println();
190         }
191         fPrinter.flush();
192     } // comment(XMLString,Augmentations)
193

194     /** Start element. */
195     public void startElement(QName element, XMLAttributes attributes, Augmentations augs)
196         throws XNIException {
197         fSeenRootElement = true;
198         fElementDepth++;
199         fNormalize = !HTMLElements.getElement(element.rawname).isSpecial();
200         printStartElement(element, attributes);
201         super.startElement(element, attributes, augs);
202     } // startElement(QName,XMLAttributes,Augmentations)
203

204     /** Empty element. */
205     public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs)
206         throws XNIException {
207         fSeenRootElement = true;
208         printStartElement(element, attributes);
209         super.emptyElement(element, attributes, augs);
210     } // emptyElement(QName,XMLAttributes,Augmentations)
211

212     /** Characters. */
213     public void characters(XMLString text, Augmentations augs)
214         throws XNIException {
215         if (fPrintChars) {
216             printCharacters(text, fNormalize);
217         }
218         super.characters(text, augs);
219     } // characters(XMLString,Augmentations)
220

221     /** End element. */
222     public void endElement(QName element, Augmentations augs)
223         throws XNIException {
224         fElementDepth--;
225         fNormalize = true;
226         /***
227         // NOTE: Not sure if this is what should be done in the case where
228         // the encoding is not explitly declared within the HEAD. So
229         // I'm leaving it commented out for now. -Ac
230         if (element.rawname.equalsIgnoreCase("head") && !fSeenHttpEquiv) {
231             boolean capitalize = Character.isUpperCase(element.rawname.charAt(0));
232             String ename = capitalize ? "META" : "meta";
233             QName qname = new QName(null, ename, ename, null);
234             XMLAttributes attrs = new XMLAttributesImpl();
235             QName aname = new QName(null, "http-equiv", "http-equiv", null);
236             attrs.addAttribute(aname, "CDATA", "Content-Type");
237             aname.setValues(null, "content", "content", null);
238             attrs.addAttribute(aname, "CDATA", "text/html; charset="+fEncoding);
239             super.emptyElement(qname, attrs, null);
240         }
241         /***/

242         printEndElement(element);
243         super.endElement(element, augs);
244     } // endElement(QName,Augmentations)
245

246     /** Start general entity. */
247     public void startGeneralEntity(String JavaDoc name, XMLResourceIdentifier id, String JavaDoc encoding, Augmentations augs)
248         throws XNIException {
249         fPrintChars = false;
250         if (name.startsWith("#")) {
251             try {
252                 boolean hex = name.startsWith("#x");
253                 int offset = hex ? 2 : 1;
254                 int base = hex ? 16 : 10;
255                 int value = Integer.parseInt(name.substring(offset), base);
256                 String JavaDoc entity = HTMLEntities.get(value);
257                 if (entity != null) {
258                     name = entity;
259                 }
260             }
261             catch (NumberFormatException JavaDoc e) {
262                 // ignore
263
}
264         }
265         printEntity(name);
266         super.startGeneralEntity(name, id, encoding, augs);
267     } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
268

269     /** End general entity. */
270     public void endGeneralEntity(String JavaDoc name, Augmentations augs)
271         throws XNIException {
272         fPrintChars = true;
273         super.endGeneralEntity(name, augs);
274     } // endGeneralEntity(String,Augmentations)
275

276     //
277
// Protected methods
278
//
279

280     /** Print attribute value. */
281     protected void printAttributeValue(String JavaDoc text) {
282         int length = text.length();
283         for (int j = 0; j < length; j++) {
284             char c = text.charAt(j);
285             if (c == '"') {
286                 fPrinter.print("&quot;");
287             }
288             else {
289                 fPrinter.print(c);
290             }
291         }
292         fPrinter.flush();
293     } // printAttributeValue(String)
294

295     /** Print characters. */
296     protected void printCharacters(XMLString text, boolean normalize) {
297         if (normalize) {
298             for (int i = 0; i < text.length; i++) {
299                 char c = text.ch[text.offset + i];
300                 if (c != '\n') {
301                     String JavaDoc entity = HTMLEntities.get(c);
302                     if (entity != null) {
303                         printEntity(entity);
304                     }
305                     else {
306                         fPrinter.print(c);
307                     }
308                 }
309                 else {
310                     fPrinter.println();
311                 }
312             }
313         }
314         else {
315             for (int i = 0; i < text.length; i++) {
316                 char c = text.ch[text.offset + i];
317                 fPrinter.print(c);
318             }
319         }
320         fPrinter.flush();
321     } // printCharacters(XMLString,boolean)
322

323     /** Print start element. */
324     protected void printStartElement(QName element, XMLAttributes attributes) {
325
326         // modify META[@http-equiv='content-type']/@content value
327
int contentIndex = -1;
328         String JavaDoc originalContent = null;
329         if (element.rawname.toLowerCase().equals("meta")) {
330             String JavaDoc httpEquiv = null;
331             int length = attributes.getLength();
332             for (int i = 0; i < length; i++) {
333                 String JavaDoc aname = attributes.getQName(i).toLowerCase();
334                 if (aname.equals("http-equiv")) {
335                     httpEquiv = attributes.getValue(i);
336                 }
337                 else if (aname.equals("content")) {
338                     contentIndex = i;
339                 }
340             }
341             if (httpEquiv != null && httpEquiv.toLowerCase().equals("content-type")) {
342                 fSeenHttpEquiv = true;
343                 String JavaDoc content = null;
344                 if (contentIndex != -1) {
345                     originalContent = attributes.getValue(contentIndex);
346                     content = originalContent.toLowerCase();
347                 }
348                 if (content != null) {
349                     int charsetIndex = content.indexOf("charset=");
350                     if (charsetIndex != -1) {
351                         content = content.substring(0, charsetIndex + 8);
352                     }
353                     else {
354                         content += ";charset=";
355                     }
356                     content += fEncoding;
357                     attributes.setValue(contentIndex, content);
358                 }
359             }
360         }
361
362         // print element
363
fPrinter.print('<');
364         fPrinter.print(element.rawname);
365         int attrCount = attributes != null ? attributes.getLength() : 0;
366         for (int i = 0; i < attrCount; i++) {
367             String JavaDoc aname = attributes.getQName(i);
368             String JavaDoc avalue = attributes.getValue(i);
369             fPrinter.print(' ');
370             fPrinter.print(aname);
371             fPrinter.print("=\"");
372             printAttributeValue(avalue);
373             fPrinter.print('"');
374         }
375         fPrinter.print('>');
376         fPrinter.flush();
377
378         // return original META[@http-equiv]/@content value
379
if (contentIndex != -1) {
380             attributes.setValue(contentIndex, originalContent);
381         }
382
383     } // printStartElement(QName,XMLAttributes)
384

385     /** Print end element. */
386     protected void printEndElement(QName element) {
387         fPrinter.print("</");
388         fPrinter.print(element.rawname);
389         fPrinter.print('>');
390         fPrinter.flush();
391     } // printEndElement(QName)
392

393     /** Print entity. */
394     protected void printEntity(String JavaDoc name) {
395         fPrinter.print('&');
396         fPrinter.print(name);
397         fPrinter.print(';');
398         fPrinter.flush();
399     } // printEntity(String)
400

401     //
402
// MAIN
403
//
404

405     /** Main. */
406     public static void main(String JavaDoc[] argv) throws Exception JavaDoc {
407         if (argv.length == 0) {
408             printUsage();
409             System.exit(1);
410         }
411         XMLParserConfiguration parser = new HTMLConfiguration();
412         parser.setFeature(NOTIFY_CHAR_REFS, true);
413         parser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true);
414         String JavaDoc iencoding = null;
415         String JavaDoc oencoding = "Windows-1252";
416         boolean identity = false;
417         boolean purify = false;
418         for (int i = 0; i < argv.length; i++) {
419             String JavaDoc arg = argv[i];
420             if (arg.equals("-ie")) {
421                 iencoding = argv[++i];
422                 continue;
423             }
424             if (arg.equals("-e") || arg.equals("-oe")) {
425                 oencoding = argv[++i];
426                 continue;
427             }
428             if (arg.equals("-i")) {
429                 identity = true;
430                 continue;
431             }
432             if (arg.equals("-p")) {
433                 purify = true;
434                 continue;
435             }
436             if (arg.equals("-h")) {
437                 printUsage();
438                 System.exit(1);
439             }
440             java.util.Vector JavaDoc filtersVector = new java.util.Vector JavaDoc(2);
441             if (identity) {
442                 filtersVector.addElement(new Identity());
443             }
444             else if (purify) {
445                 filtersVector.addElement(new Purifier());
446             }
447             filtersVector.addElement(new Writer(System.out, oencoding));
448             XMLDocumentFilter[] filters =
449                 new XMLDocumentFilter[filtersVector.size()];
450             filtersVector.copyInto(filters);
451             parser.setProperty(FILTERS, filters);
452             XMLInputSource source = new XMLInputSource(null, arg, null);
453             source.setEncoding(iencoding);
454             parser.parse(source);
455         }
456     } // main(String[])
457

458     /** Print usage. */
459     private static void printUsage() {
460         System.err.println("usage: java "+Writer.class.getName()+" (options) file ...");
461         System.err.println();
462         System.err.println("options:");
463         System.err.println(" -ie name Specify IANA name of input encoding.");
464         System.err.println(" -oe name Specify IANA name of output encoding.");
465         System.err.println(" -i Perform identity transform.");
466         System.err.println(" -p Purify output to ensure XML well-formedness.");
467         System.err.println(" -h Display help screen.");
468         System.err.println();
469         System.err.println("notes:");
470         System.err.println(" The -i and -p options are mutually exclusive.");
471         System.err.println(" The -e option has been replaced with -oe.");
472     } // printUsage()
473

474 } // class Writer
475
Popular Tags