KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > cyberneko > html > filters > Purifier


1 /*
2  * (C) Copyright 2004-2005, Andy Clark. All rights reserved.
3  *
4  * This file is distributed under an Apache style license. Please
5  * refer to the LICENSE file for specific details.
6  */

7
8 package org.cyberneko.html.filters;
9
10 import org.cyberneko.html.HTMLAugmentations;
11 import org.cyberneko.html.HTMLEventInfo;
12
13 import java.lang.reflect.Method JavaDoc;
14 import java.lang.reflect.InvocationTargetException JavaDoc;
15
16 import org.apache.xerces.util.XMLChar;
17 import org.apache.xerces.util.XMLStringBuffer;
18 import org.apache.xerces.xni.Augmentations;
19 import org.apache.xerces.xni.NamespaceContext;
20 import org.apache.xerces.xni.QName;
21 import org.apache.xerces.xni.XMLAttributes;
22 import org.apache.xerces.xni.XMLLocator;
23 import org.apache.xerces.xni.XMLString;
24 import org.apache.xerces.xni.XNIException;
25 import org.apache.xerces.xni.parser.XMLComponentManager;
26 import org.apache.xerces.xni.parser.XMLConfigurationException;
27
28 /**
29  * This filter purifies the HTML input to ensure XML well-formedness.
30  * The purification process includes:
31  * <ul>
32  * <li>fixing illegal characters in the document, including
33  * <ul>
34  * <li>element and attribute names,
35  * <li>processing instruction target and data,
36  * <li>document text;
37  * </ul>
38  * <li>ensuring the string "--" does not appear in the content of
39  * a comment;
40  * <li>ensuring the string "]]>" does not appear in the content of
41  * a CDATA section;
42  * <li>ensuring that the XML declaration has required pseudo-attributes
43  * and that the values are correct;
44  * and
45  * <li>synthesized missing namespace bindings.
46  * </ul>
47  * <p>
48  * Illegal characters in XML names are converted to the character
49  * sequence "_u####_" where "####" is the value of the Unicode
50  * character represented in hexadecimal. Whereas illegal characters
51  * appearing in document content is converted to the character
52  * sequence "\\u####".
53  * <p>
54  * In comments, the character '-' is replaced by the character
55  * sequence "- " to prevent "--" from ever appearing in the comment
56  * content. For CDATA sections, the character ']' is replaced by
57  * the character sequence "] " to prevent "]]" from appearing.
58  * <p>
59  * The URI used for synthesized namespace bindings is
60  * "http://cyberneko.org/html/ns/synthesized/<i>number</i>" where
61  * <i>number</i> is generated to ensure uniqueness.
62  *
63  * @author Andy Clark
64  *
65  * @version $Id: Purifier.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
66  */

67 public class Purifier
68     extends DefaultFilter {
69
70     //
71
// Constants
72
//
73

74     /** Synthesized namespace binding prefix. */
75     public static final String JavaDoc SYNTHESIZED_NAMESPACE_PREFX =
76         "http://cyberneko.org/html/ns/synthesized/";
77
78     /** Namespaces. */
79     protected static final String JavaDoc NAMESPACES = "http://xml.org/sax/features/namespaces";
80
81     /** Include infoset augmentations. */
82     protected static final String JavaDoc AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
83
84     /** Recognized features. */
85     private static final String JavaDoc[] RECOGNIZED_FEATURES = {
86         NAMESPACES,
87         AUGMENTATIONS,
88     };
89
90     /** Recognized features defaults. */
91     private static final Boolean JavaDoc[] RECOGNIZED_FEATURES_DEFAULTS = {
92         null,
93         null,
94     };
95
96     // static vars
97

98     /** Synthesized event info item. */
99     protected static final HTMLEventInfo SYNTHESIZED_ITEM =
100         new HTMLEventInfo.SynthesizedItem();
101
102     //
103
// Data
104
//
105

106     // features
107

108     /** Namespaces. */
109     protected boolean fNamespaces;
110
111     /** Augmentations. */
112     protected boolean fAugmentations;
113
114     // state
115

116     /** True if the doctype declaration was seen. */
117     protected boolean fSeenDoctype;
118
119     /** True if root element was seen. */
120     protected boolean fSeenRootElement;
121
122     /** True if inside a CDATA section. */
123     protected boolean fInCDATASection;
124
125     // doctype declaration info
126

127     /** Public identifier of doctype declaration. */
128     protected String JavaDoc fPublicId;
129
130     /** System identifier of doctype declaration. */
131     protected String JavaDoc fSystemId;
132
133     // namespace info
134

135     /** Namespace information. */
136     protected NamespaceContext fNamespaceContext;
137
138     /** Synthesized namespace binding count. */
139     protected int fSynthesizedNamespaceCount;
140
141     // temp vars
142

143     /** Qualified name. */
144     private QName fQName = new QName();
145
146     /** Augmentations. */
147     private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
148
149     /** String buffer. */
150     private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
151
152     //
153
// XMLComponent methods
154
//
155

156     public void reset(XMLComponentManager manager)
157         throws XMLConfigurationException {
158
159         // state
160
fInCDATASection = false;
161
162         // features
163
fNamespaces = manager.getFeature(NAMESPACES);
164         fAugmentations = manager.getFeature(AUGMENTATIONS);
165
166     } // reset(XMLComponentManager)
167

168     //
169
// XMLDocumentHandler methods
170
//
171

172     /** Start document. */
173     public void startDocument(XMLLocator locator, String JavaDoc encoding,
174                               Augmentations augs) throws XNIException {
175         fNamespaceContext = fNamespaces
176                           ? new NamespaceBinder.NamespaceSupport() : null;
177         fSynthesizedNamespaceCount = 0;
178         handleStartDocument();
179         super.startDocument(locator, encoding, augs);
180     } // startDocument(XMLLocator,String,Augmentations)
181

182     /** Start document. */
183     public void startDocument(XMLLocator locator, String JavaDoc encoding,
184                               NamespaceContext nscontext, Augmentations augs)
185         throws XNIException {
186         fNamespaceContext = nscontext;
187         fSynthesizedNamespaceCount = 0;
188         handleStartDocument();
189         super.startDocument(locator, encoding, nscontext, augs);
190     } // startDocument(XMLLocator,NamespaceContext,String,Augmentations)
191

192     /** XML declaration. */
193     public void xmlDecl(String JavaDoc version, String JavaDoc encoding, String JavaDoc standalone,
194                         Augmentations augs) throws XNIException {
195         if (version == null || !version.equals("1.0")) {
196             version = "1.0";
197         }
198         if (encoding != null && encoding.length() == 0) {
199             encoding = null;
200         }
201         if (standalone != null) {
202             if (!standalone.equalsIgnoreCase("true") &&
203                 !standalone.equalsIgnoreCase("false")) {
204                 standalone = null;
205             }
206             else {
207                 standalone = standalone.toLowerCase();
208             }
209         }
210         super.xmlDecl(version,encoding,standalone,augs);
211     } // xmlDecl(String,String,String,Augmentations)
212

213     /** Comment. */
214     public void comment(XMLString text, Augmentations augs)
215         throws XNIException {
216         StringBuffer JavaDoc str = new StringBuffer JavaDoc(purifyText(text).toString());
217         int length = str.length();
218         for (int i = length-1; i >= 0; i--) {
219             char c = str.charAt(i);
220             if (c == '-') {
221                 str.insert(i + 1, ' ');
222             }
223         }
224         fStringBuffer.length = 0;
225         fStringBuffer.append(str.toString());
226         text = fStringBuffer;
227         super.comment(text, augs);
228     } // comment(XMLString,Augmentations)
229

230     /** Processing instruction. */
231     public void processingInstruction(String JavaDoc target, XMLString data,
232                                       Augmentations augs)
233         throws XNIException {
234         target = purifyName(target, true);
235         data = purifyText(data);
236         super.processingInstruction(target, data, augs);
237     } // processingInstruction(String,XMLString,Augmentations)
238

239     /** Doctype declaration. */
240     public void doctypeDecl(String JavaDoc root, String JavaDoc pubid, String JavaDoc sysid,
241                             Augmentations augs) throws XNIException {
242         fSeenDoctype = true;
243         // NOTE: It doesn't matter what the root element name is because
244
// it must match the root element. -Ac
245
fPublicId = pubid;
246         fSystemId = sysid;
247         // NOTE: If the public identifier is specified, then a system
248
// identifier must also be specified. -Ac
249
if (fPublicId != null && fSystemId == null) {
250             fSystemId = "";
251         }
252         // NOTE: Can't save the augmentations because the object state
253
// is transient. -Ac
254
} // doctypeDecl(String,String,String,Augmentations)
255

256     /** Start element. */
257     public void startElement(QName element, XMLAttributes attrs,
258                              Augmentations augs) throws XNIException {
259         handleStartElement(element, attrs);
260         super.startElement(element, attrs, augs);
261     } // startElement(QName,XMLAttributes,Augmentations)
262

263     /** Empty element. */
264     public void emptyElement(QName element, XMLAttributes attrs,
265                              Augmentations augs) throws XNIException {
266         handleStartElement(element, attrs);
267         super.emptyElement(element, attrs, augs);
268     } // emptyElement(QName,XMLAttributes,Augmentations)
269

270     /** Start CDATA section. */
271     public void startCDATA(Augmentations augs) throws XNIException {
272         fInCDATASection = true;
273         super.startCDATA(augs);
274     } // startCDATA(Augmentations)
275

276     /** End CDATA section. */
277     public void endCDATA(Augmentations augs) throws XNIException {
278         fInCDATASection = false;
279         super.endCDATA(augs);
280     } // endCDATA(Augmentations)
281

282     /** Characters. */
283     public void characters(XMLString text, Augmentations augs)
284         throws XNIException {
285         text = purifyText(text);
286         if (fInCDATASection) {
287             StringBuffer JavaDoc str = new StringBuffer JavaDoc(text.toString());
288             int length = str.length();
289             for (int i = length-1; i >= 0; i--) {
290                 char c = str.charAt(i);
291                 if (c == ']') {
292                     str.insert(i + 1, ' ');
293                 }
294             }
295             fStringBuffer.length = 0;
296             fStringBuffer.append(str.toString());
297             text = fStringBuffer;
298         }
299         super.characters(text,augs);
300     } // characters(XMLString,Augmentations)
301

302     /** End element. */
303     public void endElement(QName element, Augmentations augs)
304         throws XNIException {
305         element = purifyQName(element);
306         if (fNamespaces) {
307             if (element.prefix != null && element.uri == null) {
308                 element.uri = fNamespaceContext.getURI(element.prefix);
309             }
310         }
311         super.endElement(element, augs);
312     } // endElement(QName,Augmentations)
313

314     //
315
// Protected methods
316
//
317

318     /** Handle start document. */
319     protected void handleStartDocument() {
320         fSeenDoctype = false;
321         fSeenRootElement = false;
322     } // handleStartDocument()
323

324     /** Handle start element. */
325     protected void handleStartElement(QName element, XMLAttributes attrs) {
326
327         // handle element and attributes
328
element = purifyQName(element);
329         int attrCount = attrs != null ? attrs.getLength() : 0;
330         for (int i = attrCount-1; i >= 0; i--) {
331             // purify attribute name
332
attrs.getName(i, fQName);
333             attrs.setName(i, purifyQName(fQName));
334
335             // synthesize namespace bindings
336
if (fNamespaces) {
337                 if (!fQName.rawname.equals("xmlns") &&
338                     !fQName.rawname.startsWith("xmlns:")) {
339                     // NOTE: Must get attribute name again because the
340
// purifyQName method does not guarantee that
341
// the same QName object is returned. -Ac
342
attrs.getName(i, fQName);
343                     if (fQName.prefix != null && fQName.uri == null) {
344                         synthesizeBinding(attrs, fQName.prefix);
345                     }
346                 }
347             }
348         }
349
350         // synthesize namespace bindings
351
if (fNamespaces) {
352             if (element.prefix != null && element.uri == null) {
353                 synthesizeBinding(attrs, element.prefix);
354             }
355         }
356
357         // synthesize doctype declaration
358
if (!fSeenRootElement && fSeenDoctype) {
359             Augmentations augs = synthesizedAugs();
360             super.doctypeDecl(element.rawname, fPublicId, fSystemId, augs);
361         }
362
363         // mark start element as seen
364
fSeenRootElement = true;
365
366     } // handleStartElement(QName,XMLAttributes)
367

368     /** Synthesize namespace binding. */
369     protected void synthesizeBinding(XMLAttributes attrs, String JavaDoc ns) {
370         String JavaDoc prefix = "xmlns";
371         String JavaDoc localpart = ns;
372         String JavaDoc qname = prefix+':'+localpart;
373         String JavaDoc uri = NamespaceBinder.NAMESPACES_URI;
374         String JavaDoc atype = "CDATA";
375         String JavaDoc avalue = SYNTHESIZED_NAMESPACE_PREFX+fSynthesizedNamespaceCount++;
376         
377         // add attribute
378
fQName.setValues(prefix, localpart, qname, uri);
379         attrs.addAttribute(fQName, atype, avalue);
380
381         // bind namespace
382
fNamespaceContext.declarePrefix(ns, avalue);
383
384     } // synthesizeBinding(XMLAttributes,String)
385

386     /** Returns an augmentations object with a synthesized item added. */
387     protected final Augmentations synthesizedAugs() {
388         HTMLAugmentations augs = null;
389         if (fAugmentations) {
390             augs = fInfosetAugs;
391             augs.removeAllItems();
392             augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
393         }
394         return augs;
395     } // synthesizedAugs():Augmentations
396

397     //
398
// Protected methods
399
//
400

401     /** Purify qualified name. */
402     protected QName purifyQName(QName qname) {
403         qname.prefix = purifyName(qname.prefix, true);
404         qname.localpart = purifyName(qname.localpart, true);
405         qname.rawname = purifyName(qname.rawname, false);
406         return qname;
407     } // purifyQName(QName):QName
408

409     /** Purify name. */
410     protected String JavaDoc purifyName(String JavaDoc name, boolean localpart) {
411         if (name == null) {
412             return name;
413         }
414         StringBuffer JavaDoc str = new StringBuffer JavaDoc();
415         int length = name.length();
416         boolean seenColon = localpart;
417         for (int i = 0; i < length; i++) {
418             char c = name.charAt(i);
419             if (i == 0) {
420                 if (!XMLChar.isNameStart(c)) {
421                     str.append("_u"+toHexString(c,4)+"_");
422                 }
423                 else {
424                     str.append(c);
425                 }
426             }
427             else {
428                 if ((fNamespaces && c == ':' && seenColon) || !XMLChar.isName(c)) {
429                     str.append("_u"+toHexString(c,4)+"_");
430                 }
431                 else {
432                     str.append(c);
433                 }
434                 seenColon = seenColon || c == ':';
435             }
436         }
437         return str.toString();
438     } // purifyName(String):String
439

440     /** Purify content. */
441     protected XMLString purifyText(XMLString text) {
442         fStringBuffer.length = 0;
443         for (int i = 0; i < text.length; i++) {
444             char c = text.ch[text.offset+i];
445             if (XMLChar.isInvalid(c)) {
446                 fStringBuffer.append("\\u"+toHexString(c,4));
447             }
448             else {
449                 fStringBuffer.append(c);
450             }
451         }
452         return fStringBuffer;
453     } // purifyText(XMLString):XMLString
454

455     //
456
// Protected static methods
457
//
458

459     /** Returns a padded hexadecimal string for the given value. */
460     protected static String JavaDoc toHexString(int c, int padlen) {
461         StringBuffer JavaDoc str = new StringBuffer JavaDoc(padlen);
462         str.append(Integer.toHexString(c));
463         int len = padlen - str.length();
464         for (int i = 0; i < len; i++) {
465             str.insert(0, '0');
466         }
467         return str.toString().toUpperCase();
468     } // toHexString(int,int):String
469

470 } // class Purifier
471
Popular Tags