KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > nu > xom > samples > StreamingXHTMLPurifier


1 /* Copyright 2002-2004 Elliotte Rusty Harold
2    
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6    
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10    GNU Lesser General Public License for more details.
11    
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307 USA
16    
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@metalab.unc.edu. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */

21
22 package nu.xom.samples;
23
24 import java.io.IOException JavaDoc;
25 import java.util.Stack JavaDoc;
26
27 import nu.xom.Attribute;
28 import nu.xom.Builder;
29 import nu.xom.DocType;
30 import nu.xom.Document;
31 import nu.xom.Element;
32 import nu.xom.NodeFactory;
33 import nu.xom.Nodes;
34 import nu.xom.ParsingException;
35 import nu.xom.Serializer;
36
37 /**
38  * <p>
39  * Demonstrates a custom <code>NodeFactory</code> that strips out
40  * all non-XHTML elements. It’s easy enough to drop out any elements
41  * that are not in the XHTML namespace. However, in the case of SVG,
42  * MathML and most other applications you’ll want to remove the
43  * content of these elements as well. I’ll assume that the namespace
44  * for text is the same as the namespace of the parent element.
45  * (This is not at all clear from the namespaces specification,
46  * but it makes sense in many cases.) To track the nearest namespace
47  * for non-elements, makeElement() will push the element’s namespace
48  * onto a stack and endElement() will pop it off. Peeking at the top
49  * of the stack tells you what namespace the nearest element uses.
50  * This is modeled after Example 8-9 in
51  * <cite>Processing XML with Java</cite>.
52  * </p>
53  *
54  * @author Elliotte Rusty Harold
55  * @version 1.0
56  *
57  */

58
59 public class StreamingXHTMLPurifier extends NodeFactory {
60
61     private Stack JavaDoc namespaces = new Stack JavaDoc();
62     private Nodes empty = new Nodes();
63     public final static String JavaDoc XHTML_NAMESPACE
64       = "http://www.w3.org/1999/xhtml";
65
66     // We need text nodes only inside XHTML
67
public Nodes makeText(String JavaDoc data) {
68         if (inXHTML()) return super.makeText(data);
69         return empty;
70     }
71
72     public Nodes makeComment(String JavaDoc data) {
73         if (inXHTML()) return super.makeComment(data);
74         return empty;
75     }
76
77     
78     private boolean inXHTML() {
79         if (namespaces.isEmpty()) return true; // document prolog
80
String JavaDoc currentNamespace = (String JavaDoc) (namespaces.peek());
81         if (XHTML_NAMESPACE.equals(currentNamespace)) return true;
82         return false;
83     }
84
85     public Element startMakingElement(String JavaDoc name, String JavaDoc namespace) {
86         
87         namespaces.push(namespace);
88         if (XHTML_NAMESPACE.equals(namespace)) {
89             return super.startMakingElement(name, namespace);
90         }
91         return null;
92     }
93     
94     public Nodes finishMakingElement(Element element) {
95         namespaces.pop();
96         int namespaceCount = element.getNamespaceDeclarationCount();
97         for (int i = 0; i < namespaceCount; i++) {
98             String JavaDoc prefix = element.getNamespacePrefix(i);
99             element.removeNamespaceDeclaration(prefix);
100             if (element.getNamespaceDeclarationCount() < namespaceCount) {
101                 i--;
102                 namespaceCount--;
103             }
104         }
105         return new Nodes(element);
106     }
107
108     public Nodes makeDocType(String JavaDoc rootElementName,
109       String JavaDoc publicID, String JavaDoc systemID) {
110         return new Nodes(new DocType("html",
111           "PUBLIC \"-//W3C//DTD XHTML Basic 1.0//EN\"",
112           "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd"));
113     }
114
115     public Nodes makeProcessingInstruction(
116       String JavaDoc target, String JavaDoc data) {
117         if (inXHTML()) {
118             return super.makeProcessingInstruction(target, data);
119         }
120         return empty;
121     }
122
123     public Nodes makeAttribute(String JavaDoc name, String JavaDoc URI,
124       String JavaDoc value, Attribute.Type type) {
125         if ("".equals(URI)
126           || "http://www.w3.org/XML/1998/namespace".equals(URI)) {
127             return super.makeAttribute(name, URI, value, type);
128         }
129         return empty;
130     }
131
132     public static void main(String JavaDoc[] args) {
133   
134         if (args.length == 0) {
135             System.out.println(
136               "Usage: java nu.xom.samples.StreamingXHTMLPurifier URL"
137             );
138             return;
139         }
140       
141         StreamingXHTMLPurifier factory = new StreamingXHTMLPurifier();
142         Builder builder = new Builder(factory);
143      
144         try {
145             Document doc = builder.build(args[0]);
146             Serializer serializer = new Serializer(System.out);
147             serializer.write(doc);
148         }
149         // indicates a well-formedness error
150
catch (ParsingException ex) {
151             System.out.println(args[0] + " is not well-formed.");
152             System.out.println(ex.getMessage());
153             ex.printStackTrace();
154         }
155         catch (IOException JavaDoc ex) {
156             System.out.println(ex);
157         }
158   
159     }
160
161 }
162
Popular Tags