KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > matuschek > html > HtmlDocument


1 package net.matuschek.html;
2
3 /************************************************
4  Copyright (c) 2001/2002 by Daniel Matuschek
5  *************************************************/

6
7 import java.net.MalformedURLException JavaDoc;
8 import java.net.URL JavaDoc;
9 import java.util.Vector JavaDoc;
10 import java.util.StringTokenizer JavaDoc;
11 import java.io.*;
12
13 import org.w3c.dom.Document JavaDoc;
14 import org.w3c.dom.Element JavaDoc;
15 import org.w3c.dom.NodeList JavaDoc;
16 import org.w3c.tidy.Tidy;
17
18 import org.apache.log4j.Category;
19
20 import net.matuschek.util.AttribValuePair;
21
22 /**
23  * This class implements an HTML document
24  *
25  * It uses JTidy to parse the given HTML code to an internal DOM
26  * representation.
27  *
28  * @author Daniel Matuschek
29  * @version $Id $
30  */

31 public class HtmlDocument
32 {
33     
34     /** URL of this document */
35     private URL JavaDoc url = null;
36     
37     /** Content text as an array of bytes (this is how we get it from HTTP !) */
38     private byte[] content = null;
39     
40     /** the DOM representation of this HTML document */
41     private Document JavaDoc domDoc = null;
42     
43     /** Log4J category for logging purposes */
44     private Category log;
45     
46     /** encoding */
47     private String JavaDoc encoding;
48     
49     /** Base URL */
50     private URL JavaDoc baseURL=null;
51     
52     /** All links */
53     Vector JavaDoc<URL JavaDoc> links;
54     
55     
56     /**
57      * initializes HTML document without content
58      */

59     private HtmlDocument(URL JavaDoc url) {
60         log = Category.getInstance(getClass().getName());
61         this.url = url;
62     }
63     
64     
65     /**
66      * Initializes an HTML document with the given content.
67      *
68      * @param url the URL of this document. Needed for link extraction.
69      * @param content some HTML text as an array of bytes
70      */

71     public HtmlDocument(URL JavaDoc url, byte[] content) {
72         this(url);
73         this.content = content;
74         parse();
75     }
76     
77     /**
78      * Initializes an HTML document with the given content.
79      *
80      * @param url the URL of this document. Needed for link extraction.
81      * @param content some HTML text as an array of bytes
82      * @param newEncoding Is the encoding of the content.
83      */

84     public HtmlDocument(URL JavaDoc url, byte[] content, String JavaDoc newEncoding) {
85         this(url);
86         this.content = content;
87         encoding = newEncoding;
88         parse();
89     }
90     
91     
92     /**
93      * Initalizes an HTML document from a String. Convert string to
94      * bytes using default encoding
95      */

96     public HtmlDocument(URL JavaDoc url, String JavaDoc contentStr) {
97         this(url);
98         this.content = new byte[contentStr.length()+1];
99         for (int i=0; i<contentStr.length(); i++) {
100             this.content[i] = (byte)contentStr.charAt(i);
101         }
102         parse();
103     }
104     
105     
106     
107     /**
108      * Extracts all links to other documents from this HTML document.
109      *
110      * @return a Vector of URLs containing the included links
111      */

112     private void parse() {
113         if (domDoc == null) {
114             parseToDOM();
115         }
116         this.links = new Vector JavaDoc<URL JavaDoc>();
117         extractLinks(domDoc.getDocumentElement(),links);
118     }
119     
120     public Vector JavaDoc<URL JavaDoc> getLinks() {
121         return this.links;
122     }
123     
124     
125     /**
126      * Extracts all links to included images from this HTML document.
127      *
128      * @return a Vector of URLs containing the included links
129      */

130     public Vector JavaDoc getImageLinks() {
131         if (domDoc == null) {
132             parseToDOM();
133         }
134         Vector JavaDoc<URL JavaDoc> links = new Vector JavaDoc<URL JavaDoc>();
135         extractImageLinks(domDoc.getDocumentElement(),links);
136         
137         return links;
138     }
139     
140     
141     /**
142      * gets all Element nodes of a given type as a Vector
143      * @param type the type of elements to return. e.g. type="a"
144      * will return all <A> tags. type must be lowercase
145      * @return a Vector containing all element nodes of the given type
146      */

147     public Vector JavaDoc getElements(String JavaDoc type) {
148         if (domDoc == null) {
149             parseToDOM();
150         }
151         
152         Vector JavaDoc <Element JavaDoc>links = new Vector JavaDoc<Element JavaDoc>();
153         extractElements(domDoc.getDocumentElement(),type,links);
154         
155         return links;
156     }
157     
158     
159     /**
160      * Extract links from the given DOM subtree and put it into the given
161      * vector.
162      *
163      * @param element the top level DOM element of the DOM tree to parse
164      * @param links the vector that will store the links
165      */

166     protected void extractLinks(Element JavaDoc element, Vector JavaDoc <URL JavaDoc>links) {
167         
168         // this should not happen !
169
if (element==null) {
170             log.error("got a null element");
171             return;
172         }
173         
174         String JavaDoc name = element.getNodeName().toLowerCase();
175         
176         if (name.equals("a")) {
177             
178             // A HREF=
179
addLink(element.getAttribute("href"),links);
180             
181         } else if (name.equals("base")) {
182                 
183                 // BASE HREF=
184
try {
185                     this.baseURL = new URL JavaDoc(element.getAttribute("href"));
186                     log.info("baseUR="+baseURL);
187                 } catch (MalformedURLException JavaDoc e) { }
188                 
189         } else if (name.equals("frame")) {
190             
191             // FRAME SRC=
192
addLink(element.getAttribute("src"),links);
193             
194             // handle internal frame (iframes) as well
195
} else if (name.equals("iframe")) {
196             
197             // IFRAME SRC=
198
addLink(element.getAttribute("src"),links);
199             
200         } else if (name.equals("image")) {
201             
202             // IMAGEG SRC= (incorrect, but seems to work in some browsers :(
203
addLink(element.getAttribute("src"),links);
204             
205         } else if (name.equals("img")) {
206             
207             // IMG SRC=
208
addLink(element.getAttribute("src"),links);
209             
210         } else if (name.equals("area")) {
211             
212             // AREA HREF=
213
addLink(element.getAttribute("href"),links);
214             
215         } else if (name.equals("meta")) {
216             
217             // META HTTP-EQUIV=REFRESH
218
String JavaDoc equiv=element.getAttribute("http-equiv");
219             if ((equiv != null) && (equiv.equalsIgnoreCase("refresh"))) {
220                 String JavaDoc refreshcontent=element.getAttribute("content");
221                 if (refreshcontent == null) { refreshcontent=""; }
222                 
223                 StringTokenizer JavaDoc st=new StringTokenizer JavaDoc(refreshcontent,";");
224                 while (st.hasMoreTokens()) {
225                     String JavaDoc token=st.nextToken().trim();
226                     AttribValuePair av = new AttribValuePair(token);
227                     if (av.getAttrib().equals("url")) {
228                         addLink(av.getValue(),links);
229                     }
230                 }
231             }
232             
233         } else if (name.equals("body")) {
234             // BODY BACKGROUND=
235
String JavaDoc background = element.getAttribute("background");
236             if ( ! ( background == null) ||
237                     ( background.equals("") ) ) {
238                 addLink(background,links);
239             }
240             
241         } else {
242             log.info("Ignore tag name: "+name);
243         }
244         
245         
246         // recursive travel through all childs
247
NodeList JavaDoc childs = element.getChildNodes();
248         
249         for (int i=0; i<childs.getLength(); i++) {
250             if (childs.item(i) instanceof Element JavaDoc) {
251                 extractLinks((Element JavaDoc)childs.item(i),links);
252             }
253         }
254         
255     }
256     
257     
258     /**
259      * Extract links to includes images from the given DOM subtree and
260      * put them into the given vector.
261      *
262      * @param element the top level DOM element of the DOM tree to parse
263      * @param links the vector that will store the links
264      */

265     protected void extractImageLinks(Element JavaDoc element, Vector JavaDoc<URL JavaDoc> links) {
266         
267         // this should not happen !
268
if (element==null) {
269             log.error("got a null element");
270             return;
271         }
272         
273         String JavaDoc name = element.getNodeName();
274         
275         if (name.equals("img")) {
276             // IMG SRC=
277
addLink(element.getAttribute("src"),links);
278         }
279         
280         if (name.equals("image")) {
281             // IMAGE SRC=
282
addLink(element.getAttribute("src"),links);
283         }
284         
285         // recursive travel through all childs
286
NodeList JavaDoc childs = element.getChildNodes();
287         
288         for (int i=0; i<childs.getLength(); i++) {
289             if (childs.item(i) instanceof Element JavaDoc) {
290                 extractImageLinks((Element JavaDoc)childs.item(i),links);
291             }
292         }
293         
294     }
295     
296     
297     /**
298      * Extract elements from the given DOM subtree and put it into the given
299      * vector.
300      *
301      * @param element the top level DOM element of the DOM tree to parse
302      * @param type HTML tag to extract (e.g. "a", "form", "head" ...)
303      * @param elementList the vector that will store the elements
304      */

305     protected void extractElements(Element JavaDoc element,
306             String JavaDoc type,
307             Vector JavaDoc <Element JavaDoc>elementList) {
308         
309         // this should not happen !
310
if (element==null) {
311             log.error("got a null element");
312             return;
313         }
314         
315         String JavaDoc name = element.getNodeName();
316         
317         if (name.equals(type)) {
318             elementList.add(element);
319         }
320         
321         
322         // recursive travel through all childs
323
NodeList JavaDoc childs = element.getChildNodes();
324         
325         for (int i=0; i<childs.getLength(); i++) {
326             if (childs.item(i) instanceof Element JavaDoc) {
327                 extractElements((Element JavaDoc)childs.item(i),type,elementList);
328             }
329         }
330         
331     }
332     
333     
334     /**
335      * parses the document to a DOM tree using Tidy
336      */

337     private void parseToDOM() {
338         ByteArrayInputStream is = new ByteArrayInputStream(content);
339         
340         // set tidy parameters
341
Tidy tidy = new Tidy();
342         tidy.setUpperCaseTags(false);
343         tidy.setUpperCaseAttrs(false);
344         tidy.setErrout(new PrintWriter(System.err));
345         
346         domDoc = tidy.parseDOM(is,null);
347     }
348     
349     
350     /**
351      * adds a links to the given vector. ignores (but logs) possible errors
352      */

353     private void addLink(String JavaDoc newURL, Vector JavaDoc<URL JavaDoc> links) {
354         
355         // remove part after # from the URL
356
// thanks to Johannes Christen for bug fix.
357
if ((newURL == null) || (newURL.equals(""))) return;
358         int pos = newURL.indexOf("#");
359         if (pos >=0 ) {
360             newURL = newURL.substring(0,pos);
361         }
362         
363         if (encoding != null) {
364             try {
365                 newURL = new String JavaDoc(newURL.getBytes(), encoding);
366             } catch (UnsupportedEncodingException e) {
367             }
368         } else {
369             try {
370                 newURL = new String JavaDoc(newURL.getBytes(), "ISO-8859-1");
371             } catch (UnsupportedEncodingException e) {
372             }
373         }
374         
375         try {
376             URL JavaDoc u = null;
377             if (this.baseURL != null) {
378                 u = new URL JavaDoc(this.baseURL,newURL);
379             } else {
380                 u = new URL JavaDoc(url,newURL);
381             }
382             links.add(u);
383         } catch (Exception JavaDoc e) {
384             log.debug("error during link extraction: "+e.getMessage()+" "+newURL);
385         }
386     }
387
388
389     public URL JavaDoc getBaseURL() {
390         return baseURL;
391     }
392     
393     
394     
395 }
396
Popular Tags