HtmlDocument


1   package net.matuschek.html;
2   
3   /************************************************
4    Copyright (c) 2001/2002 by Daniel Matuschek
5    *************************************************/
6   
7   import java.net.MalformedURLException  ;
8   import java.net.URL  ;
9   import java.util.Vector  ;
10  import java.util.StringTokenizer  ;
11  import java.io.*;
12  
13  import org.w3c.dom.Document  ;
14  import org.w3c.dom.Element  ;
15  import org.w3c.dom.NodeList  ;
16  import org.w3c.tidy.Tidy;
17  
18  import org.apache.log4j.Category;
19  
20  import net.matuschek.util.AttribValuePair;
21  
22  /**
23   * This class implements an HTML document
24   *
25   * It uses JTidy to parse the given HTML code to an internal DOM
26   * representation.
27   * 
28   * @author Daniel Matuschek 
29   * @version $Id $
30   */
31  public class HtmlDocument
32  {
33      
34      /** URL of this document */
35      private URL   url = null;
36      
37      /** Content text as an array of bytes (this is how we get it from HTTP !) */
38      private  byte[] content = null;
39      
40      /** the DOM representation of this HTML document */
41      private Document   domDoc = null;
42      
43      /** Log4J category for logging purposes */
44      private Category log;
45      
46      /** encoding */
47      private String   encoding;
48      
49      /** Base URL */
50      private URL   baseURL=null;
51      
52      /** All links */
53      Vector  <URL  > links;
54      
55      
56      /**
57       * initializes HTML document without content
58       */
59      private HtmlDocument(URL   url) {
60          log = Category.getInstance(getClass().getName());
61          this.url = url;
62      }
63      
64      
65      /**
66       * Initializes an HTML document with the given content.
67       * 
68       * @param url the URL of this document. Needed for link extraction.
69       * @param content some HTML text as an array of bytes
70       */
71      public HtmlDocument(URL   url, byte[] content) {
72          this(url);
73          this.content = content;
74          parse();
75      }
76      
77      /**
78       * Initializes an HTML document with the given content.
79       * 
80       * @param url the URL of this document. Needed for link extraction.
81       * @param content some HTML text as an array of bytes
82       * @param newEncoding Is the encoding of the content.
83       */
84      public HtmlDocument(URL   url, byte[] content, String   newEncoding) {
85          this(url);
86          this.content = content;
87          encoding = newEncoding;
88          parse();
89      }
90      
91      
92      /**
93       * Initalizes an HTML document from a String. Convert string to
94       * bytes using default encoding
95       */
96      public HtmlDocument(URL   url, String   contentStr) {
97          this(url);
98          this.content = new byte[contentStr.length()+1];
99          for (int i=0; i<contentStr.length(); i++) {
100             this.content[i] = (byte)contentStr.charAt(i);
101         }
102         parse();
103     }
104     
105     
106     
107     /**
108      * Extracts all links to other documents from this HTML document.
109      *
110      * @return a Vector of URLs containing the included links
111      */
112     private void parse() {
113         if (domDoc == null) {
114             parseToDOM();
115         }
116         this.links = new Vector  <URL  >(); 
117         extractLinks(domDoc.getDocumentElement(),links);
118     }
119     
120     public Vector  <URL  > getLinks() {
121         return this.links;
122     }
123     
124     
125     /**
126      * Extracts all links to included images from this HTML document.
127      *
128      * @return a Vector of URLs containing the included links
129      */
130     public Vector   getImageLinks() {
131         if (domDoc == null) {
132             parseToDOM();
133         }
134         Vector  <URL  > links = new Vector  <URL  >();
135         extractImageLinks(domDoc.getDocumentElement(),links);
136         
137         return links;
138     }
139     
140     
141     /**
142      * gets all Element nodes of a given type as a Vector
143      * @param type the type of elements to return. e.g. type="a"
144      * will return all <A> tags. type must be lowercase
145      * @return a Vector containing all element nodes of the given type
146      */
147     public Vector   getElements(String   type) {
148         if (domDoc == null) {
149             parseToDOM();
150         }
151         
152         Vector   <Element  >links = new Vector  <Element  >();
153         extractElements(domDoc.getDocumentElement(),type,links);
154         
155         return links;
156     }
157     
158     
159     /**
160      * Extract links from the given DOM subtree and put it into the given
161      * vector.
162      *
163      * @param element the top level DOM element of the DOM tree to parse
164      * @param links the vector that will store the links
165      */
166     protected void extractLinks(Element   element, Vector   <URL  >links) {
167         
168         // this should not happen !
169         if (element==null) {
170             log.error("got a null element");
171             return;
172         }
173         
174         String   name = element.getNodeName().toLowerCase();
175         
176         if (name.equals("a")) {
177             
178             // A HREF= 
179             addLink(element.getAttribute("href"),links);
180             
181         } else if (name.equals("base")) {
182                 
183                 // BASE HREF= 
184                 try {
185                     this.baseURL = new URL  (element.getAttribute("href"));
186                     log.info("baseUR="+baseURL);
187                 } catch (MalformedURLException   e) { }
188                 
189         } else if (name.equals("frame")) {
190             
191             // FRAME SRC=
192             addLink(element.getAttribute("src"),links);
193             
194             // handle internal frame (iframes) as well
195         } else if (name.equals("iframe")) {
196             
197             // IFRAME SRC=
198             addLink(element.getAttribute("src"),links);
199             
200         } else if (name.equals("image")) {
201             
202             // IMAGEG SRC= (incorrect, but seems to work in some browsers :(
203             addLink(element.getAttribute("src"),links);
204             
205         } else if (name.equals("img")) {
206             
207             // IMG SRC=
208             addLink(element.getAttribute("src"),links);
209             
210         } else if (name.equals("area")) {
211             
212             // AREA HREF=
213             addLink(element.getAttribute("href"),links);
214             
215         } else if (name.equals("meta")) {
216             
217             // META HTTP-EQUIV=REFRESH
218             String   equiv=element.getAttribute("http-equiv");
219             if ((equiv != null) && (equiv.equalsIgnoreCase("refresh"))) { 
220                 String   refreshcontent=element.getAttribute("content"); 
221                 if (refreshcontent == null) { refreshcontent=""; } 
222                 
223                 StringTokenizer   st=new StringTokenizer  (refreshcontent,";"); 
224                 while (st.hasMoreTokens()) { 
225                     String   token=st.nextToken().trim();
226                     AttribValuePair av = new AttribValuePair(token);
227                     if (av.getAttrib().equals("url")) { 
228                         addLink(av.getValue(),links);
229                     } 
230                 } 
231             }
232             
233         } else if (name.equals("body")) {
234             // BODY BACKGROUND=
235             String   background = element.getAttribute("background");
236             if ( ! ( background == null) ||
237                     ( background.equals("") ) ) {
238                 addLink(background,links);
239             }
240             
241         } else {
242             log.info("Ignore tag name: "+name);
243         }
244         
245         
246         // recursive travel through all childs
247         NodeList   childs = element.getChildNodes();
248         
249         for (int i=0; i<childs.getLength(); i++) {
250             if (childs.item(i) instanceof Element  ) {
251                 extractLinks((Element  )childs.item(i),links);
252             }
253         }
254         
255     }
256     
257     
258     /**
259      * Extract links to includes images from the given DOM subtree and 
260      * put them into the given vector.
261      *
262      * @param element the top level DOM element of the DOM tree to parse
263      * @param links the vector that will store the links
264      */
265     protected void extractImageLinks(Element   element, Vector  <URL  > links) {
266         
267         // this should not happen !
268         if (element==null) {
269             log.error("got a null element");
270             return;
271         }
272         
273         String   name = element.getNodeName();
274         
275         if (name.equals("img")) {
276             // IMG SRC=
277             addLink(element.getAttribute("src"),links);
278         } 
279         
280         if (name.equals("image")) {
281             // IMAGE SRC=
282             addLink(element.getAttribute("src"),links);
283         } 
284         
285         // recursive travel through all childs
286         NodeList   childs = element.getChildNodes();
287         
288         for (int i=0; i<childs.getLength(); i++) {
289             if (childs.item(i) instanceof Element  ) {
290                 extractImageLinks((Element  )childs.item(i),links);
291             }
292         }
293         
294     }
295     
296     
297     /**
298      * Extract elements from the given DOM subtree and put it into the given
299      * vector.
300      *
301      * @param element the top level DOM element of the DOM tree to parse
302      * @param type HTML tag to extract (e.g. "a", "form", "head" ...)
303      * @param elementList the vector that will store the elements
304      */
305     protected void extractElements(Element   element, 
306             String   type, 
307             Vector   <Element  >elementList) {
308         
309         // this should not happen !
310         if (element==null) {
311             log.error("got a null element");
312             return;
313         }
314         
315         String   name = element.getNodeName();
316         
317         if (name.equals(type)) {
318             elementList.add(element);
319         }
320         
321         
322         // recursive travel through all childs
323         NodeList   childs = element.getChildNodes();
324         
325         for (int i=0; i<childs.getLength(); i++) {
326             if (childs.item(i) instanceof Element  ) {
327                 extractElements((Element  )childs.item(i),type,elementList);
328             }
329         }
330         
331     }
332     
333     
334     /**
335      * parses the document to a DOM tree using Tidy
336      */
337     private void parseToDOM() {
338         ByteArrayInputStream is = new ByteArrayInputStream(content);
339         
340         // set tidy parameters
341         Tidy tidy = new Tidy();
342         tidy.setUpperCaseTags(false);
343         tidy.setUpperCaseAttrs(false);
344         tidy.setErrout(new PrintWriter(System.err));
345         
346         domDoc = tidy.parseDOM(is,null);
347     }
348     
349     
350     /**
351      * adds a links to the given vector. ignores (but logs) possible errors
352      */
353     private void addLink(String   newURL, Vector  <URL  > links) {
354         
355         // remove part after # from the URL
356         // thanks to Johannes Christen for bug fix.
357         if ((newURL == null) || (newURL.equals(""))) return;
358         int pos = newURL.indexOf("#");
359         if (pos >=0 ) {
360             newURL = newURL.substring(0,pos);
361         }
362         
363         if (encoding != null) {
364             try {
365                 newURL = new String  (newURL.getBytes(), encoding);
366             } catch (UnsupportedEncodingException e) {
367             }
368         } else {
369             try {
370                 newURL = new String  (newURL.getBytes(), "ISO-8859-1");
371             } catch (UnsupportedEncodingException e) {
372             }
373         }
374         
375         try {
376             URL   u = null;
377             if (this.baseURL != null) {
378                 u = new URL  (this.baseURL,newURL);
379             } else {
380                 u = new URL  (url,newURL);
381             }
382             links.add(u);
383         } catch (Exception   e) {
384             log.debug("error during link extraction: "+e.getMessage()+" "+newURL);
385         }
386     }
387 
388 
389     public URL   getBaseURL() {
390         return baseURL;
391     }
392     
393     
394     
395 }
396
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags