KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > bplatt > spider > SimpleHTMLParser


1 package bplatt.spider;
2
3 /**
4  * SimpleHTMLParser object - simple parser for HTML
5  * Copyright 2002, Robert L. Platt, All rights reserved
6  * @author Robert L. Platt
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21  */

22
23 import java.io.*;
24 import java.util.*;
25 import java.net.*;
26
27 public abstract class SimpleHTMLParser {
28     
29     /**
30      * Constructor for HTMLParser.
31      */

32     public SimpleHTMLParser() { }
33
34     /** Parse an HTML page from an input stream
35      * Handles three types of tokens - TAG, ENDTAG,
36      * and CONTENT. Throws out any comments.
37      */

38     public void parse(Reader r) throws IOException
39     {
40         char buf[] = new char[10];
41         
42         BufferedReader in = new BufferedReader(r);
43         
44         // Get rid of any initial (not-tag) chars.
45
while(true) {
46             read(in,buf,1);
47             if (buf[0] == '<') break;
48         }
49         
50         // Process page
51
int readahead;
52         while(true) {
53             // Process tag or comment
54
readahead = 3;
55             in.mark(readahead);
56             read(in,buf,readahead);
57             if (buf[0] == '!' && buf[1] == '-' && buf[2] == '-') handle_comment(in);
58             else if (buf[0] == '/') {
59                 in.reset();
60                 read(in,buf,1);
61                 handle_tag(SimpleHTMLToken.ENDTAG,in);
62             }
63             else {
64                 in.reset();
65                 handle_tag(SimpleHTMLToken.TAG,in);
66             }
67             
68             // determine if next char is start of new tag or content
69
readahead = 1;
70             in.mark(readahead);
71             try { read(in,buf,readahead); }
72             catch(SocketTimeoutException e) { throw(e); } // Re-throw exception
73
catch(EOFException e) { return; } // EOF is OK after tag
74
catch(IOException e) { throw(e); } // Re-throw exception
75
if (buf[0] != '<') {
76                 in.reset();
77                 if (handle_content(in) == false) return; // EOF is OK
78
}
79         }
80     }
81     
82     // Handle a tag
83
private void handle_tag(int type, BufferedReader in) throws IOException
84     {
85         char buf[] = new char[10];
86         StringBuffer JavaDoc guts = new StringBuffer JavaDoc();
87         while(true) {
88             read(in,buf,1);
89             if (buf[0] == '>') break;
90             guts.append(buf[0]);
91         }
92         SimpleHTMLToken token = new SimpleHTMLToken(type,guts.toString());
93         if (type == SimpleHTMLToken.TAG) processTag(token);
94         else processEndTag(token);
95     }
96     
97     // Throw away comment
98
private void handle_comment(BufferedReader in) throws IOException
99     {
100         char buf[] = new char[10];
101         while(true) {
102             read(in,buf,1);
103             if (buf[0] == '-') {
104                 int readahead = 2;
105                 in.mark(readahead);
106                 read(in,buf,readahead);
107                 if (buf[0] == '-' && buf[1] == '>') return;
108                 else in.reset();
109             }
110         }
111             
112     }
113     
114     // Handle tag content - return true if more content, false if EOF
115
private boolean handle_content(BufferedReader in) throws IOException
116     {
117         char buf[] = new char[10];
118         StringBuffer JavaDoc guts = new StringBuffer JavaDoc();
119         while(true) {
120             try { read(in,buf,1); }
121             catch(SocketTimeoutException e) { throw(e); } // Re-throw exception
122
catch(EOFException e) { return(false); } // EOF is OK after tag
123
catch(IOException e) { throw(e); } // Re-throw exception
124
if (buf[0] == '<') break;
125             else guts.append(buf[0]);
126         }
127         SimpleHTMLToken token = new SimpleHTMLToken(SimpleHTMLToken.CONTENT,guts.toString());
128         processContent(token);
129         return(true);
130     }
131     
132     /** processTag - process a tag */
133     public abstract void processTag(SimpleHTMLToken token) throws IOException;
134     
135     /** processEndTag - process an end tag */
136     public abstract void processEndTag(SimpleHTMLToken token) throws IOException;
137     
138     /** processContent - process content */
139     public abstract void processContent(SimpleHTMLToken token) throws IOException;
140     
141     /** Process a token and return the tag or null
142      * flag indicates whether tag is to be converted to lower case
143      */

144     public static String JavaDoc getTagType(SimpleHTMLToken token, boolean lowerCaseFlag) {
145         if (token.getType() == SimpleHTMLToken.CONTENT) return(null);
146         String JavaDoc content = token.getContent();
147         if (content == null || content.length() == 0) return(null);
148         StringTokenizer tt = new StringTokenizer(content);
149         String JavaDoc tag = null;
150         try { tag = tt.nextToken(); }
151         catch(NoSuchElementException e) { return(null); }
152         return((lowerCaseFlag?tag.toLowerCase():tag));
153     }
154     
155     // Read() - handle blocking / EOF
156
private void read(BufferedReader r,char[] buf,int nchars) throws IOException
157     {
158         int flag = 10;
159         int charsToRead = nchars;
160         
161         while (charsToRead != 0 && flag != 0) {
162             int charsRead = r.read(buf,0,nchars);
163             if (charsRead == -1) throw new EOFException("Premature EOF while parsing HTML");
164             charsToRead = charsToRead - charsRead;
165             flag--;
166             if (flag<=5) {
167                 // Wait a second
168
Thread JavaDoc mythread = Thread.currentThread();
169                 try { mythread.sleep(1000,0); }
170                 catch(InterruptedException JavaDoc e) { /* Ignore it */ }
171             }
172         }
173         if (flag == 0) throw new SocketTimeoutException("Input timed-out while parsing HTML");
174     }
175 }
176
Popular Tags