KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > quadcap > http > client > HtmlParser


1 package com.quadcap.http.client;
2
3 /* Copyright 1999 - 2003 Quadcap Software. All rights reserved.
4  *
5  * This software is distributed under the Quadcap Free Software License.
6  * This software may be used or modified for any purpose, personal or
7  * commercial. Open Source redistributions are permitted. Commercial
8  * redistribution of larger works derived from, or works which bundle
9  * this software requires a "Commercial Redistribution License"; see
10  * http://www.quadcap.com/purchase.
11  *
12  * Redistributions qualify as "Open Source" under one of the following terms:
13  *
14  * Redistributions are made at no charge beyond the reasonable cost of
15  * materials and delivery.
16  *
17  * Redistributions are accompanied by a copy of the Source Code or by an
18  * irrevocable offer to provide a copy of the Source Code for up to three
19  * years at the cost of materials and delivery. Such redistributions
20  * must allow further use, modification, and redistribution of the Source
21  * Code under substantially the same terms as this license.
22  *
23  * Redistributions of source code must retain the copyright notices as they
24  * appear in each source code file, these license terms, and the
25  * disclaimer/limitation of liability set forth as paragraph 6 below.
26  *
27  * Redistributions in binary form must reproduce this Copyright Notice,
28  * these license terms, and the disclaimer/limitation of liability set
29  * forth as paragraph 6 below, in the documentation and/or other materials
30  * provided with the distribution.
31  *
32  * The Software is provided on an "AS IS" basis. No warranty is
33  * provided that the Software is free of defects, or fit for a
34  * particular purpose.
35  *
36  * Limitation of Liability. Quadcap Software shall not be liable
37  * for any damages suffered by the Licensee or any third party resulting
38  * from use of the Software.
39  */

40
41 import java.io.CharArrayWriter JavaDoc;
42 import java.io.IOException JavaDoc;
43 import java.io.Reader JavaDoc;
44
45 import org.xml.sax.AttributeList JavaDoc;
46 import org.xml.sax.DocumentHandler JavaDoc;
47 import org.xml.sax.DTDHandler JavaDoc;
48 import org.xml.sax.EntityResolver JavaDoc;
49 import org.xml.sax.ErrorHandler JavaDoc;
50 import org.xml.sax.InputSource JavaDoc;
51 import org.xml.sax.Parser JavaDoc;
52 import org.xml.sax.SAXException JavaDoc;
53
54 import org.xml.sax.helpers.AttributeListImpl JavaDoc;
55
56 import com.quadcap.util.collections.ArrayQueue;
57
58 /**
59  * A SAX Parser for HTML.
60  *
61  * @author Stan Bailes
62  */

63 public class HtmlParser implements Parser JavaDoc {
64     InputSource JavaDoc in;
65     Reader JavaDoc r;
66     DocumentHandler JavaDoc docHandler = null;
67     DTDHandler JavaDoc dtdHandler = null;
68     EntityResolver JavaDoc entityResolver = null;
69     CharArrayWriter JavaDoc tag = new CharArrayWriter JavaDoc();
70     CharArrayWriter JavaDoc data = new CharArrayWriter JavaDoc();
71     AttributeListImpl JavaDoc attributes = new AttributeListImpl JavaDoc();
72     String JavaDoc tagName = null;
73
74     final static int TAG = 1;
75
76     public HtmlParser() {}
77
78     public void parse(InputSource JavaDoc in) throws SAXException JavaDoc,IOException JavaDoc {
79         this.in = in;
80         this.r = in.getCharacterStream();
81         tag.reset();
82         data.reset();
83         parse();
84     }
85
86     public void parse(String JavaDoc s) {}
87
88     public void setDocumentHandler(DocumentHandler JavaDoc dh) {
89         this.docHandler = dh;
90     }
91     
92     public void setDTDHandler(DTDHandler JavaDoc dh) {
93         this.dtdHandler = dh;
94     }
95
96     public void setEntityResolver(EntityResolver JavaDoc er) {
97     this.entityResolver = er;
98     }
99
100     public EntityResolver JavaDoc getEntityResolver() {
101     return entityResolver;
102     }
103
104     public void setErrorHandler(ErrorHandler JavaDoc er) {
105     }
106
107     public void setLocale(java.util.Locale JavaDoc locale) {
108     }
109     
110     public void parse() throws SAXException JavaDoc, IOException JavaDoc {
111         int state = 0;
112         int commentState = 0;
113         String JavaDoc attrName = null;
114         docHandler.startDocument();
115         while (state >= 0) {
116             int c = r.read();
117             //System.out.println("[" + ((char)c) + "] [" + state + "] <" + tag.toString() + ">");
118
if (c < 0) {
119         state = -1;
120                 break;
121             }
122             switch (commentState) {
123             case 0:
124                 break;
125             case 1:
126                 if (c == '-') commentState = 2;
127                 break;
128             case 2:
129                 if (c == '-') commentState = 3;
130                 else commentState = 1;
131                 break;
132             case 3:
133                 if (c == '>') commentState = 0;
134                 else if (c != '-') commentState = 1;
135             }
136             
137             switch (state) {
138             case 0:
139                 if (c == '<') {
140                     if (data.size() > 0) {
141                         docHandler.characters(data.toCharArray(), 0, data.size());
142                         data.reset();
143                     }
144                     state = 1;
145                 } else {
146                     data.write(c);
147                 }
148                 break;
149             case 1: // seen '<'
150
switch (c) {
151                 case '!':
152                     data.write('<');
153                     data.write('!');
154                     commentState = 1;
155                     state = 0;
156                     break;
157                 case '/':
158                     state = 8;
159                     break;
160                 default:
161                     tag.write(c);
162                     state = 5;
163                     break;
164                 }
165                 break;
166             case 5: // collect tag name
167
switch (c) {
168                 case ' ':
169                     tagName = tag.toString();
170                     tag.reset();
171             state = 6;
172                     break;
173                 case '/':
174                     tagName = tag.toString();
175                     tag.reset();
176                     state = 9;
177                     break;
178                 case '>':
179                     tagName = tag.toString();
180                     tag.reset();
181                     docHandler.startElement(tagName, attributes);
182             attributes.clear();
183                     state = 0;
184                     break;
185                 default:
186                     tag.write(c);
187                 }
188                 break;
189             case 6: // collect attributes
190
switch (c) {
191                 case ' ': case '\n': case '\r': case '\t':
192                     break;
193                 case '/':
194             state = 9;
195                     break;
196                 case '>':
197                     docHandler.startElement(tagName, attributes);
198             attributes.clear();
199                     state = 0;
200                     break;
201                 case '=':
202                     attrName = tag.toString();
203                     tag.reset();
204                     state = 10;
205                     break;
206                 default:
207                     tag.write(c);
208                 }
209                 break;
210             case 8: // seen </
211
if (c == '>') {
212                     tagName = tag.toString();
213                     tag.reset();
214             docHandler.endElement(tagName);
215                     state = 0;
216                 } else {
217                     tag.write(c);
218                 }
219                 break;
220             case 9: // in <tag, seen /
221
if (c == '>') {
222                     docHandler.startElement(tagName, attributes);
223             attributes.clear();
224                     docHandler.endElement(tagName);
225                     state = 0;
226                 } else {
227                     tag.write('/');
228                     tag.write(c);
229                     state = 6;
230                 }
231                 break;
232             case 10: // in attriblist, seen name=
233
if (c == '"') {
234                     state = 12;
235         } else if (c == '\'') {
236             state = 121;
237                 } else {
238                     tag.write(c);
239                     state = 13;
240                 }
241                 break;
242             case 12: // in attriblist, seen name="
243
if (c == '"') {
244                     attributes.addAttribute(attrName.toLowerCase(), "string",
245                                             tag.toString());
246                     tag.reset();
247                     state = 6;
248                 } else {
249                     tag.write(c);
250                 }
251                 break;
252             case 121: // in attriblist, seen name='
253
if (c == '\'') {
254                     attributes.addAttribute(attrName.toLowerCase(), "string",
255                                             tag.toString());
256                     tag.reset();
257                     state = 6;
258                 } else {
259                     tag.write(c);
260                 }
261                 break;
262             case 13: // in attriblist, seen name=c
263
switch (c) {
264                 case ' ':
265                     attributes.addAttribute(attrName.toLowerCase(), "string",
266                                             tag.toString());
267                     tag.reset();
268                     state = 6;
269                     break;
270                 case '/':
271                     state = 14;
272                     break;
273                 case '>':
274                     attributes.addAttribute(attrName.toLowerCase(), "string",
275                                             tag.toString());
276                     tag.reset();
277                     docHandler.startElement(tagName, attributes);
278             attributes.clear();
279                     state = 0;
280                     break;
281                 default:
282                     tag.write(c);
283                 }
284                 break;
285             case 14: // in attriblist, seen name=dfdf/
286
if (c == '>') {
287                     attributes.addAttribute(attrName.toLowerCase(), "string",
288                                             tag.toString());
289                     tag.reset();
290                     docHandler.startElement(tagName, attributes);
291             attributes.clear();
292                     state = 0;
293                 } else {
294                     tag.write('/');
295                     if (c != '/') {
296                         tag.write(c);
297                         state = 13;
298                     }
299                 }
300                 break;
301         case 15:
302         if (c == '-') state = 16;
303         break;
304         case 16:
305         if (c == '-') state = 17;
306         else state = 15;
307         break;
308         case 17:
309         if (c == '>') state = 0;
310         else if (c != '-') state = 15;
311         break;
312             }
313         }
314     }
315
316 }
317
Popular Tags