1 package com.quadcap.http.client; 2 3 40 41 import java.io.CharArrayWriter ; 42 import java.io.IOException ; 43 import java.io.Reader ; 44 45 import org.xml.sax.AttributeList ; 46 import org.xml.sax.DocumentHandler ; 47 import org.xml.sax.DTDHandler ; 48 import org.xml.sax.EntityResolver ; 49 import org.xml.sax.ErrorHandler ; 50 import org.xml.sax.InputSource ; 51 import org.xml.sax.Parser ; 52 import org.xml.sax.SAXException ; 53 54 import org.xml.sax.helpers.AttributeListImpl ; 55 56 import com.quadcap.util.collections.ArrayQueue; 57 58 63 public class HtmlParser implements Parser { 64 InputSource in; 65 Reader r; 66 DocumentHandler docHandler = null; 67 DTDHandler dtdHandler = null; 68 EntityResolver entityResolver = null; 69 CharArrayWriter tag = new CharArrayWriter (); 70 CharArrayWriter data = new CharArrayWriter (); 71 AttributeListImpl attributes = new AttributeListImpl (); 72 String tagName = null; 73 74 final static int TAG = 1; 75 76 public HtmlParser() {} 77 78 public void parse(InputSource in) throws SAXException ,IOException { 79 this.in = in; 80 this.r = in.getCharacterStream(); 81 tag.reset(); 82 data.reset(); 83 parse(); 84 } 85 86 public void parse(String s) {} 87 88 public void setDocumentHandler(DocumentHandler dh) { 89 this.docHandler = dh; 90 } 91 92 public void setDTDHandler(DTDHandler dh) { 93 this.dtdHandler = dh; 94 } 95 96 public void setEntityResolver(EntityResolver er) { 97 this.entityResolver = er; 98 } 99 100 public EntityResolver getEntityResolver() { 101 return entityResolver; 102 } 103 104 public void setErrorHandler(ErrorHandler er) { 105 } 106 107 public void setLocale(java.util.Locale locale) { 108 } 109 110 public void parse() throws SAXException , IOException { 111 int state = 0; 112 int commentState = 0; 113 String attrName = null; 114 docHandler.startDocument(); 115 while (state >= 0) { 116 int c = r.read(); 117 if (c < 0) { 119 state = -1; 120 break; 121 } 122 switch (commentState) { 123 case 0: 124 break; 125 case 1: 126 if (c == '-') commentState = 2; 127 break; 128 case 2: 129 if (c == '-') commentState = 3; 130 else commentState = 1; 131 break; 132 case 3: 133 if (c == '>') commentState = 0; 134 else if (c != '-') commentState = 1; 135 } 136 137 switch (state) { 138 case 0: 139 if (c == '<') { 140 if (data.size() > 0) { 141 docHandler.characters(data.toCharArray(), 0, data.size()); 142 data.reset(); 143 } 144 state = 1; 145 } else { 146 data.write(c); 147 } 148 break; 149 case 1: switch (c) { 151 case '!': 152 data.write('<'); 153 data.write('!'); 154 commentState = 1; 155 state = 0; 156 break; 157 case '/': 158 state = 8; 159 break; 160 default: 161 tag.write(c); 162 state = 5; 163 break; 164 } 165 break; 166 case 5: switch (c) { 168 case ' ': 169 tagName = tag.toString(); 170 tag.reset(); 171 state = 6; 172 break; 173 case '/': 174 tagName = tag.toString(); 175 tag.reset(); 176 state = 9; 177 break; 178 case '>': 179 tagName = tag.toString(); 180 tag.reset(); 181 docHandler.startElement(tagName, attributes); 182 attributes.clear(); 183 state = 0; 184 break; 185 default: 186 tag.write(c); 187 } 188 break; 189 case 6: switch (c) { 191 case ' ': case '\n': case '\r': case '\t': 192 break; 193 case '/': 194 state = 9; 195 break; 196 case '>': 197 docHandler.startElement(tagName, attributes); 198 attributes.clear(); 199 state = 0; 200 break; 201 case '=': 202 attrName = tag.toString(); 203 tag.reset(); 204 state = 10; 205 break; 206 default: 207 tag.write(c); 208 } 209 break; 210 case 8: if (c == '>') { 212 tagName = tag.toString(); 213 tag.reset(); 214 docHandler.endElement(tagName); 215 state = 0; 216 } else { 217 tag.write(c); 218 } 219 break; 220 case 9: if (c == '>') { 222 docHandler.startElement(tagName, attributes); 223 attributes.clear(); 224 docHandler.endElement(tagName); 225 state = 0; 226 } else { 227 tag.write('/'); 228 tag.write(c); 229 state = 6; 230 } 231 break; 232 case 10: if (c == '"') { 234 state = 12; 235 } else if (c == '\'') { 236 state = 121; 237 } else { 238 tag.write(c); 239 state = 13; 240 } 241 break; 242 case 12: if (c == '"') { 244 attributes.addAttribute(attrName.toLowerCase(), "string", 245 tag.toString()); 246 tag.reset(); 247 state = 6; 248 } else { 249 tag.write(c); 250 } 251 break; 252 case 121: if (c == '\'') { 254 attributes.addAttribute(attrName.toLowerCase(), "string", 255 tag.toString()); 256 tag.reset(); 257 state = 6; 258 } else { 259 tag.write(c); 260 } 261 break; 262 case 13: switch (c) { 264 case ' ': 265 attributes.addAttribute(attrName.toLowerCase(), "string", 266 tag.toString()); 267 tag.reset(); 268 state = 6; 269 break; 270 case '/': 271 state = 14; 272 break; 273 case '>': 274 attributes.addAttribute(attrName.toLowerCase(), "string", 275 tag.toString()); 276 tag.reset(); 277 docHandler.startElement(tagName, attributes); 278 attributes.clear(); 279 state = 0; 280 break; 281 default: 282 tag.write(c); 283 } 284 break; 285 case 14: if (c == '>') { 287 attributes.addAttribute(attrName.toLowerCase(), "string", 288 tag.toString()); 289 tag.reset(); 290 docHandler.startElement(tagName, attributes); 291 attributes.clear(); 292 state = 0; 293 } else { 294 tag.write('/'); 295 if (c != '/') { 296 tag.write(c); 297 state = 13; 298 } 299 } 300 break; 301 case 15: 302 if (c == '-') state = 16; 303 break; 304 case 16: 305 if (c == '-') state = 17; 306 else state = 15; 307 break; 308 case 17: 309 if (c == '>') state = 0; 310 else if (c != '-') state = 15; 311 break; 312 } 313 } 314 } 315 316 } 317 | Popular Tags |