1 package bplatt.spider; 2 3 22 23 import java.io.*; 24 import java.util.*; 25 import java.net.*; 26 27 public abstract class SimpleHTMLParser { 28 29 32 public SimpleHTMLParser() { } 33 34 38 public void parse(Reader r) throws IOException 39 { 40 char buf[] = new char[10]; 41 42 BufferedReader in = new BufferedReader(r); 43 44 while(true) { 46 read(in,buf,1); 47 if (buf[0] == '<') break; 48 } 49 50 int readahead; 52 while(true) { 53 readahead = 3; 55 in.mark(readahead); 56 read(in,buf,readahead); 57 if (buf[0] == '!' && buf[1] == '-' && buf[2] == '-') handle_comment(in); 58 else if (buf[0] == '/') { 59 in.reset(); 60 read(in,buf,1); 61 handle_tag(SimpleHTMLToken.ENDTAG,in); 62 } 63 else { 64 in.reset(); 65 handle_tag(SimpleHTMLToken.TAG,in); 66 } 67 68 readahead = 1; 70 in.mark(readahead); 71 try { read(in,buf,readahead); } 72 catch(SocketTimeoutException e) { throw(e); } catch(EOFException e) { return; } catch(IOException e) { throw(e); } if (buf[0] != '<') { 76 in.reset(); 77 if (handle_content(in) == false) return; } 79 } 80 } 81 82 private void handle_tag(int type, BufferedReader in) throws IOException 84 { 85 char buf[] = new char[10]; 86 StringBuffer guts = new StringBuffer (); 87 while(true) { 88 read(in,buf,1); 89 if (buf[0] == '>') break; 90 guts.append(buf[0]); 91 } 92 SimpleHTMLToken token = new SimpleHTMLToken(type,guts.toString()); 93 if (type == SimpleHTMLToken.TAG) processTag(token); 94 else processEndTag(token); 95 } 96 97 private void handle_comment(BufferedReader in) throws IOException 99 { 100 char buf[] = new char[10]; 101 while(true) { 102 read(in,buf,1); 103 if (buf[0] == '-') { 104 int readahead = 2; 105 in.mark(readahead); 106 read(in,buf,readahead); 107 if (buf[0] == '-' && buf[1] == '>') return; 108 else in.reset(); 109 } 110 } 111 112 } 113 114 private boolean handle_content(BufferedReader in) throws IOException 116 { 117 char buf[] = new char[10]; 118 StringBuffer guts = new StringBuffer (); 119 while(true) { 120 try { read(in,buf,1); } 121 catch(SocketTimeoutException e) { throw(e); } catch(EOFException e) { return(false); } catch(IOException e) { throw(e); } if (buf[0] == '<') break; 125 else guts.append(buf[0]); 126 } 127 SimpleHTMLToken token = new SimpleHTMLToken(SimpleHTMLToken.CONTENT,guts.toString()); 128 processContent(token); 129 return(true); 130 } 131 132 133 public abstract void processTag(SimpleHTMLToken token) throws IOException; 134 135 136 public abstract void processEndTag(SimpleHTMLToken token) throws IOException; 137 138 139 public abstract void processContent(SimpleHTMLToken token) throws IOException; 140 141 144 public static String getTagType(SimpleHTMLToken token, boolean lowerCaseFlag) { 145 if (token.getType() == SimpleHTMLToken.CONTENT) return(null); 146 String content = token.getContent(); 147 if (content == null || content.length() == 0) return(null); 148 StringTokenizer tt = new StringTokenizer(content); 149 String tag = null; 150 try { tag = tt.nextToken(); } 151 catch(NoSuchElementException e) { return(null); } 152 return((lowerCaseFlag?tag.toLowerCase():tag)); 153 } 154 155 private void read(BufferedReader r,char[] buf,int nchars) throws IOException 157 { 158 int flag = 10; 159 int charsToRead = nchars; 160 161 while (charsToRead != 0 && flag != 0) { 162 int charsRead = r.read(buf,0,nchars); 163 if (charsRead == -1) throw new EOFException("Premature EOF while parsing HTML"); 164 charsToRead = charsToRead - charsRead; 165 flag--; 166 if (flag<=5) { 167 Thread mythread = Thread.currentThread(); 169 try { mythread.sleep(1000,0); } 170 catch(InterruptedException e) { } 171 } 172 } 173 if (flag == 0) throw new SocketTimeoutException("Input timed-out while parsing HTML"); 174 } 175 } 176 | Popular Tags |