1 package bplatt.spider; 2 3 30 31 import java.io.*; 32 import java.util.*; 33 34 public class WebPageXtractor extends SimpleHTMLParser { 35 private ArrayList links; 36 private ArrayList images; 37 private ArrayList title; 38 private boolean inTitle; 39 40 41 public WebPageXtractor() { 42 super(); 43 links = new ArrayList(); 44 images = new ArrayList(); 45 title = new ArrayList(); 46 } 47 48 52 public void processContent(SimpleHTMLToken token) { 53 String s = token.getContent().trim(); 54 if (s != null && s.length() != 0) { 55 if (inTitle) title.add(s); 56 } 57 } 58 59 63 public void processEndTag(SimpleHTMLToken token) throws IOException 64 { 65 String tag = SimpleHTMLParser.getTagType(token,true); 66 if (tag == null) throw new IOException("HTML parsing error"); 67 else if (tag.equals("title")) inTitle = false; 68 } 69 70 74 public void processTag(SimpleHTMLToken token) throws IOException 75 { 76 String tag = SimpleHTMLParser.getTagType(token,true); 77 if (tag == null) throw new IOException("HTML parsing error"); 78 else if (tag.equals("a")) { 79 String link = extractHref(token.getContent()); 80 if (link != null) links.add(link); 81 } 82 else if (tag.equals("img")) { 83 String image = extractSrc(token.getContent()); 84 if (image != null) images.add(image); 85 } 86 else if (tag.equals("frame")) { 87 String link = extractSrc(token.getContent()); 88 if (link != null) links.add(link); 89 } 90 else if (tag.equals("title")) inTitle = true; 91 } 92 93 94 private String extractHref(String tag) 96 { 97 String delims="\t\r\f\n \'\"="; 98 StringTokenizer tt = new StringTokenizer(tag,delims); 99 while(tt.hasMoreElements()) { 100 String s = tt.nextToken(); 101 if (s.equalsIgnoreCase("href")) { 102 if (!tt.hasMoreElements()) return(null); 103 else return(tt.nextToken()); 104 } 105 } 106 return(null); 107 } 108 109 private String extractSrc(String tag) 111 { 112 String delims="\t\r\f\n \'\"="; 113 StringTokenizer tt = new StringTokenizer(tag,delims); 114 while(tt.hasMoreElements()) { 115 String s = tt.nextToken(); 116 if (s.equalsIgnoreCase("src")) { 117 if (!tt.hasMoreElements()) return(null); 118 else return(tt.nextToken()); 119 } 120 } 121 return(null); 122 } 123 127 public ArrayList getImages() { 128 return images; 129 } 130 131 135 public ArrayList getLinks() { 136 return links; 137 } 138 139 143 public ArrayList getTitle() { 144 return title; 145 } 146 } 147 | Popular Tags |