1 import au.id.jericho.lib.html.*; 2 import java.util.*; 3 import java.io.*; 4 import java.net.*; 5 6 public class ExtractText { 7 public static void main(String [] args) throws Exception { 8 String sourceUrlString="data/test.html"; 9 if (args.length==0) 10 System.err.println("Using default argument of \""+sourceUrlString+'"'); 11 else 12 sourceUrlString=args[0]; 13 if (sourceUrlString.indexOf(':')==-1) sourceUrlString="file:"+sourceUrlString; 14 Source source=new Source(new URL(sourceUrlString)); 15 source.setLogWriter(new OutputStreamWriter(System.err)); 17 source.fullSequentialParse(); 18 19 System.out.println("Document title:"); 20 String title=getTitle(source); 21 System.out.println(title==null ? "(none)" : title); 22 23 System.out.println("\nDocument description:"); 24 String description=getMetaValue(source,"description"); 25 System.out.println(description==null ? "(none)" : description); 26 27 System.out.println("\nDocument keywords:"); 28 String keywords=getMetaValue(source,"keywords"); 29 System.out.println(keywords==null ? "(none)" : keywords); 30 31 System.out.println("\nLinks to other documents:"); 32 List linkElements=source.findAllElements(HTMLElementName.A); 33 for (Iterator i=linkElements.iterator(); i.hasNext();) { 34 Element linkElement=(Element)i.next(); 35 String HREF=linkElement.getAttributeValue("href"); 36 if (href==null) continue; 37 String label=linkElement.getContent().extractText(); 39 System.out.println(href+" ("+label+")"); 40 } 41 42 System.out.println("\nAll text from BODY (exluding content inside SCRIPT and STYLE elements):"); 43 Element bodyElement=source.findNextElement(0,HTMLElementName.BODY); 44 Segment contentSegment=(bodyElement==null) ? source : bodyElement.getContent(); 45 System.out.println(contentSegment.extractText(true)); 46 } 47 48 private static String getTitle(Source source) { 49 Element titleElement=source.findNextElement(0,HTMLElementName.TITLE); 50 if (titleElement==null) return null; 51 return CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent()); 53 } 54 55 private static String getMetaValue(Source source, String key) { 56 for (int pos=0; pos<source.length();) { 57 StartTag startTag=source.findNextStartTag(pos,"name",key,false); 58 if (startTag==null) return null; 59 if (startTag.getName()==HTMLElementName.META) 60 return startTag.getAttributeValue("content"); pos=startTag.getEnd(); 62 } 63 return null; 64 } 65 } 66 | Popular Tags |