KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > ExtractText


1 import au.id.jericho.lib.html.*;
2 import java.util.*;
3 import java.io.*;
4 import java.net.*;
5
6 public class ExtractText {
7     public static void main(String JavaDoc[] args) throws Exception JavaDoc {
8         String JavaDoc sourceUrlString="data/test.html";
9         if (args.length==0)
10           System.err.println("Using default argument of \""+sourceUrlString+'"');
11         else
12             sourceUrlString=args[0];
13         if (sourceUrlString.indexOf(':')==-1) sourceUrlString="file:"+sourceUrlString;
14         Source source=new Source(new URL(sourceUrlString));
15         source.setLogWriter(new OutputStreamWriter(System.err)); // send log messages to stderr
16

17         source.fullSequentialParse();
18
19         System.out.println("Document title:");
20         String JavaDoc title=getTitle(source);
21         System.out.println(title==null ? "(none)" : title);
22
23         System.out.println("\nDocument description:");
24         String JavaDoc description=getMetaValue(source,"description");
25         System.out.println(description==null ? "(none)" : description);
26
27         System.out.println("\nDocument keywords:");
28         String JavaDoc keywords=getMetaValue(source,"keywords");
29         System.out.println(keywords==null ? "(none)" : keywords);
30     
31         System.out.println("\nLinks to other documents:");
32         List linkElements=source.findAllElements(HTMLElementName.A);
33         for (Iterator i=linkElements.iterator(); i.hasNext();) {
34             Element linkElement=(Element)i.next();
35             String JavaDoc HREF=linkElement.getAttributeValue("href");
36             if (href==null) continue;
37             // A element can contain other tags so need to extract the text from it:
38
String JavaDoc label=linkElement.getContent().extractText();
39             System.out.println(href+" ("+label+")");
40         }
41
42         System.out.println("\nAll text from BODY (exluding content inside SCRIPT and STYLE elements):");
43         Element bodyElement=source.findNextElement(0,HTMLElementName.BODY);
44         Segment contentSegment=(bodyElement==null) ? source : bodyElement.getContent();
45         System.out.println(contentSegment.extractText(true));
46   }
47
48     private static String JavaDoc getTitle(Source source) {
49         Element titleElement=source.findNextElement(0,HTMLElementName.TITLE);
50         if (titleElement==null) return null;
51         // TITLE element never contains other tags so just decode it collapsing whitespace:
52
return CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
53     }
54
55     private static String JavaDoc getMetaValue(Source source, String JavaDoc key) {
56         for (int pos=0; pos<source.length();) {
57             StartTag startTag=source.findNextStartTag(pos,"name",key,false);
58             if (startTag==null) return null;
59             if (startTag.getName()==HTMLElementName.META)
60                 return startTag.getAttributeValue("content"); // Attribute values are automatically decoded
61
pos=startTag.getEnd();
62         }
63         return null;
64     }
65 }
66
Popular Tags