ExtractText


1   import au.id.jericho.lib.html.*;
2   import java.util.*;
3   import java.io.*;
4   import java.net.*;
5   
6   public class ExtractText {
7       public static void main(String  [] args) throws Exception   {
8           String   sourceUrlString="data/test.html";
9           if (args.length==0)
10            System.err.println("Using default argument of \""+sourceUrlString+'"');
11          else
12              sourceUrlString=args[0];
13          if (sourceUrlString.indexOf(':')==-1) sourceUrlString="file:"+sourceUrlString;
14          Source source=new Source(new URL(sourceUrlString));
15          source.setLogWriter(new OutputStreamWriter(System.err)); // send log messages to stderr
16  
17          source.fullSequentialParse();
18  
19          System.out.println("Document title:");
20          String   title=getTitle(source);
21          System.out.println(title==null ? "(none)" : title);
22  
23          System.out.println("\nDocument description:");
24          String   description=getMetaValue(source,"description");
25          System.out.println(description==null ? "(none)" : description);
26  
27          System.out.println("\nDocument keywords:");
28          String   keywords=getMetaValue(source,"keywords");
29          System.out.println(keywords==null ? "(none)" : keywords);
30      
31          System.out.println("\nLinks to other documents:");
32          List linkElements=source.findAllElements(HTMLElementName.A);
33          for (Iterator i=linkElements.iterator(); i.hasNext();) {
34              Element linkElement=(Element)i.next();
35              String   HREF=linkElement.getAttributeValue("href");
36              if (href==null) continue;
37              // A element can contain other tags so need to extract the text from it:
38              String   label=linkElement.getContent().extractText();
39              System.out.println(href+" ("+label+")");
40          } 
41  
42          System.out.println("\nAll text from BODY (exluding content inside SCRIPT and STYLE elements):");
43          Element bodyElement=source.findNextElement(0,HTMLElementName.BODY);
44          Segment contentSegment=(bodyElement==null) ? source : bodyElement.getContent();
45          System.out.println(contentSegment.extractText(true));
46    }
47  
48      private static String   getTitle(Source source) {
49          Element titleElement=source.findNextElement(0,HTMLElementName.TITLE);
50          if (titleElement==null) return null;
51          // TITLE element never contains other tags so just decode it collapsing whitespace:
52          return CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
53      }
54  
55      private static String   getMetaValue(Source source, String   key) {
56          for (int pos=0; pos<source.length();) {
57              StartTag startTag=source.findNextStartTag(pos,"name",key,false);
58              if (startTag==null) return null;
59              if (startTag.getName()==HTMLElementName.META)
60                  return startTag.getAttributeValue("content"); // Attribute values are automatically decoded
61              pos=startTag.getEnd();
62          }
63          return null;
64      }
65  }
66
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags