KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > it > stefanochizzolini > clown > samples > TextExtractionSample


1 package it.stefanochizzolini.clown.samples;
2
3 import it.stefanochizzolini.clown.bytes.FileInputStream;
4 import it.stefanochizzolini.clown.documents.Document;
5 import it.stefanochizzolini.clown.documents.Page;
6 import it.stefanochizzolini.clown.documents.Pages;
7 import it.stefanochizzolini.clown.documents.contents.ContentStream;
8 import it.stefanochizzolini.clown.documents.contents.objects.Operation;
9 import it.stefanochizzolini.clown.documents.contents.tokens.Parser;
10 import it.stefanochizzolini.clown.files.File;
11 import it.stefanochizzolini.clown.objects.IPdfString;
12 import it.stefanochizzolini.clown.objects.PdfArray;
13 import it.stefanochizzolini.clown.objects.PdfDirectObject;
14 import it.stefanochizzolini.clown.tokens.FileFormatException;
15
16 import java.awt.geom.Point2D JavaDoc;
17 import java.io.RandomAccessFile JavaDoc;
18
19 /**
20   This sample is a rough stub that demonstrates a basic way to extract text from
21   a document.
22   <h3>Remarks</h3>
23   <p>This implementation is definitely simplistic: its purpose is NOT to provide
24   a real-life solution for PDF text mining; it lacks advanced features such as glyph
25   position detection, dehyphenation, page-breaks handling and so on. Its purpose is
26   to test the new content-stream parsing functionality.</p>
27   <p>So, read my lips: this-is-just-a-toy (for now!).</p>
28 */

29 public class TextExtractionSample
30   implements ISample
31 {
32   public void run(
33     PDFClownSampleLoader loader
34     )
35   {
36     /*
37       NOTE: This procedure is made up of this sequence of actions:
38       1. User choice.
39       2. Document editing (core).
40       3. Serialization.
41     */

42
43     // 1. User choice.
44
String JavaDoc filePath = loader.getPdfFileChoice("Please select a PDF file");
45
46     // 2. Document editing.
47
File file;
48     try
49     {
50       // Open the PDF file!
51
file = new File(
52         new FileInputStream(
53           new RandomAccessFile JavaDoc(filePath,"r")
54           )
55         );
56     }
57     catch(FileFormatException e)
58     {throw new RuntimeException JavaDoc(filePath + " file has a bad file format.",e);}
59     catch(Exception JavaDoc e)
60     {throw new RuntimeException JavaDoc(filePath + " file access error.",e);}
61
62     // Get the PDF document!
63
Document document = file.getDocument();
64     // Get the page collection!
65
Pages pages = document.getPages();
66 //TODO:IMPL see PDF:1.6:5.9
67
for(Page page : pages)
68     {
69       System.out.println("\nScanning page " + page.getIndex() + "...");
70       // Get page's contents!
71
ContentStream content = page.getContents().get(0);
72       Parser contentParser = content.getParser();
73       try
74       {
75         while(contentParser.moveNext())
76         {
77           boolean hit = false;
78           Operation operation = contentParser.parseOperation();
79           String JavaDoc operator = operation.getOperator();
80           if(operator.equals("TJ")
81             || operator.equals("Tj")
82             || operator.equals("'")
83             || operator.equals("''")
84             )
85           {
86             for(PdfDirectObject operand : operation.getOperands())
87             {
88               if(operand instanceof IPdfString)
89               {
90                 hit = true;
91                 System.out.print(((IPdfString)operand).getStringValue() + " ");
92               }
93               else if(operand instanceof PdfArray)
94               {
95                 for(PdfDirectObject item : ((PdfArray)operand))
96                 {
97                   if(item instanceof IPdfString)
98                   {
99                     hit = true;
100                     System.out.print(((IPdfString)item).getStringValue() + " ");
101                   }
102                 }
103               }
104             }
105             if(hit)
106             {System.out.println();}
107           }
108         }
109       }
110       catch(Exception JavaDoc e)
111       {/* Ignore! */}
112
113       break;
114     }
115   }
116 }
Popular Tags