1 package it.stefanochizzolini.clown.samples; 2 3 import it.stefanochizzolini.clown.bytes.FileInputStream; 4 import it.stefanochizzolini.clown.documents.Document; 5 import it.stefanochizzolini.clown.documents.Page; 6 import it.stefanochizzolini.clown.documents.Pages; 7 import it.stefanochizzolini.clown.documents.contents.ContentStream; 8 import it.stefanochizzolini.clown.documents.contents.objects.Operation; 9 import it.stefanochizzolini.clown.documents.contents.tokens.Parser; 10 import it.stefanochizzolini.clown.files.File; 11 import it.stefanochizzolini.clown.objects.IPdfString; 12 import it.stefanochizzolini.clown.objects.PdfArray; 13 import it.stefanochizzolini.clown.objects.PdfDirectObject; 14 import it.stefanochizzolini.clown.tokens.FileFormatException; 15 16 import java.awt.geom.Point2D ; 17 import java.io.RandomAccessFile ; 18 19 29 public class TextExtractionSample 30 implements ISample 31 { 32 public void run( 33 PDFClownSampleLoader loader 34 ) 35 { 36 42 43 String filePath = loader.getPdfFileChoice("Please select a PDF file"); 45 46 File file; 48 try 49 { 50 file = new File( 52 new FileInputStream( 53 new RandomAccessFile (filePath,"r") 54 ) 55 ); 56 } 57 catch(FileFormatException e) 58 {throw new RuntimeException (filePath + " file has a bad file format.",e);} 59 catch(Exception e) 60 {throw new RuntimeException (filePath + " file access error.",e);} 61 62 Document document = file.getDocument(); 64 Pages pages = document.getPages(); 66 for(Page page : pages) 68 { 69 System.out.println("\nScanning page " + page.getIndex() + "..."); 70 ContentStream content = page.getContents().get(0); 72 Parser contentParser = content.getParser(); 73 try 74 { 75 while(contentParser.moveNext()) 76 { 77 boolean hit = false; 78 Operation operation = contentParser.parseOperation(); 79 String operator = operation.getOperator(); 80 if(operator.equals("TJ") 81 || operator.equals("Tj") 82 || operator.equals("'") 83 || operator.equals("''") 84 ) 85 { 86 for(PdfDirectObject operand : operation.getOperands()) 87 { 88 if(operand instanceof IPdfString) 89 { 90 hit = true; 91 System.out.print(((IPdfString)operand).getStringValue() + " "); 92 } 93 else if(operand instanceof PdfArray) 94 { 95 for(PdfDirectObject item : ((PdfArray)operand)) 96 { 97 if(item instanceof IPdfString) 98 { 99 hit = true; 100 System.out.print(((IPdfString)item).getStringValue() + " "); 101 } 102 } 103 } 104 } 105 if(hit) 106 {System.out.println();} 107 } 108 } 109 } 110 catch(Exception e) 111 {} 112 113 break; 114 } 115 } 116 } | Popular Tags |