1 31 package org.pdfbox.util; 32 33 import java.io.ByteArrayOutputStream ; 34 import java.io.IOException ; 35 import java.io.OutputStreamWriter ; 36 import java.io.Writer ; 37 import java.util.regex.Matcher ; 38 import java.util.regex.Pattern ; 39 40 import org.pdfbox.pdmodel.PDDocument; 41 import org.pdfbox.pdmodel.PDPage; 42 43 44 54 public class PDFHighlighter extends PDFTextStripper 55 { 56 private Writer highlighterOutput = null; 57 59 private String [] searchedWords; 60 private ByteArrayOutputStream textOS = null; 61 private Writer textWriter = null; 62 63 68 public PDFHighlighter() throws IOException 69 { 70 super(); 71 super.setLineSeparator( "" ); 72 super.setPageSeparator( "" ); 73 super.setWordSeparator( "" ); 74 super.setShouldSeparateByBeads( false ); 75 super.setSuppressDuplicateOverlappingText( false ); 76 } 77 78 87 public void generateXMLHighlight(PDDocument pdDocument, String highlightWord, Writer xmlOutput ) throws IOException 88 { 89 generateXMLHighlight( pdDocument, new String [] { highlightWord }, xmlOutput ); 90 } 91 92 101 public void generateXMLHighlight(PDDocument pdDocument, String [] sWords, Writer xmlOutput ) throws IOException 102 { 103 highlighterOutput = xmlOutput; 104 searchedWords = sWords; 105 highlighterOutput.write("<XML>\n<Body units=characters " + 106 " version=2>\n<Highlight>\n"); 111 textOS = new ByteArrayOutputStream (); 112 textWriter = new OutputStreamWriter ( textOS, "UTF-16" ); 113 writeText(pdDocument, textWriter); 114 highlighterOutput.write("</Highlight>\n</Body>\n</XML>"); 115 highlighterOutput.flush(); 116 } 117 118 121 protected void endPage( PDPage pdPage ) throws IOException 122 { 123 textWriter.flush(); 124 125 String page = new String ( textOS.toByteArray(), "UTF-16" ); 126 textOS.reset(); 127 132 if (page.indexOf("a") != -1) 134 { 135 page = page.replaceAll("a[0-9]{1,3}", "."); 136 } 137 138 for (int i = 0; i < searchedWords.length; i++) 139 { 140 Pattern pattern = Pattern.compile(searchedWords[i], Pattern.CASE_INSENSITIVE); 141 Matcher matcher = pattern.matcher(page); 142 while( matcher.find() ) 143 { 144 int begin = matcher.start(); 145 int end = matcher.end(); 146 highlighterOutput.write(" <loc " + 147 "pg=" + (getCurrentPageNo()-1) 148 + " pos=" + begin 149 + " len="+ (end - begin) 150 + ">\n"); 151 } 152 } 153 } 154 155 162 public static void main(String [] args) throws IOException 163 { 164 PDFHighlighter xmlExtractor = new PDFHighlighter(); 165 PDDocument doc = null; 166 try 167 { 168 if( args.length < 2 ) 169 { 170 usage(); 171 } 172 String [] highlightStrings = new String [ args.length - 1]; 173 System.arraycopy( args, 1, highlightStrings, 0, highlightStrings.length ); 174 doc = PDDocument.load( args[0] ); 175 176 xmlExtractor.generateXMLHighlight( 177 doc, 178 highlightStrings, 179 new OutputStreamWriter ( System.out ) ); 180 } 181 finally 182 { 183 if( doc != null ) 184 { 185 doc.close(); 186 } 187 } 188 } 189 190 private static void usage() 191 { 192 System.err.println( "usage: java " + PDFHighlighter.class.getName() + " <pdf file> word1 word2 word3 ..." ); 193 System.exit( 1 ); 194 } 195 196 197 202 206 207 212 216 217 222 226 227 232 243 } | Popular Tags |