1 31 package org.pdfbox; 32 33 import java.io.File ; 34 import java.io.FileOutputStream ; 35 import java.io.IOException ; 36 import java.io.OutputStreamWriter ; 37 import java.io.Writer ; 38 import java.net.MalformedURLException ; 39 import java.net.URL ; 40 41 import org.pdfbox.pdmodel.PDDocument; 42 import org.pdfbox.pdmodel.encryption.AccessPermission; 43 import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; 44 import org.pdfbox.util.PDFText2HTML; 45 import org.pdfbox.util.PDFTextStripper; 46 47 54 public class ExtractText 55 { 56 59 public static final String DEFAULT_ENCODING = 60 null; 61 69 70 private static final String PASSWORD = "-password"; 71 private static final String ENCODING = "-encoding"; 72 private static final String CONSOLE = "-console"; 73 private static final String START_PAGE = "-startPage"; 74 private static final String END_PAGE = "-endPage"; 75 private static final String SORT = "-sort"; 76 private static final String HTML = "-html"; 78 81 private ExtractText() 82 { 83 } 85 86 93 public static void main( String [] args ) throws Exception 94 { 95 boolean toConsole = false; 96 boolean toHTML = false; 97 boolean sort = false; 98 String password = ""; 99 String encoding = DEFAULT_ENCODING; 100 String pdfFile = null; 101 String textFile = null; 102 int startPage = 1; 103 int endPage = Integer.MAX_VALUE; 104 for( int i=0; i<args.length; i++ ) 105 { 106 if( args[i].equals( PASSWORD ) ) 107 { 108 i++; 109 if( i >= args.length ) 110 { 111 usage(); 112 } 113 password = args[i]; 114 } 115 else if( args[i].equals( ENCODING ) ) 116 { 117 i++; 118 if( i >= args.length ) 119 { 120 usage(); 121 } 122 encoding = args[i]; 123 } 124 else if( args[i].equals( START_PAGE ) ) 125 { 126 i++; 127 if( i >= args.length ) 128 { 129 usage(); 130 } 131 startPage = Integer.parseInt( args[i] ); 132 } 133 else if( args[i].equals( HTML ) ) 134 { 135 toHTML = true; 136 } 137 else if( args[i].equals( SORT ) ) 138 { 139 sort = true; 140 } 141 else if( args[i].equals( END_PAGE ) ) 142 { 143 i++; 144 if( i >= args.length ) 145 { 146 usage(); 147 } 148 endPage = Integer.parseInt( args[i] ); 149 } 150 else if( args[i].equals( CONSOLE ) ) 151 { 152 toConsole = true; 153 } 154 else 155 { 156 if( pdfFile == null ) 157 { 158 pdfFile = args[i]; 159 } 160 else 161 { 162 textFile = args[i]; 163 } 164 } 165 } 166 167 if( pdfFile == null ) 168 { 169 usage(); 170 } 171 else 172 { 173 174 Writer output = null; 175 PDDocument document = null; 176 try 177 { 178 try 179 { 180 URL url = new URL ( pdfFile ); 183 document = PDDocument.load( url ); 184 String fileName = url.getFile(); 185 if( textFile == null && fileName.length() >4 ) 186 { 187 File outputFile = 188 new File ( fileName.substring( 0, fileName.length() -4 ) + ".txt" ); 189 textFile = outputFile.getName(); 190 } 191 } 192 catch( MalformedURLException e ) 193 { 194 document = PDDocument.load( pdfFile ); 195 if( textFile == null && pdfFile.length() >4 ) 196 { 197 textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt"; 198 } 199 } 200 201 if( document.isEncrypted() ) 203 { 204 StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password ); 205 document.openProtection( sdm ); 206 AccessPermission ap = document.getCurrentAccessPermission(); 207 208 if( ! ap.canExtractContent() ) 209 { 210 throw new IOException ( "You do not have permission to extract text" ); 211 } 212 } 213 if( toConsole ) 214 { 215 output = new OutputStreamWriter ( System.out ); 216 } 217 else 218 { 219 if( encoding != null ) 220 { 221 output = new OutputStreamWriter ( 222 new FileOutputStream ( textFile ), encoding ); 223 } 224 else 225 { 226 output = new OutputStreamWriter ( 228 new FileOutputStream ( textFile ) ); 229 } 230 } 231 232 PDFTextStripper stripper = null; 233 if(toHTML) 234 { 235 stripper = new PDFText2HTML(); 236 } 237 else 238 { 239 stripper = new PDFTextStripper(); 240 } 241 stripper.setSortByPosition( sort ); 242 stripper.setStartPage( startPage ); 243 stripper.setEndPage( endPage ); 244 stripper.writeText( document, output ); 245 } 246 finally 247 { 248 if( output != null ) 249 { 250 output.close(); 251 } 252 if( document != null ) 253 { 254 document.close(); 255 } 256 } 257 } 258 } 259 260 263 private static void usage() 264 { 265 System.err.println( "Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]\n" + 266 " -password <password> Password to decrypt document\n" + 267 " -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" + 268 " -console Send text to console instead of file\n" + 269 " -html Output in HTML format instead of raw text\n" + 270 " -sort Sort the text before writing\n" + 271 " -startPage <number> The first page to start extraction(1 based)\n" + 272 " -endPage <number> The last page to extract(inclusive)\n" + 273 " <PDF file> The PDF document to use\n" + 274 " [Text File] The file to write the text to\n" 275 ); 276 System.exit( 1 ); 277 } 278 } | Popular Tags |