ExtractText


1   /**
2    * Copyright (c) 2003-2006, www.pdfbox.org
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions are met:
7    *
8    * 1. Redistributions of source code must retain the above copyright notice,
9    *    this list of conditions and the following disclaimer.
10   * 2. Redistributions in binary form must reproduce the above copyright notice,
11   *    this list of conditions and the following disclaimer in the documentation
12   *    and/or other materials provided with the distribution.
13   * 3. Neither the name of pdfbox; nor the names of its
14   *    contributors may be used to endorse or promote products derived from this
15   *    software without specific prior written permission.
16   *
17   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20   * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27   *
28   * http://www.pdfbox.org
29   *
30   */
31  package org.pdfbox;
32  
33  import java.io.File  ;
34  import java.io.FileOutputStream  ;
35  import java.io.IOException  ;
36  import java.io.OutputStreamWriter  ;
37  import java.io.Writer  ;
38  import java.net.MalformedURLException  ;
39  import java.net.URL  ;
40  
41  import org.pdfbox.pdmodel.PDDocument;
42  import org.pdfbox.pdmodel.encryption.AccessPermission;
43  import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
44  import org.pdfbox.util.PDFText2HTML;
45  import org.pdfbox.util.PDFTextStripper;
46  
47  /**
48   * This is the main program that simply parses the pdf document and transforms it
49   * into text.
50   *
51   * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
52   * @version $Revision: 1.14 $
53   */
54  public class ExtractText
55  {
56      /**
57       * This is the default encoding of the text to be output.
58       */
59      public static final String   DEFAULT_ENCODING =
60          null;
61          //"ISO-8859-1";
62          //"ISO-8859-6"; //arabic
63          //"US-ASCII";
64          //"UTF-8";
65          //"UTF-16";
66          //"UTF-16BE";
67          //"UTF-16LE";
68  
69  
70      private static final String   PASSWORD = "-password";
71      private static final String   ENCODING = "-encoding";
72      private static final String   CONSOLE = "-console";
73      private static final String   START_PAGE = "-startPage";
74      private static final String   END_PAGE = "-endPage";
75      private static final String   SORT = "-sort";
76      private static final String   HTML = "-html";  // jjb - added simple HTML output
77  
78      /**
79       * private constructor.
80      */
81      private ExtractText()
82      {
83          //static class
84      }
85  
86      /**
87       * Infamous main method.
88       *
89       * @param args Command line arguments, should be one and a reference to a file.
90       *
91       * @throws Exception If there is an error parsing the document.
92       */
93      public static void main( String  [] args ) throws Exception  
94      {
95          boolean toConsole = false;
96          boolean toHTML = false;
97          boolean sort = false;
98          String   password = "";
99          String   encoding = DEFAULT_ENCODING;
100         String   pdfFile = null;
101         String   textFile = null;
102         int startPage = 1;
103         int endPage = Integer.MAX_VALUE;
104         for( int i=0; i<args.length; i++ )
105         {
106             if( args[i].equals( PASSWORD ) )
107             {
108                 i++;
109                 if( i >= args.length )
110                 {
111                     usage();
112                 }
113                 password = args[i];
114             }
115             else if( args[i].equals( ENCODING ) )
116             {
117                 i++;
118                 if( i >= args.length )
119                 {
120                     usage();
121                 }
122                 encoding = args[i];
123             }
124             else if( args[i].equals( START_PAGE ) )
125             {
126                 i++;
127                 if( i >= args.length )
128                 {
129                     usage();
130                 }
131                 startPage = Integer.parseInt( args[i] );
132             }
133             else if( args[i].equals( HTML ) )
134             {
135                 toHTML = true;
136             }
137             else if( args[i].equals( SORT ) )
138             {
139                 sort = true;
140             }
141             else if( args[i].equals( END_PAGE ) )
142             {
143                 i++;
144                 if( i >= args.length )
145                 {
146                     usage();
147                 }
148                 endPage = Integer.parseInt( args[i] );
149             }
150             else if( args[i].equals( CONSOLE ) )
151             {
152                 toConsole = true;
153             }
154             else
155             {
156                 if( pdfFile == null )
157                 {
158                     pdfFile = args[i];
159                 }
160                 else
161                 {
162                     textFile = args[i];
163                 }
164             }
165         }
166 
167         if( pdfFile == null )
168         {
169             usage();
170         }
171         else
172         {
173 
174             Writer   output = null;
175             PDDocument document = null;
176             try
177             {
178                 try
179                 {
180                     //basically try to load it from a url first and if the URL
181                     //is not recognized then try to load it from the file system.
182                     URL   url = new URL  ( pdfFile );
183                     document = PDDocument.load( url );
184                     String   fileName = url.getFile();
185                     if( textFile == null && fileName.length() >4 )
186                     {
187                         File   outputFile = 
188                             new File  ( fileName.substring( 0, fileName.length() -4 ) + ".txt" );
189                         textFile = outputFile.getName();
190                     }
191                 }
192                 catch( MalformedURLException   e )
193                 {
194                     document = PDDocument.load( pdfFile );
195                     if( textFile == null && pdfFile.length() >4 )
196                     {
197                         textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
198                     }
199                 }
200     
201                 //document.print();
202                 if( document.isEncrypted() )
203                 {
204                     StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password );                    
205                     document.openProtection( sdm );
206                     AccessPermission ap = document.getCurrentAccessPermission();
207                     
208                     if( ! ap.canExtractContent() ) 
209                     {
210                         throw new IOException  ( "You do not have permission to extract text" );
211                     }
212                 }
213                 if( toConsole )
214                 {
215                     output = new OutputStreamWriter  ( System.out );
216                 }
217                 else
218                 {
219                     if( encoding != null )
220                     {
221                         output = new OutputStreamWriter  (
222                             new FileOutputStream  ( textFile ), encoding );
223                     }
224                     else
225                     {
226                         //use default encoding
227                         output = new OutputStreamWriter  (
228                             new FileOutputStream  ( textFile ) );
229                     }
230                 }
231     
232                 PDFTextStripper stripper = null;
233                 if(toHTML) 
234                 {
235                    stripper = new PDFText2HTML();
236                 } 
237                 else 
238                 {
239                    stripper = new PDFTextStripper();
240                 }
241                 stripper.setSortByPosition( sort );
242                 stripper.setStartPage( startPage );
243                 stripper.setEndPage( endPage );
244                 stripper.writeText( document, output );
245             }
246             finally
247             {
248                 if( output != null )
249                 {
250                     output.close();
251                 }
252                 if( document != null )
253                 {
254                     document.close();
255                 }
256             }
257         }
258     }
259 
260     /**
261      * This will print the usage requirements and exit.
262      */
263     private static void usage()
264     {
265         System.err.println( "Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]\n" +
266             "  -password  <password>        Password to decrypt document\n" +
267             "  -encoding  <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
268             "  -console                     Send text to console instead of file\n" +
269             "  -html                        Output in HTML format instead of raw text\n" +
270             "  -sort                        Sort the text before writing\n" +
271             "  -startPage <number>          The first page to start extraction(1 based)\n" +
272             "  -endPage <number>            The last page to extract(inclusive)\n" +
273             "  <PDF file>                   The PDF document to use\n" +
274             "  [Text File]                  The file to write the text to\n"
275             );
276         System.exit( 1 );
277     }
278 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags