KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > pdfbox > ExtractText


1 /**
2  * Copyright (c) 2003-2006, www.pdfbox.org
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  * 3. Neither the name of pdfbox; nor the names of its
14  * contributors may be used to endorse or promote products derived from this
15  * software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * http://www.pdfbox.org
29  *
30  */

31 package org.pdfbox;
32
33 import java.io.File JavaDoc;
34 import java.io.FileOutputStream JavaDoc;
35 import java.io.IOException JavaDoc;
36 import java.io.OutputStreamWriter JavaDoc;
37 import java.io.Writer JavaDoc;
38 import java.net.MalformedURLException JavaDoc;
39 import java.net.URL JavaDoc;
40
41 import org.pdfbox.pdmodel.PDDocument;
42 import org.pdfbox.pdmodel.encryption.AccessPermission;
43 import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
44 import org.pdfbox.util.PDFText2HTML;
45 import org.pdfbox.util.PDFTextStripper;
46
47 /**
48  * This is the main program that simply parses the pdf document and transforms it
49  * into text.
50  *
51  * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
52  * @version $Revision: 1.14 $
53  */

54 public class ExtractText
55 {
56     /**
57      * This is the default encoding of the text to be output.
58      */

59     public static final String JavaDoc DEFAULT_ENCODING =
60         null;
61         //"ISO-8859-1";
62
//"ISO-8859-6"; //arabic
63
//"US-ASCII";
64
//"UTF-8";
65
//"UTF-16";
66
//"UTF-16BE";
67
//"UTF-16LE";
68

69
70     private static final String JavaDoc PASSWORD = "-password";
71     private static final String JavaDoc ENCODING = "-encoding";
72     private static final String JavaDoc CONSOLE = "-console";
73     private static final String JavaDoc START_PAGE = "-startPage";
74     private static final String JavaDoc END_PAGE = "-endPage";
75     private static final String JavaDoc SORT = "-sort";
76     private static final String JavaDoc HTML = "-html"; // jjb - added simple HTML output
77

78     /**
79      * private constructor.
80     */

81     private ExtractText()
82     {
83         //static class
84
}
85
86     /**
87      * Infamous main method.
88      *
89      * @param args Command line arguments, should be one and a reference to a file.
90      *
91      * @throws Exception If there is an error parsing the document.
92      */

93     public static void main( String JavaDoc[] args ) throws Exception JavaDoc
94     {
95         boolean toConsole = false;
96         boolean toHTML = false;
97         boolean sort = false;
98         String JavaDoc password = "";
99         String JavaDoc encoding = DEFAULT_ENCODING;
100         String JavaDoc pdfFile = null;
101         String JavaDoc textFile = null;
102         int startPage = 1;
103         int endPage = Integer.MAX_VALUE;
104         for( int i=0; i<args.length; i++ )
105         {
106             if( args[i].equals( PASSWORD ) )
107             {
108                 i++;
109                 if( i >= args.length )
110                 {
111                     usage();
112                 }
113                 password = args[i];
114             }
115             else if( args[i].equals( ENCODING ) )
116             {
117                 i++;
118                 if( i >= args.length )
119                 {
120                     usage();
121                 }
122                 encoding = args[i];
123             }
124             else if( args[i].equals( START_PAGE ) )
125             {
126                 i++;
127                 if( i >= args.length )
128                 {
129                     usage();
130                 }
131                 startPage = Integer.parseInt( args[i] );
132             }
133             else if( args[i].equals( HTML ) )
134             {
135                 toHTML = true;
136             }
137             else if( args[i].equals( SORT ) )
138             {
139                 sort = true;
140             }
141             else if( args[i].equals( END_PAGE ) )
142             {
143                 i++;
144                 if( i >= args.length )
145                 {
146                     usage();
147                 }
148                 endPage = Integer.parseInt( args[i] );
149             }
150             else if( args[i].equals( CONSOLE ) )
151             {
152                 toConsole = true;
153             }
154             else
155             {
156                 if( pdfFile == null )
157                 {
158                     pdfFile = args[i];
159                 }
160                 else
161                 {
162                     textFile = args[i];
163                 }
164             }
165         }
166
167         if( pdfFile == null )
168         {
169             usage();
170         }
171         else
172         {
173
174             Writer JavaDoc output = null;
175             PDDocument document = null;
176             try
177             {
178                 try
179                 {
180                     //basically try to load it from a url first and if the URL
181
//is not recognized then try to load it from the file system.
182
URL JavaDoc url = new URL JavaDoc( pdfFile );
183                     document = PDDocument.load( url );
184                     String JavaDoc fileName = url.getFile();
185                     if( textFile == null && fileName.length() >4 )
186                     {
187                         File JavaDoc outputFile =
188                             new File JavaDoc( fileName.substring( 0, fileName.length() -4 ) + ".txt" );
189                         textFile = outputFile.getName();
190                     }
191                 }
192                 catch( MalformedURLException JavaDoc e )
193                 {
194                     document = PDDocument.load( pdfFile );
195                     if( textFile == null && pdfFile.length() >4 )
196                     {
197                         textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
198                     }
199                 }
200     
201                 //document.print();
202
if( document.isEncrypted() )
203                 {
204                     StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password );
205                     document.openProtection( sdm );
206                     AccessPermission ap = document.getCurrentAccessPermission();
207                     
208                     if( ! ap.canExtractContent() )
209                     {
210                         throw new IOException JavaDoc( "You do not have permission to extract text" );
211                     }
212                 }
213                 if( toConsole )
214                 {
215                     output = new OutputStreamWriter JavaDoc( System.out );
216                 }
217                 else
218                 {
219                     if( encoding != null )
220                     {
221                         output = new OutputStreamWriter JavaDoc(
222                             new FileOutputStream JavaDoc( textFile ), encoding );
223                     }
224                     else
225                     {
226                         //use default encoding
227
output = new OutputStreamWriter JavaDoc(
228                             new FileOutputStream JavaDoc( textFile ) );
229                     }
230                 }
231     
232                 PDFTextStripper stripper = null;
233                 if(toHTML)
234                 {
235                    stripper = new PDFText2HTML();
236                 }
237                 else
238                 {
239                    stripper = new PDFTextStripper();
240                 }
241                 stripper.setSortByPosition( sort );
242                 stripper.setStartPage( startPage );
243                 stripper.setEndPage( endPage );
244                 stripper.writeText( document, output );
245             }
246             finally
247             {
248                 if( output != null )
249                 {
250                     output.close();
251                 }
252                 if( document != null )
253                 {
254                     document.close();
255                 }
256             }
257         }
258     }
259
260     /**
261      * This will print the usage requirements and exit.
262      */

263     private static void usage()
264     {
265         System.err.println( "Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]\n" +
266             " -password <password> Password to decrypt document\n" +
267             " -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
268             " -console Send text to console instead of file\n" +
269             " -html Output in HTML format instead of raw text\n" +
270             " -sort Sort the text before writing\n" +
271             " -startPage <number> The first page to start extraction(1 based)\n" +
272             " -endPage <number> The last page to extract(inclusive)\n" +
273             " <PDF file> The PDF document to use\n" +
274             " [Text File] The file to write the text to\n"
275             );
276         System.exit( 1 );
277     }
278 }
Popular Tags