KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > test > pdfbox > util > TestTextStripper


1 /**
2  * Copyright (c) 2003-2005, www.pdfbox.org
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  * 3. Neither the name of pdfbox; nor the names of its
14  * contributors may be used to endorse or promote products derived from this
15  * software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24  * ANY THEORY OF LIABILIT, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * http://www.pdfbox.org
29  */

30 package test.pdfbox.util;
31
32 import java.io.File JavaDoc;
33 import java.io.FileInputStream JavaDoc;
34 import java.io.FilenameFilter JavaDoc;
35 import java.io.FileOutputStream JavaDoc;
36 import java.io.IOException JavaDoc;
37 import java.io.InputStreamReader JavaDoc;
38 import java.io.LineNumberReader JavaDoc;
39 import java.io.OutputStream JavaDoc;
40 import java.io.OutputStreamWriter JavaDoc;
41 import java.io.Writer JavaDoc;
42
43 import junit.framework.Test;
44 import junit.framework.TestCase;
45 import junit.framework.TestSuite;
46
47 import org.pdfbox.pdmodel.PDDocument;
48
49 import org.pdfbox.util.PDFTextStripper;
50
51 /**
52  * Test suite for PDFTextStripper.
53  *
54  * FILE SET VALIDATION
55  *
56  * This test suite is designed to test PDFTextStripper using a set of PDF
57  * files and known good output for each. The default mode of testAll()
58  * is to process each *.pdf file in "test/input". An output file is
59  * created in "test/output" with the same name as the PDF file, plus an
60  * additional ".txt" suffix.
61  *
62  * The output file is then tested against a known good result file from
63  * the input directory (again, with the same name as the tested PDF file,
64  * but with the additional ".txt" suffix).
65  *
66  * So for the file "test/input/hello.pdf", an output file will be generated
67  * named "test/output/hello.pdf.txt". Then that file will be compared to
68  * the known good file "test/input/hello.pdf.txt", if it exists.
69  *
70  * Any errors are logged, and at the end of processing all *.pdf files, if
71  * there were any errors, the test fails. The logging is at INFO, as the
72  * general goal is overall validation, and on failure, the indication of
73  * which file or files failed.
74  *
75  * When processing new PDF files, you may use testAll() to generate output,
76  * verify the output manually, then move the output file to the test input
77  * directory to use as the basis for future validations.
78  *
79  * SINGLE FILE VALIDATION
80  *
81  * To further research individual failures, the test.pdfbox.util.TextStripper.file
82  * system property may be set with the name of a single file in the "test/input"
83  * directory. In this mode, testAll() will evaluate only that file, and will
84  * do so with DEBUG level logging. You can set this property from ant by
85  * defining "file", as in:
86  *
87  * ant testextract -Dfile=hello.pdf
88  *
89  * @author Robert Dickinson (bob@brutesquadlabs.com)
90  * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
91  * @version $Revision: 1.17 $
92  */

93 public class TestTextStripper extends TestCase
94 {
95     private boolean bFail = false;
96     private PDFTextStripper stripper = null;
97
98     /**
99      * Test class constructor.
100      *
101      * @param name The name of the test class.
102      *
103      * @throws IOException If there is an error creating the test.
104      */

105     public TestTextStripper( String JavaDoc name ) throws IOException JavaDoc
106     {
107         super( name );
108         stripper = new PDFTextStripper();
109         stripper.setLineSeparator("\n");
110     }
111
112     /**
113      * Test suite setup.
114      */

115     public void setUp()
116     {
117         // If you want to test a single file using DEBUG logging, from an IDE,
118
// you can do something like this:
119
//
120
// System.setProperty("test.pdfbox.util.TextStripper.file", "FVS318Ref.pdf");
121
}
122
123     /**
124      * Determine whether two strings are equal, where two null strings are
125      * considered equal.
126      *
127      * @param expected Excpected string
128      * @param actual Actual String
129      * @return <code>true</code> is the strings are both null,
130      * or if their contents are the same, otherwise <code>false</code>.
131      */

132     private boolean stringsEqual(String JavaDoc expected, String JavaDoc actual)
133     {
134         boolean equals = true;
135         if( (expected == null) && (actual == null) )
136         {
137             return true;
138         }
139         else if( expected != null && actual != null )
140         {
141             expected = expected.trim();
142             actual = actual.trim();
143             char[] expectedArray = expected.toCharArray();
144             char[] actualArray = actual.toCharArray();
145             int expectedIndex = 0;
146             int actualIndex = 0;
147             while( expectedIndex<expectedArray.length && actualIndex<actualArray.length )
148             {
149                 if( expectedArray[expectedIndex] != actualArray[actualIndex] )
150                 {
151                     equals = false;
152                     System.err.println("Lines differ at index"
153                      + " expected:" + expectedIndex + "-" + (int)expectedArray[expectedIndex]
154                      + " actual:" + actualIndex + "-" + (int)actualArray[actualIndex] );
155                     break;
156                 }
157                 expectedIndex = skipWhitespace( expectedArray, expectedIndex );
158                 actualIndex = skipWhitespace( actualArray, actualIndex );
159                 expectedIndex++;
160                 actualIndex++;
161             }
162             if( equals )
163             {
164                 if( expectedIndex != expectedArray.length )
165                 {
166                     equals = false;
167                     System.err.println("Expected line is longer at:" + expectedIndex );
168                 }
169                 if( actualIndex != actualArray.length )
170                 {
171                     equals = false;
172                     System.err.println("Actual line is longer at:" + actualIndex );
173                 }
174             }
175         }
176         else if( ( expected == null && actual != null && actual.trim().equals( "" ) ) ||
177             ( actual == null && expected != null && expected.trim().equals( "" ) ) )
178         {
179             //basically there are some cases where pdfbox will put an extra line
180
//at the end of the file, who cares, this is not enough to report
181
// a failure
182
equals = true;
183         }
184         else
185         {
186             equals = false;
187         }
188         return equals;
189     }
190
191     /**
192      * If the current index is whitespace then skip any subsequent whitespace.
193      */

194     private int skipWhitespace( char[] array, int index )
195     {
196         //if we are at a space character then skip all space
197
//characters, but when all done rollback 1 because stringsEqual
198
//will roll forward 1
199
if( array[index] == ' ' || array[index] > 256 )
200         {
201             while( index < array.length && (array[index] == ' ' || array[index] > 256))
202             {
203                 index++;
204             }
205             index--;
206         }
207         return index;
208     }
209
210     /**
211      * Validate text extraction on a single file.
212      *
213      * @param file The file to validate
214      * @param bLogResult Whether to log the extracted text
215      * @throws Exception when there is an exception
216      */

217     public void doTestFile(File JavaDoc file, boolean bLogResult)
218         throws Exception JavaDoc
219     {
220         System.out.println("Preparing to parse " + file.getName());
221         
222         OutputStream JavaDoc os = null;
223         Writer JavaDoc writer = null;
224         PDDocument document = null;
225         try
226         {
227             document = PDDocument.load(file);
228
229             File JavaDoc outFile = new File JavaDoc(file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt");
230             os = new FileOutputStream JavaDoc(outFile);
231             os.write( 0xFF );
232             os.write( 0xFE );
233             writer = new OutputStreamWriter JavaDoc(os,"UTF-16LE");
234
235             stripper.writeText(document, writer);
236
237
238
239             if (bLogResult)
240             {
241                 System.out.println("Text for " + file.getName() + ":\r\n" + stripper.getText(document));
242             }
243
244             File JavaDoc expectedFile = new File JavaDoc(file.getParentFile().getParentFile(), "input/" + file.getName() + ".txt");
245             File JavaDoc actualFile = new File JavaDoc(file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt");
246
247             if (!expectedFile.exists())
248             {
249                 this.bFail = true;
250                 System.err.println(
251                     "FAILURE: Input verification file: " + expectedFile.getAbsolutePath() +
252                     " did not exist");
253                 return;
254             }
255
256             LineNumberReader JavaDoc expectedReader =
257                 new LineNumberReader JavaDoc(new InputStreamReader JavaDoc(new FileInputStream JavaDoc(expectedFile),"UTF-16"));
258             LineNumberReader JavaDoc actualReader =
259                 new LineNumberReader JavaDoc(new InputStreamReader JavaDoc(new FileInputStream JavaDoc(actualFile), "UTF-16"));
260
261             while (true)
262             {
263                 String JavaDoc expectedLine = expectedReader.readLine();
264                 while( expectedLine != null && expectedLine.trim().length() == 0 )
265                 {
266                     expectedLine = expectedReader.readLine();
267                 }
268                 String JavaDoc actualLine = actualReader.readLine();
269                 while( actualLine != null && actualLine.trim().length() == 0 )
270                 {
271                     actualLine = actualReader.readLine();
272                 }
273                 if (!stringsEqual(expectedLine, actualLine))
274                 {
275                     this.bFail = true;
276                     System.err.println("FAILURE: Line mismatch for file " + file.getName() +
277                               " at expected line: " + expectedReader.getLineNumber() +
278                               " at actual line: " + actualReader.getLineNumber() +
279                               "\r\n expected line was: \"" + expectedLine + "\"" +
280                               "\r\n actual line was: \"" + actualLine + "\"");
281                     //lets report all lines, even though this might produce some verbose logging
282
//break;
283
}
284
285                 if( expectedLine == null || actualLine==null)
286                 {
287                     break;
288                 }
289             }
290         }
291         finally
292         {
293             if( writer != null )
294             {
295                 writer.close();
296             }
297             if( os != null )
298             {
299                 os.close();
300             }
301             if( document != null )
302             {
303                 document.close();
304             }
305         }
306     }
307
308     /**
309      * Test to validate text extraction of file set.
310      *
311      * @throws Exception when there is an exception
312      */

313     public void testExtract()
314         throws Exception JavaDoc
315     {
316         String JavaDoc filename = System.getProperty("test.pdfbox.util.TextStripper.file");
317         File JavaDoc testDir = new File JavaDoc("test/input");
318
319         if ((filename == null) || (filename.length() == 0))
320         {
321             File JavaDoc[] testFiles = testDir.listFiles(new FilenameFilter JavaDoc()
322             {
323                 public boolean accept(File JavaDoc dir, String JavaDoc name)
324                 {
325                     return (name.endsWith(".pdf"));
326                 }
327             });
328
329             for (int n = 0; n < testFiles.length; n++)
330             {
331                 doTestFile(testFiles[n], false);
332             }
333         }
334         else
335         {
336             doTestFile(new File JavaDoc(testDir, filename), true);
337         }
338
339         if (this.bFail)
340         {
341             fail("One or more failures, see test log for details");
342         }
343     }
344
345     /**
346      * Set the tests in the suite for this test class.
347      *
348      * @return the Suite.
349      */

350     public static Test suite()
351     {
352         return new TestSuite( TestTextStripper.class );
353     }
354     
355     /**
356      * Command line execution.
357      *
358      * @param args Command line arguments.
359      */

360     public static void main( String JavaDoc[] args )
361     {
362         String JavaDoc[] arg = {TestTextStripper.class.getName() };
363         junit.textui.TestRunner.main( arg );
364     }
365 }
Popular Tags