1 30 package test.pdfbox.util; 31 32 import java.io.File ; 33 import java.io.FileInputStream ; 34 import java.io.FilenameFilter ; 35 import java.io.FileOutputStream ; 36 import java.io.IOException ; 37 import java.io.InputStreamReader ; 38 import java.io.LineNumberReader ; 39 import java.io.OutputStream ; 40 import java.io.OutputStreamWriter ; 41 import java.io.Writer ; 42 43 import junit.framework.Test; 44 import junit.framework.TestCase; 45 import junit.framework.TestSuite; 46 47 import org.pdfbox.pdmodel.PDDocument; 48 49 import org.pdfbox.util.PDFTextStripper; 50 51 93 public class TestTextStripper extends TestCase 94 { 95 private boolean bFail = false; 96 private PDFTextStripper stripper = null; 97 98 105 public TestTextStripper( String name ) throws IOException 106 { 107 super( name ); 108 stripper = new PDFTextStripper(); 109 stripper.setLineSeparator("\n"); 110 } 111 112 115 public void setUp() 116 { 117 } 122 123 132 private boolean stringsEqual(String expected, String actual) 133 { 134 boolean equals = true; 135 if( (expected == null) && (actual == null) ) 136 { 137 return true; 138 } 139 else if( expected != null && actual != null ) 140 { 141 expected = expected.trim(); 142 actual = actual.trim(); 143 char[] expectedArray = expected.toCharArray(); 144 char[] actualArray = actual.toCharArray(); 145 int expectedIndex = 0; 146 int actualIndex = 0; 147 while( expectedIndex<expectedArray.length && actualIndex<actualArray.length ) 148 { 149 if( expectedArray[expectedIndex] != actualArray[actualIndex] ) 150 { 151 equals = false; 152 System.err.println("Lines differ at index" 153 + " expected:" + expectedIndex + "-" + (int)expectedArray[expectedIndex] 154 + " actual:" + actualIndex + "-" + (int)actualArray[actualIndex] ); 155 break; 156 } 157 expectedIndex = skipWhitespace( expectedArray, expectedIndex ); 158 actualIndex = skipWhitespace( actualArray, actualIndex ); 159 expectedIndex++; 160 actualIndex++; 161 } 162 if( equals ) 163 { 164 if( expectedIndex != expectedArray.length ) 165 { 166 equals = false; 167 System.err.println("Expected line is longer at:" + expectedIndex ); 168 } 169 if( actualIndex != actualArray.length ) 170 { 171 equals = false; 172 System.err.println("Actual line is longer at:" + actualIndex ); 173 } 174 } 175 } 176 else if( ( expected == null && actual != null && actual.trim().equals( "" ) ) || 177 ( actual == null && expected != null && expected.trim().equals( "" ) ) ) 178 { 179 equals = true; 183 } 184 else 185 { 186 equals = false; 187 } 188 return equals; 189 } 190 191 194 private int skipWhitespace( char[] array, int index ) 195 { 196 if( array[index] == ' ' || array[index] > 256 ) 200 { 201 while( index < array.length && (array[index] == ' ' || array[index] > 256)) 202 { 203 index++; 204 } 205 index--; 206 } 207 return index; 208 } 209 210 217 public void doTestFile(File file, boolean bLogResult) 218 throws Exception 219 { 220 System.out.println("Preparing to parse " + file.getName()); 221 222 OutputStream os = null; 223 Writer writer = null; 224 PDDocument document = null; 225 try 226 { 227 document = PDDocument.load(file); 228 229 File outFile = new File (file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt"); 230 os = new FileOutputStream (outFile); 231 os.write( 0xFF ); 232 os.write( 0xFE ); 233 writer = new OutputStreamWriter (os,"UTF-16LE"); 234 235 stripper.writeText(document, writer); 236 237 238 239 if (bLogResult) 240 { 241 System.out.println("Text for " + file.getName() + ":\r\n" + stripper.getText(document)); 242 } 243 244 File expectedFile = new File (file.getParentFile().getParentFile(), "input/" + file.getName() + ".txt"); 245 File actualFile = new File (file.getParentFile().getParentFile(), "output/" + file.getName() + ".txt"); 246 247 if (!expectedFile.exists()) 248 { 249 this.bFail = true; 250 System.err.println( 251 "FAILURE: Input verification file: " + expectedFile.getAbsolutePath() + 252 " did not exist"); 253 return; 254 } 255 256 LineNumberReader expectedReader = 257 new LineNumberReader (new InputStreamReader (new FileInputStream (expectedFile),"UTF-16")); 258 LineNumberReader actualReader = 259 new LineNumberReader (new InputStreamReader (new FileInputStream (actualFile), "UTF-16")); 260 261 while (true) 262 { 263 String expectedLine = expectedReader.readLine(); 264 while( expectedLine != null && expectedLine.trim().length() == 0 ) 265 { 266 expectedLine = expectedReader.readLine(); 267 } 268 String actualLine = actualReader.readLine(); 269 while( actualLine != null && actualLine.trim().length() == 0 ) 270 { 271 actualLine = actualReader.readLine(); 272 } 273 if (!stringsEqual(expectedLine, actualLine)) 274 { 275 this.bFail = true; 276 System.err.println("FAILURE: Line mismatch for file " + file.getName() + 277 " at expected line: " + expectedReader.getLineNumber() + 278 " at actual line: " + actualReader.getLineNumber() + 279 "\r\n expected line was: \"" + expectedLine + "\"" + 280 "\r\n actual line was: \"" + actualLine + "\""); 281 } 284 285 if( expectedLine == null || actualLine==null) 286 { 287 break; 288 } 289 } 290 } 291 finally 292 { 293 if( writer != null ) 294 { 295 writer.close(); 296 } 297 if( os != null ) 298 { 299 os.close(); 300 } 301 if( document != null ) 302 { 303 document.close(); 304 } 305 } 306 } 307 308 313 public void testExtract() 314 throws Exception 315 { 316 String filename = System.getProperty("test.pdfbox.util.TextStripper.file"); 317 File testDir = new File ("test/input"); 318 319 if ((filename == null) || (filename.length() == 0)) 320 { 321 File [] testFiles = testDir.listFiles(new FilenameFilter () 322 { 323 public boolean accept(File dir, String name) 324 { 325 return (name.endsWith(".pdf")); 326 } 327 }); 328 329 for (int n = 0; n < testFiles.length; n++) 330 { 331 doTestFile(testFiles[n], false); 332 } 333 } 334 else 335 { 336 doTestFile(new File (testDir, filename), true); 337 } 338 339 if (this.bFail) 340 { 341 fail("One or more failures, see test log for details"); 342 } 343 } 344 345 350 public static Test suite() 351 { 352 return new TestSuite( TestTextStripper.class ); 353 } 354 355 360 public static void main( String [] args ) 361 { 362 String [] arg = {TestTextStripper.class.getName() }; 363 junit.textui.TestRunner.main( arg ); 364 } 365 } | Popular Tags |