1 31 package org.pdfbox.util; 32 33 import java.io.IOException ; 34 import java.io.StringWriter ; 35 import java.io.Writer ; 36 37 import java.util.ArrayList ; 38 import java.util.Collections ; 39 import java.util.HashMap ; 40 import java.util.Iterator ; 41 import java.util.List ; 42 import java.util.Map ; 43 import java.util.Properties ; 44 import java.util.Vector ; 45 46 import org.pdfbox.cos.COSDocument; 47 import org.pdfbox.cos.COSStream; 48 49 import org.pdfbox.pdmodel.PDDocument; 50 import org.pdfbox.pdmodel.PDPage; 51 52 import org.pdfbox.pdmodel.common.PDRectangle; 53 import org.pdfbox.pdmodel.common.PDStream; 54 55 import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; 56 import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead; 57 58 import org.pdfbox.exceptions.CryptographyException; 59 import org.pdfbox.exceptions.InvalidPasswordException; 60 61 62 69 public class PDFTextStripper extends PDFStreamEngine 70 { 71 private int currentPageNo = 0; 72 private int startPage = 1; 73 private int endPage = Integer.MAX_VALUE; 74 private PDOutlineItem startBookmark = null; 75 private int startBookmarkPageNumber = -1; 76 private PDOutlineItem endBookmark = null; 77 private int endBookmarkPageNumber = -1; 78 private PDDocument document; 79 private boolean suppressDuplicateOverlappingText = true; 80 private boolean shouldSeparateByBeads = true; 81 private boolean sortByPosition = false; 82 83 private List pageArticles = null; 84 99 protected Vector charactersByArticle = new Vector (); 100 101 private Map characterListMapping = new HashMap (); 102 103 private String lineSeparator = System.getProperty("line.separator"); 104 private String pageSeparator = System.getProperty("line.separator"); 105 private String wordSeparator = " "; 106 107 110 protected Writer output; 111 112 117 public PDFTextStripper() throws IOException 118 { 119 super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) ); 120 } 121 122 131 public PDFTextStripper( Properties props ) throws IOException 132 { 133 super( props ); 134 } 135 136 146 public String getText( PDDocument doc ) throws IOException 147 { 148 StringWriter outputStream = new StringWriter (); 149 writeText( doc, outputStream ); 150 return outputStream.toString(); 151 } 152 153 160 public String getText( COSDocument doc ) throws IOException 161 { 162 return getText( new PDDocument( doc ) ); 163 } 164 165 172 public void writeText( COSDocument doc, Writer outputStream ) throws IOException 173 { 174 writeText( new PDDocument( doc ), outputStream ); 175 } 176 177 185 public void writeText( PDDocument doc, Writer outputStream ) throws IOException 186 { 187 resetEngine(); 188 189 currentPageNo = 0; 190 document = doc; 191 output = outputStream; 192 startDocument(document); 193 194 if( document.isEncrypted() ) 195 { 196 try 203 { 204 document.decrypt(""); 205 } 206 catch (CryptographyException e) 207 { 208 throw new IOException ("Error decrypting document, details: " + e.getMessage()); 209 } 210 catch (InvalidPasswordException e) 211 { 212 throw new IOException ("Error: document is encrypted"); 213 } 214 } 215 216 processPages( document.getDocumentCatalog().getAllPages() ); 217 endDocument(document); 218 } 219 220 227 protected void processPages( List pages ) throws IOException 228 { 229 if( startBookmark != null ) 230 { 231 startBookmarkPageNumber = getPageNumber( startBookmark, pages ); 232 } 233 234 if( endBookmark != null ) 235 { 236 endBookmarkPageNumber = getPageNumber( endBookmark, pages ); 237 } 238 239 if( startBookmarkPageNumber == -1 && startBookmark != null && 240 endBookmarkPageNumber == -1 && endBookmark != null && 241 startBookmark.getCOSObject() == endBookmark.getCOSObject() ) 242 { 243 startBookmarkPageNumber = 0; 247 endBookmarkPageNumber = 0; 248 } 249 250 251 Iterator pageIter = pages.iterator(); 252 while( pageIter.hasNext() ) 253 { 254 PDPage nextPage = (PDPage)pageIter.next(); 255 PDStream contentStream = nextPage.getContents(); 256 if( contentStream != null ) 257 { 258 COSStream contents = contentStream.getStream(); 259 processPage( nextPage, contents ); 260 } 261 } 262 } 263 264 private int getPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException 265 { 266 int pageNumber = -1; 267 PDPage page = bookmark.findDestinationPage( document ); 268 if( page != null ) 269 { 270 pageNumber = allPages.indexOf( page )+1; } 272 return pageNumber; 273 } 274 275 282 protected void startDocument(PDDocument pdf) throws IOException 283 { 284 } 286 287 294 protected void endDocument(PDDocument pdf ) throws IOException 295 { 296 } 298 299 307 protected void processPage( PDPage page, COSStream content ) throws IOException 308 { 309 currentPageNo++; 310 if( currentPageNo >= startPage && currentPageNo <= endPage && 311 (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) && 312 (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber )) 313 { 314 startPage( page ); 315 pageArticles = page.getThreadBeads(); 316 int numberOfArticleSections = 1 + pageArticles.size() * 2; 317 if( !shouldSeparateByBeads ) 318 { 319 numberOfArticleSections = 1; 320 } 321 int originalSize = charactersByArticle.size(); 322 charactersByArticle.setSize( numberOfArticleSections ); 323 for( int i=0; i<numberOfArticleSections; i++ ) 324 { 325 if( numberOfArticleSections < originalSize ) 326 { 327 ((List )charactersByArticle.get( i )).clear(); 328 } 329 else 330 { 331 charactersByArticle.set( i, new ArrayList () ); 332 } 333 } 334 335 characterListMapping.clear(); 336 processStream( page, page.findResources(), content ); 337 flushText(); 338 endPage( page ); 339 } 340 341 } 342 343 349 protected void startParagraph() throws IOException 350 { 351 } 353 354 360 protected void endParagraph() throws IOException 361 { 362 } 364 365 373 protected void startPage( PDPage page ) throws IOException 374 { 375 } 377 378 386 protected void endPage( PDPage page ) throws IOException 387 { 388 } 390 391 396 protected void flushText() throws IOException 397 { 398 float currentY = -1; 399 float lastBaselineFontSize = -1; 400 float endOfLastTextX = -1; 401 float startOfNextWordX = -1; 402 float lastWordSpacing = -1; 403 TextPosition lastProcessedCharacter = null; 404 405 for( int i=0; i<charactersByArticle.size(); i++) 406 { 407 startParagraph(); 408 List textList = (List )charactersByArticle.get( i ); 409 if( sortByPosition ) 410 { 411 TextPositionComparator comparator = new TextPositionComparator( getCurrentPage() ); 412 Collections.sort( textList, comparator ); 413 } 414 Iterator textIter = textList.iterator(); 415 while( textIter.hasNext() ) 416 { 417 TextPosition position = (TextPosition)textIter.next(); 418 String characterValue = position.getCharacter(); 419 420 float wordSpacing = 0; 422 423 if( wordSpacing == 0 ) 424 { 425 wordSpacing = position.getWidthOfSpace(); 427 if( wordSpacing == 0 ) 430 { 431 wordSpacing = position.getWidth(); 432 } 433 } 434 435 436 if( lastWordSpacing <= 0 ) 441 { 442 startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f); 443 } 444 else 445 { 446 startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f); 447 } 448 449 lastWordSpacing = wordSpacing; 450 451 471 472 int verticalScaling = 1; 479 if( lastBaselineFontSize < 0 || position.getFontSize() < 0 ) 480 { 481 verticalScaling = -1; 482 } 483 if (currentY != -1 && 484 ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) || 485 (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling))))) 486 { 487 output.write(getLineSeparator()); 488 endOfLastTextX = -1; 489 startOfNextWordX = -1; 490 currentY = -1; 491 lastBaselineFontSize = -1; 492 } 493 494 if (startOfNextWordX != -1 && startOfNextWordX < position.getX() && 495 lastProcessedCharacter != null && 496 lastProcessedCharacter.getCharacter() != null && 498 !lastProcessedCharacter.getCharacter().endsWith( " " ) ) 499 { 500 output.write(getWordSeparator()); 501 } 502 503 if (currentY == -1) 504 { 505 currentY = position.getY(); 506 } 507 508 if (currentY == position.getY()) 509 { 510 lastBaselineFontSize = position.getFontSize(); 511 } 512 513 endOfLastTextX = position.getX() + position.getWidth(); 517 518 519 if (characterValue != null) 520 { 521 writeCharacters( position ); 522 } 523 else 524 { 525 } 527 lastProcessedCharacter = position; 528 } 529 endParagraph(); 530 } 531 532 533 output.write(getPageSeparator()); 537 538 output.flush(); 539 } 540 541 547 protected void writeCharacters( TextPosition text ) throws IOException 548 { 549 output.write( text.getCharacter() ); 550 } 551 552 559 private boolean within( float first, float second, float variance ) 560 { 561 return second > first - variance && second < first + variance; 562 } 563 564 570 protected void showCharacter( TextPosition text ) 571 { 572 boolean showCharacter = true; 573 if( suppressDuplicateOverlappingText ) 574 { 575 showCharacter = false; 576 String textCharacter = text.getCharacter(); 577 float textX = text.getX(); 578 float textY = text.getY(); 579 List sameTextCharacters = (List )characterListMapping.get( textCharacter ); 580 if( sameTextCharacters == null ) 581 { 582 sameTextCharacters = new ArrayList (); 583 characterListMapping.put( textCharacter, sameTextCharacters ); 584 } 585 586 boolean suppressCharacter = false; 598 float tolerance = (text.getWidth()/textCharacter.length())/3.0f; 599 for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ ) 600 { 601 TextPosition character = (TextPosition)sameTextCharacters.get( i ); 602 String charCharacter = character.getCharacter(); 603 float charX = character.getX(); 604 float charY = character.getY(); 605 607 if( charCharacter != null && 608 within( charX, textX, tolerance ) && 610 within( charY, 611 textY, 612 tolerance ) ) 613 { 614 suppressCharacter = true; 615 } 616 } 617 if( !suppressCharacter ) 618 { 619 sameTextCharacters.add( text ); 620 showCharacter = true; 621 } 622 } 623 624 if( showCharacter ) 625 { 626 int foundArticleDivisionIndex = -1; 629 int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1; 630 int notFoundButFirstLeftArticleDivisionIndex = -1; 631 int notFoundButFirstAboveArticleDivisionIndex = -1; 632 float x = text.getX(); 633 float y = text.getY(); 634 if( shouldSeparateByBeads ) 635 { 636 for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ ) 637 { 638 PDThreadBead bead = (PDThreadBead)pageArticles.get( i ); 639 if( bead != null ) 640 { 641 PDRectangle rect = bead.getRectangle(); 642 if( rect.contains( x, y ) ) 643 { 644 foundArticleDivisionIndex = i*2+1; 645 } 646 else if( (x < rect.getLowerLeftX() || 647 y < rect.getUpperRightY()) && 648 notFoundButFirstLeftAndAboveArticleDivisionIndex == -1) 649 { 650 notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2; 651 } 652 else if( x < rect.getLowerLeftX() && 653 notFoundButFirstLeftArticleDivisionIndex == -1) 654 { 655 notFoundButFirstLeftArticleDivisionIndex = i*2; 656 } 657 else if( y < rect.getUpperRightY() && 658 notFoundButFirstAboveArticleDivisionIndex == -1) 659 { 660 notFoundButFirstAboveArticleDivisionIndex = i*2; 661 } 662 } 663 else 664 { 665 foundArticleDivisionIndex = 0; 666 } 667 } 668 } 669 else 670 { 671 foundArticleDivisionIndex = 0; 672 } 673 int articleDivisionIndex = -1; 674 if( foundArticleDivisionIndex != -1 ) 675 { 676 articleDivisionIndex = foundArticleDivisionIndex; 677 } 678 else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 ) 679 { 680 articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex; 681 } 682 else if( notFoundButFirstLeftArticleDivisionIndex != -1 ) 683 { 684 articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex; 685 } 686 else if( notFoundButFirstAboveArticleDivisionIndex != -1 ) 687 { 688 articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex; 689 } 690 else 691 { 692 articleDivisionIndex = charactersByArticle.size()-1; 693 } 694 List textList = (List ) charactersByArticle.get( articleDivisionIndex ); 695 textList.add( text ); 696 } 697 } 698 699 707 public int getStartPage() 708 { 709 return startPage; 710 } 711 712 717 public void setStartPage(int startPageValue) 718 { 719 startPage = startPageValue; 720 } 721 722 730 public int getEndPage() 731 { 732 return endPage; 733 } 734 735 740 public void setEndPage(int endPageValue) 741 { 742 endPage = endPageValue; 743 } 744 745 752 public void setLineSeparator(String separator) 753 { 754 lineSeparator = separator; 755 } 756 757 762 public String getLineSeparator() 763 { 764 return lineSeparator; 765 } 766 767 774 public void setPageSeparator(String separator) 775 { 776 pageSeparator = separator; 777 } 778 779 784 public String getWordSeparator() 785 { 786 return wordSeparator; 787 } 788 789 798 public void setWordSeparator(String separator) 799 { 800 wordSeparator = separator; 801 } 802 803 808 public String getPageSeparator() 809 { 810 return pageSeparator; 811 } 812 815 public boolean shouldSuppressDuplicateOverlappingText() 816 { 817 return suppressDuplicateOverlappingText; 818 } 819 820 825 protected int getCurrentPageNo() 826 { 827 return currentPageNo; 828 } 829 830 835 protected Writer getOutput() 836 { 837 return output; 838 } 839 840 847 protected List getCharactersByArticle() 848 { 849 return charactersByArticle; 850 } 851 852 860 public void setSuppressDuplicateOverlappingText( 861 boolean suppressDuplicateOverlappingTextValue) 862 { 863 this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue; 864 } 865 866 871 public boolean shouldSeparateByBeads() 872 { 873 return shouldSeparateByBeads; 874 } 875 876 881 public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) 882 { 883 this.shouldSeparateByBeads = aShouldSeparateByBeads; 884 } 885 886 891 public PDOutlineItem getEndBookmark() 892 { 893 return endBookmark; 894 } 895 896 901 public void setEndBookmark(PDOutlineItem aEndBookmark) 902 { 903 endBookmark = aEndBookmark; 904 } 905 906 911 public PDOutlineItem getStartBookmark() 912 { 913 return startBookmark; 914 } 915 916 921 public void setStartBookmark(PDOutlineItem aStartBookmark) 922 { 923 startBookmark = aStartBookmark; 924 } 925 926 932 public boolean shouldSortByPosition() 933 { 934 return sortByPosition; 935 } 936 937 950 public void setSortByPosition(boolean newSortByPosition) 951 { 952 sortByPosition = newSortByPosition; 953 } 954 } | Popular Tags |