PDFTextStripper


1   /**
2    * Copyright (c) 2003-2005, www.pdfbox.org
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without
6    * modification, are permitted provided that the following conditions are met:
7    *
8    * 1. Redistributions of source code must retain the above copyright notice,
9    *    this list of conditions and the following disclaimer.
10   * 2. Redistributions in binary form must reproduce the above copyright notice,
11   *    this list of conditions and the following disclaimer in the documentation
12   *    and/or other materials provided with the distribution.
13   * 3. Neither the name of pdfbox; nor the names of its
14   *    contributors may be used to endorse or promote products derived from this
15   *    software without specific prior written permission.
16   *
17   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20   * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27   *
28   * http://www.pdfbox.org
29   *
30   */
31  package org.pdfbox.util;
32  
33  import java.io.IOException  ;
34  import java.io.StringWriter  ;
35  import java.io.Writer  ;
36  
37  import java.util.ArrayList  ;
38  import java.util.Collections  ;
39  import java.util.HashMap  ;
40  import java.util.Iterator  ;
41  import java.util.List  ;
42  import java.util.Map  ;
43  import java.util.Properties  ;
44  import java.util.Vector  ;
45  
46  import org.pdfbox.cos.COSDocument;
47  import org.pdfbox.cos.COSStream;
48  
49  import org.pdfbox.pdmodel.PDDocument;
50  import org.pdfbox.pdmodel.PDPage;
51  
52  import org.pdfbox.pdmodel.common.PDRectangle;
53  import org.pdfbox.pdmodel.common.PDStream;
54  
55  import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
56  import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
57  
58  import org.pdfbox.exceptions.CryptographyException;
59  import org.pdfbox.exceptions.InvalidPasswordException;
60  
61  
62  /**
63   * This class will take a pdf document and strip out all of the text and ignore the
64   * formatting and such.
65   *
66   * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
67   * @version $Revision: 1.69 $
68   */
69  public class PDFTextStripper extends PDFStreamEngine
70  {   
71      private int currentPageNo = 0;
72      private int startPage = 1;
73      private int endPage = Integer.MAX_VALUE;
74      private PDOutlineItem startBookmark = null;
75      private int startBookmarkPageNumber = -1;
76      private PDOutlineItem endBookmark = null;
77      private int endBookmarkPageNumber = -1;
78      private PDDocument document;
79      private boolean suppressDuplicateOverlappingText = true;
80      private boolean shouldSeparateByBeads = true;
81      private boolean sortByPosition = false;
82      
83      private List   pageArticles = null;
84      /**
85       * The charactersByArticle is used to extract text by article divisions.  For example
86       * a PDF that has two columns like a newspaper, we want to extract the first column and
87       * then the second column.  In this example the PDF would have 2 beads(or articles), one for
88       * each column.  The size of the charactersByArticle would be 5, because not all text on the 
89       * screen will fall into one of the articles.  The five divisions are shown below
90       * 
91       * Text before first article
92       * first article text
93       * text between first article and second article
94       * second article text
95       * text after second article
96       * 
97       * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
98       */
99      protected Vector   charactersByArticle = new Vector  ();
100     
101     private Map   characterListMapping = new HashMap  ();
102     
103     private String   lineSeparator = System.getProperty("line.separator");
104     private String   pageSeparator = System.getProperty("line.separator");
105     private String   wordSeparator = " ";
106     
107     /**
108      * The stream to write the output to.
109      */
110     protected Writer   output;
111     
112     /**
113      * Instantiate a new PDFTextStripper object.  This object will load properties from
114      * Resources/PDFTextStripper.properties.
115      * @throws IOException If there is an error loading the properties.
116      */
117     public PDFTextStripper() throws IOException  
118     {
119         super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
120     }
121     
122     /**
123      * Instantiate a new PDFTextStripper object.  Loading all of the operator mappings
124      * from the properties object that is passed in.
125      * 
126      * @param props The properties containing the mapping of operators to PDFOperator 
127      * classes.
128      * 
129      * @throws IOException If there is an error reading the properties.
130      */
131     public PDFTextStripper( Properties   props ) throws IOException  
132     {
133         super( props );
134     }
135 
136     /**
137      * This will return the text of a document.  See writeText. <br />
138      * NOTE: The document must not be encrypted when coming into this method.
139      *
140      * @param doc The document to get the text from.
141      *
142      * @return The text of the PDF document.
143      *
144      * @throws IOException if the doc state is invalid or it is encrypted.
145      */
146     public String   getText( PDDocument doc ) throws IOException  
147     {
148         StringWriter   outputStream = new StringWriter  ();
149         writeText( doc, outputStream );
150         return outputStream.toString();
151     }
152 
153     /**
154      * @deprecated
155      * @see PDFTextStripper#getText( PDDocument )
156      * @param doc The document to extract the text from.
157      * @return The document text.
158      * @throws IOException If there is an error extracting the text.
159      */
160     public String   getText( COSDocument doc ) throws IOException  
161     {
162         return getText( new PDDocument( doc ) );
163     }
164 
165     /**
166      * @deprecated
167      * @see PDFTextStripper#writeText( PDDocument, Writer )
168      * @param doc The document to extract the text.
169      * @param outputStream The stream to write the text to.
170      * @throws IOException If there is an error extracting the text.
171      */
172     public void writeText( COSDocument doc, Writer   outputStream ) throws IOException  
173     {
174         writeText( new PDDocument( doc ), outputStream );
175     }
176 
177     /**
178      * This will take a PDDocument and write the text of that document to the print writer.
179      *
180      * @param doc The document to get the data from.
181      * @param outputStream The location to put the text.
182      *
183      * @throws IOException If the doc is in an invalid state.
184      */
185     public void writeText( PDDocument doc, Writer   outputStream ) throws IOException  
186     {
187         resetEngine();
188 
189         currentPageNo = 0;
190         document = doc;
191         output = outputStream;
192         startDocument(document);
193 
194         if( document.isEncrypted() )
195         {
196             // We are expecting non-encrypted documents here, but it is common
197             // for users to pass in a document that is encrypted with an empty
198             // password (such a document appears to not be encrypted by
199             // someone viewing the document, thus the confusion).  We will
200             // attempt to decrypt with the empty password to handle this case.
201             //
202             try
203             {
204                 document.decrypt("");
205             }
206             catch (CryptographyException e)
207             {
208                 throw new IOException  ("Error decrypting document, details: " + e.getMessage());
209             }
210             catch (InvalidPasswordException e)
211             {
212                 throw new IOException  ("Error: document is encrypted");
213             }
214         }
215 
216         processPages( document.getDocumentCatalog().getAllPages() );
217         endDocument(document);
218     }
219 
220     /**
221      * This will process all of the pages and the text that is in them.
222      *
223      * @param pages The pages object in the document.
224      *
225      * @throws IOException If there is an error parsing the text.
226      */
227     protected void processPages( List   pages ) throws IOException  
228     {
229         if( startBookmark != null )
230         {
231             startBookmarkPageNumber = getPageNumber( startBookmark, pages );
232         }
233         
234         if( endBookmark != null )
235         {
236             endBookmarkPageNumber = getPageNumber( endBookmark, pages );
237         }
238         
239         if( startBookmarkPageNumber == -1 && startBookmark != null &&
240             endBookmarkPageNumber == -1 && endBookmark != null &&
241             startBookmark.getCOSObject() == endBookmark.getCOSObject() )
242         {
243             //this is a special case where both the start and end bookmark
244             //are the same but point to nothing.  In this case
245             //we will not extract any text.
246             startBookmarkPageNumber = 0;
247             endBookmarkPageNumber = 0;
248         }
249         
250 
251         Iterator   pageIter = pages.iterator();
252         while( pageIter.hasNext() )
253         {
254             PDPage nextPage = (PDPage)pageIter.next();
255             PDStream contentStream = nextPage.getContents();
256             if( contentStream != null )
257             {
258                 COSStream contents = contentStream.getStream();
259                 processPage( nextPage, contents );
260             }
261         }
262     }
263     
264     private int getPageNumber( PDOutlineItem bookmark, List   allPages ) throws IOException   
265     {
266         int pageNumber = -1;
267         PDPage page = bookmark.findDestinationPage( document );
268         if( page != null )
269         {
270             pageNumber = allPages.indexOf( page )+1;//use one based indexing
271         }
272         return pageNumber;
273     }
274     
275     /**
276      * This method is available for subclasses of this class.  It will be called before processing
277      * of the document start.
278      * 
279      * @param pdf The PDF document that is being processed.
280      * @throws IOException If an IO error occurs.
281      */
282     protected void startDocument(PDDocument pdf) throws IOException   
283     {
284         // no default implementation, but available for subclasses    
285     }
286     
287     /**
288      * This method is available for subclasses of this class.  It will be called after processing
289      * of the document finishes.
290      * 
291      * @param pdf The PDF document that is being processed.
292      * @throws IOException If an IO error occurs.
293      */
294     protected void endDocument(PDDocument pdf ) throws IOException   
295     {
296         // no default implementation, but available for subclasses
297     }
298 
299     /**
300      * This will process the contents of a page.
301      *
302      * @param page The page to process.
303      * @param content The contents of the page.
304      *
305      * @throws IOException If there is an error processing the page.
306      */
307     protected void processPage( PDPage page, COSStream content ) throws IOException  
308     {
309         currentPageNo++;
310         if( currentPageNo >= startPage && currentPageNo <= endPage &&
311             (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) && 
312             (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
313         {
314             startPage( page );
315             pageArticles = page.getThreadBeads();
316             int numberOfArticleSections = 1 + pageArticles.size() * 2;
317             if( !shouldSeparateByBeads )
318             {
319                 numberOfArticleSections = 1;
320             }
321             int originalSize = charactersByArticle.size();
322             charactersByArticle.setSize( numberOfArticleSections );
323             for( int i=0; i<numberOfArticleSections; i++ )
324             {
325                 if( numberOfArticleSections < originalSize )
326                 {
327                     ((List  )charactersByArticle.get( i )).clear();
328                 }
329                 else
330                 {
331                     charactersByArticle.set( i, new ArrayList  () );
332                 }
333             }
334             
335             characterListMapping.clear();
336             processStream( page, page.findResources(), content );
337             flushText();
338             endPage( page );
339         }
340         
341     }
342     
343     /**
344      * Start a new paragraph.  Default implementation is to do nothing.  Subclasses
345      * may provide additional information.
346      * 
347      * @throws IOException If there is any error writing to the stream.
348      */
349     protected void startParagraph() throws IOException  
350     {
351         //default is to do nothing.
352     }
353     
354     /**
355      * End a paragraph.  Default implementation is to do nothing.  Subclasses
356      * may provide additional information.
357      * 
358      * @throws IOException If there is any error writing to the stream.
359      */
360     protected void endParagraph() throws IOException  
361     {
362         //default is to do nothing
363     }
364     
365     /**
366      * Start a new page.  Default implementation is to do nothing.  Subclasses
367      * may provide additional information.
368      * 
369      * @param page The page we are about to process.
370      * 
371      * @throws IOException If there is any error writing to the stream.
372      */
373     protected void startPage( PDPage page ) throws IOException  
374     {
375         //default is to do nothing.
376     }
377     
378     /**
379      * End a page.  Default implementation is to do nothing.  Subclasses
380      * may provide additional information.
381      * 
382      * @param page The page we are about to process.
383      * 
384      * @throws IOException If there is any error writing to the stream.
385      */
386     protected void endPage( PDPage page ) throws IOException  
387     {
388         //default is to do nothing
389     }
390 
391     /**
392      * This will print the text to the output stream.
393      *
394      * @throws IOException If there is an error writing the text.
395      */
396     protected void flushText() throws IOException  
397     {
398         float currentY = -1;
399         float lastBaselineFontSize = -1;
400         float endOfLastTextX = -1;
401         float startOfNextWordX = -1;
402         float lastWordSpacing = -1;
403         TextPosition lastProcessedCharacter = null;
404         
405         for( int i=0; i<charactersByArticle.size(); i++)
406         {
407             startParagraph();
408             List   textList = (List  )charactersByArticle.get( i );
409             if( sortByPosition )
410             {
411                 TextPositionComparator comparator = new TextPositionComparator( getCurrentPage() );
412                 Collections.sort( textList, comparator );
413             }
414             Iterator   textIter = textList.iterator();
415             while( textIter.hasNext() )
416             {
417                 TextPosition position = (TextPosition)textIter.next();
418                 String   characterValue = position.getCharacter();
419                 
420                 //wordSpacing = position.getWordSpacing();
421                 float wordSpacing = 0;
422                 
423                 if( wordSpacing == 0 )
424                 {
425                     //try to get width of a space character
426                     wordSpacing = position.getWidthOfSpace();
427                     //if still zero fall back to getting the width of the current
428                     //character
429                     if( wordSpacing == 0 )
430                     {
431                         wordSpacing = position.getWidth();
432                     }
433                 }
434                 
435                 
436                 // RDD - We add a conservative approximation for space determination.
437                 // basically if there is a blank area between two characters that is
438                 //equal to some percentage of the word spacing then that will be the
439                 //start of the next word
440                 if( lastWordSpacing <= 0 )
441                 {
442                     startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
443                 }
444                 else
445                 {
446                     startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
447                 }
448                 
449                 lastWordSpacing = wordSpacing;
450     
451                 // RDD - We will suppress text that is very close to the current line
452                 // and which overwrites previously rendered text on this line.
453                 // This is done specifically to handle a reasonably common situation
454                 // where an application (MS Word, in the case of my examples) renders
455                 // text four times at small (1 point) offsets in order to accomplish
456                 // bold printing.  You would not want to do this step if you were
457                 // going to render the TextPosition objects graphically.
458                 //
459                 /*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
460                     (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
461                 {
462                     if (log.isDebugEnabled())
463                     {
464                         log.debug("Suppressing text overwrite" +
465                                   " x: " + position.getX() +
466                                   " endOfLastTextX: " + endOfLastTextX +
467                                   " string: " + position.getCharacter());
468                     }
469                     continue;
470                 }*/
471     
472                 // RDD - Here we determine whether this text object is on the current
473                 // line.  We use the lastBaselineFontSize to handle the superscript
474                 // case, and the size of the current font to handle the subscript case.
475                 // Text must overlap with the last rendered baseline text by at least
476                 // a small amount in order to be considered as being on the same line.
477                 //
478                 int verticalScaling = 1;
479                 if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
480                 {
481                     verticalScaling = -1;
482                 }
483                 if (currentY != -1 &&
484                     ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
485                      (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
486                 {
487                     output.write(getLineSeparator());
488                     endOfLastTextX = -1;
489                     startOfNextWordX = -1;
490                     currentY = -1;
491                     lastBaselineFontSize = -1;
492                 }
493     
494                 if (startOfNextWordX != -1 && startOfNextWordX < position.getX() &&
495                    lastProcessedCharacter != null &&
496                    //only bother adding a space if the last character was not a space
497                    lastProcessedCharacter.getCharacter() != null &&
498                    !lastProcessedCharacter.getCharacter().endsWith( " " ) )
499                 {
500                     output.write(getWordSeparator());
501                 }
502     
503                 if (currentY == -1)
504                 {
505                     currentY = position.getY();
506                 }
507     
508                 if (currentY == position.getY())
509                 {
510                     lastBaselineFontSize = position.getFontSize();
511                 }
512     
513                 // RDD - endX is what PDF considers to be the x coordinate of the
514                 // end position of the text.  We use it in computing our metrics below.
515                 //
516                 endOfLastTextX = position.getX() + position.getWidth();
517     
518     
519                 if (characterValue != null)
520                 {
521                     writeCharacters( position );
522                 }
523                 else
524                 {
525                     //Position.getString() is null so not writing anything
526                 }
527                 lastProcessedCharacter = position;
528             }
529             endParagraph();
530         }
531         
532 
533         // RDD - newline at end of flush - required for end of page (so that the top
534         // of the next page starts on its own line.
535         //
536         output.write(getPageSeparator());
537 
538         output.flush();
539     }
540     
541     /**
542      * Write the string to the output stream.
543      *  
544      * @param text The text to write to the stream.
545      * @throws IOException If there is an error when writing the text.
546      */
547     protected void writeCharacters( TextPosition text ) throws IOException  
548     {
549         output.write( text.getCharacter() );
550     }
551 
552     /**
553      * This will determine of two floating point numbers are within a specified variance.
554      *
555      * @param first The first number to compare to.
556      * @param second The second number to compare to.
557      * @param variance The allowed variance.
558      */
559     private boolean within( float first, float second, float variance )
560     {
561         return second > first - variance && second < first + variance;
562     }
563 
564     /**
565      * This will show add a character to the list of characters to be printed to
566      * the text file.
567      *
568      * @param text The description of the character to display.
569      */
570     protected void showCharacter( TextPosition text )
571     {
572         boolean showCharacter = true;
573         if( suppressDuplicateOverlappingText )
574         {
575             showCharacter = false;
576             String   textCharacter = text.getCharacter();
577             float textX = text.getX();
578             float textY = text.getY();
579             List   sameTextCharacters = (List  )characterListMapping.get( textCharacter );
580             if( sameTextCharacters == null )
581             {
582                 sameTextCharacters = new ArrayList  ();
583                 characterListMapping.put( textCharacter, sameTextCharacters );
584             }
585     
586             // RDD - Here we compute the value that represents the end of the rendered
587             // text.  This value is used to determine whether subsequent text rendered
588             // on the same line overwrites the current text.
589             //
590             // We subtract any positive padding to handle cases where extreme amounts
591             // of padding are applied, then backed off (not sure why this is done, but there
592             // are cases where the padding is on the order of 10x the character width, and
593             // the TJ just backs up to compensate after each character).  Also, we subtract
594             // an amount to allow for kerning (a percentage of the width of the last
595             // character).
596             //
597             boolean suppressCharacter = false;
598             float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
599             for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
600             {
601                 TextPosition character = (TextPosition)sameTextCharacters.get( i );
602                 String   charCharacter = character.getCharacter();
603                 float charX = character.getX();
604                 float charY = character.getY();
605                 //only want to suppress
606                 
607                 if( charCharacter != null &&
608                     //charCharacter.equals( textCharacter ) &&
609                     within( charX, textX, tolerance ) &&
610                     within( charY, 
611                             textY, 
612                             tolerance ) )
613                 {
614                     suppressCharacter = true;
615                 }
616             }
617             if( !suppressCharacter )
618             {
619                 sameTextCharacters.add( text );
620                 showCharacter = true;
621             }
622         }
623         
624         if( showCharacter )
625         {
626             //if we are showing the character then we need to determine which
627             //article it belongs to.
628             int foundArticleDivisionIndex = -1;
629             int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
630             int notFoundButFirstLeftArticleDivisionIndex = -1;
631             int notFoundButFirstAboveArticleDivisionIndex = -1;
632             float x = text.getX();
633             float y = text.getY();
634             if( shouldSeparateByBeads )
635             {
636                 for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
637                 {
638                     PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
639                     if( bead != null )
640                     {
641                         PDRectangle rect = bead.getRectangle();
642                         if( rect.contains( x, y ) )
643                         {
644                             foundArticleDivisionIndex = i*2+1;
645                         }
646                         else if( (x < rect.getLowerLeftX() ||
647                                   y < rect.getUpperRightY()) &&
648                             notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
649                         {
650                             notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
651                         }
652                         else if( x < rect.getLowerLeftX() &&
653                                 notFoundButFirstLeftArticleDivisionIndex == -1)
654                         {
655                             notFoundButFirstLeftArticleDivisionIndex = i*2;
656                         }
657                         else if( y < rect.getUpperRightY() &&
658                                 notFoundButFirstAboveArticleDivisionIndex == -1)
659                         {
660                             notFoundButFirstAboveArticleDivisionIndex = i*2;
661                         }                        
662                     }
663                     else
664                     {
665                         foundArticleDivisionIndex = 0;
666                     }
667                 }
668             }
669             else
670             {
671                 foundArticleDivisionIndex = 0;
672             }
673             int articleDivisionIndex = -1;
674             if( foundArticleDivisionIndex != -1 )
675             {
676                 articleDivisionIndex = foundArticleDivisionIndex;
677             }
678             else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
679             {
680                 articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
681             }
682             else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
683             {
684                 articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
685             }
686             else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
687             {
688                 articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
689             }
690             else
691             {
692                 articleDivisionIndex = charactersByArticle.size()-1;
693             }
694             List   textList = (List  ) charactersByArticle.get( articleDivisionIndex );
695             textList.add( text );
696         }
697     }
698 
699     /**
700      * This is the page that the text extraction will start on.  The pages start
701      * at page 1.  For example in a 5 page PDF document, if the start page is 1
702      * then all pages will be extracted.  If the start page is 4 then pages 4 and 5
703      * will be extracted.  The default value is 1.
704      *
705      * @return Value of property startPage.
706      */
707     public int getStartPage()
708     {
709         return startPage;
710     }
711 
712     /**
713      * This will set the first page to be extracted by this class.
714      *
715      * @param startPageValue New value of property startPage.
716      */
717     public void setStartPage(int startPageValue)
718     {
719         startPage = startPageValue;
720     }
721 
722     /**
723      * This will get the last page that will be extracted.  This is inclusive,
724      * for example if a 5 page PDF an endPage value of 5 would extract the
725      * entire document, an end page of 2 would extract pages 1 and 2.  This defaults
726      * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
727      *
728      * @return Value of property endPage.
729      */
730     public int getEndPage()
731     {
732         return endPage;
733     }
734 
735     /**
736      * This will set the last page to be extracted by this class.
737      *
738      * @param endPageValue New value of property endPage.
739      */
740     public void setEndPage(int endPageValue)
741     {
742         endPage = endPageValue;
743     }
744 
745     /**
746      * Set the desired line separator for output text.  The line.separator
747      * system property is used if the line separator preference is not set
748      * explicitly using this method.
749      *
750      * @param separator The desired line separator string.
751      */
752     public void setLineSeparator(String   separator)
753     {
754         lineSeparator = separator;
755     }
756 
757     /**
758      * This will get the line separator.
759      *
760      * @return The desired line separator string.
761      */
762     public String   getLineSeparator()
763     {
764         return lineSeparator;
765     }
766 
767     /**
768      * Set the desired page separator for output text.  The line.separator
769      * system property is used if the page separator preference is not set
770      * explicitly using this method.
771      *
772      * @param separator The desired page separator string.
773      */
774     public void setPageSeparator(String   separator)
775     {
776         pageSeparator = separator;
777     }
778 
779     /**
780      * This will get the word separator.
781      *
782      * @return The desired word separator string.
783      */
784     public String   getWordSeparator()
785     {
786         return wordSeparator;
787     }
788 
789     /**
790      * Set the desired word separator for output text.  The PDFBox text extraction
791      * algorithm will output a space character if there is enough space between
792      * two words.  By default a space character is used.  If you need and accurate
793      * count of characters that are found in a PDF document then you might want to
794      * set the word separator to the empty string.
795      *
796      * @param separator The desired page separator string.
797      */
798     public void setWordSeparator(String   separator)
799     {
800         wordSeparator = separator;
801     }
802 
803     /**
804      * This will get the page separator.
805      *
806      * @return The page separator string.
807      */
808     public String   getPageSeparator()
809     {
810         return pageSeparator;
811     }
812     /**
813      * @return Returns the suppressDuplicateOverlappingText.
814      */
815     public boolean shouldSuppressDuplicateOverlappingText()
816     {
817         return suppressDuplicateOverlappingText;
818     }
819     
820     /**
821      * Get the current page number that is being processed.
822      * 
823      * @return A 1 based number representing the current page.
824      */
825     protected int getCurrentPageNo() 
826     {
827         return currentPageNo;
828     }
829 
830     /**
831      * The output stream that is being written to.
832      * 
833      * @return The stream that output is being written to.
834      */
835     protected Writer   getOutput() 
836     {
837         return output;
838     }
839     
840     /**
841      * Character strings are grouped by articles.  It is quite common that there
842      * will only be a single article.  This returns a List that contains List objects,
843      * the inner lists will contain TextPosition objects.
844      * 
845      * @return A double List of TextPositions for all text strings on the page.
846      */
847     protected List   getCharactersByArticle()
848     {
849         return charactersByArticle;
850     }
851     
852     /**
853      * By default the text stripper will attempt to remove text that overlapps each other.
854      * Word paints the same character several times in order to make it look bold.  By setting
855      * this to false all text will be extracted, which means that certain sections will be 
856      * duplicated, but better performance will be noticed.
857      * 
858      * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
859      */
860     public void setSuppressDuplicateOverlappingText(
861             boolean suppressDuplicateOverlappingTextValue)
862     {
863         this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
864     }
865     
866     /**
867      * This will tell if the text stripper should separate by beads.
868      * 
869      * @return If the text will be grouped by beads.
870      */
871     public boolean shouldSeparateByBeads()
872     {
873         return shouldSeparateByBeads;
874     }
875     
876     /**
877      * Set if the text stripper should group the text output by a list of beads.  The default value is true!
878      * 
879      * @param aShouldSeparateByBeads The new grouping of beads.
880      */
881     public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
882     {
883         this.shouldSeparateByBeads = aShouldSeparateByBeads;
884     }
885     
886     /**
887      * Get the bookmark where text extraction should end, inclusive.  Default is null.
888      * 
889      * @return The ending bookmark.
890      */
891     public PDOutlineItem getEndBookmark()
892     {
893         return endBookmark;
894     }
895     
896     /**
897      * Set the bookmark where the text extraction should stop.
898      * 
899      * @param aEndBookmark The ending bookmark.
900      */
901     public void setEndBookmark(PDOutlineItem aEndBookmark)
902     {
903         endBookmark = aEndBookmark;
904     }
905     
906     /**
907      * Get the bookmark where text extraction should start, inclusive.  Default is null.
908      * 
909      * @return The starting bookmark.
910      */
911     public PDOutlineItem getStartBookmark()
912     {
913         return startBookmark;
914     }
915     
916     /**
917      * Set the bookmark where text extraction should start, inclusive.
918      * 
919      * @param aStartBookmark The starting bookmark.
920      */
921     public void setStartBookmark(PDOutlineItem aStartBookmark)
922     {
923         startBookmark = aStartBookmark;
924     }
925 
926     /**
927      * This will tell if the text stripper should sort the text tokens
928      * before writing to the stream.
929      * 
930      * @return true If the text tokens will be sorted before being written.
931      */
932     public boolean shouldSortByPosition() 
933     {
934         return sortByPosition;
935     }
936 
937     /**
938      * The order of the text tokens in a PDF file may not be in the same
939      * as they appear visually on the screen.  For example, a PDF writer may
940      * write out all text by font, so all bold or larger text, then make a second
941      * pass and write out the normal text.<br/>
942      * The default is to <b>not</b> sort by position.<br/>
943      * <br/>
944      * A PDF writer could choose to write each character in a different order.  By
945      * default PDFBox does <b>not</b> sort the text tokens before processing them due to
946      * performance reasons.
947      *     
948      * @param newSortByPosition Tell PDFBox to sort the text positions.
949      */
950     public void setSortByPosition(boolean newSortByPosition) 
951     {
952         sortByPosition = newSortByPosition;
953     }
954 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags