KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > pdfbox > util > PDFTextStripper


1 /**
2  * Copyright (c) 2003-2005, www.pdfbox.org
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  * 3. Neither the name of pdfbox; nor the names of its
14  * contributors may be used to endorse or promote products derived from this
15  * software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * http://www.pdfbox.org
29  *
30  */

31 package org.pdfbox.util;
32
33 import java.io.IOException JavaDoc;
34 import java.io.StringWriter JavaDoc;
35 import java.io.Writer JavaDoc;
36
37 import java.util.ArrayList JavaDoc;
38 import java.util.Collections JavaDoc;
39 import java.util.HashMap JavaDoc;
40 import java.util.Iterator JavaDoc;
41 import java.util.List JavaDoc;
42 import java.util.Map JavaDoc;
43 import java.util.Properties JavaDoc;
44 import java.util.Vector JavaDoc;
45
46 import org.pdfbox.cos.COSDocument;
47 import org.pdfbox.cos.COSStream;
48
49 import org.pdfbox.pdmodel.PDDocument;
50 import org.pdfbox.pdmodel.PDPage;
51
52 import org.pdfbox.pdmodel.common.PDRectangle;
53 import org.pdfbox.pdmodel.common.PDStream;
54
55 import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
56 import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
57
58 import org.pdfbox.exceptions.CryptographyException;
59 import org.pdfbox.exceptions.InvalidPasswordException;
60
61
62 /**
63  * This class will take a pdf document and strip out all of the text and ignore the
64  * formatting and such.
65  *
66  * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
67  * @version $Revision: 1.69 $
68  */

69 public class PDFTextStripper extends PDFStreamEngine
70 {
71     private int currentPageNo = 0;
72     private int startPage = 1;
73     private int endPage = Integer.MAX_VALUE;
74     private PDOutlineItem startBookmark = null;
75     private int startBookmarkPageNumber = -1;
76     private PDOutlineItem endBookmark = null;
77     private int endBookmarkPageNumber = -1;
78     private PDDocument document;
79     private boolean suppressDuplicateOverlappingText = true;
80     private boolean shouldSeparateByBeads = true;
81     private boolean sortByPosition = false;
82     
83     private List JavaDoc pageArticles = null;
84     /**
85      * The charactersByArticle is used to extract text by article divisions. For example
86      * a PDF that has two columns like a newspaper, we want to extract the first column and
87      * then the second column. In this example the PDF would have 2 beads(or articles), one for
88      * each column. The size of the charactersByArticle would be 5, because not all text on the
89      * screen will fall into one of the articles. The five divisions are shown below
90      *
91      * Text before first article
92      * first article text
93      * text between first article and second article
94      * second article text
95      * text after second article
96      *
97      * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
98      */

99     protected Vector JavaDoc charactersByArticle = new Vector JavaDoc();
100     
101     private Map JavaDoc characterListMapping = new HashMap JavaDoc();
102     
103     private String JavaDoc lineSeparator = System.getProperty("line.separator");
104     private String JavaDoc pageSeparator = System.getProperty("line.separator");
105     private String JavaDoc wordSeparator = " ";
106     
107     /**
108      * The stream to write the output to.
109      */

110     protected Writer JavaDoc output;
111     
112     /**
113      * Instantiate a new PDFTextStripper object. This object will load properties from
114      * Resources/PDFTextStripper.properties.
115      * @throws IOException If there is an error loading the properties.
116      */

117     public PDFTextStripper() throws IOException JavaDoc
118     {
119         super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) );
120     }
121     
122     /**
123      * Instantiate a new PDFTextStripper object. Loading all of the operator mappings
124      * from the properties object that is passed in.
125      *
126      * @param props The properties containing the mapping of operators to PDFOperator
127      * classes.
128      *
129      * @throws IOException If there is an error reading the properties.
130      */

131     public PDFTextStripper( Properties JavaDoc props ) throws IOException JavaDoc
132     {
133         super( props );
134     }
135
136     /**
137      * This will return the text of a document. See writeText. <br />
138      * NOTE: The document must not be encrypted when coming into this method.
139      *
140      * @param doc The document to get the text from.
141      *
142      * @return The text of the PDF document.
143      *
144      * @throws IOException if the doc state is invalid or it is encrypted.
145      */

146     public String JavaDoc getText( PDDocument doc ) throws IOException JavaDoc
147     {
148         StringWriter JavaDoc outputStream = new StringWriter JavaDoc();
149         writeText( doc, outputStream );
150         return outputStream.toString();
151     }
152
153     /**
154      * @deprecated
155      * @see PDFTextStripper#getText( PDDocument )
156      * @param doc The document to extract the text from.
157      * @return The document text.
158      * @throws IOException If there is an error extracting the text.
159      */

160     public String JavaDoc getText( COSDocument doc ) throws IOException JavaDoc
161     {
162         return getText( new PDDocument( doc ) );
163     }
164
165     /**
166      * @deprecated
167      * @see PDFTextStripper#writeText( PDDocument, Writer )
168      * @param doc The document to extract the text.
169      * @param outputStream The stream to write the text to.
170      * @throws IOException If there is an error extracting the text.
171      */

172     public void writeText( COSDocument doc, Writer JavaDoc outputStream ) throws IOException JavaDoc
173     {
174         writeText( new PDDocument( doc ), outputStream );
175     }
176
177     /**
178      * This will take a PDDocument and write the text of that document to the print writer.
179      *
180      * @param doc The document to get the data from.
181      * @param outputStream The location to put the text.
182      *
183      * @throws IOException If the doc is in an invalid state.
184      */

185     public void writeText( PDDocument doc, Writer JavaDoc outputStream ) throws IOException JavaDoc
186     {
187         resetEngine();
188
189         currentPageNo = 0;
190         document = doc;
191         output = outputStream;
192         startDocument(document);
193
194         if( document.isEncrypted() )
195         {
196             // We are expecting non-encrypted documents here, but it is common
197
// for users to pass in a document that is encrypted with an empty
198
// password (such a document appears to not be encrypted by
199
// someone viewing the document, thus the confusion). We will
200
// attempt to decrypt with the empty password to handle this case.
201
//
202
try
203             {
204                 document.decrypt("");
205             }
206             catch (CryptographyException e)
207             {
208                 throw new IOException JavaDoc("Error decrypting document, details: " + e.getMessage());
209             }
210             catch (InvalidPasswordException e)
211             {
212                 throw new IOException JavaDoc("Error: document is encrypted");
213             }
214         }
215
216         processPages( document.getDocumentCatalog().getAllPages() );
217         endDocument(document);
218     }
219
220     /**
221      * This will process all of the pages and the text that is in them.
222      *
223      * @param pages The pages object in the document.
224      *
225      * @throws IOException If there is an error parsing the text.
226      */

227     protected void processPages( List JavaDoc pages ) throws IOException JavaDoc
228     {
229         if( startBookmark != null )
230         {
231             startBookmarkPageNumber = getPageNumber( startBookmark, pages );
232         }
233         
234         if( endBookmark != null )
235         {
236             endBookmarkPageNumber = getPageNumber( endBookmark, pages );
237         }
238         
239         if( startBookmarkPageNumber == -1 && startBookmark != null &&
240             endBookmarkPageNumber == -1 && endBookmark != null &&
241             startBookmark.getCOSObject() == endBookmark.getCOSObject() )
242         {
243             //this is a special case where both the start and end bookmark
244
//are the same but point to nothing. In this case
245
//we will not extract any text.
246
startBookmarkPageNumber = 0;
247             endBookmarkPageNumber = 0;
248         }
249         
250
251         Iterator JavaDoc pageIter = pages.iterator();
252         while( pageIter.hasNext() )
253         {
254             PDPage nextPage = (PDPage)pageIter.next();
255             PDStream contentStream = nextPage.getContents();
256             if( contentStream != null )
257             {
258                 COSStream contents = contentStream.getStream();
259                 processPage( nextPage, contents );
260             }
261         }
262     }
263     
264     private int getPageNumber( PDOutlineItem bookmark, List JavaDoc allPages ) throws IOException JavaDoc
265     {
266         int pageNumber = -1;
267         PDPage page = bookmark.findDestinationPage( document );
268         if( page != null )
269         {
270             pageNumber = allPages.indexOf( page )+1;//use one based indexing
271
}
272         return pageNumber;
273     }
274     
275     /**
276      * This method is available for subclasses of this class. It will be called before processing
277      * of the document start.
278      *
279      * @param pdf The PDF document that is being processed.
280      * @throws IOException If an IO error occurs.
281      */

282     protected void startDocument(PDDocument pdf) throws IOException JavaDoc
283     {
284         // no default implementation, but available for subclasses
285
}
286     
287     /**
288      * This method is available for subclasses of this class. It will be called after processing
289      * of the document finishes.
290      *
291      * @param pdf The PDF document that is being processed.
292      * @throws IOException If an IO error occurs.
293      */

294     protected void endDocument(PDDocument pdf ) throws IOException JavaDoc
295     {
296         // no default implementation, but available for subclasses
297
}
298
299     /**
300      * This will process the contents of a page.
301      *
302      * @param page The page to process.
303      * @param content The contents of the page.
304      *
305      * @throws IOException If there is an error processing the page.
306      */

307     protected void processPage( PDPage page, COSStream content ) throws IOException JavaDoc
308     {
309         currentPageNo++;
310         if( currentPageNo >= startPage && currentPageNo <= endPage &&
311             (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
312             (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
313         {
314             startPage( page );
315             pageArticles = page.getThreadBeads();
316             int numberOfArticleSections = 1 + pageArticles.size() * 2;
317             if( !shouldSeparateByBeads )
318             {
319                 numberOfArticleSections = 1;
320             }
321             int originalSize = charactersByArticle.size();
322             charactersByArticle.setSize( numberOfArticleSections );
323             for( int i=0; i<numberOfArticleSections; i++ )
324             {
325                 if( numberOfArticleSections < originalSize )
326                 {
327                     ((List JavaDoc)charactersByArticle.get( i )).clear();
328                 }
329                 else
330                 {
331                     charactersByArticle.set( i, new ArrayList JavaDoc() );
332                 }
333             }
334             
335             characterListMapping.clear();
336             processStream( page, page.findResources(), content );
337             flushText();
338             endPage( page );
339         }
340         
341     }
342     
343     /**
344      * Start a new paragraph. Default implementation is to do nothing. Subclasses
345      * may provide additional information.
346      *
347      * @throws IOException If there is any error writing to the stream.
348      */

349     protected void startParagraph() throws IOException JavaDoc
350     {
351         //default is to do nothing.
352
}
353     
354     /**
355      * End a paragraph. Default implementation is to do nothing. Subclasses
356      * may provide additional information.
357      *
358      * @throws IOException If there is any error writing to the stream.
359      */

360     protected void endParagraph() throws IOException JavaDoc
361     {
362         //default is to do nothing
363
}
364     
365     /**
366      * Start a new page. Default implementation is to do nothing. Subclasses
367      * may provide additional information.
368      *
369      * @param page The page we are about to process.
370      *
371      * @throws IOException If there is any error writing to the stream.
372      */

373     protected void startPage( PDPage page ) throws IOException JavaDoc
374     {
375         //default is to do nothing.
376
}
377     
378     /**
379      * End a page. Default implementation is to do nothing. Subclasses
380      * may provide additional information.
381      *
382      * @param page The page we are about to process.
383      *
384      * @throws IOException If there is any error writing to the stream.
385      */

386     protected void endPage( PDPage page ) throws IOException JavaDoc
387     {
388         //default is to do nothing
389
}
390
391     /**
392      * This will print the text to the output stream.
393      *
394      * @throws IOException If there is an error writing the text.
395      */

396     protected void flushText() throws IOException JavaDoc
397     {
398         float currentY = -1;
399         float lastBaselineFontSize = -1;
400         float endOfLastTextX = -1;
401         float startOfNextWordX = -1;
402         float lastWordSpacing = -1;
403         TextPosition lastProcessedCharacter = null;
404         
405         for( int i=0; i<charactersByArticle.size(); i++)
406         {
407             startParagraph();
408             List JavaDoc textList = (List JavaDoc)charactersByArticle.get( i );
409             if( sortByPosition )
410             {
411                 TextPositionComparator comparator = new TextPositionComparator( getCurrentPage() );
412                 Collections.sort( textList, comparator );
413             }
414             Iterator JavaDoc textIter = textList.iterator();
415             while( textIter.hasNext() )
416             {
417                 TextPosition position = (TextPosition)textIter.next();
418                 String JavaDoc characterValue = position.getCharacter();
419                 
420                 //wordSpacing = position.getWordSpacing();
421
float wordSpacing = 0;
422                 
423                 if( wordSpacing == 0 )
424                 {
425                     //try to get width of a space character
426
wordSpacing = position.getWidthOfSpace();
427                     //if still zero fall back to getting the width of the current
428
//character
429
if( wordSpacing == 0 )
430                     {
431                         wordSpacing = position.getWidth();
432                     }
433                 }
434                 
435                 
436                 // RDD - We add a conservative approximation for space determination.
437
// basically if there is a blank area between two characters that is
438
//equal to some percentage of the word spacing then that will be the
439
//start of the next word
440
if( lastWordSpacing <= 0 )
441                 {
442                     startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
443                 }
444                 else
445                 {
446                     startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
447                 }
448                 
449                 lastWordSpacing = wordSpacing;
450     
451                 // RDD - We will suppress text that is very close to the current line
452
// and which overwrites previously rendered text on this line.
453
// This is done specifically to handle a reasonably common situation
454
// where an application (MS Word, in the case of my examples) renders
455
// text four times at small (1 point) offsets in order to accomplish
456
// bold printing. You would not want to do this step if you were
457
// going to render the TextPosition objects graphically.
458
//
459
/*if ((endOfLastTextX != -1 && position.getX() < endOfLastTextX) &&
460                     (currentY != -1 && Math.abs(position.getY() - currentY) < 1))
461                 {
462                     if (log.isDebugEnabled())
463                     {
464                         log.debug("Suppressing text overwrite" +
465                                   " x: " + position.getX() +
466                                   " endOfLastTextX: " + endOfLastTextX +
467                                   " string: " + position.getCharacter());
468                     }
469                     continue;
470                 }*/

471     
472                 // RDD - Here we determine whether this text object is on the current
473
// line. We use the lastBaselineFontSize to handle the superscript
474
// case, and the size of the current font to handle the subscript case.
475
// Text must overlap with the last rendered baseline text by at least
476
// a small amount in order to be considered as being on the same line.
477
//
478
int verticalScaling = 1;
479                 if( lastBaselineFontSize < 0 || position.getFontSize() < 0 )
480                 {
481                     verticalScaling = -1;
482                 }
483                 if (currentY != -1 &&
484                     ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
485                      (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
486                 {
487                     output.write(getLineSeparator());
488                     endOfLastTextX = -1;
489                     startOfNextWordX = -1;
490                     currentY = -1;
491                     lastBaselineFontSize = -1;
492                 }
493     
494                 if (startOfNextWordX != -1 && startOfNextWordX < position.getX() &&
495                    lastProcessedCharacter != null &&
496                    //only bother adding a space if the last character was not a space
497
lastProcessedCharacter.getCharacter() != null &&
498                    !lastProcessedCharacter.getCharacter().endsWith( " " ) )
499                 {
500                     output.write(getWordSeparator());
501                 }
502     
503                 if (currentY == -1)
504                 {
505                     currentY = position.getY();
506                 }
507     
508                 if (currentY == position.getY())
509                 {
510                     lastBaselineFontSize = position.getFontSize();
511                 }
512     
513                 // RDD - endX is what PDF considers to be the x coordinate of the
514
// end position of the text. We use it in computing our metrics below.
515
//
516
endOfLastTextX = position.getX() + position.getWidth();
517     
518     
519                 if (characterValue != null)
520                 {
521                     writeCharacters( position );
522                 }
523                 else
524                 {
525                     //Position.getString() is null so not writing anything
526
}
527                 lastProcessedCharacter = position;
528             }
529             endParagraph();
530         }
531         
532
533         // RDD - newline at end of flush - required for end of page (so that the top
534
// of the next page starts on its own line.
535
//
536
output.write(getPageSeparator());
537
538         output.flush();
539     }
540     
541     /**
542      * Write the string to the output stream.
543      *
544      * @param text The text to write to the stream.
545      * @throws IOException If there is an error when writing the text.
546      */

547     protected void writeCharacters( TextPosition text ) throws IOException JavaDoc
548     {
549         output.write( text.getCharacter() );
550     }
551
552     /**
553      * This will determine of two floating point numbers are within a specified variance.
554      *
555      * @param first The first number to compare to.
556      * @param second The second number to compare to.
557      * @param variance The allowed variance.
558      */

559     private boolean within( float first, float second, float variance )
560     {
561         return second > first - variance && second < first + variance;
562     }
563
564     /**
565      * This will show add a character to the list of characters to be printed to
566      * the text file.
567      *
568      * @param text The description of the character to display.
569      */

570     protected void showCharacter( TextPosition text )
571     {
572         boolean showCharacter = true;
573         if( suppressDuplicateOverlappingText )
574         {
575             showCharacter = false;
576             String JavaDoc textCharacter = text.getCharacter();
577             float textX = text.getX();
578             float textY = text.getY();
579             List JavaDoc sameTextCharacters = (List JavaDoc)characterListMapping.get( textCharacter );
580             if( sameTextCharacters == null )
581             {
582                 sameTextCharacters = new ArrayList JavaDoc();
583                 characterListMapping.put( textCharacter, sameTextCharacters );
584             }
585     
586             // RDD - Here we compute the value that represents the end of the rendered
587
// text. This value is used to determine whether subsequent text rendered
588
// on the same line overwrites the current text.
589
//
590
// We subtract any positive padding to handle cases where extreme amounts
591
// of padding are applied, then backed off (not sure why this is done, but there
592
// are cases where the padding is on the order of 10x the character width, and
593
// the TJ just backs up to compensate after each character). Also, we subtract
594
// an amount to allow for kerning (a percentage of the width of the last
595
// character).
596
//
597
boolean suppressCharacter = false;
598             float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
599             for( int i=0; i<sameTextCharacters.size() && textCharacter != null; i++ )
600             {
601                 TextPosition character = (TextPosition)sameTextCharacters.get( i );
602                 String JavaDoc charCharacter = character.getCharacter();
603                 float charX = character.getX();
604                 float charY = character.getY();
605                 //only want to suppress
606

607                 if( charCharacter != null &&
608                     //charCharacter.equals( textCharacter ) &&
609
within( charX, textX, tolerance ) &&
610                     within( charY,
611                             textY,
612                             tolerance ) )
613                 {
614                     suppressCharacter = true;
615                 }
616             }
617             if( !suppressCharacter )
618             {
619                 sameTextCharacters.add( text );
620                 showCharacter = true;
621             }
622         }
623         
624         if( showCharacter )
625         {
626             //if we are showing the character then we need to determine which
627
//article it belongs to.
628
int foundArticleDivisionIndex = -1;
629             int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
630             int notFoundButFirstLeftArticleDivisionIndex = -1;
631             int notFoundButFirstAboveArticleDivisionIndex = -1;
632             float x = text.getX();
633             float y = text.getY();
634             if( shouldSeparateByBeads )
635             {
636                 for( int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++ )
637                 {
638                     PDThreadBead bead = (PDThreadBead)pageArticles.get( i );
639                     if( bead != null )
640                     {
641                         PDRectangle rect = bead.getRectangle();
642                         if( rect.contains( x, y ) )
643                         {
644                             foundArticleDivisionIndex = i*2+1;
645                         }
646                         else if( (x < rect.getLowerLeftX() ||
647                                   y < rect.getUpperRightY()) &&
648                             notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
649                         {
650                             notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
651                         }
652                         else if( x < rect.getLowerLeftX() &&
653                                 notFoundButFirstLeftArticleDivisionIndex == -1)
654                         {
655                             notFoundButFirstLeftArticleDivisionIndex = i*2;
656                         }
657                         else if( y < rect.getUpperRightY() &&
658                                 notFoundButFirstAboveArticleDivisionIndex == -1)
659                         {
660                             notFoundButFirstAboveArticleDivisionIndex = i*2;
661                         }
662                     }
663                     else
664                     {
665                         foundArticleDivisionIndex = 0;
666                     }
667                 }
668             }
669             else
670             {
671                 foundArticleDivisionIndex = 0;
672             }
673             int articleDivisionIndex = -1;
674             if( foundArticleDivisionIndex != -1 )
675             {
676                 articleDivisionIndex = foundArticleDivisionIndex;
677             }
678             else if( notFoundButFirstLeftAndAboveArticleDivisionIndex != -1 )
679             {
680                 articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
681             }
682             else if( notFoundButFirstLeftArticleDivisionIndex != -1 )
683             {
684                 articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
685             }
686             else if( notFoundButFirstAboveArticleDivisionIndex != -1 )
687             {
688                 articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
689             }
690             else
691             {
692                 articleDivisionIndex = charactersByArticle.size()-1;
693             }
694             List JavaDoc textList = (List JavaDoc) charactersByArticle.get( articleDivisionIndex );
695             textList.add( text );
696         }
697     }
698
699     /**
700      * This is the page that the text extraction will start on. The pages start
701      * at page 1. For example in a 5 page PDF document, if the start page is 1
702      * then all pages will be extracted. If the start page is 4 then pages 4 and 5
703      * will be extracted. The default value is 1.
704      *
705      * @return Value of property startPage.
706      */

707     public int getStartPage()
708     {
709         return startPage;
710     }
711
712     /**
713      * This will set the first page to be extracted by this class.
714      *
715      * @param startPageValue New value of property startPage.
716      */

717     public void setStartPage(int startPageValue)
718     {
719         startPage = startPageValue;
720     }
721
722     /**
723      * This will get the last page that will be extracted. This is inclusive,
724      * for example if a 5 page PDF an endPage value of 5 would extract the
725      * entire document, an end page of 2 would extract pages 1 and 2. This defaults
726      * to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
727      *
728      * @return Value of property endPage.
729      */

730     public int getEndPage()
731     {
732         return endPage;
733     }
734
735     /**
736      * This will set the last page to be extracted by this class.
737      *
738      * @param endPageValue New value of property endPage.
739      */

740     public void setEndPage(int endPageValue)
741     {
742         endPage = endPageValue;
743     }
744
745     /**
746      * Set the desired line separator for output text. The line.separator
747      * system property is used if the line separator preference is not set
748      * explicitly using this method.
749      *
750      * @param separator The desired line separator string.
751      */

752     public void setLineSeparator(String JavaDoc separator)
753     {
754         lineSeparator = separator;
755     }
756
757     /**
758      * This will get the line separator.
759      *
760      * @return The desired line separator string.
761      */

762     public String JavaDoc getLineSeparator()
763     {
764         return lineSeparator;
765     }
766
767     /**
768      * Set the desired page separator for output text. The line.separator
769      * system property is used if the page separator preference is not set
770      * explicitly using this method.
771      *
772      * @param separator The desired page separator string.
773      */

774     public void setPageSeparator(String JavaDoc separator)
775     {
776         pageSeparator = separator;
777     }
778
779     /**
780      * This will get the word separator.
781      *
782      * @return The desired word separator string.
783      */

784     public String JavaDoc getWordSeparator()
785     {
786         return wordSeparator;
787     }
788
789     /**
790      * Set the desired word separator for output text. The PDFBox text extraction
791      * algorithm will output a space character if there is enough space between
792      * two words. By default a space character is used. If you need and accurate
793      * count of characters that are found in a PDF document then you might want to
794      * set the word separator to the empty string.
795      *
796      * @param separator The desired page separator string.
797      */

798     public void setWordSeparator(String JavaDoc separator)
799     {
800         wordSeparator = separator;
801     }
802
803     /**
804      * This will get the page separator.
805      *
806      * @return The page separator string.
807      */

808     public String JavaDoc getPageSeparator()
809     {
810         return pageSeparator;
811     }
812     /**
813      * @return Returns the suppressDuplicateOverlappingText.
814      */

815     public boolean shouldSuppressDuplicateOverlappingText()
816     {
817         return suppressDuplicateOverlappingText;
818     }
819     
820     /**
821      * Get the current page number that is being processed.
822      *
823      * @return A 1 based number representing the current page.
824      */

825     protected int getCurrentPageNo()
826     {
827         return currentPageNo;
828     }
829
830     /**
831      * The output stream that is being written to.
832      *
833      * @return The stream that output is being written to.
834      */

835     protected Writer JavaDoc getOutput()
836     {
837         return output;
838     }
839     
840     /**
841      * Character strings are grouped by articles. It is quite common that there
842      * will only be a single article. This returns a List that contains List objects,
843      * the inner lists will contain TextPosition objects.
844      *
845      * @return A double List of TextPositions for all text strings on the page.
846      */

847     protected List JavaDoc getCharactersByArticle()
848     {
849         return charactersByArticle;
850     }
851     
852     /**
853      * By default the text stripper will attempt to remove text that overlapps each other.
854      * Word paints the same character several times in order to make it look bold. By setting
855      * this to false all text will be extracted, which means that certain sections will be
856      * duplicated, but better performance will be noticed.
857      *
858      * @param suppressDuplicateOverlappingTextValue The suppressDuplicateOverlappingText to set.
859      */

860     public void setSuppressDuplicateOverlappingText(
861             boolean suppressDuplicateOverlappingTextValue)
862     {
863         this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingTextValue;
864     }
865     
866     /**
867      * This will tell if the text stripper should separate by beads.
868      *
869      * @return If the text will be grouped by beads.
870      */

871     public boolean shouldSeparateByBeads()
872     {
873         return shouldSeparateByBeads;
874     }
875     
876     /**
877      * Set if the text stripper should group the text output by a list of beads. The default value is true!
878      *
879      * @param aShouldSeparateByBeads The new grouping of beads.
880      */

881     public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
882     {
883         this.shouldSeparateByBeads = aShouldSeparateByBeads;
884     }
885     
886     /**
887      * Get the bookmark where text extraction should end, inclusive. Default is null.
888      *
889      * @return The ending bookmark.
890      */

891     public PDOutlineItem getEndBookmark()
892     {
893         return endBookmark;
894     }
895     
896     /**
897      * Set the bookmark where the text extraction should stop.
898      *
899      * @param aEndBookmark The ending bookmark.
900      */

901     public void setEndBookmark(PDOutlineItem aEndBookmark)
902     {
903         endBookmark = aEndBookmark;
904     }
905     
906     /**
907      * Get the bookmark where text extraction should start, inclusive. Default is null.
908      *
909      * @return The starting bookmark.
910      */

911     public PDOutlineItem getStartBookmark()
912     {
913         return startBookmark;
914     }
915     
916     /**
917      * Set the bookmark where text extraction should start, inclusive.
918      *
919      * @param aStartBookmark The starting bookmark.
920      */

921     public void setStartBookmark(PDOutlineItem aStartBookmark)
922     {
923         startBookmark = aStartBookmark;
924     }
925
926     /**
927      * This will tell if the text stripper should sort the text tokens
928      * before writing to the stream.
929      *
930      * @return true If the text tokens will be sorted before being written.
931      */

932     public boolean shouldSortByPosition()
933     {
934         return sortByPosition;
935     }
936
937     /**
938      * The order of the text tokens in a PDF file may not be in the same
939      * as they appear visually on the screen. For example, a PDF writer may
940      * write out all text by font, so all bold or larger text, then make a second
941      * pass and write out the normal text.<br/>
942      * The default is to <b>not</b> sort by position.<br/>
943      * <br/>
944      * A PDF writer could choose to write each character in a different order. By
945      * default PDFBox does <b>not</b> sort the text tokens before processing them due to
946      * performance reasons.
947      *
948      * @param newSortByPosition Tell PDFBox to sort the text positions.
949      */

950     public void setSortByPosition(boolean newSortByPosition)
951     {
952         sortByPosition = newSortByPosition;
953     }
954 }
Popular Tags