KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > pdfbox > util > PDFHighlighter


1 /**
2  * Copyright (c) 2006, www.pdfbox.org
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  * 3. Neither the name of pdfbox; nor the names of its
14  * contributors may be used to endorse or promote products derived from this
15  * software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * http://www.pdfbox.org
29  *
30  */

31 package org.pdfbox.util;
32
33 import java.io.ByteArrayOutputStream JavaDoc;
34 import java.io.IOException JavaDoc;
35 import java.io.OutputStreamWriter JavaDoc;
36 import java.io.Writer JavaDoc;
37 import java.util.regex.Matcher JavaDoc;
38 import java.util.regex.Pattern JavaDoc;
39
40 import org.pdfbox.pdmodel.PDDocument;
41 import org.pdfbox.pdmodel.PDPage;
42
43
44 /**
45  * Highlighting of words in a PDF document with an XML file.
46  *
47  * @author slagraulet (slagraulet@cardiweb.com)
48  * @author <a HREF="mailto:ben@benlitchfield.com">Ben Litchfield</a>
49  * @version $Revision: 1.7 $
50  *
51  * @see <a HREF="http://partners.adobe.com/public/developer/en/pdf/HighlightFileFormat.pdf">
52  * Adobe Highlight File Format</a>
53  */

54 public class PDFHighlighter extends PDFTextStripper
55 {
56     private Writer JavaDoc highlighterOutput = null;
57     //private Color highlightColor = Color.YELLOW;
58

59     private String JavaDoc[] searchedWords;
60     private ByteArrayOutputStream JavaDoc textOS = null;
61     private Writer JavaDoc textWriter = null;
62
63     /**
64      * Default constructor.
65      *
66      * @throws IOException If there is an error constructing this class.
67      */

68     public PDFHighlighter() throws IOException JavaDoc
69     {
70         super();
71         super.setLineSeparator( "" );
72         super.setPageSeparator( "" );
73         super.setWordSeparator( "" );
74         super.setShouldSeparateByBeads( false );
75         super.setSuppressDuplicateOverlappingText( false );
76     }
77     
78     /**
79      * Generate an XML highlight string based on the PDF.
80      *
81      * @param pdDocument The PDF to find words in.
82      * @param highlightWord The word to search for.
83      * @param xmlOutput The resulting output xml file.
84      *
85      * @throws IOException If there is an error reading from the PDF, or writing to the XML.
86      */

87     public void generateXMLHighlight(PDDocument pdDocument, String JavaDoc highlightWord, Writer JavaDoc xmlOutput ) throws IOException JavaDoc
88     {
89         generateXMLHighlight( pdDocument, new String JavaDoc[] { highlightWord }, xmlOutput );
90     }
91
92     /**
93      * Generate an XML highlight string based on the PDF.
94      *
95      * @param pdDocument The PDF to find words in.
96      * @param sWords The words to search for.
97      * @param xmlOutput The resulting output xml file.
98      *
99      * @throws IOException If there is an error reading from the PDF, or writing to the XML.
100      */

101     public void generateXMLHighlight(PDDocument pdDocument, String JavaDoc[] sWords, Writer JavaDoc xmlOutput ) throws IOException JavaDoc
102     {
103         highlighterOutput = xmlOutput;
104         searchedWords = sWords;
105         highlighterOutput.write("<XML>\n<Body units=characters " +
106                                 //color and mode are not implemented by the highlight spec
107
//so don't include them for now
108
//" color=#" + getHighlightColorAsString() +
109
//" mode=active " + */
110
" version=2>\n<Highlight>\n");
111         textOS = new ByteArrayOutputStream JavaDoc();
112         textWriter = new OutputStreamWriter JavaDoc( textOS, "UTF-16" );
113         writeText(pdDocument, textWriter);
114         highlighterOutput.write("</Highlight>\n</Body>\n</XML>");
115         highlighterOutput.flush();
116     }
117
118     /**
119      * {@inheritDoc}
120      */

121     protected void endPage( PDPage pdPage ) throws IOException JavaDoc
122     {
123         textWriter.flush();
124
125         String JavaDoc page = new String JavaDoc( textOS.toByteArray(), "UTF-16" );
126         textOS.reset();
127         //page = page.replaceAll( "\n", "" );
128
//page = page.replaceAll( "\r", "" );
129
//page = CCRStringUtil.stripChar(page, '\n');
130
//page = CCRStringUtil.stripChar(page, '\r');
131

132         // Traitement des listes à puces (caractères spéciaux)
133
if (page.indexOf("a") != -1)
134         {
135             page = page.replaceAll("a[0-9]{1,3}", ".");
136         }
137
138         for (int i = 0; i < searchedWords.length; i++)
139         {
140             Pattern JavaDoc pattern = Pattern.compile(searchedWords[i], Pattern.CASE_INSENSITIVE);
141             Matcher JavaDoc matcher = pattern.matcher(page);
142             while( matcher.find() )
143             {
144                 int begin = matcher.start();
145                 int end = matcher.end();
146                 highlighterOutput.write(" <loc " +
147                         "pg=" + (getCurrentPageNo()-1)
148                         + " pos=" + begin
149                         + " len="+ (end - begin)
150                         + ">\n");
151             }
152         }
153     }
154
155     /**
156      * Command line application.
157      *
158      * @param args The command line arguments to the application.
159      *
160      * @throws IOException If there is an error generating the highlight file.
161      */

162     public static void main(String JavaDoc[] args) throws IOException JavaDoc
163     {
164         PDFHighlighter xmlExtractor = new PDFHighlighter();
165         PDDocument doc = null;
166         try
167         {
168             if( args.length < 2 )
169             {
170                 usage();
171             }
172             String JavaDoc[] highlightStrings = new String JavaDoc[ args.length - 1];
173             System.arraycopy( args, 1, highlightStrings, 0, highlightStrings.length );
174             doc = PDDocument.load( args[0] );
175             
176             xmlExtractor.generateXMLHighlight(
177                 doc,
178                 highlightStrings,
179                 new OutputStreamWriter JavaDoc( System.out ) );
180         }
181         finally
182         {
183             if( doc != null )
184             {
185                 doc.close();
186             }
187         }
188     }
189     
190     private static void usage()
191     {
192         System.err.println( "usage: java " + PDFHighlighter.class.getName() + " <pdf file> word1 word2 word3 ..." );
193         System.exit( 1 );
194     }
195     
196     
197     /**
198      * Get the color to highlight the strings with. Default is Color.YELLOW.
199      *
200      * @return The color to highlight strings with.
201      */

202     /*public Color getHighlightColor()
203     {
204         return highlightColor;
205     }**/

206     
207     /**
208      * Get the color to highlight the strings with. Default is Color.YELLOW.
209      *
210      * @param color The color to highlight strings with.
211      */

212     /*public void setHighlightColor(Color color)
213     {
214         this.highlightColor = color;
215     }**/

216     
217     /**
218      * Set the highlight color using HTML like rgb string. The string must be 6 characters long.
219      *
220      * @param color The color to use for highlighting. Should be in the format of "FF0000".
221      */

222     /*public void setHighlightColor( String color )
223     {
224         highlightColor = Color.decode( color );
225     }**/

226     
227     /**
228      * Get the highlight color as an HTML like string. This will return a string of six characters.
229      *
230      * @return The current highlight color. For example FF0000
231      */

232     /*public String getHighlightColorAsString()
233     {
234         //BJL: kudos to anyone that has a cleaner way of doing this!
235         String red = Integer.toHexString( highlightColor.getRed() );
236         String green = Integer.toHexString( highlightColor.getGreen() );
237         String blue = Integer.toHexString( highlightColor.getBlue() );
238         
239         return (red.length() < 2 ? "0" + red : red) +
240                (green.length() < 2 ? "0" + green : green) +
241                (blue.length() < 2 ? "0" + blue : blue);
242     }**/

243 }
Popular Tags