KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > pdfbox > util > PDFText2HTML


1 /**
2  * Copyright (c) 2003-2004, www.pdfbox.org
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  * this list of conditions and the following disclaimer in the documentation
12  * and/or other materials provided with the distribution.
13  * 3. Neither the name of pdfbox; nor the names of its
14  * contributors may be used to endorse or promote products derived from this
15  * software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * http://www.pdfbox.org
29  *
30  */

31 package org.pdfbox.util;
32
33 import java.io.IOException JavaDoc;
34
35 import java.util.Iterator JavaDoc;
36 import java.util.List JavaDoc;
37
38 import org.pdfbox.pdmodel.PDDocument;
39
40 /**
41  * Wrap stripped text in simple HTML, trying to form HTML paragraphs.
42  * Paragraphs broken by pages, columns, or figures are not mended.
43  *
44  *
45  * @author jjb - http://www.johnjbarton.com
46  * @version $Revision: 1.3 $
47  */

48 public class PDFText2HTML extends PDFTextStripper
49 {
50     private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
51
52     private TextPosition beginTitle;
53     private TextPosition afterEndTitle;
54     private String JavaDoc titleGuess;
55     private boolean suppressParagraphs;
56     private boolean onFirstPage = true;
57    
58     /**
59      * Constructor.
60      *
61      * @throws IOException If there is an error during initialization.
62      */

63     public PDFText2HTML() throws IOException JavaDoc
64     {
65         titleGuess = "";
66         beginTitle = null;
67         afterEndTitle = null;
68         suppressParagraphs = false;
69     }
70
71     /**
72      * Write the header to the output document.
73      *
74      * @throws IOException If there is a problem writing out the header to the document.
75      */

76     protected void writeHeader() throws IOException JavaDoc
77     {
78         StringBuffer JavaDoc buf = new StringBuffer JavaDoc(INITIAL_PDF_TO_HTML_BYTES);
79         buf.append("<html><head>");
80         buf.append("<title>");
81         buf.append(getTitleGuess());
82         buf.append("</title>");
83         buf.append("</head>");
84         buf.append("<body>\n");
85         getOutput().write(buf.toString());
86     }
87    
88     /**
89      * The guess to the document title.
90      *
91      * @return A string that is the title of this document.
92      */

93     protected String JavaDoc getTitleGuess()
94     {
95         return titleGuess;
96     }
97    
98     /**
99      * {@inheritDoc}
100      */

101     protected void flushText() throws IOException JavaDoc
102     {
103         Iterator JavaDoc textIter = getCharactersByArticle().iterator();
104       
105         if (onFirstPage)
106         {
107             guessTitle(textIter);
108             writeHeader();
109             onFirstPage = false;
110         }
111         super.flushText();
112     }
113     
114     /**
115      * {@inheritDoc}
116      */

117     public void endDocument(PDDocument pdf) throws IOException JavaDoc
118     {
119         output.write("</body></html>");
120     }
121
122     /**
123      * This method will attempt to guess the title of the document.
124      *
125      * @param textIter The characters on the first page.
126      * @return The text position that is guessed to be the title.
127      */

128     protected TextPosition guessTitle(Iterator JavaDoc textIter)
129     {
130         float lastFontSize = -1.0f;
131         int stringsInFont = 0;
132         StringBuffer JavaDoc titleText = new StringBuffer JavaDoc();
133         while (textIter.hasNext())
134         {
135             Iterator JavaDoc textByArticle = ((List JavaDoc)textIter.next()).iterator();
136             while( textByArticle.hasNext() )
137             {
138                 TextPosition position = (TextPosition) textByArticle.next();
139                 float currentFontSize = position.getFontSize();
140                 if (currentFontSize != lastFontSize)
141                 {
142                     if (beginTitle != null)
143                     { // font change in candidate title.
144
if (stringsInFont == 0)
145                         {
146                             beginTitle = null; // false alarm
147
titleText.setLength(0);
148                         }
149                         else
150                         {
151                             // had a significant font with some words: call it a title
152
titleGuess = titleText.toString();
153                             afterEndTitle = position;
154                             return beginTitle;
155                         }
156                     }
157                     else
158                     { // font change and begin == null
159
if (currentFontSize > 13.0f)
160                         { // most body text is 12pt max I guess
161
beginTitle = position;
162                         }
163                     }
164          
165                     lastFontSize = currentFontSize;
166                     stringsInFont = 0;
167                 }
168                 stringsInFont++;
169                 if (beginTitle != null)
170                 {
171                     titleText.append(position.getCharacter()+" ");
172                 }
173             }
174         }
175         return beginTitle; // null
176
}
177     
178     /**
179      * Write out the paragraph separator.
180      *
181      * @throws IOException If there is an error writing to the stream.
182      */

183     protected void startParagraph() throws IOException JavaDoc
184     {
185         if (! suppressParagraphs)
186         {
187             getOutput().write("<p>");
188         }
189     }
190     /**
191      * Write out the paragraph separator.
192      *
193      * @throws IOException If there is an error writing to the stream.
194      */

195     protected void endParagraph() throws IOException JavaDoc
196     {
197         if (! suppressParagraphs)
198         {
199             getOutput().write("</p>");
200         }
201     }
202     
203     /**
204      * {@inheritDoc}
205      */

206     protected void writeCharacters(TextPosition position ) throws IOException JavaDoc
207     {
208         if (position == beginTitle)
209         {
210             output.write("<H1>");
211             suppressParagraphs = true;
212         }
213         if (position == afterEndTitle)
214         {
215             output.write("</H1>"); // end title and start first paragraph
216
suppressParagraphs = false;
217         }
218       
219         String JavaDoc chars = position.getCharacter();
220
221         for (int i = 0; i < chars.length(); i++)
222         {
223             char c = chars.charAt(i);
224             if ((c < 32) || (c > 126))
225             {
226                 int charAsInt = c;
227                 output.write("&#" + charAsInt + ";");
228             }
229             else
230             {
231                 switch (c)
232                 {
233                     case 34:
234                         output.write("&quot;");
235                         break;
236                     case 38:
237                         output.write("&amp;");
238                         break;
239                     case 60:
240                         output.write("&lt;");
241                         break;
242                     case 62:
243                         output.write("&gt;");
244                         break;
245                     default:
246                         output.write(c);
247                 }
248             }
249         }
250     }
251     
252     /**
253      * @return Returns the suppressParagraphs.
254      */

255     public boolean isSuppressParagraphs()
256     {
257         return suppressParagraphs;
258     }
259     /**
260      * @param shouldSuppressParagraphs The suppressParagraphs to set.
261      */

262     public void setSuppressParagraphs(boolean shouldSuppressParagraphs)
263     {
264         this.suppressParagraphs = shouldSuppressParagraphs;
265     }
266 }
Popular Tags