1 31 package org.pdfbox.util; 32 33 import java.io.IOException ; 34 35 import java.util.Iterator ; 36 import java.util.List ; 37 38 import org.pdfbox.pdmodel.PDDocument; 39 40 48 public class PDFText2HTML extends PDFTextStripper 49 { 50 private static final int INITIAL_PDF_TO_HTML_BYTES = 8192; 51 52 private TextPosition beginTitle; 53 private TextPosition afterEndTitle; 54 private String titleGuess; 55 private boolean suppressParagraphs; 56 private boolean onFirstPage = true; 57 58 63 public PDFText2HTML() throws IOException 64 { 65 titleGuess = ""; 66 beginTitle = null; 67 afterEndTitle = null; 68 suppressParagraphs = false; 69 } 70 71 76 protected void writeHeader() throws IOException 77 { 78 StringBuffer buf = new StringBuffer (INITIAL_PDF_TO_HTML_BYTES); 79 buf.append("<html><head>"); 80 buf.append("<title>"); 81 buf.append(getTitleGuess()); 82 buf.append("</title>"); 83 buf.append("</head>"); 84 buf.append("<body>\n"); 85 getOutput().write(buf.toString()); 86 } 87 88 93 protected String getTitleGuess() 94 { 95 return titleGuess; 96 } 97 98 101 protected void flushText() throws IOException 102 { 103 Iterator textIter = getCharactersByArticle().iterator(); 104 105 if (onFirstPage) 106 { 107 guessTitle(textIter); 108 writeHeader(); 109 onFirstPage = false; 110 } 111 super.flushText(); 112 } 113 114 117 public void endDocument(PDDocument pdf) throws IOException 118 { 119 output.write("</body></html>"); 120 } 121 122 128 protected TextPosition guessTitle(Iterator textIter) 129 { 130 float lastFontSize = -1.0f; 131 int stringsInFont = 0; 132 StringBuffer titleText = new StringBuffer (); 133 while (textIter.hasNext()) 134 { 135 Iterator textByArticle = ((List )textIter.next()).iterator(); 136 while( textByArticle.hasNext() ) 137 { 138 TextPosition position = (TextPosition) textByArticle.next(); 139 float currentFontSize = position.getFontSize(); 140 if (currentFontSize != lastFontSize) 141 { 142 if (beginTitle != null) 143 { if (stringsInFont == 0) 145 { 146 beginTitle = null; titleText.setLength(0); 148 } 149 else 150 { 151 titleGuess = titleText.toString(); 153 afterEndTitle = position; 154 return beginTitle; 155 } 156 } 157 else 158 { if (currentFontSize > 13.0f) 160 { beginTitle = position; 162 } 163 } 164 165 lastFontSize = currentFontSize; 166 stringsInFont = 0; 167 } 168 stringsInFont++; 169 if (beginTitle != null) 170 { 171 titleText.append(position.getCharacter()+" "); 172 } 173 } 174 } 175 return beginTitle; } 177 178 183 protected void startParagraph() throws IOException 184 { 185 if (! suppressParagraphs) 186 { 187 getOutput().write("<p>"); 188 } 189 } 190 195 protected void endParagraph() throws IOException 196 { 197 if (! suppressParagraphs) 198 { 199 getOutput().write("</p>"); 200 } 201 } 202 203 206 protected void writeCharacters(TextPosition position ) throws IOException 207 { 208 if (position == beginTitle) 209 { 210 output.write("<H1>"); 211 suppressParagraphs = true; 212 } 213 if (position == afterEndTitle) 214 { 215 output.write("</H1>"); suppressParagraphs = false; 217 } 218 219 String chars = position.getCharacter(); 220 221 for (int i = 0; i < chars.length(); i++) 222 { 223 char c = chars.charAt(i); 224 if ((c < 32) || (c > 126)) 225 { 226 int charAsInt = c; 227 output.write("&#" + charAsInt + ";"); 228 } 229 else 230 { 231 switch (c) 232 { 233 case 34: 234 output.write("""); 235 break; 236 case 38: 237 output.write("&"); 238 break; 239 case 60: 240 output.write("<"); 241 break; 242 case 62: 243 output.write(">"); 244 break; 245 default: 246 output.write(c); 247 } 248 } 249 } 250 } 251 252 255 public boolean isSuppressParagraphs() 256 { 257 return suppressParagraphs; 258 } 259 262 public void setSuppressParagraphs(boolean shouldSuppressParagraphs) 263 { 264 this.suppressParagraphs = shouldSuppressParagraphs; 265 } 266 } | Popular Tags |