1 11 package org.eclipse.jface.internal.text.html; 12 13 import java.io.IOException ; 14 import java.io.PushbackReader ; 15 import java.io.Reader ; 16 import java.util.HashMap ; 17 import java.util.HashSet ; 18 import java.util.Map ; 19 import java.util.Set ; 20 21 import org.eclipse.swt.SWT; 22 import org.eclipse.swt.custom.StyleRange; 23 24 import org.eclipse.jface.text.TextPresentation; 25 26 27 33 public class HTML2TextReader extends SubstitutionTextReader { 34 35 private static final String EMPTY_STRING= ""; private static final Map fgEntityLookup; 37 private static final Set fgTags; 38 39 static { 40 41 fgTags= new HashSet (); 42 fgTags.add("b"); fgTags.add("br"); fgTags.add("br/"); fgTags.add("div"); fgTags.add("h1"); fgTags.add("h2"); fgTags.add("h3"); fgTags.add("h4"); fgTags.add("h5"); fgTags.add("p"); fgTags.add("dl"); fgTags.add("dt"); fgTags.add("dd"); fgTags.add("li"); fgTags.add("ul"); fgTags.add("pre"); fgTags.add("head"); 60 fgEntityLookup= new HashMap (7); 61 fgEntityLookup.put("lt", "<"); fgEntityLookup.put("gt", ">"); fgEntityLookup.put("nbsp", " "); fgEntityLookup.put("amp", "&"); fgEntityLookup.put("circ", "^"); fgEntityLookup.put("tilde", "~"); fgEntityLookup.put("quot", "\""); } 69 70 private int fCounter= 0; 71 private TextPresentation fTextPresentation; 72 private int fBold= 0; 73 private int fStartOffset= -1; 74 private boolean fInParagraph= false; 75 private boolean fIsPreformattedText= false; 76 private boolean fIgnore= false; 77 private boolean fHeaderDetected= false; 78 79 86 public HTML2TextReader(Reader reader, TextPresentation presentation) { 87 super(new PushbackReader (reader)); 88 fTextPresentation= presentation; 89 } 90 91 public int read() throws IOException { 92 int c= super.read(); 93 if (c != -1) 94 ++ fCounter; 95 return c; 96 } 97 98 protected void startBold() { 99 if (fBold == 0) 100 fStartOffset= fCounter; 101 ++ fBold; 102 } 103 104 protected void startPreformattedText() { 105 fIsPreformattedText= true; 106 setSkipWhitespace(false); 107 } 108 109 protected void stopPreformattedText() { 110 fIsPreformattedText= false; 111 setSkipWhitespace(true); 112 } 113 114 protected void stopBold() { 115 -- fBold; 116 if (fBold == 0) { 117 if (fTextPresentation != null) { 118 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, SWT.BOLD)); 119 } 120 fStartOffset= -1; 121 } 122 } 123 124 127 protected String computeSubstitution(int c) throws IOException { 128 129 if (c == '<') 130 return processHTMLTag(); 131 else if (fIgnore) 132 return EMPTY_STRING; 133 else if (c == '&') 134 return processEntity(); 135 else if (fIsPreformattedText) 136 return processPreformattedText(c); 137 138 return null; 139 } 140 141 private String html2Text(String html) { 142 143 if (html == null || html.length() == 0) 144 return EMPTY_STRING; 145 146 html= html.toLowerCase(); 147 148 String tag= html; 149 if ('/' == tag.charAt(0)) 150 tag= tag.substring(1); 151 152 if (!fgTags.contains(tag)) 153 return EMPTY_STRING; 154 155 156 if ("pre".equals(html)) { startPreformattedText(); 158 return EMPTY_STRING; 159 } 160 161 if ("/pre".equals(html)) { stopPreformattedText(); 163 return EMPTY_STRING; 164 } 165 166 if (fIsPreformattedText) 167 return EMPTY_STRING; 168 169 if ("b".equals(html)) { startBold(); 171 return EMPTY_STRING; 172 } 173 174 if ((html.length() > 1 && html.charAt(0) == 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { startBold(); 176 return EMPTY_STRING; 177 } 178 179 if ("dl".equals(html)) return LINE_DELIM; 181 182 if ("dd".equals(html)) return "\t"; 185 if ("li".equals(html)) return LINE_DELIM + HTMLMessages.getString("HTML2TextReader.listItemPrefix"); 189 if ("/b".equals(html)) { stopBold(); 191 return EMPTY_STRING; 192 } 193 194 if ("p".equals(html)) { fInParagraph= true; 196 return LINE_DELIM; 197 } 198 199 if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) return LINE_DELIM; 201 202 if ("/p".equals(html)) { boolean inParagraph= fInParagraph; 204 fInParagraph= false; 205 return inParagraph ? EMPTY_STRING : LINE_DELIM; 206 } 207 208 if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { stopBold(); 210 return LINE_DELIM; 211 } 212 213 if ("/dd".equals(html)) return LINE_DELIM; 215 216 if ("head".equals(html) && !fHeaderDetected) { fHeaderDetected= true; 218 fIgnore= true; 219 return EMPTY_STRING; 220 } 221 222 if ("/head".equals(html) && fHeaderDetected && fIgnore) { fIgnore= false; 224 return EMPTY_STRING; 225 } 226 227 return EMPTY_STRING; 228 } 229 230 233 private String processHTMLTag() throws IOException { 234 235 StringBuffer buf= new StringBuffer (); 236 int ch; 237 do { 238 239 ch= nextChar(); 240 241 while (ch != -1 && ch != '>') { 242 buf.append(Character.toLowerCase((char) ch)); 243 ch= nextChar(); 244 if (ch == '"'){ 245 buf.append(Character.toLowerCase((char) ch)); 246 ch= nextChar(); 247 while (ch != -1 && ch != '"'){ 248 buf.append(Character.toLowerCase((char) ch)); 249 ch= nextChar(); 250 } 251 } 252 if (ch == '<'){ 253 unread(ch); 254 return '<' + buf.toString(); 255 } 256 } 257 258 if (ch == -1) 259 return null; 260 261 int tagLen= buf.length(); 262 if ((tagLen >= 3 && "!--".equals(buf.substring(0, 3))) && !(tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)))) { buf.append(ch); 267 } else { 268 break; 269 } 270 } while (true); 271 272 return html2Text(buf.toString()); 273 } 274 275 private String processPreformattedText(int c) { 276 if (c == '\r' || c == '\n') 277 fCounter++; 278 return null; 279 } 280 281 282 private void unread(int ch) throws IOException { 283 ((PushbackReader ) getReader()).unread(ch); 284 } 285 286 protected String entity2Text(String symbol) { 287 if (symbol.length() > 1 && symbol.charAt(0) == '#') { 288 int ch; 289 try { 290 if (symbol.charAt(1) == 'x') { 291 ch= Integer.parseInt(symbol.substring(2), 16); 292 } else { 293 ch= Integer.parseInt(symbol.substring(1), 10); 294 } 295 return EMPTY_STRING + (char)ch; 296 } catch (NumberFormatException e) { 297 } 298 } else { 299 String str= (String ) fgEntityLookup.get(symbol); 300 if (str != null) { 301 return str; 302 } 303 } 304 return "&" + symbol; } 306 307 310 private String processEntity() throws IOException { 311 StringBuffer buf= new StringBuffer (); 312 int ch= nextChar(); 313 while (Character.isLetterOrDigit((char)ch) || ch == '#') { 314 buf.append((char) ch); 315 ch= nextChar(); 316 } 317 318 if (ch == ';') 319 return entity2Text(buf.toString()); 320 321 buf.insert(0, '&'); 322 if (ch != -1) 323 buf.append((char) ch); 324 return buf.toString(); 325 } 326 } 327 | Popular Tags |