KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > eclipse > jface > internal > text > html > HTML2TextReader


1 /*******************************************************************************
2  * Copyright (c) 2000, 2006 IBM Corporation and others.
3  * All rights reserved. This program and the accompanying materials
4  * are made available under the terms of the Eclipse Public License v1.0
5  * which accompanies this distribution, and is available at
6  * http://www.eclipse.org/legal/epl-v10.html
7  *
8  * Contributors:
9  * IBM Corporation - initial API and implementation
10  *******************************************************************************/

11 package org.eclipse.jface.internal.text.html;
12
13 import java.io.IOException JavaDoc;
14 import java.io.PushbackReader JavaDoc;
15 import java.io.Reader JavaDoc;
16 import java.util.HashMap JavaDoc;
17 import java.util.HashSet JavaDoc;
18 import java.util.Map JavaDoc;
19 import java.util.Set JavaDoc;
20
21 import org.eclipse.swt.SWT;
22 import org.eclipse.swt.custom.StyleRange;
23
24 import org.eclipse.jface.text.TextPresentation;
25
26
27 /**
28  * Reads the text contents from a reader of HTML contents and translates
29  * the tags or cut them out.
30  * <p>
31  * Moved into this package from <code>org.eclipse.jface.internal.text.revisions</code>.</p>
32  */

33 public class HTML2TextReader extends SubstitutionTextReader {
34
35     private static final String JavaDoc EMPTY_STRING= ""; //$NON-NLS-1$
36
private static final Map JavaDoc fgEntityLookup;
37     private static final Set JavaDoc fgTags;
38
39     static {
40
41         fgTags= new HashSet JavaDoc();
42         fgTags.add("b"); //$NON-NLS-1$
43
fgTags.add("br"); //$NON-NLS-1$
44
fgTags.add("br/"); //$NON-NLS-1$
45
fgTags.add("div"); //$NON-NLS-1$
46
fgTags.add("h1"); //$NON-NLS-1$
47
fgTags.add("h2"); //$NON-NLS-1$
48
fgTags.add("h3"); //$NON-NLS-1$
49
fgTags.add("h4"); //$NON-NLS-1$
50
fgTags.add("h5"); //$NON-NLS-1$
51
fgTags.add("p"); //$NON-NLS-1$
52
fgTags.add("dl"); //$NON-NLS-1$
53
fgTags.add("dt"); //$NON-NLS-1$
54
fgTags.add("dd"); //$NON-NLS-1$
55
fgTags.add("li"); //$NON-NLS-1$
56
fgTags.add("ul"); //$NON-NLS-1$
57
fgTags.add("pre"); //$NON-NLS-1$
58
fgTags.add("head"); //$NON-NLS-1$
59

60         fgEntityLookup= new HashMap JavaDoc(7);
61         fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$
62
fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$
63
fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$
64
fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$
65
fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$
66
fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$
67
fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$
68
}
69
70     private int fCounter= 0;
71     private TextPresentation fTextPresentation;
72     private int fBold= 0;
73     private int fStartOffset= -1;
74     private boolean fInParagraph= false;
75     private boolean fIsPreformattedText= false;
76     private boolean fIgnore= false;
77     private boolean fHeaderDetected= false;
78
79     /**
80      * Transforms the HTML text from the reader to formatted text.
81      *
82      * @param reader the reader
83      * @param presentation If not <code>null</code>, formattings will be applied to
84      * the presentation.
85     */

86     public HTML2TextReader(Reader JavaDoc reader, TextPresentation presentation) {
87         super(new PushbackReader JavaDoc(reader));
88         fTextPresentation= presentation;
89     }
90
91     public int read() throws IOException JavaDoc {
92         int c= super.read();
93         if (c != -1)
94             ++ fCounter;
95         return c;
96     }
97
98     protected void startBold() {
99         if (fBold == 0)
100             fStartOffset= fCounter;
101         ++ fBold;
102     }
103
104     protected void startPreformattedText() {
105         fIsPreformattedText= true;
106         setSkipWhitespace(false);
107     }
108
109     protected void stopPreformattedText() {
110         fIsPreformattedText= false;
111         setSkipWhitespace(true);
112     }
113
114     protected void stopBold() {
115         -- fBold;
116         if (fBold == 0) {
117             if (fTextPresentation != null) {
118                 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, SWT.BOLD));
119             }
120             fStartOffset= -1;
121         }
122     }
123
124     /*
125      * @see org.eclipse.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int)
126      */

127     protected String JavaDoc computeSubstitution(int c) throws IOException JavaDoc {
128
129         if (c == '<')
130             return processHTMLTag();
131         else if (fIgnore)
132             return EMPTY_STRING;
133         else if (c == '&')
134             return processEntity();
135         else if (fIsPreformattedText)
136             return processPreformattedText(c);
137
138         return null;
139     }
140
141     private String JavaDoc html2Text(String JavaDoc html) {
142
143         if (html == null || html.length() == 0)
144             return EMPTY_STRING;
145
146         html= html.toLowerCase();
147         
148         String JavaDoc tag= html;
149         if ('/' == tag.charAt(0))
150             tag= tag.substring(1);
151
152         if (!fgTags.contains(tag))
153             return EMPTY_STRING;
154
155
156         if ("pre".equals(html)) { //$NON-NLS-1$
157
startPreformattedText();
158             return EMPTY_STRING;
159         }
160
161         if ("/pre".equals(html)) { //$NON-NLS-1$
162
stopPreformattedText();
163             return EMPTY_STRING;
164         }
165
166         if (fIsPreformattedText)
167             return EMPTY_STRING;
168
169         if ("b".equals(html)) { //$NON-NLS-1$
170
startBold();
171             return EMPTY_STRING;
172         }
173
174         if ((html.length() > 1 && html.charAt(0) == 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$
175
startBold();
176             return EMPTY_STRING;
177         }
178
179         if ("dl".equals(html)) //$NON-NLS-1$
180
return LINE_DELIM;
181
182         if ("dd".equals(html)) //$NON-NLS-1$
183
return "\t"; //$NON-NLS-1$
184

185         if ("li".equals(html)) //$NON-NLS-1$
186
// FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682
187
return LINE_DELIM + HTMLMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$
188

189         if ("/b".equals(html)) { //$NON-NLS-1$
190
stopBold();
191             return EMPTY_STRING;
192         }
193
194         if ("p".equals(html)) { //$NON-NLS-1$
195
fInParagraph= true;
196             return LINE_DELIM;
197         }
198
199         if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
200
return LINE_DELIM;
201
202         if ("/p".equals(html)) { //$NON-NLS-1$
203
boolean inParagraph= fInParagraph;
204             fInParagraph= false;
205             return inParagraph ? EMPTY_STRING : LINE_DELIM;
206         }
207
208         if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
209
stopBold();
210             return LINE_DELIM;
211         }
212
213         if ("/dd".equals(html)) //$NON-NLS-1$
214
return LINE_DELIM;
215         
216         if ("head".equals(html) && !fHeaderDetected) { //$NON-NLS-1$
217
fHeaderDetected= true;
218             fIgnore= true;
219             return EMPTY_STRING;
220         }
221         
222         if ("/head".equals(html) && fHeaderDetected && fIgnore) { //$NON-NLS-1$
223
fIgnore= false;
224             return EMPTY_STRING;
225         }
226
227         return EMPTY_STRING;
228     }
229
230     /*
231      * A '<' has been read. Process a html tag
232      */

233     private String JavaDoc processHTMLTag() throws IOException JavaDoc {
234
235         StringBuffer JavaDoc buf= new StringBuffer JavaDoc();
236         int ch;
237         do {
238
239             ch= nextChar();
240
241             while (ch != -1 && ch != '>') {
242                 buf.append(Character.toLowerCase((char) ch));
243                 ch= nextChar();
244                 if (ch == '"'){
245                     buf.append(Character.toLowerCase((char) ch));
246                     ch= nextChar();
247                     while (ch != -1 && ch != '"'){
248                         buf.append(Character.toLowerCase((char) ch));
249                         ch= nextChar();
250                     }
251                 }
252                 if (ch == '<'){
253                     unread(ch);
254                     return '<' + buf.toString();
255                 }
256             }
257
258             if (ch == -1)
259                 return null;
260
261             int tagLen= buf.length();
262             // needs special treatment for comments
263
if ((tagLen >= 3 && "!--".equals(buf.substring(0, 3))) //$NON-NLS-1$
264
&& !(tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)))) { //$NON-NLS-1$
265
// unfinished comment
266
buf.append(ch);
267             } else {
268                 break;
269             }
270         } while (true);
271
272         return html2Text(buf.toString());
273     }
274
275     private String JavaDoc processPreformattedText(int c) {
276         if (c == '\r' || c == '\n')
277             fCounter++;
278         return null;
279     }
280
281
282     private void unread(int ch) throws IOException JavaDoc {
283         ((PushbackReader JavaDoc) getReader()).unread(ch);
284     }
285
286     protected String JavaDoc entity2Text(String JavaDoc symbol) {
287         if (symbol.length() > 1 && symbol.charAt(0) == '#') {
288             int ch;
289             try {
290                 if (symbol.charAt(1) == 'x') {
291                     ch= Integer.parseInt(symbol.substring(2), 16);
292                 } else {
293                     ch= Integer.parseInt(symbol.substring(1), 10);
294                 }
295                 return EMPTY_STRING + (char)ch;
296             } catch (NumberFormatException JavaDoc e) {
297             }
298         } else {
299             String JavaDoc str= (String JavaDoc) fgEntityLookup.get(symbol);
300             if (str != null) {
301                 return str;
302             }
303         }
304         return "&" + symbol; // not found //$NON-NLS-1$
305
}
306
307     /*
308      * A '&' has been read. Process a entity
309      */

310     private String JavaDoc processEntity() throws IOException JavaDoc {
311         StringBuffer JavaDoc buf= new StringBuffer JavaDoc();
312         int ch= nextChar();
313         while (Character.isLetterOrDigit((char)ch) || ch == '#') {
314             buf.append((char) ch);
315             ch= nextChar();
316         }
317
318         if (ch == ';')
319             return entity2Text(buf.toString());
320
321         buf.insert(0, '&');
322         if (ch != -1)
323             buf.append((char) ch);
324         return buf.toString();
325     }
326 }
327
Popular Tags