KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > eclipse > jface > internal > text > link > contentassist > HTML2TextReader


1 /*******************************************************************************
2  * Copyright (c) 2000, 2006 IBM Corporation and others.
3  * All rights reserved. This program and the accompanying materials
4  * are made available under the terms of the Eclipse Public License v1.0
5  * which accompanies this distribution, and is available at
6  * http://www.eclipse.org/legal/epl-v10.html
7  *
8  * Contributors:
9  * IBM Corporation - initial API and implementation
10  *******************************************************************************/

11 package org.eclipse.jface.internal.text.link.contentassist;
12
13
14 import java.io.IOException JavaDoc;
15 import java.io.PushbackReader JavaDoc;
16 import java.io.Reader JavaDoc;
17 import java.util.HashMap JavaDoc;
18 import java.util.HashSet JavaDoc;
19 import java.util.Map JavaDoc;
20 import java.util.Set JavaDoc;
21
22 import org.eclipse.swt.SWT;
23 import org.eclipse.swt.custom.StyleRange;
24
25 import org.eclipse.jface.text.TextPresentation;
26
27
28 /**
29  * Reads the text contents from a reader of HTML contents and translates
30  * the tags or cut them out.
31  */

32 public class HTML2TextReader extends SubstitutionTextReader {
33
34     private static final String JavaDoc EMPTY_STRING= ""; //$NON-NLS-1$
35
private static final Map JavaDoc fgEntityLookup;
36     private static final Set JavaDoc fgTags;
37
38     static {
39
40         fgTags= new HashSet JavaDoc();
41         fgTags.add("b"); //$NON-NLS-1$
42
fgTags.add("br"); //$NON-NLS-1$
43
fgTags.add("br/"); //$NON-NLS-1$
44
fgTags.add("div"); //$NON-NLS-1$
45
fgTags.add("h1"); //$NON-NLS-1$
46
fgTags.add("h2"); //$NON-NLS-1$
47
fgTags.add("h3"); //$NON-NLS-1$
48
fgTags.add("h4"); //$NON-NLS-1$
49
fgTags.add("h5"); //$NON-NLS-1$
50
fgTags.add("p"); //$NON-NLS-1$
51
fgTags.add("dl"); //$NON-NLS-1$
52
fgTags.add("dt"); //$NON-NLS-1$
53
fgTags.add("dd"); //$NON-NLS-1$
54
fgTags.add("li"); //$NON-NLS-1$
55
fgTags.add("ul"); //$NON-NLS-1$
56
fgTags.add("pre"); //$NON-NLS-1$
57
fgTags.add("head"); //$NON-NLS-1$
58

59         fgEntityLookup= new HashMap JavaDoc(7);
60         fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$
61
fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$
62
fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$
63
fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$
64
fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$
65
fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$
66
fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$
67
}
68
69     private int fCounter= 0;
70     private TextPresentation fTextPresentation;
71     private int fBold= 0;
72     private int fStartOffset= -1;
73     private boolean fInParagraph= false;
74     private boolean fIsPreformattedText= false;
75     private boolean fIgnore= false;
76
77     /**
78      * Transforms the HTML text from the reader to formatted text.
79      *
80      * @param reader the reader
81      * @param presentation If not <code>null</code>, formattings will be applied to
82      * the presentation.
83     */

84     public HTML2TextReader(Reader JavaDoc reader, TextPresentation presentation) {
85         super(new PushbackReader JavaDoc(reader));
86         fTextPresentation= presentation;
87     }
88
89     public int read() throws IOException JavaDoc {
90         int c= super.read();
91         if (c != -1)
92             ++ fCounter;
93         return c;
94     }
95
96     protected void startBold() {
97         if (fBold == 0)
98             fStartOffset= fCounter;
99         ++ fBold;
100     }
101
102     protected void startPreformattedText() {
103         fIsPreformattedText= true;
104         setSkipWhitespace(false);
105     }
106
107     protected void stopPreformattedText() {
108         fIsPreformattedText= false;
109         setSkipWhitespace(true);
110     }
111
112     protected void stopBold() {
113         -- fBold;
114         if (fBold == 0) {
115             if (fTextPresentation != null) {
116                 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, SWT.BOLD));
117             }
118             fStartOffset= -1;
119         }
120     }
121
122     /*
123      * @see org.eclipse.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int)
124      */

125     protected String JavaDoc computeSubstitution(int c) throws IOException JavaDoc {
126
127         if (c == '<')
128             return processHTMLTag();
129         else if (fIgnore)
130             return EMPTY_STRING;
131         else if (c == '&')
132             return processEntity();
133         else if (fIsPreformattedText)
134             return processPreformattedText(c);
135
136         return null;
137     }
138
139     private String JavaDoc html2Text(String JavaDoc html) {
140
141         if (html == null || html.length() == 0)
142             return EMPTY_STRING;
143
144         html= html.toLowerCase();
145         
146         String JavaDoc tag= html;
147         if ('/' == tag.charAt(0))
148             tag= tag.substring(1);
149
150         if (!fgTags.contains(tag))
151             return EMPTY_STRING;
152
153
154         if ("pre".equals(html)) { //$NON-NLS-1$
155
startPreformattedText();
156             return EMPTY_STRING;
157         }
158
159         if ("/pre".equals(html)) { //$NON-NLS-1$
160
stopPreformattedText();
161             return EMPTY_STRING;
162         }
163
164         if (fIsPreformattedText)
165             return EMPTY_STRING;
166
167         if ("b".equals(html)) { //$NON-NLS-1$
168
startBold();
169             return EMPTY_STRING;
170         }
171
172         if ((html.length() > 1 && html.charAt(0) == 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$
173
startBold();
174             return EMPTY_STRING;
175         }
176
177         if ("dl".equals(html)) //$NON-NLS-1$
178
return LINE_DELIM;
179
180         if ("dd".equals(html)) //$NON-NLS-1$
181
return "\t"; //$NON-NLS-1$
182

183         if ("li".equals(html)) //$NON-NLS-1$
184
// FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682
185
return LINE_DELIM + ContentAssistMessages.getString("HTML2TextReader.listItemPrefix"); //$NON-NLS-1$
186

187         if ("/b".equals(html)) { //$NON-NLS-1$
188
stopBold();
189             return EMPTY_STRING;
190         }
191
192         if ("p".equals(html)) { //$NON-NLS-1$
193
fInParagraph= true;
194             return LINE_DELIM;
195         }
196
197         if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
198
return LINE_DELIM;
199
200         if ("/p".equals(html)) { //$NON-NLS-1$
201
boolean inParagraph= fInParagraph;
202             fInParagraph= false;
203             return inParagraph ? EMPTY_STRING : LINE_DELIM;
204         }
205
206         if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
207
stopBold();
208             return LINE_DELIM;
209         }
210
211         if ("/dd".equals(html)) //$NON-NLS-1$
212
return LINE_DELIM;
213         
214         if ("head".equals(html)) { //$NON-NLS-1$
215
fIgnore= true;
216             return EMPTY_STRING;
217         }
218         
219         if ("/head".equals(html)) { //$NON-NLS-1$
220
fIgnore= false;
221             return EMPTY_STRING;
222         }
223
224         return EMPTY_STRING;
225     }
226
227     /*
228      * A '<' has been read. Process a html tag
229      */

230     private String JavaDoc processHTMLTag() throws IOException JavaDoc {
231
232         StringBuffer JavaDoc buf= new StringBuffer JavaDoc();
233         int ch;
234         do {
235
236             ch= nextChar();
237
238             while (ch != -1 && ch != '>') {
239                 buf.append(Character.toLowerCase((char) ch));
240                 ch= nextChar();
241                 if (ch == '"'){
242                     buf.append(Character.toLowerCase((char) ch));
243                     ch= nextChar();
244                     while (ch != -1 && ch != '"'){
245                         buf.append(Character.toLowerCase((char) ch));
246                         ch= nextChar();
247                     }
248                 }
249                 if (ch == '<'){
250                     unread(ch);
251                     return '<' + buf.toString();
252                 }
253             }
254
255             if (ch == -1)
256                 return null;
257
258             int tagLen= buf.length();
259             // needs special treatment for comments
260
if ((tagLen >= 3 && "!--".equals(buf.substring(0, 3))) //$NON-NLS-1$
261
&& !(tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)))) { //$NON-NLS-1$
262
// unfinished comment
263
buf.append(ch);
264             } else {
265                 break;
266             }
267         } while (true);
268
269         return html2Text(buf.toString());
270     }
271
272     private String JavaDoc processPreformattedText(int c) {
273         if (c == '\r' || c == '\n')
274             fCounter++;
275         return null;
276     }
277
278
279     private void unread(int ch) throws IOException JavaDoc {
280         ((PushbackReader JavaDoc) getReader()).unread(ch);
281     }
282
283     protected String JavaDoc entity2Text(String JavaDoc symbol) {
284         if (symbol.length() > 1 && symbol.charAt(0) == '#') {
285             int ch;
286             try {
287                 if (symbol.charAt(1) == 'x') {
288                     ch= Integer.parseInt(symbol.substring(2), 16);
289                 } else {
290                     ch= Integer.parseInt(symbol.substring(1), 10);
291                 }
292                 return EMPTY_STRING + (char)ch;
293             } catch (NumberFormatException JavaDoc e) {
294             }
295         } else {
296             String JavaDoc str= (String JavaDoc) fgEntityLookup.get(symbol);
297             if (str != null) {
298                 return str;
299             }
300         }
301         return "&" + symbol; // not found //$NON-NLS-1$
302
}
303
304     /*
305      * A '&' has been read. Process a entity
306      */

307     private String JavaDoc processEntity() throws IOException JavaDoc {
308         StringBuffer JavaDoc buf= new StringBuffer JavaDoc();
309         int ch= nextChar();
310         while (Character.isLetterOrDigit((char)ch) || ch == '#') {
311             buf.append((char) ch);
312             ch= nextChar();
313         }
314
315         if (ch == ';')
316             return entity2Text(buf.toString());
317
318         buf.insert(0, '&');
319         if (ch != -1)
320             buf.append((char) ch);
321         return buf.toString();
322     }
323 }
324
Popular Tags