KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > eclipse > jdt > internal > ui > text > HTML2TextReader


1 /*******************************************************************************
2  * Copyright (c) 2000, 2006 IBM Corporation and others.
3  * All rights reserved. This program and the accompanying materials
4  * are made available under the terms of the Eclipse Public License v1.0
5  * which accompanies this distribution, and is available at
6  * http://www.eclipse.org/legal/epl-v10.html
7  *
8  * Contributors:
9  * IBM Corporation - initial API and implementation
10  *******************************************************************************/

11 package org.eclipse.jdt.internal.ui.text;
12
13 import java.io.IOException JavaDoc;
14 import java.io.PushbackReader JavaDoc;
15 import java.io.Reader JavaDoc;
16 import java.util.HashMap JavaDoc;
17 import java.util.HashSet JavaDoc;
18 import java.util.Map JavaDoc;
19 import java.util.Set JavaDoc;
20
21 import org.eclipse.swt.SWT;
22 import org.eclipse.swt.custom.StyleRange;
23
24 import org.eclipse.jface.text.TextPresentation;
25
26 import org.eclipse.jdt.internal.ui.JavaUIMessages;
27
28
29 /**
30  * Reads the text contents from a reader of HTML contents and translates
31  * the tags or cut them out.
32  */

33 public class HTML2TextReader extends SubstitutionTextReader {
34
35     private static final String JavaDoc EMPTY_STRING= ""; //$NON-NLS-1$
36
private static final Map JavaDoc fgEntityLookup;
37     private static final Set JavaDoc fgTags;
38
39     static {
40
41         fgTags= new HashSet JavaDoc();
42         fgTags.add("b"); //$NON-NLS-1$
43
fgTags.add("br"); //$NON-NLS-1$
44
fgTags.add("br/"); //$NON-NLS-1$
45
fgTags.add("div"); //$NON-NLS-1$
46
fgTags.add("h1"); //$NON-NLS-1$
47
fgTags.add("h2"); //$NON-NLS-1$
48
fgTags.add("h3"); //$NON-NLS-1$
49
fgTags.add("h4"); //$NON-NLS-1$
50
fgTags.add("h5"); //$NON-NLS-1$
51
fgTags.add("p"); //$NON-NLS-1$
52
fgTags.add("dl"); //$NON-NLS-1$
53
fgTags.add("dt"); //$NON-NLS-1$
54
fgTags.add("dd"); //$NON-NLS-1$
55
fgTags.add("li"); //$NON-NLS-1$
56
fgTags.add("ul"); //$NON-NLS-1$
57
fgTags.add("pre"); //$NON-NLS-1$
58
fgTags.add("head"); //$NON-NLS-1$
59

60         fgEntityLookup= new HashMap JavaDoc(7);
61         fgEntityLookup.put("lt", "<"); //$NON-NLS-1$ //$NON-NLS-2$
62
fgEntityLookup.put("gt", ">"); //$NON-NLS-1$ //$NON-NLS-2$
63
fgEntityLookup.put("nbsp", " "); //$NON-NLS-1$ //$NON-NLS-2$
64
fgEntityLookup.put("amp", "&"); //$NON-NLS-1$ //$NON-NLS-2$
65
fgEntityLookup.put("circ", "^"); //$NON-NLS-1$ //$NON-NLS-2$
66
fgEntityLookup.put("tilde", "~"); //$NON-NLS-2$ //$NON-NLS-1$
67
fgEntityLookup.put("quot", "\""); //$NON-NLS-1$ //$NON-NLS-2$
68
}
69
70     private int fCounter= 0;
71     private TextPresentation fTextPresentation;
72     private int fBold= 0;
73     private int fStartOffset= -1;
74     private boolean fInParagraph= false;
75     private boolean fIsPreformattedText= false;
76     private boolean fIgnore= false;
77
78     /**
79      * Transforms the HTML text from the reader to formatted text.
80      *
81      * @param reader the reader
82      * @param presentation If not <code>null</code>, formattings will be applied to
83      * the presentation.
84     */

85     public HTML2TextReader(Reader JavaDoc reader, TextPresentation presentation) {
86         super(new PushbackReader JavaDoc(reader));
87         fTextPresentation= presentation;
88     }
89
90     public int read() throws IOException JavaDoc {
91         int c= super.read();
92         if (c != -1)
93             ++ fCounter;
94         return c;
95     }
96
97     protected void startBold() {
98         if (fBold == 0)
99             fStartOffset= fCounter;
100         ++ fBold;
101     }
102
103     protected void startPreformattedText() {
104         fIsPreformattedText= true;
105         setSkipWhitespace(false);
106     }
107
108     protected void stopPreformattedText() {
109         fIsPreformattedText= false;
110         setSkipWhitespace(true);
111     }
112
113     protected void stopBold() {
114         -- fBold;
115         if (fBold == 0) {
116             if (fTextPresentation != null) {
117                 fTextPresentation.addStyleRange(new StyleRange(fStartOffset, fCounter - fStartOffset, null, null, SWT.BOLD));
118             }
119             fStartOffset= -1;
120         }
121     }
122
123     /*
124      * @see org.eclipse.jdt.internal.ui.text.SubstitutionTextReader#computeSubstitution(int)
125      */

126     protected String JavaDoc computeSubstitution(int c) throws IOException JavaDoc {
127
128         if (c == '<')
129             return processHTMLTag();
130         else if (fIgnore)
131             return EMPTY_STRING;
132         else if (c == '&')
133             return processEntity();
134         else if (fIsPreformattedText)
135             return processPreformattedText(c);
136
137         return null;
138     }
139
140     private String JavaDoc html2Text(String JavaDoc html) {
141
142         if (html == null || html.length() == 0)
143             return EMPTY_STRING;
144
145         html= html.toLowerCase();
146         
147         String JavaDoc tag= html;
148         if ('/' == tag.charAt(0))
149             tag= tag.substring(1);
150
151         if (!fgTags.contains(tag))
152             return EMPTY_STRING;
153
154
155         if ("pre".equals(html)) { //$NON-NLS-1$
156
startPreformattedText();
157             return EMPTY_STRING;
158         }
159
160         if ("/pre".equals(html)) { //$NON-NLS-1$
161
stopPreformattedText();
162             return EMPTY_STRING;
163         }
164
165         if (fIsPreformattedText)
166             return EMPTY_STRING;
167
168         if ("b".equals(html)) { //$NON-NLS-1$
169
startBold();
170             return EMPTY_STRING;
171         }
172
173         if ((html.length() > 1 && html.charAt(0) == 'h' && Character.isDigit(html.charAt(1))) || "dt".equals(html)) { //$NON-NLS-1$
174
startBold();
175             return EMPTY_STRING;
176         }
177
178         if ("dl".equals(html)) //$NON-NLS-1$
179
return LINE_DELIM;
180
181         if ("dd".equals(html)) //$NON-NLS-1$
182
return "\t"; //$NON-NLS-1$
183

184         if ("li".equals(html)) //$NON-NLS-1$
185
// FIXME: this hard-coded prefix does not work for RTL languages, see https://bugs.eclipse.org/bugs/show_bug.cgi?id=91682
186
return LINE_DELIM + JavaUIMessages.HTML2TextReader_listItemPrefix;
187
188         if ("/b".equals(html)) { //$NON-NLS-1$
189
stopBold();
190             return EMPTY_STRING;
191         }
192
193         if ("p".equals(html)) { //$NON-NLS-1$
194
fInParagraph= true;
195             return LINE_DELIM;
196         }
197
198         if ("br".equals(html) || "br/".equals(html) || "div".equals(html)) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
199
return LINE_DELIM;
200
201         if ("/p".equals(html)) { //$NON-NLS-1$
202
boolean inParagraph= fInParagraph;
203             fInParagraph= false;
204             return inParagraph ? EMPTY_STRING : LINE_DELIM;
205         }
206
207         if ((html.startsWith("/h") && html.length() > 2 && Character.isDigit(html.charAt(2))) || "/dt".equals(html)) { //$NON-NLS-1$ //$NON-NLS-2$
208
stopBold();
209             return LINE_DELIM;
210         }
211
212         if ("/dd".equals(html)) //$NON-NLS-1$
213
return LINE_DELIM;
214         
215         if ("head".equals(html)) { //$NON-NLS-1$
216
fIgnore= true;
217             return EMPTY_STRING;
218         }
219         
220         if ("/head".equals(html)) { //$NON-NLS-1$
221
fIgnore= false;
222             return EMPTY_STRING;
223         }
224
225         return EMPTY_STRING;
226     }
227
228     /*
229      * A '<' has been read. Process a html tag
230      */

231     private String JavaDoc processHTMLTag() throws IOException JavaDoc {
232
233         StringBuffer JavaDoc buf= new StringBuffer JavaDoc();
234         int ch;
235         do {
236
237             ch= nextChar();
238
239             while (ch != -1 && ch != '>') {
240                 buf.append(Character.toLowerCase((char) ch));
241                 ch= nextChar();
242                 if (ch == '"'){
243                     buf.append(Character.toLowerCase((char) ch));
244                     ch= nextChar();
245                     while (ch != -1 && ch != '"'){
246                         buf.append(Character.toLowerCase((char) ch));
247                         ch= nextChar();
248                     }
249                 }
250                 if (ch == '<'){
251                     unread(ch);
252                     return '<' + buf.toString();
253                 }
254             }
255
256             if (ch == -1)
257                 return null;
258
259             int tagLen= buf.length();
260             // needs special treatment for comments
261
if ((tagLen >= 3 && "!--".equals(buf.substring(0, 3))) //$NON-NLS-1$
262
&& !(tagLen >= 5 && "--".equals(buf.substring(tagLen - 2)))) { //$NON-NLS-1$
263
// unfinished comment
264
buf.append(ch);
265             } else {
266                 break;
267             }
268         } while (true);
269
270         return html2Text(buf.toString());
271     }
272
273     private String JavaDoc processPreformattedText(int c) {
274         if (c == '\r' || c == '\n')
275             fCounter++;
276         return null;
277     }
278
279
280     private void unread(int ch) throws IOException JavaDoc {
281         ((PushbackReader JavaDoc) getReader()).unread(ch);
282     }
283
284     protected String JavaDoc entity2Text(String JavaDoc symbol) {
285         if (symbol.length() > 1 && symbol.charAt(0) == '#') {
286             int ch;
287             try {
288                 if (symbol.charAt(1) == 'x') {
289                     ch= Integer.parseInt(symbol.substring(2), 16);
290                 } else {
291                     ch= Integer.parseInt(symbol.substring(1), 10);
292                 }
293                 return EMPTY_STRING + (char)ch;
294             } catch (NumberFormatException JavaDoc e) {
295             }
296         } else {
297             String JavaDoc str= (String JavaDoc) fgEntityLookup.get(symbol);
298             if (str != null) {
299                 return str;
300             }
301         }
302         return "&" + symbol; // not found //$NON-NLS-1$
303
}
304
305     /*
306      * A '&' has been read. Process a entity
307      */

308     private String JavaDoc processEntity() throws IOException JavaDoc {
309         StringBuffer JavaDoc buf= new StringBuffer JavaDoc();
310         int ch= nextChar();
311         while (Character.isLetterOrDigit((char)ch) || ch == '#') {
312             buf.append((char) ch);
313             ch= nextChar();
314         }
315
316         if (ch == ';')
317             return entity2Text(buf.toString());
318
319         buf.insert(0, '&');
320         if (ch != -1)
321             buf.append((char) ch);
322         return buf.toString();
323     }
324 }
325
Popular Tags