KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > util > CmsHtml2TextConverter


1
2 package org.opencms.util;
3
4
5 import java.util.HashMap JavaDoc;
6 import java.util.Iterator JavaDoc;
7 import java.util.List JavaDoc;
8 import java.util.Map JavaDoc;
9
10 import org.htmlparser.Tag;
11 import org.htmlparser.Text;
12 import org.htmlparser.util.Translate;
13
14 /**
15  * Extracts the HTML page content.<p>
16  */

17 public class CmsHtml2TextConverter extends CmsHtmlParser {
18
19     /** Indicated to append or store the next line breaks. */
20     private boolean m_appendBr;
21
22     /** Map of stored attributes that must bw written to the output when the tag closes. */
23     private Map JavaDoc m_attributeMap;
24
25     /** The last appended line break count. */
26     private int m_brCount;
27
28     /** The current indentation. */
29     private int m_indent;
30
31     /** The current line length. */
32     private int m_lineLength;
33
34     /** The marker String (for headlines, bullets etc.). */
35     private String JavaDoc m_marker;
36
37     /** The maximum line length. */
38     private int m_maxLineLength;
39
40     /** The last stored, but not appended line break count. */
41     private int m_storedBrCount;
42
43     /**
44      * Creates a new instance of the html converter.<p>
45      */

46     public CmsHtml2TextConverter() {
47
48         m_result = new StringBuffer JavaDoc(512);
49         m_maxLineLength = 100;
50         m_attributeMap = new HashMap JavaDoc(16);
51     }
52
53     /**
54      * Extracts the text from the given html content, assuming the given html encoding.<p>
55      *
56      * @param html the content to extract the plain text from
57      * @param encoding the encoding to use
58      *
59      * @return the text extracted from the given html content
60      *
61      * @throws Exception if something goes wrong
62      */

63     public static String JavaDoc html2text(String JavaDoc html, String JavaDoc encoding) throws Exception JavaDoc {
64
65         // create the converter instance
66
CmsHtml2TextConverter visitor = new CmsHtml2TextConverter();
67         return visitor.process(html, encoding);
68     }
69
70     /**
71      * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag)
72      */

73     public void visitEndTag(Tag tag) {
74
75         m_appendBr = false;
76         appendLinebreaks(tag, false);
77         String JavaDoc attribute = (String JavaDoc)m_attributeMap.remove(tag.getParent());
78         if (attribute != null) {
79             appendText(attribute);
80         }
81     }
82
83     /**
84      * @see org.htmlparser.visitors.NodeVisitor#visitStringNode(org.htmlparser.Text)
85      */

86     public void visitStringNode(Text text) {
87
88         appendText(text.toPlainTextString());
89     }
90
91     /**
92      * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag)
93      */

94     public void visitTag(Tag tag) {
95
96         m_appendBr = true;
97         appendLinebreaks(tag, true);
98
99         if (tag.getTagName().equals("IMG")) {
100             appendText("##IMG##");
101         }
102
103         String JavaDoc href = tag.getAttribute("href");
104         if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(href)) {
105             appendAttribute(tag, " [" + href.trim() + "]");
106         }
107         String JavaDoc src = tag.getAttribute("src");
108         if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(src)) {
109             appendAttribute(tag, " [" + src.trim() + "]");
110         }
111         String JavaDoc title = tag.getAttribute("title");
112         if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(title)) {
113             appendAttribute(tag, " {" + title.trim() + "}");
114         }
115         String JavaDoc alt = tag.getAttribute("alt");
116         if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(alt)) {
117             appendAttribute(tag, " {" + alt.trim() + "}");
118         }
119     }
120
121     private void appendAttribute(Tag tag, String JavaDoc text) {
122
123         if (tag.getTagName().equals("IMG")) {
124             appendText(text);
125         } else {
126             String JavaDoc current = (String JavaDoc)m_attributeMap.get(tag);
127             if (current != null) {
128                 text = current + text;
129             }
130             m_attributeMap.put(tag, text);
131         }
132     }
133
134     private void appendIndentation() {
135
136         if (m_lineLength <= m_indent) {
137             int len = (m_marker != null) ? m_indent - (m_marker.length() + 1) : m_indent;
138             for (int i = 0; i < len; i++) {
139                 m_result.append(' ');
140             }
141             if (m_marker != null) {
142                 m_result.append(m_marker);
143                 m_result.append(' ');
144                 m_marker = null;
145             }
146         }
147     }
148
149     private void appendLinebreak(int count) {
150
151         appendLinebreak(count, false);
152     }
153
154     private void appendLinebreak(int count, boolean force) {
155
156         if (m_appendBr) {
157             if (m_storedBrCount > count) {
158                 count = m_storedBrCount;
159             }
160             m_storedBrCount = 0;
161             if (force) {
162                 m_brCount = 0;
163             }
164             while (m_brCount < count) {
165                 m_result.append("\r\n");
166                 m_brCount++;
167             }
168             m_lineLength = m_indent;
169         } else {
170             while (m_storedBrCount < count) {
171                 m_storedBrCount++;
172             }
173         }
174     }
175
176     private void appendLinebreaks(Tag tag, boolean open) {
177
178         String JavaDoc name = tag.getTagName();
179         int pos = TAG_LIST.indexOf(name);
180
181         switch (pos) {
182             case 0: // H1
183
setMarker("=", open);
184                 setIndentation(2, open);
185                 appendLinebreak(2);
186                 break;
187             case 1: // H2
188
setMarker("==", open);
189                 setIndentation(3, open);
190                 appendLinebreak(2);
191                 break;
192             case 2: // H3
193
setMarker("===", open);
194                 setIndentation(4, open);
195                 appendLinebreak(2);
196                 break;
197             case 3: // H4
198
setMarker("====", open);
199                 setIndentation(5, open);
200                 appendLinebreak(2);
201                 break;
202             case 4: // H5
203
setMarker("=====", open);
204                 setIndentation(6, open);
205                 appendLinebreak(2);
206                 break;
207             case 5: // H6
208
setMarker("=======", open);
209                 setIndentation(7, open);
210                 appendLinebreak(2);
211                 break;
212             case 6: // P
213
case 7: // DIV
214
appendLinebreak(2);
215                 break;
216             case 8: // SPAN
217
break;
218             case 9: // BR
219
appendLinebreak(1, true);
220                 break;
221             case 10: // OL
222
case 11: // UL
223
appendLinebreak(2);
224                 break;
225             case 12: // LI
226
setMarker("*", open);
227                 setIndentation(5, open);
228                 appendLinebreak(1);
229                 break;
230             case 13: // TABLE
231
setIndentation(5, open);
232                 appendLinebreak(2);
233                 if (open) {
234                     appendLinebreak(1);
235                     appendText("-----");
236                     appendLinebreak(1);
237                 }
238                 break;
239             case 14: // TD
240
setMarker("--", open);
241                 appendLinebreak(2);
242                 break;
243             case 15: // TR
244
if (!open) {
245                     appendLinebreak(1);
246                     appendText("-----");
247                     appendLinebreak(1);
248                 }
249                 break;
250             case 16: // TH
251
case 17: // THEAD
252
case 18: // TBODY
253
case 19: // TFOOT
254
appendLinebreak(1);
255                 break;
256             default: // unknown tag (ignore)
257
}
258     }
259
260     private void appendText(String JavaDoc text) {
261
262         if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) {
263             text = Translate.decode(text);
264             text = collapse(text);
265         }
266         if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) {
267
268             if (m_storedBrCount > 0) {
269                 m_appendBr = true;
270                 appendLinebreak(m_storedBrCount);
271             }
272             appendIndentation();
273             m_brCount = 0;
274
275             List JavaDoc wordList = CmsStringUtil.splitAsList(text, ' ');
276             Iterator JavaDoc i = wordList.iterator();
277             while (i.hasNext()) {
278                 String JavaDoc word = (String JavaDoc)i.next();
279                 boolean hasNbsp = ((word.charAt(0) == 160) || (word.charAt(word.length() - 1) == 160));
280                 if ((word.length() + 1 + m_lineLength) > m_maxLineLength) {
281                     m_appendBr = true;
282                     appendLinebreak(1);
283                     appendIndentation();
284                     m_brCount = 0;
285                 } else {
286                     if (!hasNbsp
287                         && (m_lineLength > m_indent)
288                         && (m_result.charAt(m_result.length() - 1) != 160)
289                         && (m_result.charAt(m_result.length() - 1) != 32)) {
290
291                         m_result.append(' ');
292                         m_lineLength++;
293                     }
294                 }
295                 m_result.append(word);
296                 m_lineLength += word.length();
297             }
298         }
299     }
300
301     private void setIndentation(int length, boolean open) {
302
303         if (open) {
304             m_indent += length;
305         } else {
306             m_indent -= length;
307             if (m_indent < 0) {
308                 m_indent = 0;
309             }
310         }
311     }
312
313     private void setMarker(String JavaDoc marker, boolean open) {
314
315         if (open) {
316             m_marker = marker;
317         }
318     }
319 }
Popular Tags