KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > util > StringBean


1
2 package org.opencms.util;
3
4 import java.io.Serializable JavaDoc;
5
6 import org.htmlparser.Node;
7 import org.htmlparser.Tag;
8 import org.htmlparser.Text;
9 import org.htmlparser.tags.LinkTag;
10 import org.htmlparser.util.Translate;
11 import org.htmlparser.visitors.NodeVisitor;
12
13 /**
14  * Extracts the HTML page content.<p>
15  */

16 public class StringBean extends NodeVisitor implements Serializable JavaDoc {
17
18     /**
19      * A newline.
20      */

21     private static final String JavaDoc NEWLINE = System.getProperty("line.separator");
22
23     /**
24      * The length of the NEWLINE.
25      */

26     private static final int NEWLINE_SIZE = NEWLINE.length();
27
28     private static final long serialVersionUID = 1596190888769126925L;
29
30     /**
31      * The buffer text is stored in while traversing the HTML.
32      */

33     protected StringBuffer JavaDoc m_buffer;
34
35     /**
36      * If <code>true</code> sequences of whitespace characters are replaced
37      * with a single space character.
38      */

39     protected boolean m_collapse;
40
41     /**
42      * Set <code>true</code> when traversing a PRE tag.
43      */

44     protected boolean m_isPre;
45
46     /**
47      * Set <code>true</code> when traversing a SCRIPT tag.
48      */

49     protected boolean m_isScript;
50
51     /**
52      * Set <code>true</code> when traversing a STYLE tag.
53      */

54     protected boolean m_isStyle;
55
56     /**
57      * If <code>true</code> the link URLs are embedded in the text output.
58      */

59     protected boolean m_links;
60
61     /**
62      * The strings extracted from the URL.
63      */

64     protected String JavaDoc m_strings;
65
66     /**
67      * Create a StringBean object.
68      * Default property values are set to 'do the right thing':
69      * <p><code>Links</code> is set <code>false</code> so text appears like a
70      * browser would display it, albeit without the colour or underline clues
71      * normally associated with a link.</p>
72      * <p><code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so
73      * that printing the text works, but the extra information regarding these
74      * formatting marks is available if you set it false.</p>
75      * <p><code>Collapse</code> is set <code>true</code>, so text appears
76      * compact like a browser would display it.</p>
77      */

78     public StringBean() {
79
80         super(true, true);
81         m_strings = null;
82         m_links = false;
83         m_collapse = true;
84         m_buffer = new StringBuffer JavaDoc(4096);
85         m_isScript = false;
86         m_isPre = false;
87         m_isStyle = false;
88     }
89
90     /**
91      * Get the current 'collapse whitespace' state.
92      * If set to <code>true</code> this emulates the operation of browsers
93      * in interpretting text where <quote>user agents should collapse input
94      * white space sequences when producing output inter-word space</quote>.
95      * See HTML specification section 9.1 White space
96      * <a HREF="http://www.w3.org/TR/html4/struct/text.html#h-9.1">
97      * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>.
98      * @return <code>true</code> if sequences of whitespace (space '&#92;u0020',
99      * tab '&#92;u0009', form feed '&#92;u000C', zero-width space '&#92;u200B',
100      * carriage-return '\r' and NEWLINE '\n') are to be replaced with a single
101      * space.
102      */

103     public boolean getCollapse() {
104
105         return (m_collapse);
106     }
107
108     /**
109      * Get the current 'include links' state.
110      * @return <code>true</code> if link text is included in the text extracted
111      * from the URL, <code>false</code> otherwise.
112      */

113     public boolean getLinks() {
114
115         return (m_links);
116     }
117
118     /**
119      * Return the textual contents of the URL.
120      * This is the primary output of the bean.
121      * @return The user visible (what would be seen in a browser) text.
122      */

123     public String JavaDoc getStrings() {
124
125         if (null == m_strings) {
126             if (0 == m_buffer.length()) {
127                 setStrings();
128             } else {
129                 updateStrings(m_buffer.toString());
130             }
131         }
132
133         return (m_strings);
134     }
135
136     /**
137      * Set the current 'collapse whitespace' state.
138      * If the setting is changed after the URL has been set, the text from the
139      * URL will be reacquired, which is possibly expensive.
140      * @param collapse If <code>true</code>, sequences of whitespace
141      * will be reduced to a single space.
142      */

143     public void setCollapse(boolean collapse) {
144
145         boolean oldValue = m_collapse;
146         if (oldValue != collapse) {
147             m_collapse = collapse;
148             setStrings();
149         }
150     }
151
152     /**
153      * Set the 'include links' state.
154      * If the setting is changed after the URL has been set, the text from the
155      * URL will be reacquired, which is possibly expensive.
156      * @param links Use <code>true</code> if link text is to be included in the
157      * text extracted from the URL, <code>false</code> otherwise.
158      */

159     public void setLinks(boolean links) {
160
161         boolean oldValue = m_links;
162         if (oldValue != links) {
163             m_links = links;
164             setStrings();
165         }
166     }
167
168     /**
169      * Resets the state of the PRE and SCRIPT flags.
170      * @param tag The end tag to process.
171      */

172     public void visitEndTag(Tag tag) {
173         
174         Node parent = tag.getParent();
175         if (parent instanceof LinkTag) {
176             if (getLinks()) { // appends the link as text between angle brackets to the output.
177
m_buffer.append(" <");
178                 m_buffer.append(((LinkTag)parent).getLink());
179                 m_buffer.append(">");
180             }
181         }
182         
183         String JavaDoc name = tag.getTagName().toUpperCase();
184         if (name.equals("PRE")) {
185             m_isPre = false;
186         } else if (name.equals("SCRIPT")) {
187             m_isScript = false;
188         } else if (name.equals("STYLE")) {
189             m_isStyle = false;
190         }
191         
192         if (isHeadTag(name)) {
193             carriageReturn();
194             carriageReturn(true);
195         }
196         
197         if (isTitleTag(name)) {
198             m_buffer.append(" ]");
199             carriageReturn();
200             carriageReturn(true);
201         }
202     }
203
204     private boolean isTitleTag(String JavaDoc name) {
205         
206         return "TITLE".equals(name);
207     }
208     
209     private boolean isHeadTag(String JavaDoc name) {
210
211         return "H1".equals(name)
212             || "H2".equals(name)
213             || "H3".equals(name)
214             || "H4".equals(name)
215             || "H5".equals(name)
216             || "H6".equals(name);
217     }
218
219     /**
220      * Appends the text to the output.
221      * @param string The text node.
222      */

223     public void visitStringNode(Text string) {
224
225         if (!m_isScript && !m_isStyle) {
226             String JavaDoc text = string.getText();
227             if (!m_isPre) {
228                 text = Translate.decode(text);
229                 text = text.replace('\u00a0', ' ');
230                 if (getCollapse()) {
231                     collapse(m_buffer, text);
232                 } else {
233                     m_buffer.append(text);
234                 }
235             } else {
236                 m_buffer.append(text);
237             }
238         }
239     }
240
241     /**
242      * Appends a NEWLINE to the output if the tag breaks flow, and
243      * possibly sets the state of the PRE and SCRIPT flags.
244      * @param tag The tag to examine.
245      */

246     public void visitTag(Tag tag) {
247             
248         String JavaDoc name = tag.getTagName();
249         if (name.equalsIgnoreCase("PRE")) {
250             m_isPre = true;
251         } else if (name.equalsIgnoreCase("SCRIPT")) {
252             m_isScript = true;
253         } else if (name.equalsIgnoreCase("STYLE")) {
254             m_isStyle = true;
255         }
256         
257         if (isHeadTag(name)) {
258             carriageReturn(true);
259             m_buffer.append("* ");
260         } else if (isTitleTag(name)) {
261             m_buffer.append("[ ");
262         } else {
263             if (tag.breaksFlow()) {
264                 carriageReturn();
265             }
266         }
267         
268     }
269
270     /**
271      * Appends a newline to the buffer if there isn't one there already.
272      * Except if the buffer is empty.
273      */

274     protected void carriageReturn() {
275
276         carriageReturn(false);
277     }
278
279     /**
280      * Appends a newline to the buffer if there isn't one there already.
281      * Except if the buffer is empty.
282      *
283      * @param check a parameter the developer forgot to comment
284      */

285     protected void carriageReturn(boolean check) {
286
287         int length;
288
289         length = m_buffer.length();
290         if ((0 != length) // don't append newlines to the beginning of a buffer
291
&& (check || ((NEWLINE_SIZE <= length) // not enough chars to hold a NEWLINE
292
&& (!m_buffer.substring(length - NEWLINE_SIZE, length).equals(NEWLINE))))) {
293
294             m_buffer.append(NEWLINE);
295         }
296     }
297     
298     /**
299      * Add the given text collapsing whitespace.
300      * Use a little finite state machine:
301      * <pre>
302      * state 0: whitepace was last emitted character
303      * state 1: in whitespace
304      * state 2: in word
305      * A whitespace character moves us to state 1 and any other character
306      * moves us to state 2, except that state 0 stays in state 0 until
307      * a non-whitespace and going from whitespace to word we emit a space
308      * before the character:
309      * input: whitespace other-character
310      * state\next
311      * 0 0 2
312      * 1 1 space then 2
313      * 2 1 2
314      * </pre>
315      * @param buffer The buffer to append to.
316      * @param string The string to append.
317      */

318     protected void collapse(StringBuffer JavaDoc buffer, String JavaDoc string) {
319
320         int chars;
321         int length;
322         int state;
323         char character;
324
325         chars = string.length();
326         if (0 != chars) {
327             length = buffer.length();
328             state = ((0 == length) || (buffer.charAt(length - 1) == ' ') || ((NEWLINE_SIZE <= length) && buffer.substring(
329                 length - NEWLINE_SIZE,
330                 length).equals(NEWLINE))) ? 0 : 1;
331             for (int i = 0; i < chars; i++) {
332                 character = string.charAt(i);
333                 switch (character) {
334                     // see HTML specification section 9.1 White space
335
// http://www.w3.org/TR/html4/struct/text.html#h-9.1
336
case '\u0020':
337                     case '\u0009':
338                     case '\u000C':
339                     case '\u200B':
340                     case '\r':
341                     case '\n':
342                         if (0 != state) {
343                             state = 1;
344                         }
345                         break;
346                     default:
347                         if (1 == state) {
348                             buffer.append(' ');
349                         }
350                         state = 2;
351                         buffer.append(character);
352                 }
353             }
354         }
355     }
356
357     /**
358      * Fetch the URL contents.
359      * Only do work if there is a valid parser with it's URL set.
360      */

361     protected void setStrings() {
362
363         m_strings = null;
364         m_buffer = new StringBuffer JavaDoc(4096);
365     }
366
367     /**
368      * Assign the <code>Strings</code> property, firing the property change.
369      * @param strings The new value of the <code>Strings</code> property.
370      */

371     protected void updateStrings(String JavaDoc strings) {
372
373         if ((null == m_strings) || !m_strings.equals(strings)) {
374             m_strings = strings;
375         }
376     }
377 }
378
Popular Tags