KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > jahia > services > search > BigTextFieldExtractor


1 package org.jahia.services.search;
2
3 import java.io.Serializable JavaDoc;
4
5 import org.htmlparser.Tag;
6 import org.htmlparser.Text;
7 import org.htmlparser.tags.LinkTag;
8 import org.htmlparser.util.Translate;
9 import org.htmlparser.visitors.NodeVisitor;
10
11 /**
12  * Extract strings from the specified HTML text. This class is a modification of
13  * the {@link org.htmlparser.beans.StringBean} bean from <code>htmlparser</code>
14  * package.
15  *
16  * @author Sergiy Shyrkov
17  * @see org.htmlparser.beans.StringBean
18  */

19 public class BigTextFieldExtractor extends NodeVisitor implements Serializable JavaDoc
20 {
21   /**
22    * A newline.
23    */

24   private static final String JavaDoc NEWLINE = System.getProperty("line.separator");
25
26   /**
27    * The length of the NEWLINE.
28    */

29   private static final int NEWLINE_SIZE = NEWLINE.length();
30
31   /**
32    * The buffer text is stored in while traversing the HTML.
33    */

34   private StringBuffer JavaDoc mBuffer;
35
36   /**
37    * If <code>true</code> sequences of whitespace characters are replaced with
38    * a single space character.
39    */

40   private boolean mCollapse;
41
42   /**
43    * Set <code>true</code> when traversing a PRE tag.
44    */

45   private boolean mIsPre;
46
47   /**
48    * Set <code>true</code> when traversing a SCRIPT tag.
49    */

50   private boolean mIsScript;
51
52   /**
53    * Set <code>true</code> when traversing a STYLE tag.
54    */

55   private boolean mIsStyle;
56
57   /**
58    * If <code>true</code> the link URLs are embedded in the text output.
59    */

60   private boolean mLinks;
61
62   /**
63    * If <code>true</code> regular space characters are substituted for
64    * non-breaking spaces in the text output.
65    */

66   private boolean mReplaceSpace;
67
68   /**
69    * Create a BigTextFieldExtractor object. Default property values are set to
70    * 'do the right thing':
71    * <p>
72    * <code>Links</code> is set <code>false</code> so text appears like a
73    * browser would display it, albeit without the colour or underline clues
74    * normally associated with a link.
75    * </p>
76    * <p>
77    * <code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so
78    * that printing the text works, but the extra information regarding these
79    * formatting marks is available if you set it false.
80    * </p>
81    * <p>
82    * <code>Collapse</code> is set <code>true</code>, so text appears
83    * compact like a browser would display it.
84    * </p>
85    */

86   public BigTextFieldExtractor()
87   {
88     super(true, true);
89     mLinks = false;
90     mReplaceSpace = true;
91     mCollapse = true;
92     mBuffer = new StringBuffer JavaDoc(4096);
93     mIsScript = false;
94     mIsPre = false;
95     mIsStyle = false;
96   }
97
98   /**
99    * Create a BigTextFieldExtractor object.
100    *
101    * @param links Use <code>true</code> if link text is to be included in the
102    * text extracted from the URL, <code>false</code> otherwise
103    * @param replace <code>true</code> if non-breaking spaces (character
104    * '&#92;u00a0', numeric character reference &amp;#160; or character
105    * entity reference &amp;nbsp;) are to be replaced with normal spaces
106    * (character '&#92;u0020')
107    * @param collapse If <code>true</code>, sequences of whitespace will be
108    * reduced to a single space
109    */

110   public BigTextFieldExtractor(boolean links, boolean replace, boolean collapse)
111   {
112     super(true, true);
113     mLinks = links;
114     mReplaceSpace = replace;
115     mCollapse = collapse;
116     mBuffer = new StringBuffer JavaDoc(4096);
117     mIsScript = false;
118     mIsPre = false;
119     mIsStyle = false;
120   }
121
122   /**
123    * Add the given text collapsing whitespace. Use a little finite state
124    * machine:
125    *
126    * <pre>
127    * state 0: whitepace was last emitted character
128    * state 1: in whitespace
129    * state 2: in word
130    * A whitespace character moves us to state 1 and any other character
131    * moves us to state 2, except that state 0 stays in state 0 until
132    * a non-whitespace and going from whitespace to word we emit a space
133    * before the character:
134    * input: whitespace other-character
135    * state\next
136    * 0 0 2
137    * 1 1 space then 2
138    * 2 1 2
139    * </pre>
140    *
141    * @param buffer The buffer to append to.
142    * @param string The string to append.
143    */

144   private void collapse(StringBuffer JavaDoc buffer, String JavaDoc string)
145   {
146     int chars;
147     int length;
148     int state;
149     char character;
150
151     chars = string.length();
152     if (0 != chars)
153     {
154       length = buffer.length();
155       state = ((0 == length) || (buffer.charAt(length - 1) == ' ') || ((NEWLINE_SIZE <= length) && buffer
156         .substring(length - NEWLINE_SIZE, length).equals(NEWLINE))) ? 0 : 1;
157       for (int i = 0; i < chars; i++)
158       {
159         character = string.charAt(i);
160         switch (character)
161         {
162           // see HTML specification section 9.1 White space
163
// http://www.w3.org/TR/html4/struct/text.html#h-9.1
164
case '\u0020':
165           case '\u0009':
166           case '\u000C':
167           case '\u200B':
168           case '\r':
169           case '\n':
170             if (0 != state)
171               state = 1;
172             break;
173           default:
174             if (1 == state)
175               buffer.append(' ');
176             state = 2;
177             buffer.append(character);
178         }
179       }
180     }
181   }
182
183   /**
184    * Return the extracted textual contents of the initial HTML.
185    *
186    * @return the extracted textual contents of the initial HTM
187    */

188   public String JavaDoc getExtractedText()
189   {
190     return mBuffer.toString();
191   }
192
193   /**
194    * Resets the extracted text.
195    */

196   public void reset()
197   {
198     mBuffer.delete(0, mBuffer.length());
199     mIsScript = false;
200     mIsPre = false;
201     mIsStyle = false;
202   }
203
204   /**
205    * Resets the state of the PRE and SCRIPT flags.
206    *
207    * @param tag The end tag to process.
208    */

209   public void visitEndTag(Tag tag)
210   {
211     String JavaDoc name;
212
213     name = tag.getTagName();
214     if (name.equalsIgnoreCase("PRE"))
215       mIsPre = false;
216     else if (name.equalsIgnoreCase("SCRIPT"))
217       mIsScript = false;
218     else if (name.equalsIgnoreCase("STYLE"))
219       mIsStyle = false;
220   }
221
222   /**
223    * Appends the text to the output.
224    *
225    * @param string The text node.
226    */

227   public void visitStringNode(Text string)
228   {
229     if (!mIsScript && !mIsStyle)
230     {
231       String JavaDoc text = string.getText();
232       if (!mIsPre)
233       {
234         text = Translate.decode(text);
235         if (mReplaceSpace)
236           text = text.replace('\u00a0', ' ');
237         if (mCollapse)
238           collapse(mBuffer, text);
239         else
240           mBuffer.append(text);
241       }
242       else
243         mBuffer.append(text);
244     }
245   }
246
247   /**
248    * Appends a NEWLINE to the output if the tag breaks flow, and possibly sets
249    * the state of the PRE and SCRIPT flags.
250    *
251    * @param tag The tag to examine.
252    */

253   public void visitTag(Tag tag)
254   {
255     String JavaDoc name;
256
257     if (tag instanceof LinkTag)
258       if (mLinks)
259       { // appends the link as text between angle brackets to the output.
260
mBuffer.append("<");
261         mBuffer.append(((LinkTag)tag).getLink());
262         mBuffer.append(">");
263       }
264     name = tag.getTagName();
265     if (name.equalsIgnoreCase("PRE"))
266       mIsPre = true;
267     else if (name.equalsIgnoreCase("SCRIPT"))
268       mIsScript = true;
269     else if (name.equalsIgnoreCase("STYLE"))
270       mIsStyle = true;
271     if (tag.breaksFlow())
272       mBuffer.append(' ');
273   }
274 }
275
Popular Tags