1 package org.jahia.services.search; 2 3 import java.io.Serializable ; 4 5 import org.htmlparser.Tag; 6 import org.htmlparser.Text; 7 import org.htmlparser.tags.LinkTag; 8 import org.htmlparser.util.Translate; 9 import org.htmlparser.visitors.NodeVisitor; 10 11 19 public class BigTextFieldExtractor extends NodeVisitor implements Serializable 20 { 21 24 private static final String NEWLINE = System.getProperty("line.separator"); 25 26 29 private static final int NEWLINE_SIZE = NEWLINE.length(); 30 31 34 private StringBuffer mBuffer; 35 36 40 private boolean mCollapse; 41 42 45 private boolean mIsPre; 46 47 50 private boolean mIsScript; 51 52 55 private boolean mIsStyle; 56 57 60 private boolean mLinks; 61 62 66 private boolean mReplaceSpace; 67 68 86 public BigTextFieldExtractor() 87 { 88 super(true, true); 89 mLinks = false; 90 mReplaceSpace = true; 91 mCollapse = true; 92 mBuffer = new StringBuffer (4096); 93 mIsScript = false; 94 mIsPre = false; 95 mIsStyle = false; 96 } 97 98 110 public BigTextFieldExtractor(boolean links, boolean replace, boolean collapse) 111 { 112 super(true, true); 113 mLinks = links; 114 mReplaceSpace = replace; 115 mCollapse = collapse; 116 mBuffer = new StringBuffer (4096); 117 mIsScript = false; 118 mIsPre = false; 119 mIsStyle = false; 120 } 121 122 144 private void collapse(StringBuffer buffer, String string) 145 { 146 int chars; 147 int length; 148 int state; 149 char character; 150 151 chars = string.length(); 152 if (0 != chars) 153 { 154 length = buffer.length(); 155 state = ((0 == length) || (buffer.charAt(length - 1) == ' ') || ((NEWLINE_SIZE <= length) && buffer 156 .substring(length - NEWLINE_SIZE, length).equals(NEWLINE))) ? 0 : 1; 157 for (int i = 0; i < chars; i++) 158 { 159 character = string.charAt(i); 160 switch (character) 161 { 162 case '\u0020': 165 case '\u0009': 166 case '\u000C': 167 case '\u200B': 168 case '\r': 169 case '\n': 170 if (0 != state) 171 state = 1; 172 break; 173 default: 174 if (1 == state) 175 buffer.append(' '); 176 state = 2; 177 buffer.append(character); 178 } 179 } 180 } 181 } 182 183 188 public String getExtractedText() 189 { 190 return mBuffer.toString(); 191 } 192 193 196 public void reset() 197 { 198 mBuffer.delete(0, mBuffer.length()); 199 mIsScript = false; 200 mIsPre = false; 201 mIsStyle = false; 202 } 203 204 209 public void visitEndTag(Tag tag) 210 { 211 String name; 212 213 name = tag.getTagName(); 214 if (name.equalsIgnoreCase("PRE")) 215 mIsPre = false; 216 else if (name.equalsIgnoreCase("SCRIPT")) 217 mIsScript = false; 218 else if (name.equalsIgnoreCase("STYLE")) 219 mIsStyle = false; 220 } 221 222 227 public void visitStringNode(Text string) 228 { 229 if (!mIsScript && !mIsStyle) 230 { 231 String text = string.getText(); 232 if (!mIsPre) 233 { 234 text = Translate.decode(text); 235 if (mReplaceSpace) 236 text = text.replace('\u00a0', ' '); 237 if (mCollapse) 238 collapse(mBuffer, text); 239 else 240 mBuffer.append(text); 241 } 242 else 243 mBuffer.append(text); 244 } 245 } 246 247 253 public void visitTag(Tag tag) 254 { 255 String name; 256 257 if (tag instanceof LinkTag) 258 if (mLinks) 259 { mBuffer.append("<"); 261 mBuffer.append(((LinkTag)tag).getLink()); 262 mBuffer.append(">"); 263 } 264 name = tag.getTagName(); 265 if (name.equalsIgnoreCase("PRE")) 266 mIsPre = true; 267 else if (name.equalsIgnoreCase("SCRIPT")) 268 mIsScript = true; 269 else if (name.equalsIgnoreCase("STYLE")) 270 mIsStyle = true; 271 if (tag.breaksFlow()) 272 mBuffer.append(' '); 273 } 274 } 275 | Popular Tags |