1 19 20 33 package org.htmlparser.parserHelper; 34 35 import java.util.StringTokenizer ; 36 37 import org.htmlparser.Node; 38 import org.htmlparser.NodeReader; 39 import org.htmlparser.tags.Tag; 40 import org.htmlparser.tags.data.TagData; 41 import org.htmlparser.util.ParserFeedback; 42 43 public class TagParser 44 { 45 public final static int TAG_BEFORE_PARSING_STATE = 1; 46 public final static int TAG_BEGIN_PARSING_STATE = 1 << 2; 47 public final static int TAG_FINISHED_PARSING_STATE = 1 << 3; 48 public final static int TAG_ILLEGAL_STATE = 1 << 4; 49 public final static int TAG_IGNORE_DATA_STATE = 1 << 5; 50 public final static int TAG_IGNORE_BEGIN_TAG_STATE = 1 << 6; 51 public final static int TAG_IGNORE_CHAR_SINGLE_QUOTE = 1 << 7; 52 53 public final static String ENCOUNTERED_QUERY_MESSAGE = 54 "TagParser : Encountered > after a query. Accepting without correction and continuing parsing"; 55 56 private ParserFeedback feedback; 57 58 public TagParser(ParserFeedback feedback) 59 { 60 this.feedback = feedback; 61 } 62 63 public Tag find(NodeReader reader, String input, int position) 64 { 65 int state = TAG_BEFORE_PARSING_STATE; 66 int i = position; 67 char ch; 68 char[] ignorechar = new char[1]; 69 Tag tag = 71 new Tag( 72 new TagData( 73 position, 74 0, 75 reader.getLastLineNumber(), 76 0, 77 "", 78 input, 79 "", 80 false)); 81 82 Bool encounteredQuery = new Bool(false); 83 while (i < tag.getTagLine().length() 84 && state != TAG_FINISHED_PARSING_STATE 85 && state != TAG_ILLEGAL_STATE) 86 { 87 ch = tag.getTagLine().charAt(i); 88 state = 89 automataInput( 90 encounteredQuery, 91 i, 92 state, 93 ch, 94 tag, 95 i, 96 ignorechar); 97 i = incrementCounter(i, reader, state, tag); 98 } 99 if (state == TAG_FINISHED_PARSING_STATE) 100 { 101 String tagLine = tag.getTagLine(); 102 if (i > 1 && tagLine.charAt(i - 2) == '/') 103 { 104 tag.setEmptyXmlTag(true); 105 String tagContents = tag.getText(); 106 tag.setText(tagContents.substring(0, tagContents.length() - 1)); 107 } 108 return tag; 109 } 110 else 111 return null; 112 } 113 114 private int automataInput( 115 Bool encounteredQuery, 116 int i, 117 int state, 118 char ch, 119 Tag tag, 120 int pos, 121 char[] ignorechar) 122 { 123 state = checkIllegalState(i, state, ch, tag); 124 state = checkFinishedState(encounteredQuery, i, state, ch, tag, pos); 125 state = toggleIgnoringState(state, ch, ignorechar); 126 if (state == TAG_BEFORE_PARSING_STATE && ch != '<') 127 { 128 state = TAG_ILLEGAL_STATE; 129 } 130 if (state == TAG_IGNORE_DATA_STATE && ch == '<') 131 { 132 if (!isWellFormedTag(tag, pos)) 135 state = TAG_IGNORE_BEGIN_TAG_STATE; 136 } 137 if (state == TAG_IGNORE_BEGIN_TAG_STATE && ch == '>') 138 { 139 state = TAG_IGNORE_DATA_STATE; 140 } 141 checkIfAppendable(encounteredQuery, state, ch, tag); 142 state = checkBeginParsingState(i, state, ch, tag); 143 144 return state; 145 } 146 147 private int checkBeginParsingState(int i, int state, char ch, Tag tag) 148 { 149 if (ch == '<' 150 && (state == TAG_BEFORE_PARSING_STATE || state == TAG_ILLEGAL_STATE)) 151 { 152 tag.setTagBegin(i); 154 state = TAG_BEGIN_PARSING_STATE; 155 } 156 return state; 157 } 158 159 private boolean isWellFormedTag(Tag tag, int pos) 160 { 161 String inputLine = tag.getTagLine(); 162 int closeTagPos = inputLine.indexOf('>', pos + 1); 163 int openTagPos = inputLine.indexOf('<', pos + 1); 164 return openTagPos > closeTagPos 165 || (openTagPos == -1 && closeTagPos != -1); 166 } 167 168 private int checkFinishedState( 169 Bool encounteredQuery, 170 int i, 171 int state, 172 char ch, 173 Tag tag, 174 int pos) 175 { 176 if (ch == '>') 177 { 178 if (state == TAG_BEGIN_PARSING_STATE) 179 { 180 state = TAG_FINISHED_PARSING_STATE; 181 tag.setTagEnd(i); 182 } 183 else if (state == TAG_IGNORE_DATA_STATE) 184 { 185 if (encounteredQuery.getBoolean()) 186 { 187 encounteredQuery.setBoolean(false); 188 feedback.info(ENCOUNTERED_QUERY_MESSAGE); 189 return state; 190 } 191 if (isWellFormedTag(tag, pos)) 194 return state; 195 196 state = TAG_FINISHED_PARSING_STATE; 197 tag.setTagEnd(i); 198 correctTag(tag); 202 203 StringBuffer msg = new StringBuffer (); 204 msg.append( 205 "HTMLTagParser : Encountered > inside inverted commas in line \n"); 206 msg.append(tag.getTagLine()); 207 msg.append(", location "); 208 msg.append(i); 209 msg.append("\n"); 210 for (int j = 0; j < i; j++) 211 msg.append(' '); 212 msg.append('^'); 213 msg.append("\nAutomatically corrected."); 214 feedback.warning(msg.toString()); 215 } 216 } 217 else if ( 218 ch == '<' 219 && state == TAG_BEGIN_PARSING_STATE 220 && tag.getText().charAt(0) != '%') 221 { 222 state = TAG_FINISHED_PARSING_STATE; 223 tag.setTagEnd(i - 1); 224 i--; 225 } 226 return state; 227 } 228 229 private void checkIfAppendable( 230 Bool encounteredQuery, 231 int state, 232 char ch, 233 Tag tag) 234 { 235 if (state == TAG_IGNORE_DATA_STATE 236 || state == TAG_BEGIN_PARSING_STATE 237 || state == TAG_IGNORE_BEGIN_TAG_STATE) 238 { 239 if (ch == '?') 240 encounteredQuery.setBoolean(true); 241 tag.append(ch); 242 } 243 } 244 245 private int checkIllegalState(int i, int state, char ch, Tag tag) 246 { 247 if (ch == '/' 248 && i > 0 249 && tag.getTagLine().charAt(i - 1) == '<' 250 && state != TAG_IGNORE_DATA_STATE 251 && state != TAG_IGNORE_BEGIN_TAG_STATE) 252 { 253 state = TAG_ILLEGAL_STATE; 254 } 255 256 return state; 257 } 258 259 public void correctTag(Tag tag) 260 { 261 String tempText = tag.getText(); 262 StringBuffer absorbedText = new StringBuffer (); 263 char c; 264 for (int j = 0; j < tempText.length(); j++) 265 { 266 c = tempText.charAt(j); 267 if (c != '"') 268 absorbedText.append(c); 269 } 270 StringBuffer result = insertInvertedCommasCorrectly(absorbedText); 272 tag.setText(result.toString()); 273 } 274 public StringBuffer insertInvertedCommasCorrectly(StringBuffer absorbedText) 275 { 276 StringBuffer result = new StringBuffer (); 277 StringTokenizer tok = 278 new StringTokenizer (absorbedText.toString(), "=", false); 279 String token; 280 token = (String ) tok.nextToken(); 281 result.append(token + "="); 282 for (; tok.hasMoreTokens();) 283 { 284 token = (String ) tok.nextToken(); 285 token = pruneSpaces(token); 286 result.append('"'); 287 int lastIndex = token.lastIndexOf(' '); 288 if (lastIndex != -1 && tok.hasMoreTokens()) 289 { 290 result.append(token.substring(0, lastIndex)); 291 result.append('"'); 292 result.append(token.substring(lastIndex, token.length())); 293 } 294 else 295 result.append(token + '"'); 296 if (tok.hasMoreTokens()) 297 result.append("="); 298 } 299 return result; 300 } 301 public static String pruneSpaces(String token) 302 { 303 int firstSpace; 304 int lastSpace; 305 firstSpace = token.indexOf(' '); 306 while (firstSpace == 0) 307 { 308 token = token.substring(1, token.length()); 309 firstSpace = token.indexOf(' '); 310 } 311 lastSpace = token.lastIndexOf(' '); 312 while (lastSpace == token.length() - 1) 313 { 314 token = token.substring(0, token.length() - 1); 315 lastSpace = token.lastIndexOf(' '); 316 } 317 return token; 318 } 319 320 327 private int toggleIgnoringState(int state, char ch, char[] ignorechar) 328 { 329 if (state == TAG_IGNORE_DATA_STATE) 330 { 331 if (ch == ignorechar[0]) 332 state = TAG_BEGIN_PARSING_STATE; 333 } 334 else if (state == TAG_BEGIN_PARSING_STATE) 335 if (ch == '"' || ch == '\'') 336 { 337 state = TAG_IGNORE_DATA_STATE; 338 ignorechar[0] = ch; 339 } 340 341 return (state); 342 } 343 344 public int incrementCounter(int i, NodeReader reader, int state, Tag tag) 345 { 346 String nextLine = null; 347 if ((state == TAG_BEGIN_PARSING_STATE 348 || state == TAG_IGNORE_DATA_STATE 349 || state == TAG_IGNORE_BEGIN_TAG_STATE) 350 && i == tag.getTagLine().length() - 1) 351 { 352 int numLinesAdvanced = 0; 356 do 357 { 358 nextLine = reader.getNextLine(); 359 numLinesAdvanced++; 360 } 361 while (nextLine != null && nextLine.length() == 0); 362 if (nextLine == null) 363 { 364 nextLine = ">"; 366 } 367 else 368 { 369 tag.append(Node.getLineSeparator()); 371 } 372 373 while (--numLinesAdvanced > 0) 375 tag.setTagLine(""); 376 377 tag.setTagLine(nextLine); 379 i = -1; 380 } 381 return ++i; 382 } 383 class Bool 385 { 386 private boolean boolValue; 387 388 Bool(boolean boolValue) 389 { 390 this.boolValue = boolValue; 391 } 392 393 public void setBoolean(boolean boolValue) 394 { 395 this.boolValue = boolValue; 396 } 397 398 public boolean getBoolean() 399 { 400 return boolValue; 401 } 402 } 403 } 404 | Popular Tags |