| 1 27 package org.htmlparser.util; 28 29 import java.io.UnsupportedEncodingException ; 30 import java.util.ArrayList ; 31 32 import org.htmlparser.Node; 33 import org.htmlparser.NodeFilter; 34 import org.htmlparser.Parser; 35 import org.htmlparser.Tag; 36 import org.htmlparser.filters.NodeClassFilter; 37 import org.htmlparser.filters.TagNameFilter; 38 import org.htmlparser.lexer.Lexer; 39 import org.htmlparser.lexer.Page; 40 import org.htmlparser.tags.CompositeTag; 41 import org.htmlparser.util.NodeList; 42 import org.htmlparser.util.ParserException; 43 44 45 public class ParserUtils 46 { 47 public static String removeChars(String s, char occur) { 48 StringBuffer newString = new StringBuffer (); 49 char ch; 50 for (int i = 0; i < s.length(); i++) { 51 ch = s.charAt(i); 52 if (ch != occur) 53 newString.append(ch); 54 } 55 return newString.toString(); 56 } 57 58 public static String removeEscapeCharacters(String inputString) { 59 inputString = ParserUtils.removeChars(inputString, '\r'); 60 inputString = ParserUtils.removeChars(inputString, '\n'); 61 inputString = ParserUtils.removeChars(inputString, '\t'); 62 return inputString; 63 } 64 65 public static String removeTrailingBlanks(String text) { 66 char ch = ' '; 67 while (ch == ' ') { 68 ch = text.charAt(text.length() - 1); 69 if (ch == ' ') 70 text = text.substring(0, text.length() - 1); 71 } 72 return text; 73 } 74 75 81 public static Node[] findTypeInNode(Node node, Class type) 82 { 83 NodeFilter filter; 84 NodeList ret; 85 86 ret = new NodeList (); 87 filter = new NodeClassFilter (type); 88 node.collectInto (ret, filter); 89 90 return (ret.toNodeArray ()); 91 } 92 93 103 public static String [] splitButDigits (String input, String charsDoNotBeRemoved) 104 { 105 106 ArrayList output = new ArrayList (); 107 int minCapacity = 0; 108 StringBuffer str = new StringBuffer (); 109 110 boolean charFound = false; 111 boolean toBeAdd = false; 112 for (int index=0; index<input.length(); index++) 113 { 114 charFound=false; 115 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 116 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 117 charFound=true; 118 if ((Character.isDigit(input.charAt(index))) || (charFound)) 119 { 120 str.append(input.charAt(index)); 121 toBeAdd=false; 122 } 123 else 124 if (!toBeAdd) 125 toBeAdd=true; 126 if (toBeAdd && (str.length()!=0)) { 128 minCapacity++; 129 output.ensureCapacity(minCapacity); 130 if (output.add(str.toString())) 131 str = new StringBuffer (); 132 else 133 minCapacity--; 134 } 135 } 136 if (str.length()!=0) { 138 minCapacity++; 139 output.ensureCapacity(minCapacity); 140 if (output.add(str.toString())) 141 str = new StringBuffer (); 142 else 143 minCapacity--; 144 } 145 146 output.trimToSize(); 147 Object [] outputObj = output.toArray(); 148 String [] outputStr = new String [output.size()]; 149 for (int i=0; i<output.size(); i++) 150 outputStr[i] = new String ((String ) outputObj[i]); 151 return outputStr; 152 153 } 154 155 166 public static String trimButDigits (String input, String charsDoNotBeRemoved) 167 { 168 169 StringBuffer output = new StringBuffer (); 170 171 boolean charFound=false; 172 for (int index=0; index<input.length(); index++) 173 { 174 charFound=false; 175 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 176 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 177 charFound=true; 178 if ((Character.isDigit(input.charAt(index))) || (charFound)) 179 output.append(input.charAt(index)); 180 } 181 182 return output.toString(); 183 184 } 185 186 198 public static String trimButDigitsBeginEnd (String input, String charsDoNotBeRemoved) 199 { 200 201 String output = new String (); 202 203 int begin=0; 204 int end=input.length()-1; 205 boolean charFound=false; 206 boolean ok=true; 207 for (int index=begin; (index<input.length()) && ok; index++) 208 { 209 charFound=false; 210 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 211 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 212 charFound=true; 213 if ( (Character.isDigit(input.charAt(index))) || (charFound) ) 214 { 215 begin=index; 216 ok=false; 217 } 218 } 219 ok=true; 220 for (int index=end; (index>=0) && ok; index--) 221 { 222 charFound=false; 223 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 224 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 225 charFound=true; 226 if ( (Character.isDigit(input.charAt(index))) || (charFound) ) 227 { 228 end=index; 229 ok=false; 230 } 231 } 232 output=input.substring(begin,end+1); 233 234 return output; 235 236 } 237 238 248 public static String [] splitSpaces (String input, String charsToBeRemoved) 249 { 250 251 ArrayList output = new ArrayList (); 252 int minCapacity = 0; 253 StringBuffer str = new StringBuffer (); 254 255 boolean charFound = false; 256 boolean toBeAdd = false; 257 for (int index=0; index<input.length(); index++) 258 { 259 charFound=false; 260 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 261 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 262 charFound=true; 263 if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound))) 264 { 265 str.append(input.charAt(index)); 266 toBeAdd=false; 267 } 268 else 269 if (!toBeAdd) 270 toBeAdd=true; 271 if (toBeAdd && (str.length()!=0)) { 273 minCapacity++; 274 output.ensureCapacity(minCapacity); 275 if (output.add(str.toString())) 276 str = new StringBuffer (); 277 else 278 minCapacity--; 279 } 280 } 281 if (str.length()!=0) { 283 minCapacity++; 284 output.ensureCapacity(minCapacity); 285 if (output.add(str.toString())) 286 str = new StringBuffer (); 287 else 288 minCapacity--; 289 } 290 291 output.trimToSize(); 292 Object [] outputObj = output.toArray(); 293 String [] outputStr = new String [output.size()]; 294 for (int i=0; i<output.size(); i++) 295 outputStr[i] = new String ((String ) outputObj[i]); 296 return outputStr; 297 298 } 299 300 311 public static String trimSpaces (String input, String charsToBeRemoved) 312 { 313 314 StringBuffer output = new StringBuffer (); 315 316 boolean charFound=false; 317 for (int index=0; index<input.length(); index++) 318 { 319 charFound=false; 320 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 321 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 322 charFound=true; 323 if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound))) 324 output.append(input.charAt(index)); 325 } 326 327 return output.toString(); 328 329 } 330 331 343 public static String trimSpacesBeginEnd (String input, String charsToBeRemoved) 344 { 345 346 String output = new String (); 347 348 int begin=0; 349 int end=input.length()-1; 350 boolean charFound=false; 351 boolean ok=true; 352 for (int index=begin; (index<input.length()) && ok; index++) 353 { 354 charFound=false; 355 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 356 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 357 charFound=true; 358 if (!( (Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound) )) 359 { 360 begin=index; 361 ok=false; 362 } 363 } 364 ok=true; 365 for (int index=end; (index>=0) && ok; index--) 366 { 367 charFound=false; 368 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 369 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 370 charFound=true; 371 if (!( (Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound) )) 372 { 373 end=index; 374 ok=false; 375 } 376 } 377 output=input.substring(begin,end+1); 378 379 return output; 380 381 } 382 383 393 public static String [] splitButChars (String input, String charsDoNotBeRemoved) 394 { 395 396 ArrayList output = new ArrayList (); 397 int minCapacity = 0; 398 StringBuffer str = new StringBuffer (); 399 400 boolean charFound = false; 401 boolean toBeAdd = false; 402 for (int index=0; index<input.length(); index++) 403 { 404 charFound=false; 405 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 406 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 407 charFound=true; 408 if (charFound) 409 { 410 str.append(input.charAt(index)); 411 toBeAdd=false; 412 } 413 else 414 if (!toBeAdd) 415 toBeAdd=true; 416 if (toBeAdd && (str.length()!=0)) { 418 minCapacity++; 419 output.ensureCapacity(minCapacity); 420 if (output.add(str.toString())) 421 str = new StringBuffer (); 422 else 423 minCapacity--; 424 } 425 } 426 if (str.length()!=0) { 428 minCapacity++; 429 output.ensureCapacity(minCapacity); 430 if (output.add(str.toString())) 431 str = new StringBuffer (); 432 else 433 minCapacity--; 434 } 435 436 output.trimToSize(); 437 Object [] outputObj = output.toArray(); 438 String [] outputStr = new String [output.size()]; 439 for (int i=0; i<output.size(); i++) 440 outputStr[i] = new String ((String ) outputObj[i]); 441 return outputStr; 442 443 } 444 445 456 public static String trimButChars (String input, String charsDoNotBeRemoved) 457 { 458 459 StringBuffer output = new StringBuffer (); 460 461 boolean charFound=false; 462 for (int index=0; index<input.length(); index++) 463 { 464 charFound=false; 465 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 466 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 467 charFound=true; 468 if (charFound) 469 output.append(input.charAt(index)); 470 } 471 472 return output.toString(); 473 474 } 475 476 488 public static String trimButCharsBeginEnd (String input, String charsDoNotBeRemoved) 489 { 490 491 String output = new String (); 492 493 int begin=0; 494 int end=input.length()-1; 495 boolean charFound=false; 496 boolean ok=true; 497 for (int index=begin; (index<input.length()) && ok; index++) 498 { 499 charFound=false; 500 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 501 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 502 charFound=true; 503 if (charFound) 504 { 505 begin=index; 506 ok=false; 507 } 508 } 509 ok=true; 510 for (int index=end; (index>=0) && ok; index--) 511 { 512 charFound=false; 513 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 514 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 515 charFound=true; 516 if (charFound) 517 { 518 end=index; 519 ok=false; 520 } 521 } 522 output=input.substring(begin,end+1); 523 524 return output; 525 526 } 527 528 537 public static String [] splitChars (String input, String charsToBeRemoved) 538 { 539 540 ArrayList output = new ArrayList (); 541 int minCapacity = 0; 542 StringBuffer str = new StringBuffer (); 543 544 boolean charFound = false; 545 boolean toBeAdd = false; 546 for (int index=0; index<input.length(); index++) 547 { 548 charFound=false; 549 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 550 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 551 charFound=true; 552 if (!(charFound)) 553 { 554 str.append(input.charAt(index)); 555 toBeAdd=false; 556 } 557 else 558 if (!toBeAdd) 559 toBeAdd=true; 560 if (toBeAdd && (str.length()!=0)) { 562 minCapacity++; 563 output.ensureCapacity(minCapacity); 564 if (output.add(str.toString())) 565 str = new StringBuffer (); 566 else 567 minCapacity--; 568 } 569 } 570 if (str.length()!=0) { 572 minCapacity++; 573 output.ensureCapacity(minCapacity); 574 if (output.add(str.toString())) 575 str = new StringBuffer (); 576 else 577 minCapacity--; 578 } 579 580 output.trimToSize(); 581 Object [] outputObj = output.toArray(); 582 String [] outputStr = new String [output.size()]; 583 for (int i=0; i<output.size(); i++) 584 outputStr[i] = new String ((String ) outputObj[i]); 585 return outputStr; 586 587 } 588 589 599 public static String trimChars (String input, String charsToBeRemoved) 600 { 601 602 StringBuffer output = new StringBuffer (); 603 604 boolean charFound=false; 605 for (int index=0; index<input.length(); index++) 606 { 607 charFound=false; 608 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 609 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 610 charFound=true; 611 if (!(charFound)) 612 output.append(input.charAt(index)); 613 } 614 615 return output.toString(); 616 617 } 618 619 630 public static String trimCharsBeginEnd (String input, String charsToBeRemoved) 631 { 632 633 String output = new String (); 634 635 int begin=0; 636 int end=input.length()-1; 637 boolean charFound=false; 638 boolean ok=true; 639 for (int index=begin; (index<input.length()) && ok; index++) 640 { 641 charFound=false; 642 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 643 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 644 charFound=true; 645 if (!(charFound)) 646 { 647 begin=index; 648 ok=false; 649 } 650 } 651 ok=true; 652 for (int index=end; (index>=0) && ok; index--) 653 { 654 charFound=false; 655 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 656 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 657 charFound=true; 658 if (!(charFound)) 659 { 660 end=index; 661 ok=false; 662 } 663 } 664 output=input.substring(begin,end+1); 665 666 return output; 667 668 } 669 670 675 public static String [] splitTags (String input, String [] tags) 676 throws ParserException, UnsupportedEncodingException  677 { 678 return splitTags (input, tags, true, true); 679 } 680 681 698 public static String [] splitTags (String input, String [] tags, boolean recursive, boolean insideTag) 699 throws ParserException, UnsupportedEncodingException  700 { 701 702 ArrayList outputArrayList = new ArrayList (); 703 int minCapacity = 0; 704 String output = new String (); 705 String inputModified = new String (input); 706 String [] outputStr = new String [] {}; 707 708 String dummyString = createDummyString (' ', input.length()); 709 710 for (int i=0; i<tags.length; i++) 712 { 713 714 NodeList links = getLinks (inputModified, tags[i], recursive); 716 for (int j=0; j<links.size(); j++) 717 { 718 CompositeTag beginTag = (CompositeTag)links.elementAt(j); 719 Tag endTag = beginTag.getEndTag(); 720 721 int beginTagBegin = beginTag.getStartPosition (); 723 int endTagBegin = beginTag.getEndPosition (); 724 int beginTagEnd = endTag.getStartPosition (); 725 int endTagEnd = endTag.getEndPosition (); 726 727 if (insideTag) 728 { 729 dummyString = modifyDummyString (new |