1 27 package org.htmlparser.util; 28 29 import java.io.UnsupportedEncodingException ; 30 import java.util.ArrayList ; 31 32 import org.htmlparser.Node; 33 import org.htmlparser.NodeFilter; 34 import org.htmlparser.Parser; 35 import org.htmlparser.Tag; 36 import org.htmlparser.filters.NodeClassFilter; 37 import org.htmlparser.filters.TagNameFilter; 38 import org.htmlparser.lexer.Lexer; 39 import org.htmlparser.lexer.Page; 40 import org.htmlparser.tags.CompositeTag; 41 import org.htmlparser.util.NodeList; 42 import org.htmlparser.util.ParserException; 43 44 45 public class ParserUtils 46 { 47 public static String removeChars(String s, char occur) { 48 StringBuffer newString = new StringBuffer (); 49 char ch; 50 for (int i = 0; i < s.length(); i++) { 51 ch = s.charAt(i); 52 if (ch != occur) 53 newString.append(ch); 54 } 55 return newString.toString(); 56 } 57 58 public static String removeEscapeCharacters(String inputString) { 59 inputString = ParserUtils.removeChars(inputString, '\r'); 60 inputString = ParserUtils.removeChars(inputString, '\n'); 61 inputString = ParserUtils.removeChars(inputString, '\t'); 62 return inputString; 63 } 64 65 public static String removeTrailingBlanks(String text) { 66 char ch = ' '; 67 while (ch == ' ') { 68 ch = text.charAt(text.length() - 1); 69 if (ch == ' ') 70 text = text.substring(0, text.length() - 1); 71 } 72 return text; 73 } 74 75 81 public static Node[] findTypeInNode(Node node, Class type) 82 { 83 NodeFilter filter; 84 NodeList ret; 85 86 ret = new NodeList (); 87 filter = new NodeClassFilter (type); 88 node.collectInto (ret, filter); 89 90 return (ret.toNodeArray ()); 91 } 92 93 103 public static String [] splitButDigits (String input, String charsDoNotBeRemoved) 104 { 105 106 ArrayList output = new ArrayList (); 107 int minCapacity = 0; 108 StringBuffer str = new StringBuffer (); 109 110 boolean charFound = false; 111 boolean toBeAdd = false; 112 for (int index=0; index<input.length(); index++) 113 { 114 charFound=false; 115 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 116 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 117 charFound=true; 118 if ((Character.isDigit(input.charAt(index))) || (charFound)) 119 { 120 str.append(input.charAt(index)); 121 toBeAdd=false; 122 } 123 else 124 if (!toBeAdd) 125 toBeAdd=true; 126 if (toBeAdd && (str.length()!=0)) { 128 minCapacity++; 129 output.ensureCapacity(minCapacity); 130 if (output.add(str.toString())) 131 str = new StringBuffer (); 132 else 133 minCapacity--; 134 } 135 } 136 if (str.length()!=0) { 138 minCapacity++; 139 output.ensureCapacity(minCapacity); 140 if (output.add(str.toString())) 141 str = new StringBuffer (); 142 else 143 minCapacity--; 144 } 145 146 output.trimToSize(); 147 Object [] outputObj = output.toArray(); 148 String [] outputStr = new String [output.size()]; 149 for (int i=0; i<output.size(); i++) 150 outputStr[i] = new String ((String ) outputObj[i]); 151 return outputStr; 152 153 } 154 155 166 public static String trimButDigits (String input, String charsDoNotBeRemoved) 167 { 168 169 StringBuffer output = new StringBuffer (); 170 171 boolean charFound=false; 172 for (int index=0; index<input.length(); index++) 173 { 174 charFound=false; 175 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 176 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 177 charFound=true; 178 if ((Character.isDigit(input.charAt(index))) || (charFound)) 179 output.append(input.charAt(index)); 180 } 181 182 return output.toString(); 183 184 } 185 186 198 public static String trimButDigitsBeginEnd (String input, String charsDoNotBeRemoved) 199 { 200 201 String output = new String (); 202 203 int begin=0; 204 int end=input.length()-1; 205 boolean charFound=false; 206 boolean ok=true; 207 for (int index=begin; (index<input.length()) && ok; index++) 208 { 209 charFound=false; 210 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 211 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 212 charFound=true; 213 if ( (Character.isDigit(input.charAt(index))) || (charFound) ) 214 { 215 begin=index; 216 ok=false; 217 } 218 } 219 ok=true; 220 for (int index=end; (index>=0) && ok; index--) 221 { 222 charFound=false; 223 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 224 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 225 charFound=true; 226 if ( (Character.isDigit(input.charAt(index))) || (charFound) ) 227 { 228 end=index; 229 ok=false; 230 } 231 } 232 output=input.substring(begin,end+1); 233 234 return output; 235 236 } 237 238 248 public static String [] splitSpaces (String input, String charsToBeRemoved) 249 { 250 251 ArrayList output = new ArrayList (); 252 int minCapacity = 0; 253 StringBuffer str = new StringBuffer (); 254 255 boolean charFound = false; 256 boolean toBeAdd = false; 257 for (int index=0; index<input.length(); index++) 258 { 259 charFound=false; 260 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 261 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 262 charFound=true; 263 if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound))) 264 { 265 str.append(input.charAt(index)); 266 toBeAdd=false; 267 } 268 else 269 if (!toBeAdd) 270 toBeAdd=true; 271 if (toBeAdd && (str.length()!=0)) { 273 minCapacity++; 274 output.ensureCapacity(minCapacity); 275 if (output.add(str.toString())) 276 str = new StringBuffer (); 277 else 278 minCapacity--; 279 } 280 } 281 if (str.length()!=0) { 283 minCapacity++; 284 output.ensureCapacity(minCapacity); 285 if (output.add(str.toString())) 286 str = new StringBuffer (); 287 else 288 minCapacity--; 289 } 290 291 output.trimToSize(); 292 Object [] outputObj = output.toArray(); 293 String [] outputStr = new String [output.size()]; 294 for (int i=0; i<output.size(); i++) 295 outputStr[i] = new String ((String ) outputObj[i]); 296 return outputStr; 297 298 } 299 300 311 public static String trimSpaces (String input, String charsToBeRemoved) 312 { 313 314 StringBuffer output = new StringBuffer (); 315 316 boolean charFound=false; 317 for (int index=0; index<input.length(); index++) 318 { 319 charFound=false; 320 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 321 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 322 charFound=true; 323 if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound))) 324 output.append(input.charAt(index)); 325 } 326 327 return output.toString(); 328 329 } 330 331 343 public static String trimSpacesBeginEnd (String input, String charsToBeRemoved) 344 { 345 346 String output = new String (); 347 348 int begin=0; 349 int end=input.length()-1; 350 boolean charFound=false; 351 boolean ok=true; 352 for (int index=begin; (index<input.length()) && ok; index++) 353 { 354 charFound=false; 355 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 356 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 357 charFound=true; 358 if (!( (Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound) )) 359 { 360 begin=index; 361 ok=false; 362 } 363 } 364 ok=true; 365 for (int index=end; (index>=0) && ok; index--) 366 { 367 charFound=false; 368 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 369 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 370 charFound=true; 371 if (!( (Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound) )) 372 { 373 end=index; 374 ok=false; 375 } 376 } 377 output=input.substring(begin,end+1); 378 379 return output; 380 381 } 382 383 393 public static String [] splitButChars (String input, String charsDoNotBeRemoved) 394 { 395 396 ArrayList output = new ArrayList (); 397 int minCapacity = 0; 398 StringBuffer str = new StringBuffer (); 399 400 boolean charFound = false; 401 boolean toBeAdd = false; 402 for (int index=0; index<input.length(); index++) 403 { 404 charFound=false; 405 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 406 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 407 charFound=true; 408 if (charFound) 409 { 410 str.append(input.charAt(index)); 411 toBeAdd=false; 412 } 413 else 414 if (!toBeAdd) 415 toBeAdd=true; 416 if (toBeAdd && (str.length()!=0)) { 418 minCapacity++; 419 output.ensureCapacity(minCapacity); 420 if (output.add(str.toString())) 421 str = new StringBuffer (); 422 else 423 minCapacity--; 424 } 425 } 426 if (str.length()!=0) { 428 minCapacity++; 429 output.ensureCapacity(minCapacity); 430 if (output.add(str.toString())) 431 str = new StringBuffer (); 432 else 433 minCapacity--; 434 } 435 436 output.trimToSize(); 437 Object [] outputObj = output.toArray(); 438 String [] outputStr = new String [output.size()]; 439 for (int i=0; i<output.size(); i++) 440 outputStr[i] = new String ((String ) outputObj[i]); 441 return outputStr; 442 443 } 444 445 456 public static String trimButChars (String input, String charsDoNotBeRemoved) 457 { 458 459 StringBuffer output = new StringBuffer (); 460 461 boolean charFound=false; 462 for (int index=0; index<input.length(); index++) 463 { 464 charFound=false; 465 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 466 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 467 charFound=true; 468 if (charFound) 469 output.append(input.charAt(index)); 470 } 471 472 return output.toString(); 473 474 } 475 476 488 public static String trimButCharsBeginEnd (String input, String charsDoNotBeRemoved) 489 { 490 491 String output = new String (); 492 493 int begin=0; 494 int end=input.length()-1; 495 boolean charFound=false; 496 boolean ok=true; 497 for (int index=begin; (index<input.length()) && ok; index++) 498 { 499 charFound=false; 500 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 501 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 502 charFound=true; 503 if (charFound) 504 { 505 begin=index; 506 ok=false; 507 } 508 } 509 ok=true; 510 for (int index=end; (index>=0) && ok; index--) 511 { 512 charFound=false; 513 for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) 514 if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) 515 charFound=true; 516 if (charFound) 517 { 518 end=index; 519 ok=false; 520 } 521 } 522 output=input.substring(begin,end+1); 523 524 return output; 525 526 } 527 528 537 public static String [] splitChars (String input, String charsToBeRemoved) 538 { 539 540 ArrayList output = new ArrayList (); 541 int minCapacity = 0; 542 StringBuffer str = new StringBuffer (); 543 544 boolean charFound = false; 545 boolean toBeAdd = false; 546 for (int index=0; index<input.length(); index++) 547 { 548 charFound=false; 549 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 550 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 551 charFound=true; 552 if (!(charFound)) 553 { 554 str.append(input.charAt(index)); 555 toBeAdd=false; 556 } 557 else 558 if (!toBeAdd) 559 toBeAdd=true; 560 if (toBeAdd && (str.length()!=0)) { 562 minCapacity++; 563 output.ensureCapacity(minCapacity); 564 if (output.add(str.toString())) 565 str = new StringBuffer (); 566 else 567 minCapacity--; 568 } 569 } 570 if (str.length()!=0) { 572 minCapacity++; 573 output.ensureCapacity(minCapacity); 574 if (output.add(str.toString())) 575 str = new StringBuffer (); 576 else 577 minCapacity--; 578 } 579 580 output.trimToSize(); 581 Object [] outputObj = output.toArray(); 582 String [] outputStr = new String [output.size()]; 583 for (int i=0; i<output.size(); i++) 584 outputStr[i] = new String ((String ) outputObj[i]); 585 return outputStr; 586 587 } 588 589 599 public static String trimChars (String input, String charsToBeRemoved) 600 { 601 602 StringBuffer output = new StringBuffer (); 603 604 boolean charFound=false; 605 for (int index=0; index<input.length(); index++) 606 { 607 charFound=false; 608 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 609 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 610 charFound=true; 611 if (!(charFound)) 612 output.append(input.charAt(index)); 613 } 614 615 return output.toString(); 616 617 } 618 619 630 public static String trimCharsBeginEnd (String input, String charsToBeRemoved) 631 { 632 633 String output = new String (); 634 635 int begin=0; 636 int end=input.length()-1; 637 boolean charFound=false; 638 boolean ok=true; 639 for (int index=begin; (index<input.length()) && ok; index++) 640 { 641 charFound=false; 642 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 643 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 644 charFound=true; 645 if (!(charFound)) 646 { 647 begin=index; 648 ok=false; 649 } 650 } 651 ok=true; 652 for (int index=end; (index>=0) && ok; index--) 653 { 654 charFound=false; 655 for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) 656 if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) 657 charFound=true; 658 if (!(charFound)) 659 { 660 end=index; 661 ok=false; 662 } 663 } 664 output=input.substring(begin,end+1); 665 666 return output; 667 668 } 669 670 675 public static String [] splitTags (String input, String [] tags) 676 throws ParserException, UnsupportedEncodingException 677 { 678 return splitTags (input, tags, true, true); 679 } 680 681 698 public static String [] splitTags (String input, String [] tags, boolean recursive, boolean insideTag) 699 throws ParserException, UnsupportedEncodingException 700 { 701 702 ArrayList outputArrayList = new ArrayList (); 703 int minCapacity = 0; 704 String output = new String (); 705 String inputModified = new String (input); 706 String [] outputStr = new String [] {}; 707 708 String dummyString = createDummyString (' ', input.length()); 709 710 for (int i=0; i<tags.length; i++) 712 { 713 714 NodeList links = getLinks (inputModified, tags[i], recursive); 716 for (int j=0; j<links.size(); j++) 717 { 718 CompositeTag beginTag = (CompositeTag)links.elementAt(j); 719 Tag endTag = beginTag.getEndTag(); 720 721 int beginTagBegin = beginTag.getStartPosition (); 723 int endTagBegin = beginTag.getEndPosition (); 724 int beginTagEnd = endTag.getStartPosition (); 725 int endTagEnd = endTag.getEndPosition (); 726 727 if (insideTag) 728 { 729 dummyString = modifyDummyString (new String (dummyString), beginTagBegin, endTagEnd); 730 } 731 else 732 { 733 dummyString = modifyDummyString (new String (dummyString), beginTagBegin, endTagBegin); 734 dummyString = modifyDummyString (new String (dummyString), beginTagEnd, endTagEnd); 735 } 736 } 737 for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) 738 { 739 int kNew = dummyString.indexOf('*',k); 740 if (kNew!=-1) 741 { 742 output = inputModified.substring(k,kNew); 743 k = dummyString.indexOf(' ',kNew); 744 745 minCapacity++; 746 outputArrayList.ensureCapacity(minCapacity); 747 if (outputArrayList.add(output)) 748 output = new String (); 749 else 750 minCapacity--; 751 } 752 else 753 { 754 output = inputModified.substring(k,dummyString.length()); 755 k = kNew; 756 757 minCapacity++; 758 outputArrayList.ensureCapacity(minCapacity); 759 if (outputArrayList.add(output)) 760 output = new String (); 761 else 762 minCapacity--; 763 } 764 } 765 StringBuffer outputStringBuffer = new StringBuffer (); 766 outputArrayList.trimToSize(); 767 Object [] outputObj = outputArrayList.toArray(); 768 outputStr = new String [outputArrayList.size()]; 769 for (int j=0; j<outputArrayList.size(); j++) 770 { 771 outputStr[j] = new String ((String ) outputObj[j]); 772 outputStringBuffer.append(outputStr[j]); 773 } 774 outputArrayList = new ArrayList (); 775 inputModified = new String (outputStringBuffer.toString()); 776 dummyString = createDummyString (' ', inputModified.length()); 777 } 778 779 return outputStr; 780 781 } 782 783 790 public static String [] splitTags (String input, Class nodeType) 791 throws ParserException, UnsupportedEncodingException 792 { 793 return splitTags (input, new NodeClassFilter (nodeType), true, true); 794 } 795 796 803 public static String [] splitTags (String input, Class nodeType, boolean recursive, boolean insideTag) 804 throws ParserException, UnsupportedEncodingException 805 { 806 return splitTags (input, new NodeClassFilter (nodeType), recursive, insideTag); 807 } 808 809 816 public static String [] splitTags (String input, NodeFilter filter) 817 throws ParserException, UnsupportedEncodingException 818 { 819 return splitTags (input, filter, true, true); 820 } 821 822 829 public static String [] splitTags (String input, NodeFilter filter, boolean recursive, boolean insideTag) 830 throws ParserException, UnsupportedEncodingException 831 { 832 833 ArrayList outputArrayList = new ArrayList (); 834 int minCapacity = 0; 835 String output = new String (); 836 837 String dummyString = createDummyString (' ', input.length()); 838 839 NodeList links = getLinks (input, filter, recursive); 841 for (int j=0; j<links.size(); j++) 842 { 843 CompositeTag beginTag = (CompositeTag)links.elementAt(j); 844 Tag endTag = beginTag.getEndTag(); 845 846 int beginTagBegin = beginTag.getStartPosition (); 848 int endTagBegin = beginTag.getEndPosition (); 849 int beginTagEnd = endTag.getStartPosition (); 850 int endTagEnd = endTag.getEndPosition (); 851 852 if (insideTag) 853 { 854 dummyString = modifyDummyString (new String (dummyString), beginTagBegin, endTagEnd); 855 } 856 else 857 { 858 dummyString = modifyDummyString (new String (dummyString), beginTagBegin, endTagBegin); 859 dummyString = modifyDummyString (new String (dummyString), beginTagEnd, endTagEnd); 860 } 861 } 862 for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) 863 { 864 int kNew = dummyString.indexOf('*',k); 865 if (kNew!=-1) 866 { 867 output = input.substring(k,kNew); 868 k = dummyString.indexOf(' ',kNew); 869 870 minCapacity++; 871 outputArrayList.ensureCapacity(minCapacity); 872 if (outputArrayList.add(output)) 873 output = new String (); 874 else 875 minCapacity--; 876 } 877 else 878 { 879 output = input.substring(k,dummyString.length()); 880 k = kNew; 881 882 minCapacity++; 883 outputArrayList.ensureCapacity(minCapacity); 884 if (outputArrayList.add(output)) 885 output = new String (); 886 else 887 minCapacity--; 888 } 889 890 } 891 892 outputArrayList.trimToSize(); 893 Object [] outputObj = outputArrayList.toArray(); 894 String [] outputStr = new String [outputArrayList.size()]; 895 for (int i=0; i<outputArrayList.size(); i++) 896 outputStr[i] = new String ((String ) outputObj[i]); 897 return outputStr; 898 899 } 900 901 911 public static String trimAllTags (String input, boolean inside) 912 { 913 914 StringBuffer output = new StringBuffer (); 915 916 if (inside) { 917 if ((input.indexOf('<')==-1) || (input.lastIndexOf('>')==-1) || (input.lastIndexOf('>')<input.indexOf('<'))) { 918 output.append(input); 919 } else { 920 output.append(input.substring(0, input.indexOf('<'))); 921 output.append(input.substring(input.lastIndexOf('>')+1, input.length())); 922 } 923 } else { 924 boolean write = true; 925 for (int index=0; index<input.length(); index++) 926 { 927 if (input.charAt(index)=='<' && write) 928 write = false; 929 if (write) 930 output.append(input.charAt(index)); 931 if (input.charAt(index)=='>' && (!write)) 932 write = true; 933 } 934 } 935 936 return output.toString(); 937 } 938 939 940 946 public static String trimTags (String input, String [] tags) 947 throws ParserException, UnsupportedEncodingException 948 { 949 return trimTags (input, tags, true, true); 950 } 951 952 970 public static String trimTags (String input, String [] tags, boolean recursive, boolean insideTag) 971 throws ParserException, UnsupportedEncodingException 972 { 973 974 StringBuffer output = new StringBuffer (); 975 String inputModified = new String (input); 976 String dummyString = createDummyString (' ', input.length()); 977 978 for (int i=0; i<tags.length; i++) 980 { 981 output = new StringBuffer (); 982 983 NodeList links = getLinks (inputModified, tags[i], recursive); 985 for (int j=0; j<links.size(); j++) 986 { 987 CompositeTag beginTag = (CompositeTag)links.elementAt(j); 988 Tag endTag = beginTag.getEndTag(); 989 990 int beginTagBegin = beginTag.getStartPosition (); 992 int endTagBegin = beginTag.getEndPosition (); 993 int beginTagEnd = endTag.getStartPosition (); 994 int endTagEnd = endTag.getEndPosition (); 995 996 997 if (insideTag) 998 { 999 dummyString = modifyDummyString (new String (dummyString), beginTagBegin, endTagEnd); 1000 } 1001 else 1002 { 1003 dummyString = modifyDummyString (new String (dummyString), beginTagBegin, endTagBegin); 1004 dummyString = modifyDummyString (new String (dummyString), beginTagEnd, endTagEnd); 1005 } 1006 } 1007 for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) 1008 { 1009 int kNew = dummyString.indexOf('*',k); 1010 if (kNew!=-1) 1011 { 1012 output = output.append(inputModified.substring(k,kNew)); 1013 k = dummyString.indexOf(' ',kNew); 1014 } 1015 else 1016 { 1017 output = output.append(inputModified.substring(k,dummyString.length())); 1018 k = kNew; 1019 } 1020 } 1021 inputModified = new String (output); 1022 dummyString = createDummyString (' ', inputModified.length()); 1023 } 1024 1025 return output.toString(); 1026 1027 } 1028 1029 1037 public static String trimTags (String input, Class nodeType) 1038 throws ParserException, UnsupportedEncodingException 1039 { 1040 return trimTags (input, new NodeClassFilter (nodeType), true, true); 1041 } 1042 1043 1051 public static String trimTags (String input, Class nodeType, boolean recursive, boolean insideTag) 1052 throws ParserException, UnsupportedEncodingException 1053 { 1054 return trimTags (input, new NodeClassFilter (nodeType), recursive, insideTag); 1055 } 1056 1057 1065 public static String trimTags (String input, NodeFilter filter) 1066 throws ParserException, UnsupportedEncodingException 1067 { 1068 return trimTags (input, filter, true, true); 1069 } 1070 1071 1079 public static String trimTags (String input, NodeFilter filter, boolean recursive, boolean insideTag) 1080 throws ParserException, UnsupportedEncodingException 1081 { 1082 1083 StringBuffer output = new StringBuffer (); 1084 1085 String dummyString = createDummyString (' ', input.length()); 1086 1087 NodeList links = getLinks (input, filter, recursive); 1089 for (int j=0; j<links.size(); j++) 1090 { 1091 CompositeTag beginTag = (CompositeTag)links.elementAt(j); 1092 Tag endTag = beginTag.getEndTag(); 1093 1094 int beginTagBegin = beginTag.getStartPosition (); 1096 int endTagBegin = beginTag.getEndPosition (); 1097 int beginTagEnd = endTag.getStartPosition (); 1098 int endTagEnd = endTag.getEndPosition (); 1099 1100 if (insideTag) 1101 { 1102 dummyString = modifyDummyString (new String (dummyString), beginTagBegin, endTagEnd); 1103 } 1104 else 1105 { 1106 dummyString = modifyDummyString (new String (dummyString), beginTagBegin, endTagBegin); 1107 dummyString = modifyDummyString (new String (dummyString), beginTagEnd, endTagEnd); 1108 } 1109 } 1110 for (int k=dummyString.indexOf(' '); (k<dummyString.length()) && (k!=-1);) 1111 { 1112 int kNew = dummyString.indexOf('*',k); 1113 if (kNew!=-1) 1114 { 1115 output = output.append(input.substring(k,kNew)); 1116 k = dummyString.indexOf(' ',kNew); 1117 } 1118 else 1119 { 1120 output = output.append(input.substring(k,dummyString.length())); 1121 k = kNew; 1122 } 1123 1124 } 1125 1126 return output.toString(); 1127 1128 } 1129 1130 1136 public static Parser createParserParsingAnInputString (String input) 1137 throws ParserException, UnsupportedEncodingException 1138 { 1139 1140 Parser parser = new Parser(); 1141 Lexer lexer = new Lexer(); 1142 Page page = new Page(input); 1143 lexer.setPage(page); 1144 parser.setLexer(lexer); 1145 1146 return parser; 1147 1148 } 1149 1150 private static NodeList getLinks (String output, String tag, boolean recursive) 1151 throws ParserException, UnsupportedEncodingException 1152 { 1153 1154 Parser parser = new Parser(); 1155 NodeFilter filterLink = new TagNameFilter (tag); 1156 NodeList links = new NodeList (); 1157 parser = createParserParsingAnInputString(output); 1158 links = parser.extractAllNodesThatMatch(filterLink); 1159 1160 if (!recursive) 1164 { 1165 for (int j=0; j<links.size(); j++) 1166 { 1167 CompositeTag jStartTag = (CompositeTag)links.elementAt(j); 1168 Tag jEndTag = jStartTag.getEndTag(); 1169 int jStartTagBegin = jStartTag.getStartPosition (); 1170 int jEndTagEnd = jEndTag.getEndPosition (); 1171 for (int k=0; k<links.size(); k++) 1172 { 1173 CompositeTag kStartTag = (CompositeTag)links.elementAt(k); 1174 Tag kEndTag = kStartTag.getEndTag(); 1175 int kStartTagBegin = kStartTag.getStartPosition (); 1176 int kEndTagEnd = kEndTag.getEndPosition (); 1177 if ((k!=j) && (kStartTagBegin>jStartTagBegin) && (kEndTagEnd<jEndTagEnd)) 1178 { 1179 links.remove(k); 1180 k--; 1181 j--; 1182 } 1183 } 1184 } 1185 } 1186 1187 return links; 1188 1189 } 1190 1191 private static NodeList getLinks (String output, NodeFilter filter, boolean recursive) 1192 throws ParserException, UnsupportedEncodingException 1193 { 1194 1195 Parser parser = new Parser(); 1196 NodeList links = new NodeList (); 1197 parser = createParserParsingAnInputString(output); 1198 links = parser.extractAllNodesThatMatch(filter); 1199 1200 if (!recursive) 1204 { 1205 for (int j=0; j<links.size(); j++) 1206 { 1207 CompositeTag jStartTag = (CompositeTag)links.elementAt(j); 1208 Tag jEndTag = jStartTag.getEndTag(); 1209 int jStartTagBegin = jStartTag.getStartPosition (); 1210 int jEndTagEnd = jEndTag.getEndPosition (); 1211 for (int k=0; k<links.size(); k++) 1212 { 1213 CompositeTag kStartTag = (CompositeTag)links.elementAt(k); 1214 Tag kEndTag = kStartTag.getEndTag(); 1215 int kStartTagBegin = kStartTag.getStartPosition (); 1216 int kEndTagEnd = kEndTag.getEndPosition (); 1217 if ((k!=j) && (kStartTagBegin>jStartTagBegin) && (kEndTagEnd<jEndTagEnd)) 1218 { 1219 links.remove(k); 1220 k--; 1221 j--; 1222 } 1223 } 1224 } 1225 } 1226 1227 return links; 1228 1229 } 1230 1231 private static String createDummyString (char fillingChar, int length) 1232 { 1233 StringBuffer dummyStringBuffer = new StringBuffer (); 1234 for (int j=0; j<length; j++) 1235 dummyStringBuffer = dummyStringBuffer.append(fillingChar); 1236 return new String (dummyStringBuffer); 1237 } 1238 1239 private static String modifyDummyString (String dummyString, int beginTag, int endTag) 1240 { 1241 String dummyStringInterval = createDummyString ('*', endTag-beginTag); 1242 return new String (dummyString.substring(0, beginTag) + dummyStringInterval + dummyString.substring(endTag, dummyString.length())); 1243 } 1244 1245} | Popular Tags |