1 25 package org.archive.util; 26 27 import java.io.BufferedInputStream ; 28 import java.io.BufferedOutputStream ; 29 import java.io.BufferedReader ; 30 import java.io.FileInputStream ; 31 import java.io.FileOutputStream ; 32 import java.io.FileWriter ; 33 import java.io.IOException ; 34 import java.io.InputStream ; 35 import java.io.InputStreamReader ; 36 import java.io.PrintStream ; 37 import java.io.Reader ; 38 import java.util.Iterator ; 39 import java.util.SortedSet ; 40 import java.util.TreeSet ; 41 42 import org.apache.commons.httpclient.URIException; 43 import org.archive.net.UURI; 44 import org.archive.net.UURIFactory; 45 import org.archive.util.iterator.LineReadingIterator; 46 import org.archive.util.iterator.RegexpLineIterator; 47 48 56 public class SurtPrefixSet extends TreeSet <String > { 57 58 private static final long serialVersionUID = 2598365040524933110L; 59 60 private static final String SURT_PREFIX_DIRECTIVE = "+"; 61 62 69 public boolean containsPrefixOf(String s) { 70 SortedSet sub = headSet(s); 71 if (!sub.isEmpty() && s.startsWith((String )sub.last())) { 74 return true; } return contains(s); } 78 79 85 public boolean add(String s) { 86 SortedSet sub = headSet(s); 87 if (!sub.isEmpty() && s.startsWith((String )sub.last())) { 88 return false; 90 } 91 boolean retVal = super.add(s); 92 sub = tailSet(s+"\0"); 93 while(!sub.isEmpty() && ((String )sub.first()).startsWith(s)) { 94 sub.remove(sub.first()); 96 } 97 return retVal; 98 } 99 100 101 108 public void importFrom(Reader r) { 109 BufferedReader reader = new BufferedReader (r); 110 String s; 111 112 Iterator iter = 113 new RegexpLineIterator( 114 new LineReadingIterator(reader), 115 RegexpLineIterator.COMMENT_LINE, 116 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT, 117 RegexpLineIterator.ENTRY); 118 119 while (iter.hasNext()) { 120 s = (String ) iter.next(); 121 add(s.toLowerCase()); 122 } 123 } 124 125 128 public void importFromUris(Reader r) { 129 BufferedReader reader = new BufferedReader (r); 130 String s; 131 132 Iterator iter = 133 new RegexpLineIterator( 134 new LineReadingIterator(reader), 135 RegexpLineIterator.COMMENT_LINE, 136 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT, 137 RegexpLineIterator.ENTRY); 138 139 while (iter.hasNext()) { 140 s = (String ) iter.next(); 141 addFromPlain(s); 143 } 144 } 145 146 154 public void importFromMixed(Reader r, boolean deduceFromSeeds) { 155 BufferedReader reader = new BufferedReader (r); 156 String s; 157 158 Iterator iter = 159 new RegexpLineIterator( 160 new LineReadingIterator(reader), 161 RegexpLineIterator.COMMENT_LINE, 162 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT, 163 RegexpLineIterator.ENTRY); 164 165 while (iter.hasNext()) { 166 s = (String ) iter.next(); 167 if(s.startsWith(SURT_PREFIX_DIRECTIVE)) { 168 String u = s.substring(SURT_PREFIX_DIRECTIVE.length()).trim(); 170 if(u.indexOf("(")>0) { 171 add(u.toLowerCase()); 173 } else { 174 addFromPlain(u); 177 } 178 179 continue; 180 } else { 181 if(deduceFromSeeds) { 182 addFromPlain(s); 185 } 186 } 187 } 188 } 189 190 196 private void addFromPlain(String u) { 197 u = prefixFromPlain(u); 198 add(u); 199 } 200 201 211 public static String prefixFromPlain(String u) { 212 u = ArchiveUtils.addImpliedHttpIfNecessary(u); 213 u = coerceFromHttpsForComparison(u); 214 boolean trailingSlash = u.endsWith("/"); 215 try { 217 u = UURIFactory.getInstance(u).toString(); 218 } catch (URIException e) { 219 e.printStackTrace(); 220 } 222 if(!trailingSlash && u.endsWith("/")) { 227 u = u.substring(0,u.length()-1); 228 } 229 u = SURT.fromURI(u); 231 u = SurtPrefixSet.asPrefix(u); 233 return u; 234 } 235 236 243 private static String coerceFromHttpsForComparison(String u) { 244 if (u.startsWith("https://")) { 245 u = "http" + u.substring("https".length()); 246 } 247 return u; 248 } 249 250 272 private static String asPrefix(String s) { 273 s = s.replaceAll("^(.*//.*/)[^/]*","$1"); 275 if (!s.endsWith("/")) { 277 s = s.replaceAll("^(.*)\\)","$1"); 278 } 279 return s; 280 } 281 282 289 public static String getCandidateSurt(Object object) { 290 UURI u = UURI.from(object); 291 if (u == null) { 292 return null; 293 } 294 String candidateSurt = u.getSurtForm(); 295 candidateSurt = coerceFromHttpsForComparison(candidateSurt); 297 return candidateSurt; 298 } 299 303 public void exportTo(FileWriter fw) throws IOException { 304 Iterator iter = this.iterator(); 305 while(iter.hasNext()) { 306 fw.write((String )iter.next() + "\n"); 307 } 308 } 309 310 317 public void convertAllPrefixesToHosts() { 318 SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone(); 319 Iterator iter = iterCopy.iterator(); 320 while (iter.hasNext()) { 321 String prefix = (String ) iter.next(); 322 String convPrefix = convertPrefixToHost(prefix); 323 if(prefix!=convPrefix) { 324 this.remove(prefix); 326 this.add(convPrefix); 327 } 328 } 329 } 330 331 public static String convertPrefixToHost(String prefix) { 332 if(prefix.endsWith(")")) { 333 return prefix; } 335 if(prefix.indexOf(')')<0) { 336 if(!prefix.endsWith(",")) { 338 prefix += ","; 339 } 340 prefix += ")"; 341 } else { 342 prefix = prefix.substring(0,prefix.indexOf(')')+1); 344 } 345 return prefix; 346 } 347 348 355 public void convertAllPrefixesToDomains() { 356 SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone(); 357 Iterator iter = iterCopy.iterator(); 358 while (iter.hasNext()) { 359 String prefix = (String ) iter.next(); 360 String convPrefix = convertPrefixToDomain(prefix); 361 if(prefix!=convPrefix) { 362 this.remove(prefix); 364 this.add(convPrefix); 365 } 366 } 367 } 368 369 public static String convertPrefixToDomain(String prefix) { 370 if(prefix.indexOf(')')>=0) { 371 prefix = prefix.substring(0,prefix.indexOf(')')); 372 } 373 if(prefix.endsWith("www,")) { 375 prefix = prefix.substring(0,prefix.length()-4); 376 } 377 return prefix; 378 } 379 380 390 public static void main(String [] args) throws IOException { 391 InputStream in = args.length > 0 ? new BufferedInputStream ( 392 new FileInputStream (args[0])) : System.in; 393 PrintStream out = args.length > 1 ? new PrintStream ( 394 new BufferedOutputStream (new FileOutputStream (args[1]))) 395 : System.out; 396 BufferedReader br = 397 new BufferedReader (new InputStreamReader (in)); 398 String line; 399 while((line = br.readLine())!=null) { 400 if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#")); 401 line = line.trim(); 402 if(line.length()==0) continue; 403 out.println(prefixFromPlain(line)); 404 } 405 br.close(); 406 out.close(); 407 } 408 } 409 | Popular Tags |