1 23 package org.archive.crawler.frontier; 24 25 import java.io.File ; 26 27 import org.archive.crawler.datamodel.CrawlURI; 28 import org.archive.net.UURI; 29 import org.archive.net.UURIFactory; 30 import org.archive.util.TmpDirTestCase; 31 import org.archive.util.FileUtils; 32 33 import com.sleepycat.bind.serial.StoredClassCatalog; 34 import com.sleepycat.je.DatabaseConfig; 35 import com.sleepycat.je.Environment; 36 import com.sleepycat.je.EnvironmentConfig; 37 38 47 public class AdaptiveRevisitHostQueueTest 48 extends TmpDirTestCase 49 implements AdaptiveRevisitAttributeConstants { 50 public void testHQ() throws Exception { 51 EnvironmentConfig envConfig = new EnvironmentConfig(); 52 envConfig.setTransactional(true); 53 envConfig.setAllowCreate(true); 54 File envDir = new File (getTmpDir(), "AR"); 55 if (envDir.exists()) { 56 FileUtils.deleteDir(envDir); 57 } 58 envDir.mkdirs(); 59 Environment env = new Environment(envDir, envConfig); 60 DatabaseConfig dbConfig = new DatabaseConfig(); 63 dbConfig.setAllowCreate(true); 64 StoredClassCatalog catalog = 65 new StoredClassCatalog(env.openDatabase(null, "classes", dbConfig)); 66 AdaptiveRevisitHostQueue hq = 67 new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 1); 68 69 70 CrawlURI[] curis = {null,null,null,null}; 72 73 UURI uuri = UURIFactory.getInstance("http://bok.hi.is/1.html"); 74 curis[0] = new CrawlURI(uuri); 75 curis[0].setVia(null); 76 77 uuri = UURIFactory.getInstance("http://bok.hi.is/2.html"); 78 curis[1] = new CrawlURI(uuri); 79 curis[1].setVia(null); 80 81 uuri = UURIFactory.getInstance("http://bok.hi.is/3.html"); 82 curis[2] = new CrawlURI(uuri); 83 curis[2].setVia(null); 84 85 uuri = UURIFactory.getInstance("http://bok.hi.is/4.html"); 86 curis[3] = new CrawlURI(uuri); 87 curis[3].setVia(null); 88 89 assertTrue("HQ should be empty initially", 90 hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_EMPTY); 91 assertEquals("Incorrect nextReadyTime on Empty", 92 Long.MAX_VALUE,hq.getNextReadyTime()); 93 assertEquals("Initial size of HQ should be 0",0,hq.getSize()); 94 95 assertEquals("Peek should return null when 'ready queue' is empty", 96 null, hq.peek()); 97 98 102 103 curis[0].putLong( 104 A_TIME_OF_NEXT_PROCESSING, 105 System.currentTimeMillis()); curis[1].putLong( 107 A_TIME_OF_NEXT_PROCESSING, 108 System.currentTimeMillis()+5000); curis[2].putLong( 110 A_TIME_OF_NEXT_PROCESSING, 111 System.currentTimeMillis()+20000); 113 hq.add(curis[0],false); 114 assertEquals("First CrawlURI should be top",curis[0].toString(), 115 hq.peek().toString()); 116 assertTrue("HQ should no longer be empty", 117 hq.getState()!=AdaptiveRevisitHostQueue.HQSTATE_EMPTY); 118 assertEquals("Size of HQ should now be 1",1,hq.getSize()); 119 120 126 CrawlURI curi = hq.next(); assertEquals("next() did not return 'top' URI", 128 curis[0].toString(),curi.toString()); 129 assertTrue("HQ should now be busy, is " + hq.getStateByName(), 130 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY); 131 try{ 132 hq.next(); 133 assertTrue("next() should throw an IllegalStateException if HQ " + 134 "not ready",false); 135 } catch(IllegalStateException e){ 136 } 138 assertEquals("New top URI should be null", 139 null,hq.peek()); 140 141 hq.add(curis[1],false); 142 assertEquals("Second CrawlURI should be top",curis[1].toString(), 143 hq.peek().toString()); 144 assertEquals("Size of HQ should now be 2",2,hq.getSize()); 145 146 curi.putLong(A_TIME_OF_NEXT_PROCESSING, 148 hq.peek().getLong(A_TIME_OF_NEXT_PROCESSING) 149 +100000); hq.update(curi,false,0); 151 assertEquals("Second CrawlURI should be still be top", 152 curis[1].toString(),hq.peek().toString()); 153 assertEquals("Size of HQ should still be 2",2,hq.getSize()); 154 155 hq.add(curis[2],false); 156 assertEquals("Second CrawlURI should still be top", 157 curis[1].toString(), hq.peek().toString()); 158 assertEquals("Size of HQ should now be 3",3,hq.getSize()); 159 160 166 assertTrue("HQ should be snoozed, is " + hq.getStateByName(), 167 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_SNOOZED); 168 synchronized(this){ 170 wait(hq.getNextReadyTime()-System.currentTimeMillis()+100); 171 } 172 assertTrue("HQ should now be ready, is " + hq.getStateByName(), 173 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY); 174 175 183 curis[2].putLong( 184 A_TIME_OF_NEXT_PROCESSING, 185 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) 186 -1000); hq.add(curis[2],true); 188 assertEquals("Size of HQ should still be 3",hq.getSize(),3); 189 assertEquals("Third CrawlURI should be now be top", 190 curis[2].toString(), hq.peek().toString()); 191 curis[2].putLong(A_TIME_OF_NEXT_PROCESSING, 192 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) 193 +10000); hq.add(curis[2],true); 195 assertEquals("Size of HQ should still be 3",hq.getSize(),3); 196 assertEquals("Third CrawlURI should still top", 197 curis[2].toString(), hq.peek().toString()); 198 199 200 206 curi = hq.next(); assertEquals("next() did not return 'top' URI", 208 curis[2].toString(),curi.toString()); 209 assertTrue("HQ should now be busy, is " + hq.getStateByName(), 210 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY); 211 try{ 212 hq.next(); 213 assertTrue("next() should throw an IllegalStateException if HQ " + 214 "not ready",false); 215 } catch(IllegalStateException e){ 216 } 218 assertEquals("New top URI", 219 curis[1].toString(),hq.peek().toString()); 220 221 225 226 curis[3].putLong(A_TIME_OF_NEXT_PROCESSING, 227 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) 228 - 1); hq.add(curis[3],false); 230 assertEquals("Size of HQ should now be 4",4,hq.getSize()); 231 232 233 244 245 try { 246 hq.update(curis[1],false,0); 247 assertTrue("update() should not accept URI",false); 248 } catch(IllegalStateException e){ 249 } 251 252 long timeOfPolitenessWakeUp = System.currentTimeMillis()+2000; 255 hq.update(curi,true,timeOfPolitenessWakeUp); assertTrue("HQ should be snoozed, is " + hq.getStateByName(), 257 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_SNOOZED); 258 259 try { 260 hq.update(curis[2],false,0); 261 assertTrue("update() should not accept URI",false); 262 } catch(IllegalStateException e){ 263 } 265 assertEquals("HQs time of next ready should reflect set wait time ", 266 timeOfPolitenessWakeUp, hq.getNextReadyTime()); 267 268 269 273 synchronized(this){ 275 wait(hq.getNextReadyTime()-System.currentTimeMillis()+100); 276 } 277 assertTrue("HQ should now be ready, is " + hq.getStateByName(), 278 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY); 279 assertEquals("HQs time of next ready should still be when it 'woken' " + 280 "up.", timeOfPolitenessWakeUp, hq.getNextReadyTime()); 281 282 291 curi = hq.next(); assertEquals("next() did not return 'top' URI", 293 curis[2].toString(),curi.toString()); 294 assertTrue("HQ should now be busy, is " + hq.getStateByName(), 295 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY); 296 hq.close(); 297 298 hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 2); 299 300 assertEquals("Size of HQ after reopening should now be 4", 301 4, hq.getSize()); 302 assertTrue("HQ should be ready on reopen, is " + hq.getStateByName(), 303 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY); 304 assertEquals("CrawlURI 'in processing' before should be top", 305 curi.toString(), hq.peek().toString()); 306 307 311 curi = hq.next(); assertEquals("next() did not return 'top' URI", 313 curis[2].toString(),curi.toString()); 314 assertTrue("HQ should still be ready, is " + hq.getStateByName(), 315 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY); 316 317 319 curi = hq.next(); assertEquals("next() did not return 'top' URI", 321 curis[3].toString(),curi.toString()); 322 assertTrue("HQ should be busy, is " + hq.getStateByName(), 323 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY); 324 assertEquals("Size of HQ should still be 4", 325 4, hq.getSize()); 326 327 328 331 hq.update(curi,false,0); 332 assertTrue("HQ should now be ready, is " + hq.getStateByName(), 333 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY); 334 assertEquals("'updated' CrawlURI before should be top", 335 curi.toString(), hq.peek().toString()); 336 337 338 342 343 hq.update(curis[2],true,System.currentTimeMillis() + 1000000); curis[3].putLong(A_TIME_OF_NEXT_PROCESSING, 345 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING) 346 + 1000); assertTrue("HQ should still be ready, is " + hq.getStateByName(), 348 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY); 349 assertEquals("Top CrawlURI before should be unchanged", 350 curi.toString(), hq.peek().toString()); 351 352 353 355 358 hq.close(); 359 catalog.close(); 360 env.close(); 361 cleanUpOldFiles("AR"); 362 } 363 364 } 365 | Popular Tags |