KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > frontier > AdaptiveRevisitHostQueueTest


1 /* ARHostQueueTest.java
2 *
3 * Created on Sep 13, 2004
4 *
5 * Copyright (C) 2004 Kristinn Sigur?sson.
6 *
7 * This file is part of the Heritrix web crawler (crawler.archive.org).
8 *
9 * Heritrix is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * any later version.
13 *
14 * Heritrix is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser Public License
20 * along with Heritrix; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */

23 package org.archive.crawler.frontier;
24
25 import java.io.File JavaDoc;
26
27 import org.archive.crawler.datamodel.CrawlURI;
28 import org.archive.net.UURI;
29 import org.archive.net.UURIFactory;
30 import org.archive.util.TmpDirTestCase;
31 import org.archive.util.FileUtils;
32
33 import com.sleepycat.bind.serial.StoredClassCatalog;
34 import com.sleepycat.je.DatabaseConfig;
35 import com.sleepycat.je.Environment;
36 import com.sleepycat.je.EnvironmentConfig;
37
38 /**
39  * A JUnit test for {@link AdaptiveRevisitHostQueue AdaptiveRevisitHostQueue}
40  * class.
41  * <p>
42  * Since the ARHostQueue maintains significant state information there is only
43  * one Unit test described here that tests various different transitions.
44  *
45  * @author Kristinn Sigurdsson
46  */

47 public class AdaptiveRevisitHostQueueTest
48 extends TmpDirTestCase
49 implements AdaptiveRevisitAttributeConstants {
50     public void testHQ() throws Exception JavaDoc {
51         EnvironmentConfig envConfig = new EnvironmentConfig();
52         envConfig.setTransactional(true);
53         envConfig.setAllowCreate(true);
54         File JavaDoc envDir = new File JavaDoc(getTmpDir(), "AR");
55         if (envDir.exists()) {
56             FileUtils.deleteDir(envDir);
57         }
58         envDir.mkdirs();
59         Environment env = new Environment(envDir, envConfig);
60         // Open the class catalog database. Create it if it does not
61
// already exist.
62
DatabaseConfig dbConfig = new DatabaseConfig();
63         dbConfig.setAllowCreate(true);
64         StoredClassCatalog catalog =
65             new StoredClassCatalog(env.openDatabase(null, "classes", dbConfig));
66         AdaptiveRevisitHostQueue hq =
67             new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 1);
68
69
70         // Make the CrawlUris
71
CrawlURI[] curis = {null,null,null,null};
72
73         UURI uuri = UURIFactory.getInstance("http://bok.hi.is/1.html");
74         curis[0] = new CrawlURI(uuri);
75         curis[0].setVia(null);
76         
77         uuri = UURIFactory.getInstance("http://bok.hi.is/2.html");
78         curis[1] = new CrawlURI(uuri);
79         curis[1].setVia(null);
80
81         uuri = UURIFactory.getInstance("http://bok.hi.is/3.html");
82         curis[2] = new CrawlURI(uuri);
83         curis[2].setVia(null);
84
85         uuri = UURIFactory.getInstance("http://bok.hi.is/4.html");
86         curis[3] = new CrawlURI(uuri);
87         curis[3].setVia(null);
88
89         assertTrue("HQ should be empty initially",
90                 hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
91         assertEquals("Incorrect nextReadyTime on Empty",
92                 Long.MAX_VALUE,hq.getNextReadyTime());
93         assertEquals("Initial size of HQ should be 0",0,hq.getSize());
94         
95         assertEquals("Peek should return null when 'ready queue' is empty",
96                 null, hq.peek());
97     
98         /*
99          * Add three CrawlURIs and ensures that the correct one is reported by
100          * peek(); All are added later then current time!
101          */

102
103         curis[0].putLong(
104                 A_TIME_OF_NEXT_PROCESSING,
105                 System.currentTimeMillis()); // now
106
curis[1].putLong(
107                 A_TIME_OF_NEXT_PROCESSING,
108                 System.currentTimeMillis()+5000); // in 5 sec
109
curis[2].putLong(
110                 A_TIME_OF_NEXT_PROCESSING,
111                 System.currentTimeMillis()+20000); // in 20 sec.
112

113         hq.add(curis[0],false);
114         assertEquals("First CrawlURI should be top",curis[0].toString(),
115                 hq.peek().toString());
116         assertTrue("HQ should no longer be empty",
117                 hq.getState()!=AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
118         assertEquals("Size of HQ should now be 1",1,hq.getSize());
119         
120         /*
121          * Invoke next and ensure that the HQ is now busy (initial valence was
122          * set to 1). Also check for proper errors for a busy HQ. Such as when
123          * trying to reinvoke next().
124          *
125          */

126         CrawlURI curi = hq.next(); // Should return curis[2]
127
assertEquals("next() did not return 'top' URI",
128                 curis[0].toString(),curi.toString());
129         assertTrue("HQ should now be busy, is " + hq.getStateByName(),
130                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
131         try{
132             hq.next();
133             assertTrue("next() should throw an IllegalStateException if HQ " +
134                     "not ready",false);
135         } catch(IllegalStateException JavaDoc e){
136             // This is supposed to happen.
137
}
138         assertEquals("New top URI should be null",
139                 null,hq.peek());
140         
141         hq.add(curis[1],false);
142         assertEquals("Second CrawlURI should be top",curis[1].toString(),
143                 hq.peek().toString());
144         assertEquals("Size of HQ should now be 2",2,hq.getSize());
145
146         // Return it with next fetch time in the future.
147
curi.putLong(A_TIME_OF_NEXT_PROCESSING,
148             hq.peek().getLong(A_TIME_OF_NEXT_PROCESSING)
149                         +100000); // 100 sec behind current top.
150
hq.update(curi,false,0);
151         assertEquals("Second CrawlURI should be still be top",
152                 curis[1].toString(),hq.peek().toString());
153         assertEquals("Size of HQ should still be 2",2,hq.getSize());
154         
155         hq.add(curis[2],false);
156         assertEquals("Second CrawlURI should still be top",
157                 curis[1].toString(), hq.peek().toString());
158         assertEquals("Size of HQ should now be 3",3,hq.getSize());
159
160         /*
161          * If there are no URIs ready, the queue should snooze, even though no
162          * politeness demand has been made.
163          * <p>
164          * Confirms this and that it wakes up.
165          */

166         assertTrue("HQ should be snoozed, is " + hq.getStateByName(),
167                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
168         // Wait past wakeup time
169
synchronized(this){
170             wait(hq.getNextReadyTime()-System.currentTimeMillis()+100);
171         }
172         assertTrue("HQ should now be ready, is " + hq.getStateByName(),
173                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
174     
175         /*
176          * Re-adds a URI with a lower ready time which should promote it to the
177          * top of the queue. Checks if this happens correctly.
178          *
179          * Then tests an add override which would demote it back, ensures that
180          * this fails as it should (i.e. URIs time of next processing remains
181          * unchanged).
182          */

183         curis[2].putLong(
184                 A_TIME_OF_NEXT_PROCESSING,
185                 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
186                             -1000); // 1 sec. prior to current top
187
hq.add(curis[2],true);
188         assertEquals("Size of HQ should still be 3",hq.getSize(),3);
189         assertEquals("Third CrawlURI should be now be top",
190                 curis[2].toString(), hq.peek().toString());
191         curis[2].putLong(A_TIME_OF_NEXT_PROCESSING,
192                 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
193                             +10000); // 10 sec. later
194
hq.add(curis[2],true);
195         assertEquals("Size of HQ should still be 3",hq.getSize(),3);
196         assertEquals("Third CrawlURI should still top",
197                 curis[2].toString(), hq.peek().toString());
198
199     
200         /*
201          * Invoke next and ensure that the HQ is now busy (initial valence was
202          * set to 1). Also check for proper errors for a busy HQ. Such as when
203          * trying to reinvoke next().
204          *
205          */

206         curi = hq.next(); // Should return curis[2]
207
assertEquals("next() did not return 'top' URI",
208                 curis[2].toString(),curi.toString());
209         assertTrue("HQ should now be busy, is " + hq.getStateByName(),
210                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
211         try{
212             hq.next();
213             assertTrue("next() should throw an IllegalStateException if HQ " +
214                     "not ready",false);
215         } catch(IllegalStateException JavaDoc e){
216             // This is supposed to happen.
217
}
218         assertEquals("New top URI",
219                 curis[1].toString(),hq.peek().toString());
220         
221         /*
222          * Add a URI while HQ is busy. Check if this succeeds normally.
223          *
224          */

225         
226         curis[3].putLong(A_TIME_OF_NEXT_PROCESSING,
227                 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
228                         - 1); // 1 msec. ahead of current top (order [2] 3 1 0)
229
hq.add(curis[3],false);
230         assertEquals("Size of HQ should now be 4",4,hq.getSize());
231         
232         
233         /*
234          * Invoke update, first with an invalid URI (not the one issued by
235          * next() earlier), this should fail. Then with the correct one, this
236          * should succeed. Then finally test update again with an invalid URI
237          * (i.e. when no HQ has no outstanding URIs, that should fail.
238          *
239          * At each step, proper checks are made of state and that methods give
240          * appropriate errors.
241          *
242          * Updated URI is given low time of next processing to put it 'in front'
243          */

244     
245         try {
246             hq.update(curis[1],false,0);
247             assertTrue("update() should not accept URI",false);
248         } catch(IllegalStateException JavaDoc e){
249             // This is supposed to happen
250
}
251         
252         // We do not change the 'time of next processing' on update
253
// so curis[2] should again be at top of queue.
254
long timeOfPolitenessWakeUp = System.currentTimeMillis()+2000;
255         hq.update(curi,true,timeOfPolitenessWakeUp); // Wake in 5 sec.
256
assertTrue("HQ should be snoozed, is " + hq.getStateByName(),
257                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
258         
259         try {
260             hq.update(curis[2],false,0);
261             assertTrue("update() should not accept URI",false);
262         } catch(IllegalStateException JavaDoc e){
263             // This is supposed to happen
264
}
265         assertEquals("HQs time of next ready should reflect set wait time ",
266                 timeOfPolitenessWakeUp, hq.getNextReadyTime());
267         
268         
269         /*
270          * Check if the HQ wakes up from it's 'snoozing'
271          *
272          */

273         // Wait past wakeup time
274
synchronized(this){
275             wait(hq.getNextReadyTime()-System.currentTimeMillis()+100);
276         }
277         assertTrue("HQ should now be ready, is " + hq.getStateByName(),
278                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
279         assertEquals("HQs time of next ready should still be when it 'woken' " +
280                 "up.", timeOfPolitenessWakeUp, hq.getNextReadyTime());
281    
282         /*
283          * Invoke next so that the HQ has a URI being processed. Then
284          * close the HQ and reopen it to ensure that this happens normally, i.e.
285          * state is recovered properly, including the restoration of the URI
286          * being processed, back to the regular queue (where it should be
287          * first).
288          *
289          * On recreating the HQ, set valence to 2.
290          */

291         curi = hq.next(); // Should return curis[2]
292
assertEquals("next() did not return 'top' URI",
293                 curis[2].toString(),curi.toString());
294         assertTrue("HQ should now be busy, is " + hq.getStateByName(),
295                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
296         hq.close();
297         
298         hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 2);
299         
300         assertEquals("Size of HQ after reopening should now be 4",
301                 4, hq.getSize());
302         assertTrue("HQ should be ready on reopen, is " + hq.getStateByName(),
303                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
304         assertEquals("CrawlURI 'in processing' before should be top",
305                 curi.toString(), hq.peek().toString());
306     
307         /* Check if valence higher then 1 is properly handled.
308          *
309          * Invoke next(), check if still ready and new top URI.
310          */

311         curi = hq.next(); // Should return curis[2]
312
assertEquals("next() did not return 'top' URI",
313                 curis[2].toString(),curi.toString());
314         assertTrue("HQ should still be ready, is " + hq.getStateByName(),
315                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
316         
317         /* Invoke next() again, check if now busy.
318          */

319         curi = hq.next(); // Should return curis[3]
320
assertEquals("next() did not return 'top' URI",
321                 curis[3].toString(),curi.toString());
322         assertTrue("HQ should be busy, is " + hq.getStateByName(),
323                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_BUSY);
324         assertEquals("Size of HQ should still be 4",
325                 4, hq.getSize());
326
327         
328         /* Update() second URI issued. Confirm HQ is now ready again. URI is
329          * given same time of next processing to put it 'in front'. (no snooze)
330          */

331         hq.update(curi,false,0);
332         assertTrue("HQ should now be ready, is " + hq.getStateByName(),
333                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
334         assertEquals("'updated' CrawlURI before should be top",
335                 curi.toString(), hq.peek().toString());
336         
337         
338         /* Update() again, ensure proper state. URI is NOT placed at front of
339          * queue and snooze time is given. But the HQ should not enter a
340          * snoozed state because the 'other' slot is free.
341          */

342         
343         hq.update(curis[2],true,System.currentTimeMillis() + 1000000); // 10sec
344
curis[3].putLong(A_TIME_OF_NEXT_PROCESSING,
345                 curis[1].getLong(A_TIME_OF_NEXT_PROCESSING)
346                         + 1000); // 1 sec. behind of current top
347
assertTrue("HQ should still be ready, is " + hq.getStateByName(),
348                 hq.getState()==AdaptiveRevisitHostQueue.HQSTATE_READY);
349         assertEquals("Top CrawlURI before should be unchanged",
350                 curi.toString(), hq.peek().toString());
351         
352
353         // TODO: Test sorting with scheduling directives.
354

355         /*
356          * Close the ARHostQueue and the Environment
357          */

358         hq.close();
359         catalog.close();
360         env.close();
361         cleanUpOldFiles("AR");
362     }
363     
364 }
365
Popular Tags