KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > tools > TestSegmentMergeTool


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.tools;
5
6 import java.io.*;
7 import java.util.Properties JavaDoc;
8 import java.util.Random JavaDoc;
9
10 import net.nutch.db.Page;
11 import net.nutch.fetcher.FetcherOutput;
12 import net.nutch.io.ArrayFile;
13 import net.nutch.io.MD5Hash;
14 import net.nutch.fs.*;
15 import net.nutch.segment.SegmentReader;
16 import net.nutch.segment.SegmentWriter;
17 import net.nutch.util.*;
18 import net.nutch.pagedb.FetchListEntry;
19 import net.nutch.parse.Outlink;
20 import net.nutch.parse.ParseData;
21 import net.nutch.parse.ParseText;
22 import net.nutch.protocol.Content;
23
24 import junit.framework.TestCase;
25
26 /** Unit tests for SegmentMergeTool methods. */
27 public class TestSegmentMergeTool extends TestCase {
28
29   protected static final int SEGMENT_CNT = 10;
30
31   protected static final int PAGE_CNT = 500;
32
33   protected File testDir = null;
34
35   public TestSegmentMergeTool(String JavaDoc name) {
36     super(name);
37   }
38
39   /**
40    * Create test directory.
41    *
42    * @see junit.framework.TestCase#setUp()
43    */

44   protected void setUp() throws Exception JavaDoc {
45     super.setUp();
46     testDir = File.createTempFile(".smttest", "");
47     testDir.delete();
48     testDir.mkdirs();
49   }
50
51   /**
52    * Create test segment data.
53    *
54    * @param dir segment directory
55    * @param unique if true, use unique data per segment, otherwise use the same
56    * data
57    * @throws Exception
58    */

59   protected void createSegmentData(NutchFileSystem nfs, File dir, boolean unique) throws Exception JavaDoc {
60     SegmentWriter sw = new SegmentWriter(nfs, dir, true);
61     Random JavaDoc r = new Random JavaDoc(System.currentTimeMillis());
62     for (int i = 0; i < PAGE_CNT; i++) {
63       String JavaDoc url = "http://www.example.com/page-" + i;
64       String JavaDoc rnd = "";
65       if (unique) {
66         rnd = "/" + System.currentTimeMillis();
67         url += rnd;
68       }
69       url += "/example.html";
70       FetchListEntry fle = new FetchListEntry(true, new Page(url, 1.0f), new String JavaDoc[] { "test" + rnd });
71       FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), FetcherOutput.SUCCESS);
72       StringBuffer JavaDoc content = new StringBuffer JavaDoc("<html><body><h1>Hello from Page " + i + "</h1>");
73       if (unique) {
74         content.append("<p>Created at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong() + "</p>");
75       }
76       for (int k = 0; k < 10; k++) {
77         content.append("<p>" + k + " lines of text in the queue, " + k + " lines of text...</p>\n");
78       }
79       content.append("</body></html>");
80       Properties JavaDoc meta = new Properties JavaDoc();
81       meta.setProperty("Content-Type", "text/html");
82       meta.setProperty("Host", "http://localhost");
83       meta.setProperty("Connection", "Keep-alive, close");
84       Content co = new Content(url, "http://www.example.com", content.toString().getBytes("UTF-8"), "text/html", meta);
85       ParseData pd = new ParseData("Hello from Page " + i, new Outlink[0], meta);
86       StringBuffer JavaDoc text = new StringBuffer JavaDoc("Hello from Page" + i);
87       if (unique) {
88         text.append("\nCreated at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong());
89       }
90       for (int k = 0; k < 10; k++) {
91         text.append(k + " lines of text in the queue, " + k + " lines of text...\n");
92       }
93       ParseText pt = new ParseText(text.toString());
94       sw.append(fo, co, pt, pd);
95     }
96     sw.close();
97   }
98
99   /**
100    * Remove test directory.
101    *
102    * @see junit.framework.TestCase#tearDown()
103    */

104   protected void tearDown() throws Exception JavaDoc {
105     NutchFileSystem nfs = new LocalFileSystem();
106     try {
107       super.tearDown();
108       try {
109         FileUtil.fullyDelete(nfs, testDir);
110       } catch (Exception JavaDoc e) {
111         System.out.println("NON-FATAL: " + e.getMessage());
112       }
113     } finally {
114       nfs.close();
115     }
116   }
117
118   /**
119    * Test merging segments with unique data. The output (merged segment) should
120    * contain the number of pages equal exactly to a product of segment count
121    * times page count per segment.
122    *
123    */

124   public void testUniqueMerge() throws IOException {
125     NutchFileSystem nfs = new LocalFileSystem();
126     try {
127       File dataDir = new File(testDir, "segments");
128       File outSegment = new File(testDir, "output");
129       try {
130         for (int i = 0; i < SEGMENT_CNT; i++) {
131           File f = new File(dataDir, "seg" + i);
132           nfs.mkdirs(f);
133           createSegmentData(nfs, f, true);
134         }
135         runTool(dataDir, outSegment);
136         SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]);
137         assertEquals(SEGMENT_CNT * PAGE_CNT, sr.size);
138         sr.close();
139       } catch (Throwable JavaDoc e) {
140         e.printStackTrace();
141         fail(e.getMessage() + ", " + e.getStackTrace());
142       }
143     } finally {
144       nfs.close();
145     }
146   }
147   
148   protected void runTool(File dataDir, File outSegment) throws Exception JavaDoc {
149     SegmentMergeTool.main(
150             new String JavaDoc[] {"-dir", dataDir.toString(), "-o", outSegment.toString(),
151                     "-ds"});
152   }
153
154   /**
155    * Test merging segments with the same data. The output (merged segment)
156    * should contain the number of pages equal exactly to the page count of a
157    * single segment.
158    *
159    */

160   public void testSameMerge() throws IOException {
161     NutchFileSystem nfs = new LocalFileSystem();
162     try {
163       File dataDir = new File(testDir, "segments");
164       File outSegment = new File(testDir, "output");
165       try {
166         for (int i = 0; i < SEGMENT_CNT; i++) {
167           File f = new File(dataDir, "seg" + i);
168           nfs.mkdirs(f);
169           createSegmentData(nfs, f, false);
170         }
171         runTool(dataDir, outSegment);
172         SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]);
173         assertEquals(PAGE_CNT, sr.size);
174         sr.close();
175       } catch (Exception JavaDoc e) {
176         e.printStackTrace();
177         fail(e.getMessage());
178       }
179     } catch (Throwable JavaDoc ex) {
180       ex.printStackTrace();
181       fail(ex.getMessage());
182     } finally {
183       nfs.close();
184     }
185   }
186
187   public void testCorruptSegmentMerge() throws IOException {
188     NutchFileSystem nfs = new LocalFileSystem();
189     try {
190       File dataDir = new File(testDir, "segments");
191       File outSegment = new File(testDir, "output");
192       try {
193         for (int i = 0; i < SEGMENT_CNT; i++) {
194           File f = new File(dataDir, "seg" + i);
195           nfs.mkdirs(f);
196           createSegmentData(nfs, f, true);
197           // corrupt some segments in various ways... be creative :-)
198
switch (i) {
199             case 0:
200               // truncate the fetcherOutput data file
201
File data = new File(f, FetcherOutput.DIR_NAME);
202               data = new File(data, "data");
203               RandomAccessFile raf = new RandomAccessFile(data, "rws");
204               raf.setLength(raf.length() - raf.length() / 4);
205               raf.close();
206               break;
207             case 1:
208               // truncate the Content data file
209
data = new File(f, Content.DIR_NAME);
210               data = new File(data, "data");
211               raf = new RandomAccessFile(data, "rws");
212               raf.setLength(raf.length() - raf.length() / 4);
213               raf.close();
214               break;
215             case 2:
216               // trash the whole
217
// content
218
data = new File(f, Content.DIR_NAME);
219               new File(data, "data").delete();
220               new File(data, "index").delete();
221               break;
222             case 3:
223               // remove the "index" files - this is a very typical symptom for
224
// segments created by a crashed fetcher process. Such segments should
225
// be automatically fixed and recovered.
226
data = new File(f, FetcherOutput.DIR_NAME);
227               new File(data, "index").delete();
228               data = new File(f, Content.DIR_NAME);
229               new File(data, "index").delete();
230               data = new File(f, ParseData.DIR_NAME);
231               new File(data, "index").delete();
232               data = new File(f, ParseText.DIR_NAME);
233               new File(data, "index").delete();
234               break;
235             default:
236               // do nothing
237
;
238           }
239         }
240         runTool(dataDir, outSegment);
241         SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]);
242         // we arrive at this expression as follows:
243
// 1. SEGMENT_CNT - 1 : because we trash one whole segment
244
// 2. 2 * PAGE_CNT / 4: because for two segments
245
// we truncate 1/4 of the data file
246
// 3. + 2: because sometimes truncation falls on
247
// the boundary of the last entry
248
int maxCnt = PAGE_CNT * (SEGMENT_CNT - 1) - 2 * PAGE_CNT / 4 + 2 * (SEGMENT_CNT -1);
249         //System.out.println("maxCnt=" + maxCnt + ", sr.size=" + sr.size);
250
assertTrue(sr.size < maxCnt);
251         sr.close();
252       } catch (Exception JavaDoc e) {
253         e.printStackTrace();
254         fail(e.getMessage());
255       }
256     } catch (Throwable JavaDoc ex) {
257       ex.printStackTrace();
258       fail(ex.getMessage());
259     } finally {
260       nfs.close();
261     }
262   }
263 }
264
Popular Tags