1 2 3 4 package net.nutch.tools; 5 6 import java.io.*; 7 import java.util.Properties ; 8 import java.util.Random ; 9 10 import net.nutch.db.Page; 11 import net.nutch.fetcher.FetcherOutput; 12 import net.nutch.io.ArrayFile; 13 import net.nutch.io.MD5Hash; 14 import net.nutch.fs.*; 15 import net.nutch.segment.SegmentReader; 16 import net.nutch.segment.SegmentWriter; 17 import net.nutch.util.*; 18 import net.nutch.pagedb.FetchListEntry; 19 import net.nutch.parse.Outlink; 20 import net.nutch.parse.ParseData; 21 import net.nutch.parse.ParseText; 22 import net.nutch.protocol.Content; 23 24 import junit.framework.TestCase; 25 26 27 public class TestSegmentMergeTool extends TestCase { 28 29 protected static final int SEGMENT_CNT = 10; 30 31 protected static final int PAGE_CNT = 500; 32 33 protected File testDir = null; 34 35 public TestSegmentMergeTool(String name) { 36 super(name); 37 } 38 39 44 protected void setUp() throws Exception { 45 super.setUp(); 46 testDir = File.createTempFile(".smttest", ""); 47 testDir.delete(); 48 testDir.mkdirs(); 49 } 50 51 59 protected void createSegmentData(NutchFileSystem nfs, File dir, boolean unique) throws Exception { 60 SegmentWriter sw = new SegmentWriter(nfs, dir, true); 61 Random r = new Random (System.currentTimeMillis()); 62 for (int i = 0; i < PAGE_CNT; i++) { 63 String url = "http://www.example.com/page-" + i; 64 String rnd = ""; 65 if (unique) { 66 rnd = "/" + System.currentTimeMillis(); 67 url += rnd; 68 } 69 url += "/example.html"; 70 FetchListEntry fle = new FetchListEntry(true, new Page(url, 1.0f), new String [] { "test" + rnd }); 71 FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), FetcherOutput.SUCCESS); 72 StringBuffer content = new StringBuffer ("<html><body><h1>Hello from Page " + i + "</h1>"); 73 if (unique) { 74 content.append("<p>Created at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong() + "</p>"); 75 } 76 for (int k = 0; k < 10; k++) { 77 content.append("<p>" + k + " lines of text in the queue, " + k + " lines of text...</p>\n"); 78 } 79 content.append("</body></html>"); 80 Properties meta = new Properties (); 81 meta.setProperty("Content-Type", "text/html"); 82 meta.setProperty("Host", "http://localhost"); 83 meta.setProperty("Connection", "Keep-alive, close"); 84 Content co = new Content(url, "http://www.example.com", content.toString().getBytes("UTF-8"), "text/html", meta); 85 ParseData pd = new ParseData("Hello from Page " + i, new Outlink[0], meta); 86 StringBuffer text = new StringBuffer ("Hello from Page" + i); 87 if (unique) { 88 text.append("\nCreated at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong()); 89 } 90 for (int k = 0; k < 10; k++) { 91 text.append(k + " lines of text in the queue, " + k + " lines of text...\n"); 92 } 93 ParseText pt = new ParseText(text.toString()); 94 sw.append(fo, co, pt, pd); 95 } 96 sw.close(); 97 } 98 99 104 protected void tearDown() throws Exception { 105 NutchFileSystem nfs = new LocalFileSystem(); 106 try { 107 super.tearDown(); 108 try { 109 FileUtil.fullyDelete(nfs, testDir); 110 } catch (Exception e) { 111 System.out.println("NON-FATAL: " + e.getMessage()); 112 } 113 } finally { 114 nfs.close(); 115 } 116 } 117 118 124 public void testUniqueMerge() throws IOException { 125 NutchFileSystem nfs = new LocalFileSystem(); 126 try { 127 File dataDir = new File(testDir, "segments"); 128 File outSegment = new File(testDir, "output"); 129 try { 130 for (int i = 0; i < SEGMENT_CNT; i++) { 131 File f = new File(dataDir, "seg" + i); 132 nfs.mkdirs(f); 133 createSegmentData(nfs, f, true); 134 } 135 runTool(dataDir, outSegment); 136 SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]); 137 assertEquals(SEGMENT_CNT * PAGE_CNT, sr.size); 138 sr.close(); 139 } catch (Throwable e) { 140 e.printStackTrace(); 141 fail(e.getMessage() + ", " + e.getStackTrace()); 142 } 143 } finally { 144 nfs.close(); 145 } 146 } 147 148 protected void runTool(File dataDir, File outSegment) throws Exception { 149 SegmentMergeTool.main( 150 new String [] {"-dir", dataDir.toString(), "-o", outSegment.toString(), 151 "-ds"}); 152 } 153 154 160 public void testSameMerge() throws IOException { 161 NutchFileSystem nfs = new LocalFileSystem(); 162 try { 163 File dataDir = new File(testDir, "segments"); 164 File outSegment = new File(testDir, "output"); 165 try { 166 for (int i = 0; i < SEGMENT_CNT; i++) { 167 File f = new File(dataDir, "seg" + i); 168 nfs.mkdirs(f); 169 createSegmentData(nfs, f, false); 170 } 171 runTool(dataDir, outSegment); 172 SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]); 173 assertEquals(PAGE_CNT, sr.size); 174 sr.close(); 175 } catch (Exception e) { 176 e.printStackTrace(); 177 fail(e.getMessage()); 178 } 179 } catch (Throwable ex) { 180 ex.printStackTrace(); 181 fail(ex.getMessage()); 182 } finally { 183 nfs.close(); 184 } 185 } 186 187 public void testCorruptSegmentMerge() throws IOException { 188 NutchFileSystem nfs = new LocalFileSystem(); 189 try { 190 File dataDir = new File(testDir, "segments"); 191 File outSegment = new File(testDir, "output"); 192 try { 193 for (int i = 0; i < SEGMENT_CNT; i++) { 194 File f = new File(dataDir, "seg" + i); 195 nfs.mkdirs(f); 196 createSegmentData(nfs, f, true); 197 switch (i) { 199 case 0: 200 File data = new File(f, FetcherOutput.DIR_NAME); 202 data = new File(data, "data"); 203 RandomAccessFile raf = new RandomAccessFile(data, "rws"); 204 raf.setLength(raf.length() - raf.length() / 4); 205 raf.close(); 206 break; 207 case 1: 208 data = new File(f, Content.DIR_NAME); 210 data = new File(data, "data"); 211 raf = new RandomAccessFile(data, "rws"); 212 raf.setLength(raf.length() - raf.length() / 4); 213 raf.close(); 214 break; 215 case 2: 216 data = new File(f, Content.DIR_NAME); 219 new File(data, "data").delete(); 220 new File(data, "index").delete(); 221 break; 222 case 3: 223 data = new File(f, FetcherOutput.DIR_NAME); 227 new File(data, "index").delete(); 228 data = new File(f, Content.DIR_NAME); 229 new File(data, "index").delete(); 230 data = new File(f, ParseData.DIR_NAME); 231 new File(data, "index").delete(); 232 data = new File(f, ParseText.DIR_NAME); 233 new File(data, "index").delete(); 234 break; 235 default: 236 ; 238 } 239 } 240 runTool(dataDir, outSegment); 241 SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]); 242 int maxCnt = PAGE_CNT * (SEGMENT_CNT - 1) - 2 * PAGE_CNT / 4 + 2 * (SEGMENT_CNT -1); 249 assertTrue(sr.size < maxCnt); 251 sr.close(); 252 } catch (Exception e) { 253 e.printStackTrace(); 254 fail(e.getMessage()); 255 } 256 } catch (Throwable ex) { 257 ex.printStackTrace(); 258 fail(ex.getMessage()); 259 } finally { 260 nfs.close(); 261 } 262 } 263 } 264 | Popular Tags |