KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > db > DBKeyDivision


1 /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3 package net.nutch.db;
4
5 import java.io.*;
6 import java.util.*;
7 import net.nutch.io.*;
8
9 /************************************************
10  * DBKeyDivision exists for other DB classes to
11  * figure out how to find the right distributed-DB
12  * section.
13  *
14  * @author Mike Cafarella
15  ************************************************/

16 public class DBKeyDivision {
17     //
18
// We need to divide the URL_UTF keyspace into equal portions.
19
// Some URLs are more likely than others, so we choose the
20
// "dividing values" empirically. These numbers are derived
21
// from examining the DMOZ URL set.
22
//
23
public static String JavaDoc[] URL_KEYSPACE_DIVIDERS = {"",
24                                                     "http://devrando",
25                                                     "http://hoaxinh.",
26                                                     "http://magyn.co",
27                                                     "http://osha-enf",
28                                                     "http://templeem",
29                                                     "http://wiem.one",
30                                                     "http://www.aero",
31                                                     "http://www.anth",
32                                                     "http://www.baro",
33                                                     "http://www.bruc",
34                                                     "http://www.chen",
35                                                     "http://www.corh",
36                                                     "http://www.dive",
37                                                     "http://www.envi",
38                                                     "http://www.fort",
39                                                     "http://www.geoc",
40                                                     "http://www.gree",
41                                                     "http://www.htmi",
42                                                     "http://www.j103",
43                                                     "http://www.lake",
44                                                     "http://www.masc",
45                                                     "http://www.mrmp",
46                                                     "http://www.njdr",
47                                                     "http://www.pbs.",
48                                                     "http://www.quee",
49                                                     "http://www.saba",
50                                                     "http://www.skee",
51                                                     "http://www.surg",
52                                                     "http://www.tony",
53                                                     "http://www.uvm.",
54                                                     "http://www.wigg"};
55
56     //
57
// For the MD5, we expect an even distribution.
58
//
59
public static MD5Hash[] MD5_KEYSPACE_DIVIDERS =
60     {new MD5Hash("00000000000000000000000000000000"),
61      new MD5Hash("08000000000000000000000000000000"),
62      new MD5Hash("10000000000000000000000000000000"),
63      new MD5Hash("18000000000000000000000000000000"),
64      new MD5Hash("20000000000000000000000000000000"),
65      new MD5Hash("28000000000000000000000000000000"),
66      new MD5Hash("30000000000000000000000000000000"),
67      new MD5Hash("38000000000000000000000000000000"),
68      new MD5Hash("40000000000000000000000000000000"),
69      new MD5Hash("48000000000000000000000000000000"),
70      new MD5Hash("50000000000000000000000000000000"),
71      new MD5Hash("58000000000000000000000000000000"),
72      new MD5Hash("60000000000000000000000000000000"),
73      new MD5Hash("68000000000000000000000000000000"),
74      new MD5Hash("70000000000000000000000000000000"),
75      new MD5Hash("78000000000000000000000000000000"),
76      new MD5Hash("80000000000000000000000000000000"),
77      new MD5Hash("88000000000000000000000000000000"),
78      new MD5Hash("90000000000000000000000000000000"),
79      new MD5Hash("98000000000000000000000000000000"),
80      new MD5Hash("a0000000000000000000000000000000"),
81      new MD5Hash("a8000000000000000000000000000000"),
82      new MD5Hash("b0000000000000000000000000000000"),
83      new MD5Hash("b8000000000000000000000000000000"),
84      new MD5Hash("c0000000000000000000000000000000"),
85      new MD5Hash("c8000000000000000000000000000000"),
86      new MD5Hash("d0000000000000000000000000000000"),
87      new MD5Hash("d8000000000000000000000000000000"),
88      new MD5Hash("e0000000000000000000000000000000"),
89      new MD5Hash("e8000000000000000000000000000000"),
90      new MD5Hash("f0000000000000000000000000000000"),
91      new MD5Hash("f8000000000000000000000000000000")};
92
93     //
94
// I know it stinks that MAX_SECTIONS is only 32 right now.
95
// We'll increase it after we compute more values for the
96
// 'divider' arrays below.
97
//
98
public static int MAX_SECTIONS = URL_KEYSPACE_DIVIDERS.length;
99
100     /**
101      * Find the right section index for the given URL, and the number of
102      * sections in the db overall.
103      */

104     public static int findURLSection(String JavaDoc url, int numSections) {
105         if (numSections > URL_KEYSPACE_DIVIDERS.length) {
106             throw new IllegalArgumentException JavaDoc("Too many db sections. " + numSections + " is greater than max of " + URL_KEYSPACE_DIVIDERS.length);
107         }
108
109         return binarySearch(url, URL_KEYSPACE_DIVIDERS, numSections);
110     }
111
112     /**
113      * Find the right section index for the given MD5, and the number of
114      * sections in the db overall.
115      */

116     public static int findMD5Section(MD5Hash md5, int numSections) {
117         if (numSections > MD5_KEYSPACE_DIVIDERS.length) {
118             throw new IllegalArgumentException JavaDoc("Too many db sections. " + numSections + " is greater than max of " + MD5_KEYSPACE_DIVIDERS.length);
119         }
120
121         return binarySearch(md5, MD5_KEYSPACE_DIVIDERS, numSections);
122     }
123
124     /**
125      * Utility method for finding the right place in the section array.
126      */

127     static int binarySearch(Comparable JavaDoc obj, Comparable JavaDoc keyArray[], int numSections) {
128         double stepSize = keyArray.length / (numSections * 1.0);
129         int lastBest = -1;
130         int low = 0;
131         int high = numSections - 1;
132         
133         while (low <= high) {
134             int mid = (low + high) / 2;
135             Comparable JavaDoc midObj = keyArray[(int) Math.round(mid * stepSize)];
136             int cmp = obj.compareTo(midObj);
137             if (cmp < 0) {
138                 high = mid - 1;
139             } else if (cmp > 0) {
140                 lastBest = mid;
141                 low = mid + 1;
142             } else {
143                 return mid;
144             }
145         }
146         return lastBest;
147     }
148
149 }
150
Popular Tags