KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > crawler > util > DiskFPMergeUriUniqFilter


1 /* DiskFPMergeUriUniqFilter
2 *
3 * $Id: DiskFPMergeUriUniqFilter.java,v 1.4 2006/07/13 06:04:11 gojomo Exp $
4 *
5 * Created on Dec 14, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.crawler.util;
26
27 import it.unimi.dsi.fastutil.longs.LongIterators;
28 import it.unimi.dsi.fastutil.longs.LongIterator;
29
30 import java.io.BufferedInputStream JavaDoc;
31 import java.io.BufferedOutputStream JavaDoc;
32 import java.io.DataInputStream JavaDoc;
33 import java.io.DataOutputStream JavaDoc;
34 import java.io.File JavaDoc;
35 import java.io.FileInputStream JavaDoc;
36 import java.io.FileNotFoundException JavaDoc;
37 import java.io.FileOutputStream JavaDoc;
38 import java.io.IOException JavaDoc;
39 import java.util.NoSuchElementException JavaDoc;
40
41 import org.archive.util.ArchiveUtils;
42
43 /**
44  * Crude FPMergeUriUniqFilter using a disk data file of raw longs as the
45  * overall FP record.
46  *
47  * @author gojomo
48  */

49 public class DiskFPMergeUriUniqFilter extends FPMergeUriUniqFilter {
50     long count = 0;
51     File JavaDoc scratchDir;
52     File JavaDoc currentFps;
53     File JavaDoc newFpsFile;
54     DataOutputStream JavaDoc newFps;
55     long newCount;
56     DataInputStream JavaDoc oldFps;
57     
58     public DiskFPMergeUriUniqFilter(File JavaDoc scratchDir) {
59         super();
60         this.scratchDir = scratchDir;
61         // TODO: Use two scratch locations, to allow IO to be split
62
// over separate disks
63
}
64     
65     /* (non-Javadoc)
66      * @see org.archive.crawler.util.FPMergeUriUniqFilter#beginFpMerge()
67      */

68     protected LongIterator beginFpMerge() {
69         newFpsFile = new File JavaDoc(scratchDir,ArchiveUtils.get17DigitDate()+".fp");
70         if(newFpsFile.exists()) {
71             throw new RuntimeException JavaDoc(newFpsFile+" exists");
72         }
73         try {
74             newFps = new DataOutputStream JavaDoc(new BufferedOutputStream JavaDoc(new FileOutputStream JavaDoc(newFpsFile)));
75         } catch (FileNotFoundException JavaDoc e) {
76             throw new RuntimeException JavaDoc(e);
77         }
78         newCount = 0;
79         if(currentFps==null) {
80             return LongIterators.EMPTY_ITERATOR;
81         }
82         try {
83             oldFps = new DataInputStream JavaDoc(new BufferedInputStream JavaDoc(new FileInputStream JavaDoc(currentFps)));
84         } catch (FileNotFoundException JavaDoc e1) {
85             throw new RuntimeException JavaDoc(e1);
86         }
87         return new DataFileLongIterator(oldFps);
88     }
89
90     /* (non-Javadoc)
91      * @see org.archive.crawler.util.FPMergeUriUniqFilter#addNewFp(long)
92      */

93     protected void addNewFp(long fp) {
94         try {
95             newFps.writeLong(fp);
96             newCount++;
97         } catch (IOException JavaDoc e) {
98             throw new RuntimeException JavaDoc(e);
99         }
100     }
101
102     /* (non-Javadoc)
103      * @see org.archive.crawler.util.FPMergeUriUniqFilter#finishFpMerge()
104      */

105     protected void finishFpMerge() {
106         try {
107             newFps.close();
108             File JavaDoc oldFpsFile = currentFps;
109             currentFps = newFpsFile;
110             if(oldFps!=null) {
111                 oldFps.close();
112             }
113             if(oldFpsFile!=null) {
114                 oldFpsFile.delete();
115             }
116         } catch (IOException JavaDoc e) {
117             throw new RuntimeException JavaDoc(e);
118         }
119         count = newCount;
120     }
121
122     /* (non-Javadoc)
123      * @see org.archive.crawler.datamodel.UriUniqFilter#count()
124      */

125     public long count() {
126         return count;
127     }
128
129     public class DataFileLongIterator implements LongIterator {
130         DataInputStream JavaDoc in;
131         long next;
132         boolean nextIsValid = false;
133         
134         /**
135          * Construct a long iterator reading from the given
136          * stream.
137          *
138          * @param disStream DataInputStream from which to read longs
139          */

140         public DataFileLongIterator(DataInputStream JavaDoc disStream) {
141             this.in = disStream;
142         }
143
144         /**
145          * Test whether any items remain; loads next item into
146          * holding 'next' field.
147          *
148          * @see java.util.Iterator#hasNext()
149          */

150         public boolean hasNext() {
151             return nextIsValid ? true: lookahead();
152         }
153         
154         /**
155          * Check if there's a next by trying to read it.
156          *
157          * @return true if 'next' field is filled with a valid next, false otherwise
158          */

159         protected boolean lookahead() {
160             try {
161                 next = in.readLong();
162             } catch (IOException JavaDoc e) {
163                 return false;
164             }
165             nextIsValid = true;
166             return true;
167         }
168
169         /**
170          * Return the next item.
171          *
172          * @see java.util.Iterator#next()
173          */

174         public Long JavaDoc next() {
175             if (!hasNext()) {
176                 throw new NoSuchElementException JavaDoc();
177             }
178             // 'next' is guaranteed set by a hasNext() which returned true
179
Long JavaDoc returnObj = new Long JavaDoc(this.next);
180             this.nextIsValid = false;
181             return returnObj;
182         }
183         
184         /* (non-Javadoc)
185          * @see java.util.Iterator#remove()
186          */

187         public void remove() {
188             throw new UnsupportedOperationException JavaDoc();
189         }
190         
191         
192         /* (non-Javadoc)
193          * @see it.unimi.dsi.fastutil.longs.LongIterator#nextLong()
194          */

195         public long nextLong() {
196             if (!hasNext()) {
197                 throw new NoSuchElementException JavaDoc();
198             }
199             // 'next' is guaranteed non-null by a hasNext() which returned true
200
this.nextIsValid = false; // after this return, 'next' needs refresh
201
return this.next;
202         }
203
204         /* (non-Javadoc)
205          * @see it.unimi.dsi.fastutil.longs.LongIterator#skip(int)
206          */

207         public int skip(int arg0) {
208             return 0;
209         }
210     }
211
212 }
213
Popular Tags