KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > ParseData


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse;
5
6 import java.io.*;
7 import java.util.*;
8
9 import net.nutch.io.*;
10 import net.nutch.fs.*;
11 import net.nutch.util.*;
12 import net.nutch.tools.UpdateDatabaseTool;
13
14
15 /** Data extracted from a page's content.
16  * @see Parse#getData()
17  */

18 public final class ParseData extends VersionedWritable {
19   public static final String JavaDoc DIR_NAME = "parse_data";
20
21   private final static byte VERSION = 1;
22
23   private String JavaDoc title;
24   private Outlink[] outlinks;
25   private Properties metadata;
26
27   public ParseData() {}
28
29   public ParseData(String JavaDoc title, Outlink[] outlinks, Properties metadata) {
30     this.title = title;
31     this.outlinks = outlinks;
32     this.metadata = metadata;
33   }
34
35   //
36
// Accessor methods
37
//
38

39   /** The title of the page. */
40   public String JavaDoc getTitle() { return title; }
41
42   /** The outlinks of the page. */
43   public Outlink[] getOutlinks() { return outlinks; }
44
45   /** Other page properties. This is the place to find format-specific
46    * properties. Different parser implementations for different content types
47    * will populate this differently. */

48   public Properties getMetadata() { return metadata; }
49
50   /** Return the value of a metadata property. */
51   public String JavaDoc get(String JavaDoc name) { return getMetadata().getProperty(name); }
52
53   //
54
// Writable methods
55
//
56

57   public byte getVersion() { return VERSION; }
58
59   public final void readFields(DataInput in) throws IOException {
60     super.readFields(in); // check version
61

62     title = UTF8.readString(in); // read title
63

64     int totalOutlinks = in.readInt(); // read outlinks
65
int outlinksToRead = Math.min(UpdateDatabaseTool.MAX_OUTLINKS_PER_PAGE,
66                                   totalOutlinks);
67     outlinks = new Outlink[outlinksToRead];
68     for (int i = 0; i < outlinksToRead; i++) {
69       outlinks[i] = Outlink.read(in);
70     }
71     for (int i = outlinksToRead; i < totalOutlinks; i++) {
72       Outlink.skip(in);
73     }
74     
75     int propertyCount = in.readInt(); // read metadata
76
metadata = new Properties();
77     for (int i = 0; i < propertyCount; i++) {
78       metadata.put(UTF8.readString(in), UTF8.readString(in));
79     }
80     
81   }
82
83   public final void write(DataOutput out) throws IOException {
84     super.write(out); // write version
85

86     UTF8.writeString(out, title); // write title
87

88     out.writeInt(outlinks.length); // write outlinks
89
for (int i = 0; i < outlinks.length; i++) {
90       outlinks[i].write(out);
91     }
92
93     out.writeInt(metadata.size()); // write metadata
94
Iterator i = metadata.entrySet().iterator();
95     while (i.hasNext()) {
96       Map.Entry e = (Map.Entry)i.next();
97       UTF8.writeString(out, (String JavaDoc)e.getKey());
98       UTF8.writeString(out, (String JavaDoc)e.getValue());
99     }
100   }
101
102   public static ParseData read(DataInput in) throws IOException {
103     ParseData parseText = new ParseData();
104     parseText.readFields(in);
105     return parseText;
106   }
107
108   //
109
// other methods
110
//
111

112   public boolean equals(Object JavaDoc o) {
113     if (!(o instanceof ParseData))
114       return false;
115     ParseData other = (ParseData)o;
116     return
117       this.title.equals(other.title) &&
118       Arrays.equals(this.outlinks, other.outlinks) &&
119       this.metadata.equals(other.metadata);
120   }
121
122   public String JavaDoc toString() {
123     StringBuffer JavaDoc buffer = new StringBuffer JavaDoc();
124
125     buffer.append("Title: " + title + "\n" );
126
127     buffer.append("Outlinks: " + outlinks.length + "\n" );
128     for (int i = 0; i < outlinks.length; i++) {
129        buffer.append(" outlink: " + outlinks[i] + "\n");
130     }
131
132     buffer.append("Metadata: " + metadata + "\n" );
133
134     return buffer.toString();
135   }
136
137   public static void main(String JavaDoc argv[]) throws Exception JavaDoc {
138     String JavaDoc usage = "ParseData (-local | -ndfs <namenode:port>) recno segment";
139     
140     if (argv.length < 3) {
141       System.out.println("usage:" + usage);
142       return;
143     }
144
145     NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, 0);
146     try {
147       int recno = Integer.parseInt(argv[0]);
148       String JavaDoc segment = argv[1];
149
150       File file = new File(segment, DIR_NAME);
151       System.out.println("Reading from file: " + file);
152
153       ArrayFile.Reader parses = new ArrayFile.Reader(nfs, file.toString());
154
155       ParseData parseDatum = new ParseData();
156       parses.get(recno, parseDatum);
157
158       System.out.println("Retrieved " + recno + " from file " + file);
159       System.out.println(parseDatum);
160
161       parses.close();
162     } finally {
163       nfs.close();
164     }
165   }
166 }
167
Popular Tags