1 2 3 4 package net.nutch.parse; 5 6 import java.io.*; 7 import java.util.*; 8 9 import net.nutch.io.*; 10 import net.nutch.fs.*; 11 import net.nutch.util.*; 12 import net.nutch.tools.UpdateDatabaseTool; 13 14 15 18 public final class ParseData extends VersionedWritable { 19 public static final String DIR_NAME = "parse_data"; 20 21 private final static byte VERSION = 1; 22 23 private String title; 24 private Outlink[] outlinks; 25 private Properties metadata; 26 27 public ParseData() {} 28 29 public ParseData(String title, Outlink[] outlinks, Properties metadata) { 30 this.title = title; 31 this.outlinks = outlinks; 32 this.metadata = metadata; 33 } 34 35 39 40 public String getTitle() { return title; } 41 42 43 public Outlink[] getOutlinks() { return outlinks; } 44 45 48 public Properties getMetadata() { return metadata; } 49 50 51 public String get(String name) { return getMetadata().getProperty(name); } 52 53 57 public byte getVersion() { return VERSION; } 58 59 public final void readFields(DataInput in) throws IOException { 60 super.readFields(in); 62 title = UTF8.readString(in); 64 int totalOutlinks = in.readInt(); int outlinksToRead = Math.min(UpdateDatabaseTool.MAX_OUTLINKS_PER_PAGE, 66 totalOutlinks); 67 outlinks = new Outlink[outlinksToRead]; 68 for (int i = 0; i < outlinksToRead; i++) { 69 outlinks[i] = Outlink.read(in); 70 } 71 for (int i = outlinksToRead; i < totalOutlinks; i++) { 72 Outlink.skip(in); 73 } 74 75 int propertyCount = in.readInt(); metadata = new Properties(); 77 for (int i = 0; i < propertyCount; i++) { 78 metadata.put(UTF8.readString(in), UTF8.readString(in)); 79 } 80 81 } 82 83 public final void write(DataOutput out) throws IOException { 84 super.write(out); 86 UTF8.writeString(out, title); 88 out.writeInt(outlinks.length); for (int i = 0; i < outlinks.length; i++) { 90 outlinks[i].write(out); 91 } 92 93 out.writeInt(metadata.size()); Iterator i = metadata.entrySet().iterator(); 95 while (i.hasNext()) { 96 Map.Entry e = (Map.Entry)i.next(); 97 UTF8.writeString(out, (String )e.getKey()); 98 UTF8.writeString(out, (String )e.getValue()); 99 } 100 } 101 102 public static ParseData read(DataInput in) throws IOException { 103 ParseData parseText = new ParseData(); 104 parseText.readFields(in); 105 return parseText; 106 } 107 108 112 public boolean equals(Object o) { 113 if (!(o instanceof ParseData)) 114 return false; 115 ParseData other = (ParseData)o; 116 return 117 this.title.equals(other.title) && 118 Arrays.equals(this.outlinks, other.outlinks) && 119 this.metadata.equals(other.metadata); 120 } 121 122 public String toString() { 123 StringBuffer buffer = new StringBuffer (); 124 125 buffer.append("Title: " + title + "\n" ); 126 127 buffer.append("Outlinks: " + outlinks.length + "\n" ); 128 for (int i = 0; i < outlinks.length; i++) { 129 buffer.append(" outlink: " + outlinks[i] + "\n"); 130 } 131 132 buffer.append("Metadata: " + metadata + "\n" ); 133 134 return buffer.toString(); 135 } 136 137 public static void main(String argv[]) throws Exception { 138 String usage = "ParseData (-local | -ndfs <namenode:port>) recno segment"; 139 140 if (argv.length < 3) { 141 System.out.println("usage:" + usage); 142 return; 143 } 144 145 NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, 0); 146 try { 147 int recno = Integer.parseInt(argv[0]); 148 String segment = argv[1]; 149 150 File file = new File(segment, DIR_NAME); 151 System.out.println("Reading from file: " + file); 152 153 ArrayFile.Reader parses = new ArrayFile.Reader(nfs, file.toString()); 154 155 ParseData parseDatum = new ParseData(); 156 parses.get(recno, parseDatum); 157 158 System.out.println("Retrieved " + recno + " from file " + file); 159 System.out.println(parseDatum); 160 161 parses.close(); 162 } finally { 163 nfs.close(); 164 } 165 } 166 } 167 | Popular Tags |