KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > net > nutch > parse > text > TextParser


1 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
2 /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
3
4 package net.nutch.parse.text;
5
6 import java.util.Properties JavaDoc;
7
8 import net.nutch.protocol.Content;
9 import net.nutch.parse.*;
10 import net.nutch.util.*;
11
12 public class TextParser implements Parser {
13   public Parse getParse(Content content) throws ParseException {
14     // copy content meta data through
15
Properties JavaDoc metadata = new Properties JavaDoc();
16     metadata.putAll(content.getMetadata());
17
18     ParseData parseData = new ParseData("", new Outlink[0], metadata);
19
20     String JavaDoc encoding =
21       StringUtil.parseCharacterEncoding(content.getContentType());
22     String JavaDoc text;
23     if (encoding != null) { // found an encoding header
24
try { // try to use named encoding
25
text = new String JavaDoc(content.getContent(), encoding);
26       } catch (java.io.UnsupportedEncodingException JavaDoc e) {
27         throw new ParseException(e);
28       }
29     } else {
30       // FIXME: implement charset detector. This code causes problem when
31
// character set isn't specified in HTTP header.
32
text = new String JavaDoc(content.getContent()); // use default encoding
33
}
34
35     return new ParseImpl(text, parseData);
36   }
37 }
38
Popular Tags