KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > visitors > TextExtractingVisitor


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2004 Somik Raha
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TextExtractingVisitor.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2004/05/24 16:18:36 $
10
// $Revision: 1.42 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.visitors;
28
29 import org.htmlparser.Text;
30 import org.htmlparser.Tag;
31 import org.htmlparser.util.Translate;
32
33
34 /**
35  * Extracts text from a web page.
36  * Usage:
37  * <code>
38  * Parser parser = new Parser(...);
39  * TextExtractingVisitor visitor = new TextExtractingVisitor();
40  * parser.visitAllNodesWith(visitor);
41  * String textInPage = visitor.getExtractedText();
42  * </code>
43  */

44 public class TextExtractingVisitor extends NodeVisitor {
45     private StringBuffer JavaDoc textAccumulator;
46     private boolean preTagBeingProcessed;
47
48     public TextExtractingVisitor() {
49         textAccumulator = new StringBuffer JavaDoc();
50         preTagBeingProcessed = false;
51     }
52
53     public String JavaDoc getExtractedText() {
54         return textAccumulator.toString();
55     }
56
57     public void visitStringNode(Text stringNode) {
58         String JavaDoc text = stringNode.getText();
59         if (!preTagBeingProcessed) {
60             text = Translate.decode(text);
61             text = replaceNonBreakingSpaceWithOrdinarySpace(text);
62         }
63         textAccumulator.append(text);
64     }
65
66     private String JavaDoc replaceNonBreakingSpaceWithOrdinarySpace(String JavaDoc text) {
67         return text.replace('\u00a0',' ');
68     }
69
70     public void visitTag(Tag tag)
71     {
72         if (isPreTag(tag))
73             preTagBeingProcessed = true;
74     }
75
76     public void visitEndTag(Tag tag)
77     {
78         if (isPreTag(tag))
79             preTagBeingProcessed = false;
80     }
81
82     private boolean isPreTag(Tag tag) {
83         return tag.getTagName().equals("PRE");
84     }
85
86 }
87
Popular Tags