KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > blandware > atleap > common > parsers > html > PlainTextParser


1 /*
2  * Copyright 2005 Blandware (http://www.blandware.com)
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package com.blandware.atleap.common.parsers.html;
17
18 import com.blandware.atleap.common.Constants;
19
20 import java.io.Reader JavaDoc;
21 import java.io.Writer JavaDoc;
22 import java.io.InputStream JavaDoc;
23 import java.io.IOException JavaDoc;
24 import java.util.HashMap JavaDoc;
25
26
27 /**
28  * An HTML parser that extracts a plain text.
29  *
30  * @author Roman Puchkovskiy <a HREF="mailto:roman.puchkovskiy@blandware.com">
31  * &lt;roman.puchkovskiy@blandware.com&gt;</a>
32  * @version $Revision: 1.2 $ $Date: 2005/08/02 14:53:29 $
33  */

34 class PlainTextParser extends HTMLParser {
35
36     /**
37      * Constructs the PlainTextParser.
38      *
39      * @param reader the <code>Reader</code> that will supply an HTML to parse
40      * @param writer the <code>Writer</code> that will accept the extracted text
41      * @param lookingForEncoding if we want to find out the document encoding
42      * ourselves (from &lt;meta&gt; tags, for example)
43      * @param stream the <code>RewindableInputStreamWrapper</code> that supplies
44      * data for <code>reader</code>. It's needed only if
45      * <code>lookingForEncoding</code> is <code>true</code> to stop remembering
46      * bytes when it's clear that no encoding will be found.
47      */

48     public PlainTextParser(Reader JavaDoc reader, Writer JavaDoc writer, boolean lookingForEncoding,
49                                InputStream JavaDoc stream) {
50         super(reader, writer, lookingForEncoding, stream);
51     }
52
53     protected void addText(String JavaDoc text) throws IOException JavaDoc {
54         if (!inStyle && !inScript) {
55             if (lookingForEncoding) {
56                 resultedChars.write(text);
57             } else {
58                 output.write(text);
59             }
60         }
61     }
62
63     protected void addSpace() throws IOException JavaDoc {
64         if (!inStyle && !inScript) {
65             String JavaDoc space = afterTag ? Constants.EOL : " ";
66             if (lookingForEncoding) {
67                 resultedChars.write(space);
68             } else {
69                 output.write(space);
70             }
71         }
72     }
73
74     protected void processTag(String JavaDoc tagName, HashMap JavaDoc attributes,
75                               boolean closing) throws IOException JavaDoc {
76         checkoutAttribute(attributes, "title");
77         if (tagName.equalsIgnoreCase("<script")) {
78             inScript = true;
79         } else if (tagName.equalsIgnoreCase("</script")) {
80             inScript = false;
81         } /*else if (tagName.equalsIgnoreCase("<a")) {
82             checkoutAttribute(attributes, "title");
83         } */
else if (tagName.equalsIgnoreCase("<img")) {
84             checkoutAttribute(attributes, "alt");
85         } else if (tagName.equalsIgnoreCase("<meta")) {
86             checkoutNameContentPair(attributes, "keywords");
87             checkoutNameContentPair(attributes, "description");
88             checkoutNameContentPair(attributes, "copyright");
89             checkoutNameContentPair(attributes, "publisher");
90             checkoutNameContentPair(attributes, "author");
91             if (lookingForEncoding) {
92                 checkoutEncodingChange(attributes);
93             }
94         } else if (tagName.equalsIgnoreCase("</head")
95                    || tagName.equalsIgnoreCase("<body")) {
96             if (!headFinished) {
97                 // No valid METAs expected more, so the encoding won't change
98
lookingForEncoding = false;
99                 headFinished = true;
100                 if (initialStream instanceof RewindableInputStreamWrapper) {
101                     RewindableInputStreamWrapper stream = (RewindableInputStreamWrapper) initialStream;
102                     stream.stopRemembering();
103                     output.write(resultedChars.toCharArray());
104                     resultedChars = null;
105                 }
106             }
107         }
108     }
109
110     /**
111      * Considers text -- adds it
112      *
113      * @param text the text ro be considered
114      */

115     protected void considerText(String JavaDoc text) throws IOException JavaDoc {
116         addText(text);
117     }
118
119     /**
120      * Considers space -- adds it
121      *
122      * @throws java.io.IOException
123      */

124     protected void considerSpace() throws IOException JavaDoc {
125         addSpace();
126     }
127 }
128
Popular Tags