KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > htmlcleaner > HtmlCleaner


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.htmlcleaner;
17
18 import java.io.OutputStream JavaDoc;
19 import java.io.ByteArrayOutputStream JavaDoc;
20 import org.outerj.daisy.xmlutil.SaxBuffer;
21
22 /**
23  * Performs cleanup of HTML documents to well-formed HTML-as-XML documents.
24  *
25  * <p>More information:
26  * <ul>
27  * <li>To instantiate: see {@link HtmlCleanerFactory} and {@link HtmlCleanerTemplate}
28  * <li>About cleanup procedure: see {@link NekoHtmlParser}, {@link HtmlRepairer}
29  * and {@link StylingHtmlSerializer}.
30  * </ul>
31  */

32 public class HtmlCleaner {
33     private HtmlCleanerTemplate template;
34
35     HtmlCleaner(HtmlCleanerTemplate template) {
36         this.template = template;
37     }
38
39     /**
40      * Parses and cleans up the HTML, writing the result to the given outputstream,
41      * encoded as UTF-8.
42      */

43     public void clean(String JavaDoc somethingWhichLooksLikeHtml, OutputStream JavaDoc outputStream) throws Exception JavaDoc {
44         NekoHtmlParser parser = new NekoHtmlParser();
45         SaxBuffer buffer = parser.parse(GeckoCorruptTagCleaner.clean(somethingWhichLooksLikeHtml));
46
47         StylingHtmlSerializer serializer = new StylingHtmlSerializer(template);
48         serializer.setOutputStream(outputStream);
49         HtmlRepairer repairer = new HtmlRepairer(template);
50
51         repairer.clean(buffer, new MergeCharacterEventsHandler(serializer));
52     }
53
54     public byte[] cleanToByteArray(String JavaDoc somethingWhichLooksLikeHtml) throws Exception JavaDoc {
55         ByteArrayOutputStream JavaDoc os = new ByteArrayOutputStream JavaDoc(10000);
56         clean(somethingWhichLooksLikeHtml, os);
57         return os.toByteArray();
58     }
59
60     public String JavaDoc cleanToString(String JavaDoc somethingWhichLooksLikeHtml) throws Exception JavaDoc {
61         ByteArrayOutputStream JavaDoc os = new ByteArrayOutputStream JavaDoc(10000);
62         clean(somethingWhichLooksLikeHtml, os);
63         return os.toString("UTF-8");
64     }
65 }
66
Popular Tags