KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > uk > ac > roe > antigen > utils > HtmlToTextParser


1 /*
2  * Created on 07-Feb-2005
3  */

4 package uk.ac.roe.antigen.utils;
5
6 import java.io.IOException JavaDoc;
7 import java.io.Reader JavaDoc;
8 import java.io.StringReader JavaDoc;
9 import java.util.HashMap JavaDoc;
10 import java.util.Map JavaDoc;
11
12 import javax.swing.text.MutableAttributeSet JavaDoc;
13 import javax.swing.text.html.HTML JavaDoc;
14 import javax.swing.text.html.HTMLEditorKit JavaDoc;
15 import javax.swing.text.html.parser.ParserDelegator JavaDoc;
16
17 public class HtmlToTextParser {
18
19     private TagRemovalParserCallback parserCallBack = new TagRemovalParserCallback();;
20
21     private ParserDelegator JavaDoc parser = new ParserDelegator JavaDoc();
22
23     private StringBuffer JavaDoc contentBuffer;;
24
25     /**
26      * @param input
27      * @throws IOException
28      */

29     public String JavaDoc parse(Reader JavaDoc input) throws IOException JavaDoc {
30         contentBuffer = new StringBuffer JavaDoc();
31         parser.parse(input, parserCallBack, false);
32         return contentBuffer.toString();
33     }
34
35     /**
36      * Simple test
37      *
38      * @param args
39      * ignored
40      * @throws IOException
41      */

42     public static void main(String JavaDoc[] args) throws IOException JavaDoc {
43         String JavaDoc htmlText = "<html><head></head><body>" + "<h1>Heading 1</h1>"
44                 + "<h2>Heading 2</h2>" + "Some <b>bold</b> test and a new<br>"
45                 + "line in <em>italics</em>" + "<p>A separate paragraph</p>"
46                 + "separated by a <hr> line, "
47                 + "a <a HREF='http://www.astrogrid.org'>link</a>, "
48                 + "and a <h3>third heading</h3> to finish.";
49
50         Reader JavaDoc input = new StringReader JavaDoc(htmlText);
51         HtmlToTextParser parser = new HtmlToTextParser();
52         String JavaDoc output = parser.parse(input);
53         System.out.println(output);
54
55     }
56
57     private class TagRemovalParserCallback extends HTMLEditorKit.ParserCallback JavaDoc {
58
59         private Map JavaDoc headings = new HashMap JavaDoc();
60         
61         public TagRemovalParserCallback() {
62          headings.put(HTML.Tag.H1,"=");
63          headings.put(HTML.Tag.H2,"-");
64          headings.put(HTML.Tag.H3,".");
65         }
66         /**
67          * Keep track of the number of chars in a heading
68          */

69         private int charCount=0;
70         private int indentationLevel=0;
71         
72         private static final int LINELENGTH = 40;
73
74         private static final char BOLDCHAR = '*';
75
76         private static final char ITALCHAR = '_';
77
78         public void handleSimpleTag(HTML.Tag JavaDoc tag, MutableAttributeSet JavaDoc attrs,
79                 int pos) {
80             if (tag == HTML.Tag.BR || tag == HTML.Tag.P) {
81                 contentBuffer.append("\n");
82             }
83             if (tag == HTML.Tag.HR) {
84                 contentBuffer.append("\n");
85                 for (int i = 0; i < LINELENGTH; ++i) {
86                     contentBuffer.append("_");
87                 }
88                 contentBuffer.append("\n");
89             }
90             
91         }
92
93         public void handleStartTag(HTML.Tag JavaDoc tag, MutableAttributeSet JavaDoc attrs,
94                 int pos) {
95             if (tag == HTML.Tag.B) {
96                 contentBuffer.append(BOLDCHAR);
97             }
98             if (tag == HTML.Tag.EM) {
99                 contentBuffer.append(ITALCHAR);
100             }
101             if (tag == HTML.Tag.P) {
102                 contentBuffer.append('\n');
103             }
104             if (headings.containsKey(tag)) {
105                 contentBuffer.append('\n');
106                 charCount = 0;
107             }
108             if (tag == HTML.Tag.A) {
109                 String JavaDoc link = (String JavaDoc) attrs.getAttribute(HTML.Attribute.HREF);
110                 contentBuffer.append("["+link+"]");
111             }
112             if (tag == HTML.Tag.LI) {
113                 contentBuffer.append("\n");
114                 for (int i=0;i<indentationLevel;++i) {
115                  contentBuffer.append(" ");
116                 }
117                 contentBuffer.append("o ");
118             }
119             if (tag == HTML.Tag.UL) {
120                 indentationLevel++;
121             }
122         }
123
124         public void handleEndTag(HTML.Tag JavaDoc tag, int pos) {
125             if (tag == HTML.Tag.B) {
126                 contentBuffer.append(BOLDCHAR);
127             }
128             if (tag == HTML.Tag.EM) {
129                 contentBuffer.append(ITALCHAR);
130             }
131             if (tag == HTML.Tag.P) {
132                 contentBuffer.append('\n');
133             }
134             if (headings.containsKey(tag)) {
135                 contentBuffer.append('\n');
136                 for (int i=0;i<charCount;++i) {
137                  contentBuffer.append((String JavaDoc)headings.get(tag));
138                 }
139                 charCount = 0;
140                 contentBuffer.append('\n');
141             }
142             
143             if (tag == HTML.Tag.UL) {
144                 indentationLevel--;
145                 contentBuffer.append('\n');
146             }
147         }
148
149         public void handleText(char[] data, int pos) {
150             contentBuffer.append(data);
151             charCount+=data.length;
152         }
153
154     }
155
156 }
Popular Tags