KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > summary > HtmlSummarizer


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.summary;
17
18 import org.xmlpull.v1.XmlPullParser;
19 import org.xmlpull.mxp1.MXParser;
20 import org.outerj.daisy.xmlutil.XmlReader;
21
22 import java.io.InputStream JavaDoc;
23
24 public class HtmlSummarizer {
25     /**
26      *
27      * @param is an inputstream from which XML-well-formed HTML markup can be read.
28      */

29     public static String JavaDoc extractSummary(InputStream JavaDoc is, int summaryLength) throws Exception JavaDoc {
30         XmlPullParser parser = new MXParser();
31         parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true);
32         parser.setInput(new XmlReader(is));
33         int eventType = parser.getEventType();
34         StringBuffer JavaDoc summary = new StringBuffer JavaDoc(summaryLength);
35
36         while (eventType != XmlPullParser.END_DOCUMENT)
37         {
38             eventType = parser.next();
39             if (eventType == XmlPullParser.START_TAG)
40             {
41                 if (!parser.getName().equals("html"))
42                     return null;
43
44                 // run over the children of the html element
45
while (eventType != XmlPullParser.END_TAG)
46                 {
47                     eventType = parser.next();
48                     if (eventType == XmlPullParser.START_TAG)
49                     {
50                         if (parser.getName().equals("body")) {
51                             eventType = parser.next();
52                             int bodyElementNestingCount = 0;
53                             while (bodyElementNestingCount >= 0)
54                             {
55                                 if (eventType == XmlPullParser.START_TAG)
56                                     bodyElementNestingCount++;
57                                 else if (eventType == XmlPullParser.END_TAG)
58                                     bodyElementNestingCount--;
59
60                                 if (eventType == XmlPullParser.TEXT) {
61                                     String JavaDoc text = collapseWhitespace(parser.getText());
62                                     int interestingChars = Math.min(summaryLength - 3 - summary.length(), text.length());
63                                     summary.append(text.substring(0, interestingChars));
64                                     if (summary.length() == summaryLength - 3) {
65                                         summary.append("...");
66                                         return summary.toString();
67                                     }
68                                 }
69                                 eventType = parser.next();
70                             }
71                         } else {
72                             goToEndElement(parser);
73                         }
74                     }
75                 }
76             }
77         }
78
79         if (summary.length() > 0) {
80             return summary.toString();
81         } else {
82             return null;
83         }
84     }
85
86
87     private static void goToEndElement(XmlPullParser parser) throws Exception JavaDoc
88     {
89         // TODO rewrite this non-recursive
90
int eventType = parser.next();
91         while (eventType != XmlPullParser.END_TAG)
92         {
93             if (eventType == XmlPullParser.START_TAG)
94                 goToEndElement(parser);
95             eventType = parser.next();
96         }
97     }
98
99     private static String JavaDoc collapseWhitespace(String JavaDoc text) {
100         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(text.length());
101         boolean lastCharWasWhitespace = false;
102
103         for (int i = 0; i < text.length(); i++) {
104             char c = text.charAt(i);
105             switch (c) {
106                 case '\n':
107                 case '\r':
108                 case ' ':
109                     if (!lastCharWasWhitespace) {
110                         buffer.append(' ');
111                         lastCharWasWhitespace = true;
112                     }
113                     break;
114                 default:
115                     buffer.append(c);
116                     lastCharWasWhitespace = false;
117             }
118         }
119
120         return buffer.toString();
121     }
122
123 }
124
Popular Tags