1 16 package org.outerj.daisy.summary; 17 18 import org.xmlpull.v1.XmlPullParser; 19 import org.xmlpull.mxp1.MXParser; 20 import org.outerj.daisy.xmlutil.XmlReader; 21 22 import java.io.InputStream ; 23 24 public class HtmlSummarizer { 25 29 public static String extractSummary(InputStream is, int summaryLength) throws Exception { 30 XmlPullParser parser = new MXParser(); 31 parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true); 32 parser.setInput(new XmlReader(is)); 33 int eventType = parser.getEventType(); 34 StringBuffer summary = new StringBuffer (summaryLength); 35 36 while (eventType != XmlPullParser.END_DOCUMENT) 37 { 38 eventType = parser.next(); 39 if (eventType == XmlPullParser.START_TAG) 40 { 41 if (!parser.getName().equals("html")) 42 return null; 43 44 while (eventType != XmlPullParser.END_TAG) 46 { 47 eventType = parser.next(); 48 if (eventType == XmlPullParser.START_TAG) 49 { 50 if (parser.getName().equals("body")) { 51 eventType = parser.next(); 52 int bodyElementNestingCount = 0; 53 while (bodyElementNestingCount >= 0) 54 { 55 if (eventType == XmlPullParser.START_TAG) 56 bodyElementNestingCount++; 57 else if (eventType == XmlPullParser.END_TAG) 58 bodyElementNestingCount--; 59 60 if (eventType == XmlPullParser.TEXT) { 61 String text = collapseWhitespace(parser.getText()); 62 int interestingChars = Math.min(summaryLength - 3 - summary.length(), text.length()); 63 summary.append(text.substring(0, interestingChars)); 64 if (summary.length() == summaryLength - 3) { 65 summary.append("..."); 66 return summary.toString(); 67 } 68 } 69 eventType = parser.next(); 70 } 71 } else { 72 goToEndElement(parser); 73 } 74 } 75 } 76 } 77 } 78 79 if (summary.length() > 0) { 80 return summary.toString(); 81 } else { 82 return null; 83 } 84 } 85 86 87 private static void goToEndElement(XmlPullParser parser) throws Exception 88 { 89 int eventType = parser.next(); 91 while (eventType != XmlPullParser.END_TAG) 92 { 93 if (eventType == XmlPullParser.START_TAG) 94 goToEndElement(parser); 95 eventType = parser.next(); 96 } 97 } 98 99 private static String collapseWhitespace(String text) { 100 StringBuffer buffer = new StringBuffer (text.length()); 101 boolean lastCharWasWhitespace = false; 102 103 for (int i = 0; i < text.length(); i++) { 104 char c = text.charAt(i); 105 switch (c) { 106 case '\n': 107 case '\r': 108 case ' ': 109 if (!lastCharWasWhitespace) { 110 buffer.append(' '); 111 lastCharWasWhitespace = true; 112 } 113 break; 114 default: 115 buffer.append(c); 116 lastCharWasWhitespace = false; 117 } 118 } 119 120 return buffer.toString(); 121 } 122 123 } 124 | Popular Tags |