1 4 package uk.ac.roe.antigen.utils; 5 6 import java.io.IOException ; 7 import java.io.Reader ; 8 import java.io.StringReader ; 9 import java.util.HashMap ; 10 import java.util.Map ; 11 12 import javax.swing.text.MutableAttributeSet ; 13 import javax.swing.text.html.HTML ; 14 import javax.swing.text.html.HTMLEditorKit ; 15 import javax.swing.text.html.parser.ParserDelegator ; 16 17 public class HtmlToTextParser { 18 19 private TagRemovalParserCallback parserCallBack = new TagRemovalParserCallback();; 20 21 private ParserDelegator parser = new ParserDelegator (); 22 23 private StringBuffer contentBuffer;; 24 25 29 public String parse(Reader input) throws IOException { 30 contentBuffer = new StringBuffer (); 31 parser.parse(input, parserCallBack, false); 32 return contentBuffer.toString(); 33 } 34 35 42 public static void main(String [] args) throws IOException { 43 String htmlText = "<html><head></head><body>" + "<h1>Heading 1</h1>" 44 + "<h2>Heading 2</h2>" + "Some <b>bold</b> test and a new<br>" 45 + "line in <em>italics</em>" + "<p>A separate paragraph</p>" 46 + "separated by a <hr> line, " 47 + "a <a HREF='http://www.astrogrid.org'>link</a>, " 48 + "and a <h3>third heading</h3> to finish."; 49 50 Reader input = new StringReader (htmlText); 51 HtmlToTextParser parser = new HtmlToTextParser(); 52 String output = parser.parse(input); 53 System.out.println(output); 54 55 } 56 57 private class TagRemovalParserCallback extends HTMLEditorKit.ParserCallback { 58 59 private Map headings = new HashMap (); 60 61 public TagRemovalParserCallback() { 62 headings.put(HTML.Tag.H1,"="); 63 headings.put(HTML.Tag.H2,"-"); 64 headings.put(HTML.Tag.H3,"."); 65 } 66 69 private int charCount=0; 70 private int indentationLevel=0; 71 72 private static final int LINELENGTH = 40; 73 74 private static final char BOLDCHAR = '*'; 75 76 private static final char ITALCHAR = '_'; 77 78 public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attrs, 79 int pos) { 80 if (tag == HTML.Tag.BR || tag == HTML.Tag.P) { 81 contentBuffer.append("\n"); 82 } 83 if (tag == HTML.Tag.HR) { 84 contentBuffer.append("\n"); 85 for (int i = 0; i < LINELENGTH; ++i) { 86 contentBuffer.append("_"); 87 } 88 contentBuffer.append("\n"); 89 } 90 91 } 92 93 public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrs, 94 int pos) { 95 if (tag == HTML.Tag.B) { 96 contentBuffer.append(BOLDCHAR); 97 } 98 if (tag == HTML.Tag.EM) { 99 contentBuffer.append(ITALCHAR); 100 } 101 if (tag == HTML.Tag.P) { 102 contentBuffer.append('\n'); 103 } 104 if (headings.containsKey(tag)) { 105 contentBuffer.append('\n'); 106 charCount = 0; 107 } 108 if (tag == HTML.Tag.A) { 109 String link = (String ) attrs.getAttribute(HTML.Attribute.HREF); 110 contentBuffer.append("["+link+"]"); 111 } 112 if (tag == HTML.Tag.LI) { 113 contentBuffer.append("\n"); 114 for (int i=0;i<indentationLevel;++i) { 115 contentBuffer.append(" "); 116 } 117 contentBuffer.append("o "); 118 } 119 if (tag == HTML.Tag.UL) { 120 indentationLevel++; 121 } 122 } 123 124 public void handleEndTag(HTML.Tag tag, int pos) { 125 if (tag == HTML.Tag.B) { 126 contentBuffer.append(BOLDCHAR); 127 } 128 if (tag == HTML.Tag.EM) { 129 contentBuffer.append(ITALCHAR); 130 } 131 if (tag == HTML.Tag.P) { 132 contentBuffer.append('\n'); 133 } 134 if (headings.containsKey(tag)) { 135 contentBuffer.append('\n'); 136 for (int i=0;i<charCount;++i) { 137 contentBuffer.append((String )headings.get(tag)); 138 } 139 charCount = 0; 140 contentBuffer.append('\n'); 141 } 142 143 if (tag == HTML.Tag.UL) { 144 indentationLevel--; 145 contentBuffer.append('\n'); 146 } 147 } 148 149 public void handleText(char[] data, int pos) { 150 contentBuffer.append(data); 151 charCount+=data.length; 152 } 153 154 } 155 156 } | Popular Tags |