1 16 package com.blandware.atleap.common.parsers.html; 17 18 19 import com.blandware.atleap.common.parsers.exception.PlainTextExtractorException; 20 import com.blandware.atleap.common.parsers.SpecificPlainTextExtractor; 21 import com.blandware.atleap.common.Constants; 22 23 import javax.swing.text.ChangedCharSetException ; 24 import java.io.*; 25 import java.util.StringTokenizer ; 26 import java.util.List ; 27 28 47 public class HTMLPlainTextExtractor implements SpecificPlainTextExtractor { 48 49 protected String usedEncoding = null; 50 51 54 public HTMLPlainTextExtractor() { 55 } 56 57 70 public void extract(InputStream input, Writer output, String encoding) 71 throws PlainTextExtractorException { 72 HTMLParser parser = null; 73 Reader reader = null; 74 75 try { 76 if (encoding == null || encoding.trim().length() == 0) { 77 encoding = Constants.DEFAULT_ENCODING; 78 } 79 usedEncoding = encoding; 80 81 RewindableInputStreamWrapper rewindableInputStreamWrapper 82 = new RewindableInputStreamWrapper(input); 83 reader = new InputStreamReader(rewindableInputStreamWrapper, encoding); 84 parser = new PlainTextParser(reader, output, true, 85 rewindableInputStreamWrapper); 86 try { 87 parser.parse(); 90 } catch (ChangedCharSetException e) { 91 String inputEncodingSpec = e.getCharSetSpec(); 93 String inputEncoding = (e.keyEqualsCharSet()) 94 ? inputEncodingSpec 95 : getEncodingFromSpec(inputEncodingSpec); 96 usedEncoding = inputEncoding; 97 reader = new BufferedReader(new InputStreamReader(new PrefilledInputStream(rewindableInputStreamWrapper), 98 inputEncoding)); 99 parser = new PlainTextParser(reader, output, false, null); 100 parser.parse(); 101 } 102 } catch (Exception e) { 103 throw new PlainTextExtractorException(e); 104 } 105 } 106 107 112 private String getEncodingFromSpec(String spec) { 113 String inputEncoding; 114 StringTokenizer tokenizer = new StringTokenizer (spec, " \t;="); 115 int tokenCount = tokenizer.countTokens(); 116 String [] tokens = new String [tokenCount]; 117 for (int i = 0; i < tokenCount; i++) { 118 tokens[i] = tokenizer.nextToken(); 119 } 120 if (tokenCount >= 3 && "charset".equalsIgnoreCase(tokens[1])) { 121 inputEncoding = tokens[2]; 122 } else { 123 inputEncoding = Constants.DEFAULT_ENCODING; 124 } 125 return inputEncoding; 126 } 127 128 138 public List extractInlineResources(InputStream input, String encoding) 139 throws PlainTextExtractorException { 140 try { 141 Reader reader = new InputStreamReader(input, encoding); 142 InlineResourcesParser parser = new InlineResourcesParser(reader); 143 parser.parse(); 144 return parser.getExtractedResources(); 145 } catch (IOException e) { 146 throw new PlainTextExtractorException(e); 147 } catch (ParseException e) { 148 throw new PlainTextExtractorException(e); 149 } 150 } 151 152 161 public List extractAllRefs(InputStream input, String encoding) 162 throws PlainTextExtractorException { 163 try { 164 Reader reader = new InputStreamReader(input, encoding); 165 RefsParser parser = new RefsParser(reader); 166 parser.parse(); 167 return parser.getExtractedRefs(); 168 } catch (IOException e) { 169 throw new PlainTextExtractorException(e); 170 } catch (ParseException e) { 171 throw new PlainTextExtractorException(e); 172 } 173 } 174 175 178 public String getUsedEncoding() { 179 return usedEncoding; 180 } 181 } | Popular Tags |