1 31 32 package org.opencms.search.extractors; 33 34 import org.opencms.util.CmsFileUtil; 35 import org.opencms.util.CmsStringUtil; 36 37 import java.io.ByteArrayInputStream ; 38 import java.io.IOException ; 39 import java.io.InputStream ; 40 41 50 public abstract class A_CmsTextExtractor implements I_CmsTextExtractor { 51 52 53 protected byte[] m_inputBuffer; 54 55 58 public I_CmsExtractionResult extractText(byte[] content) throws Exception { 59 60 return extractText(content, null); 62 } 63 64 67 public I_CmsExtractionResult extractText(byte[] content, String encoding) throws Exception { 68 69 m_inputBuffer = content; 71 return extractText(new ByteArrayInputStream (content), encoding); 72 } 73 74 77 public I_CmsExtractionResult extractText(InputStream in) throws Exception { 78 79 return extractText(in, null); 81 } 82 83 86 public I_CmsExtractionResult extractText(InputStream in, String encoding) throws Exception { 87 88 byte[] text = CmsFileUtil.readFully(in); 90 return extractText(text, encoding); 92 } 93 94 102 public InputStream getStreamCopy(InputStream in) throws IOException { 103 104 if (m_inputBuffer != null) { 105 return new ByteArrayInputStream (m_inputBuffer); 106 } 107 108 m_inputBuffer = CmsFileUtil.readFully(in); 110 111 return new ByteArrayInputStream (m_inputBuffer); 113 } 114 115 122 protected String removeControlChars(String content) { 123 124 if (CmsStringUtil.isEmptyOrWhitespaceOnly(content)) { 125 return ""; 127 } 128 129 char[] chars = content.toCharArray(); 130 StringBuffer result = new StringBuffer (chars.length); 131 boolean wasUnwanted = false; 132 for (int i = 0; i < chars.length; i++) { 133 char ch = chars[i]; 134 135 int type = Character.getType(ch); 136 switch (type) { 137 138 case Character.CURRENCY_SYMBOL: 140 case Character.CONNECTOR_PUNCTUATION: 141 case Character.FINAL_QUOTE_PUNCTUATION: 142 case Character.INITIAL_QUOTE_PUNCTUATION: 143 case Character.DASH_PUNCTUATION: 144 case Character.START_PUNCTUATION: 145 case Character.END_PUNCTUATION: 146 case Character.OTHER_PUNCTUATION: 147 case Character.OTHER_LETTER: 149 case Character.MODIFIER_LETTER: 150 case Character.UPPERCASE_LETTER: 151 case Character.TITLECASE_LETTER: 152 case Character.LOWERCASE_LETTER: 153 case Character.DECIMAL_DIGIT_NUMBER: 155 case Character.SPACE_SEPARATOR: 157 result.append(ch); 158 wasUnwanted = false; 159 break; 160 161 case Character.LINE_SEPARATOR: 163 result.append('\n'); 164 wasUnwanted = true; 165 break; 166 167 case Character.MATH_SYMBOL: 169 case Character.OTHER_SYMBOL: 170 case Character.CONTROL: 172 case Character.COMBINING_SPACING_MARK: 173 case Character.ENCLOSING_MARK: 174 case Character.FORMAT: 175 case Character.LETTER_NUMBER: 176 case Character.MODIFIER_SYMBOL: 177 case Character.NON_SPACING_MARK: 178 case Character.PARAGRAPH_SEPARATOR: 179 case Character.PRIVATE_USE: 180 case Character.SURROGATE: 181 case Character.UNASSIGNED: 182 case Character.OTHER_NUMBER: 183 default: 184 if (!wasUnwanted) { 185 result.append('\n'); 186 wasUnwanted = true; 187 } 188 } 189 } 190 191 return result.toString(); 192 } 193 } | Popular Tags |