1 7 8 package org.cyberneko.html.filters; 9 10 import java.io.OutputStream ; 11 import java.io.OutputStreamWriter ; 12 import java.io.PrintWriter ; 13 import java.io.UnsupportedEncodingException ; 14 15 import org.cyberneko.html.HTMLConfiguration; 16 import org.cyberneko.html.HTMLElements; 17 import org.cyberneko.html.HTMLEntities; 18 import org.cyberneko.html.filters.DefaultFilter; 19 20 import org.apache.xerces.xni.Augmentations; 21 import org.apache.xerces.xni.NamespaceContext; 22 import org.apache.xerces.xni.QName; 23 import org.apache.xerces.xni.XMLAttributes; 24 import org.apache.xerces.xni.XMLLocator; 25 import org.apache.xerces.xni.XMLResourceIdentifier; 26 import org.apache.xerces.xni.XMLString; 27 import org.apache.xerces.xni.XNIException; 28 import org.apache.xerces.xni.parser.XMLDocumentFilter; 29 import org.apache.xerces.xni.parser.XMLInputSource; 30 import org.apache.xerces.xni.parser.XMLParserConfiguration; 31 32 56 public class Writer 57 extends DefaultFilter { 58 59 63 64 public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs"; 65 66 67 public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs"; 68 69 70 protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations"; 71 72 73 protected static final String FILTERS = "http://cyberneko.org/html/properties/filters"; 74 75 79 80 protected String fEncoding; 81 82 86 protected PrintWriter fPrinter; 87 88 90 91 protected boolean fSeenRootElement; 92 93 94 protected boolean fSeenHttpEquiv; 95 96 97 protected int fElementDepth; 98 99 100 protected boolean fNormalize; 101 102 103 protected boolean fPrintChars; 104 105 109 110 public Writer() { 111 try { 114 fEncoding = "UTF-8"; 115 fPrinter = new PrintWriter (new OutputStreamWriter (System.out, fEncoding)); 116 } 117 catch (UnsupportedEncodingException e) { 118 throw new RuntimeException (e.getMessage()); 119 } 120 } 122 130 public Writer(OutputStream outputStream, String encoding) 131 throws UnsupportedEncodingException { 132 this(new OutputStreamWriter (outputStream, encoding), encoding); 133 } 135 143 public Writer(java.io.Writer writer, String encoding) { 144 fEncoding = encoding; 145 if (writer instanceof PrintWriter ) { 146 fPrinter = (PrintWriter )writer; 147 } 148 else { 149 fPrinter = new PrintWriter (writer); 150 } 151 } 153 157 159 160 public void startDocument(XMLLocator locator, String encoding, 161 NamespaceContext nscontext, Augmentations augs) 162 throws XNIException { 163 fSeenRootElement = false; 164 fSeenHttpEquiv = false; 165 fElementDepth = 0; 166 fNormalize = true; 167 fPrintChars = true; 168 super.startDocument(locator, encoding, nscontext, augs); 169 } 171 173 174 public void startDocument(XMLLocator locator, String encoding, Augmentations augs) 175 throws XNIException { 176 startDocument(locator, encoding, null, augs); 177 } 179 180 public void comment(XMLString text, Augmentations augs) 181 throws XNIException { 182 if (fSeenRootElement && fElementDepth <= 0) { 183 fPrinter.println(); 184 } 185 fPrinter.print("<!--"); 186 printCharacters(text, false); 187 fPrinter.print("-->"); 188 if (!fSeenRootElement) { 189 fPrinter.println(); 190 } 191 fPrinter.flush(); 192 } 194 195 public void startElement(QName element, XMLAttributes attributes, Augmentations augs) 196 throws XNIException { 197 fSeenRootElement = true; 198 fElementDepth++; 199 fNormalize = !HTMLElements.getElement(element.rawname).isSpecial(); 200 printStartElement(element, attributes); 201 super.startElement(element, attributes, augs); 202 } 204 205 public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) 206 throws XNIException { 207 fSeenRootElement = true; 208 printStartElement(element, attributes); 209 super.emptyElement(element, attributes, augs); 210 } 212 213 public void characters(XMLString text, Augmentations augs) 214 throws XNIException { 215 if (fPrintChars) { 216 printCharacters(text, fNormalize); 217 } 218 super.characters(text, augs); 219 } 221 222 public void endElement(QName element, Augmentations augs) 223 throws XNIException { 224 fElementDepth--; 225 fNormalize = true; 226 242 printEndElement(element); 243 super.endElement(element, augs); 244 } 246 247 public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding, Augmentations augs) 248 throws XNIException { 249 fPrintChars = false; 250 if (name.startsWith("#")) { 251 try { 252 boolean hex = name.startsWith("#x"); 253 int offset = hex ? 2 : 1; 254 int base = hex ? 16 : 10; 255 int value = Integer.parseInt(name.substring(offset), base); 256 String entity = HTMLEntities.get(value); 257 if (entity != null) { 258 name = entity; 259 } 260 } 261 catch (NumberFormatException e) { 262 } 264 } 265 printEntity(name); 266 super.startGeneralEntity(name, id, encoding, augs); 267 } 269 270 public void endGeneralEntity(String name, Augmentations augs) 271 throws XNIException { 272 fPrintChars = true; 273 super.endGeneralEntity(name, augs); 274 } 276 280 281 protected void printAttributeValue(String text) { 282 int length = text.length(); 283 for (int j = 0; j < length; j++) { 284 char c = text.charAt(j); 285 if (c == '"') { 286 fPrinter.print("""); 287 } 288 else { 289 fPrinter.print(c); 290 } 291 } 292 fPrinter.flush(); 293 } 295 296 protected void printCharacters(XMLString text, boolean normalize) { 297 if (normalize) { 298 for (int i = 0; i < text.length; i++) { 299 char c = text.ch[text.offset + i]; 300 if (c != '\n') { 301 String entity = HTMLEntities.get(c); 302 if (entity != null) { 303 printEntity(entity); 304 } 305 else { 306 fPrinter.print(c); 307 } 308 } 309 else { 310 fPrinter.println(); 311 } 312 } 313 } 314 else { 315 for (int i = 0; i < text.length; i++) { 316 char c = text.ch[text.offset + i]; 317 fPrinter.print(c); 318 } 319 } 320 fPrinter.flush(); 321 } 323 324 protected void printStartElement(QName element, XMLAttributes attributes) { 325 326 int contentIndex = -1; 328 String originalContent = null; 329 if (element.rawname.toLowerCase().equals("meta")) { 330 String httpEquiv = null; 331 int length = attributes.getLength(); 332 for (int i = 0; i < length; i++) { 333 String aname = attributes.getQName(i).toLowerCase(); 334 if (aname.equals("http-equiv")) { 335 httpEquiv = attributes.getValue(i); 336 } 337 else if (aname.equals("content")) { 338 contentIndex = i; 339 } 340 } 341 if (httpEquiv != null && httpEquiv.toLowerCase().equals("content-type")) { 342 fSeenHttpEquiv = true; 343 String content = null; 344 if (contentIndex != -1) { 345 originalContent = attributes.getValue(contentIndex); 346 content = originalContent.toLowerCase(); 347 } 348 if (content != null) { 349 int charsetIndex = content.indexOf("charset="); 350 if (charsetIndex != -1) { 351 content = content.substring(0, charsetIndex + 8); 352 } 353 else { 354 content += ";charset="; 355 } 356 content += fEncoding; 357 attributes.setValue(contentIndex, content); 358 } 359 } 360 } 361 362 fPrinter.print('<'); 364 fPrinter.print(element.rawname); 365 int attrCount = attributes != null ? attributes.getLength() : 0; 366 for (int i = 0; i < attrCount; i++) { 367 String aname = attributes.getQName(i); 368 String avalue = attributes.getValue(i); 369 fPrinter.print(' '); 370 fPrinter.print(aname); 371 fPrinter.print("=\""); 372 printAttributeValue(avalue); 373 fPrinter.print('"'); 374 } 375 fPrinter.print('>'); 376 fPrinter.flush(); 377 378 if (contentIndex != -1) { 380 attributes.setValue(contentIndex, originalContent); 381 } 382 383 } 385 386 protected void printEndElement(QName element) { 387 fPrinter.print("</"); 388 fPrinter.print(element.rawname); 389 fPrinter.print('>'); 390 fPrinter.flush(); 391 } 393 394 protected void printEntity(String name) { 395 fPrinter.print('&'); 396 fPrinter.print(name); 397 fPrinter.print(';'); 398 fPrinter.flush(); 399 } 401 405 406 public static void main(String [] argv) throws Exception { 407 if (argv.length == 0) { 408 printUsage(); 409 System.exit(1); 410 } 411 XMLParserConfiguration parser = new HTMLConfiguration(); 412 parser.setFeature(NOTIFY_CHAR_REFS, true); 413 parser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true); 414 String iencoding = null; 415 String oencoding = "Windows-1252"; 416 boolean identity = false; 417 boolean purify = false; 418 for (int i = 0; i < argv.length; i++) { 419 String arg = argv[i]; 420 if (arg.equals("-ie")) { 421 iencoding = argv[++i]; 422 continue; 423 } 424 if (arg.equals("-e") || arg.equals("-oe")) { 425 oencoding = argv[++i]; 426 continue; 427 } 428 if (arg.equals("-i")) { 429 identity = true; 430 continue; 431 } 432 if (arg.equals("-p")) { 433 purify = true; 434 continue; 435 } 436 if (arg.equals("-h")) { 437 printUsage(); 438 System.exit(1); 439 } 440 java.util.Vector filtersVector = new java.util.Vector (2); 441 if (identity) { 442 filtersVector.addElement(new Identity()); 443 } 444 else if (purify) { 445 filtersVector.addElement(new Purifier()); 446 } 447 filtersVector.addElement(new Writer(System.out, oencoding)); 448 XMLDocumentFilter[] filters = 449 new XMLDocumentFilter[filtersVector.size()]; 450 filtersVector.copyInto(filters); 451 parser.setProperty(FILTERS, filters); 452 XMLInputSource source = new XMLInputSource(null, arg, null); 453 source.setEncoding(iencoding); 454 parser.parse(source); 455 } 456 } 458 459 private static void printUsage() { 460 System.err.println("usage: java "+Writer.class.getName()+" (options) file ..."); 461 System.err.println(); 462 System.err.println("options:"); 463 System.err.println(" -ie name Specify IANA name of input encoding."); 464 System.err.println(" -oe name Specify IANA name of output encoding."); 465 System.err.println(" -i Perform identity transform."); 466 System.err.println(" -p Purify output to ensure XML well-formedness."); 467 System.err.println(" -h Display help screen."); 468 System.err.println(); 469 System.err.println("notes:"); 470 System.err.println(" The -i and -p options are mutually exclusive."); 471 System.err.println(" The -e option has been replaced with -oe."); 472 } 474 } | Popular Tags |