KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > w3c > tidy > Tidy


1 /*
2  * @(#)Tidy.java 1.11 2000/08/16
3  *
4  */

5
6 /*
7   HTML parser and pretty printer
8
9   Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
10   Institute of Technology, Institut National de Recherche en
11   Informatique et en Automatique, Keio University). All Rights
12   Reserved.
13
14   Contributing Author(s):
15
16      Dave Raggett <dsr@w3.org>
17      Andy Quick <ac.quick@sympatico.ca> (translation to Java)
18
19   The contributing author(s) would like to thank all those who
20   helped with testing, bug fixes, and patience. This wouldn't
21   have been possible without all of you.
22
23   COPYRIGHT NOTICE:
24  
25   This software and documentation is provided "as is," and
26   the copyright holders and contributing author(s) make no
27   representations or warranties, express or implied, including
28   but not limited to, warranties of merchantability or fitness
29   for any particular purpose or that the use of the software or
30   documentation will not infringe any third party patents,
31   copyrights, trademarks or other rights.
32
33   The copyright holders and contributing author(s) will not be
34   liable for any direct, indirect, special or consequential damages
35   arising out of any use of the software or documentation, even if
36   advised of the possibility of such damage.
37
38   Permission is hereby granted to use, copy, modify, and distribute
39   this source code, or portions hereof, documentation and executables,
40   for any purpose, without fee, subject to the following restrictions:
41
42   1. The origin of this source code must not be misrepresented.
43   2. Altered versions must be plainly marked as such and must
44      not be misrepresented as being the original source.
45   3. This Copyright notice may not be removed or altered from any
46      source or altered source distribution.
47  
48   The copyright holders and contributing author(s) specifically
49   permit, without fee, and encourage the use of this source code
50   as a component for supporting the Hypertext Markup Language in
51   commercial products. If you use this source code in a product,
52   acknowledgment is not required but would be appreciated.
53 */

54
55 package org.w3c.tidy;
56
57 import java.io.PrintWriter JavaDoc;
58 import java.io.FileWriter JavaDoc;
59 import java.io.InputStream JavaDoc;
60 import java.io.FileInputStream JavaDoc;
61 import java.io.OutputStream JavaDoc;
62 import java.io.FileOutputStream JavaDoc;
63 import java.util.Properties JavaDoc;
64
65 import java.io.IOException JavaDoc;
66 import java.io.FileNotFoundException JavaDoc;
67
68 /**
69  *
70  * <p>HTML parser and pretty printer</p>
71  *
72  * <p>
73  * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
74  * See Tidy.java for the copyright notice.
75  * Derived from <a HREF="http://www.w3.org/People/Raggett/tidy">
76  * HTML Tidy Release 4 Aug 2000</a>
77  * </p>
78  *
79  * <p>
80  * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
81  * Institute of Technology, Institut National de Recherche en
82  * Informatique et en Automatique, Keio University). All Rights
83  * Reserved.
84  * </p>
85  *
86  * <p>
87  * Contributing Author(s):<br>
88  * <a HREF="mailto:dsr@w3.org">Dave Raggett</a><br>
89  * <a HREF="mailto:ac.quick@sympatico.ca">Andy Quick</a> (translation to Java)
90  * </p>
91  *
92  * <p>
93  * The contributing author(s) would like to thank all those who
94  * helped with testing, bug fixes, and patience. This wouldn't
95  * have been possible without all of you.
96  * </p>
97  *
98  * <p>
99  * COPYRIGHT NOTICE:<br>
100  *
101  * This software and documentation is provided "as is," and
102  * the copyright holders and contributing author(s) make no
103  * representations or warranties, express or implied, including
104  * but not limited to, warranties of merchantability or fitness
105  * for any particular purpose or that the use of the software or
106  * documentation will not infringe any third party patents,
107  * copyrights, trademarks or other rights.
108  * </p>
109  *
110  * <p>
111  * The copyright holders and contributing author(s) will not be
112  * liable for any direct, indirect, special or consequential damages
113  * arising out of any use of the software or documentation, even if
114  * advised of the possibility of such damage.
115  * </p>
116  *
117  * <p>
118  * Permission is hereby granted to use, copy, modify, and distribute
119  * this source code, or portions hereof, documentation and executables,
120  * for any purpose, without fee, subject to the following restrictions:
121  * </p>
122  *
123  * <p>
124  * <ol>
125  * <li>The origin of this source code must not be misrepresented.</li>
126  * <li>Altered versions must be plainly marked as such and must
127  * not be misrepresented as being the original source.</li>
128  * <li>This Copyright notice may not be removed or altered from any
129  * source or altered source distribution.</li>
130  * </ol>
131  * </p>
132  *
133  * <p>
134  * The copyright holders and contributing author(s) specifically
135  * permit, without fee, and encourage the use of this source code
136  * as a component for supporting the Hypertext Markup Language in
137  * commercial products. If you use this source code in a product,
138  * acknowledgment is not required but would be appreciated.
139  * </p>
140  *
141  * @author Dave Raggett <dsr@w3.org>
142  * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
143  * @version 1.0, 1999/05/22
144  * @version 1.0.1, 1999/05/29
145  * @version 1.1, 1999/06/18 Java Bean
146  * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
147  * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
148  * @version 1.4, 1999/09/04 DOM support
149  * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
150  * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
151  * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
152  * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
153  * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
154  * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
155  * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
156  *
157  */

158
159 public class Tidy implements java.io.Serializable JavaDoc {
160
161     static final long serialVersionUID = -2794371560623987718L;
162
163     private boolean initialized = false;
164     private PrintWriter JavaDoc errout = null; /* error output stream */
165     private PrintWriter JavaDoc stderr = null;
166     private Configuration configuration = null;
167     private String JavaDoc inputStreamName = "InputStream";
168     private int parseErrors = 0;
169     private int parseWarnings = 0;
170
171     public Tidy()
172     {
173         init();
174     }
175
176     public Configuration getConfiguration()
177     {
178         return configuration;
179     }
180
181     public PrintWriter JavaDoc getStderr()
182     {
183         return stderr;
184     }
185
186     /**
187      * ParseErrors - the number of errors that occurred in the most
188      * recent parse operation
189      */

190
191     public int getParseErrors()
192     {
193         return parseErrors;
194     }
195
196     /**
197      * ParseWarnings - the number of warnings that occurred in the most
198      * recent parse operation
199      */

200
201     public int getParseWarnings()
202     {
203         return parseWarnings;
204     }
205
206     /**
207      * Errout - the error output stream
208      */

209
210     public PrintWriter JavaDoc getErrout()
211     {
212         return errout;
213     }
214
215     public void setErrout(PrintWriter JavaDoc errout)
216     {
217         this.errout = errout;
218     }
219
220     /**
221      * Spaces - default indentation
222      * @see org.w3c.tidy.Configuration#spaces
223      */

224
225     public void setSpaces(int spaces)
226     {
227         configuration.spaces = spaces;
228     }
229
230     public int getSpaces()
231     {
232         return configuration.spaces;
233     }
234
235     /**
236      * Wraplen - default wrap margin
237      * @see org.w3c.tidy.Configuration#wraplen
238      */

239
240     public void setWraplen(int wraplen)
241     {
242         configuration.wraplen = wraplen;
243     }
244
245     public int getWraplen()
246     {
247         return configuration.wraplen;
248     }
249
250     /**
251      * CharEncoding
252      * @see org.w3c.tidy.Configuration#CharEncoding
253      */

254
255     public void setCharEncoding(int charencoding)
256     {
257         configuration.CharEncoding = charencoding;
258     }
259
260     public int getCharEncoding()
261     {
262         return configuration.CharEncoding;
263     }
264
265     /**
266      * Tabsize
267      * @see org.w3c.tidy.Configuration#tabsize
268      */

269
270     public void setTabsize(int tabsize)
271     {
272         configuration.tabsize = tabsize;
273     }
274
275     public int getTabsize()
276     {
277         return configuration.tabsize;
278     }
279
280     /**
281      * Errfile - file name to write errors to
282      * @see org.w3c.tidy.Configuration#errfile
283      */

284
285     public void setErrfile(String JavaDoc errfile)
286     {
287         configuration.errfile = errfile;
288     }
289
290     public String JavaDoc getErrfile()
291     {
292         return configuration.errfile;
293     }
294
295     /**
296      * Writeback - if true then output tidied markup
297      * NOTE: this property is ignored when parsing from an InputStream.
298      * @see org.w3c.tidy.Configuration#writeback
299      */

300
301     public void setWriteback(boolean writeback)
302     {
303         configuration.writeback = writeback;
304     }
305
306     public boolean getWriteback()
307     {
308         return configuration.writeback;
309     }
310
311     /**
312      * OnlyErrors - if true normal output is suppressed
313      * @see org.w3c.tidy.Configuration#OnlyErrors
314      */

315
316     public void setOnlyErrors(boolean OnlyErrors)
317     {
318         configuration.OnlyErrors = OnlyErrors;
319     }
320
321     public boolean getOnlyErrors()
322     {
323         return configuration.OnlyErrors;
324     }
325
326     /**
327      * ShowWarnings - however errors are always shown
328      * @see org.w3c.tidy.Configuration#ShowWarnings
329      */

330
331     public void setShowWarnings(boolean ShowWarnings)
332     {
333         configuration.ShowWarnings = ShowWarnings;
334     }
335
336     public boolean getShowWarnings()
337     {
338         return configuration.ShowWarnings;
339     }
340
341     /**
342      * Quiet - no 'Parsing X', guessed DTD or summary
343      * @see org.w3c.tidy.Configuration#Quiet
344      */

345
346     public void setQuiet(boolean Quiet)
347     {
348         configuration.Quiet = Quiet;
349     }
350
351     public boolean getQuiet()
352     {
353         return configuration.Quiet;
354     }
355
356     /**
357      * IndentContent - indent content of appropriate tags
358      * @see org.w3c.tidy.Configuration#IndentContent
359      */

360
361     public void setIndentContent(boolean IndentContent)
362     {
363         configuration.IndentContent = IndentContent;
364     }
365
366     public boolean getIndentContent()
367     {
368         return configuration.IndentContent;
369     }
370
371     /**
372      * SmartIndent - does text/block level content effect indentation
373      * @see org.w3c.tidy.Configuration#SmartIndent
374      */

375
376     public void setSmartIndent(boolean SmartIndent)
377     {
378         configuration.SmartIndent = SmartIndent;
379     }
380
381     public boolean getSmartIndent()
382     {
383         return configuration.SmartIndent;
384     }
385
386     /**
387      * HideEndTags - suppress optional end tags
388      * @see org.w3c.tidy.Configuration#HideEndTags
389      */

390
391     public void setHideEndTags(boolean HideEndTags)
392     {
393         configuration.HideEndTags = HideEndTags;
394     }
395
396     public boolean getHideEndTags()
397     {
398         return configuration.HideEndTags;
399     }
400
401     /**
402      * XmlTags - treat input as XML
403      * @see org.w3c.tidy.Configuration#XmlTags
404      */

405
406     public void setXmlTags(boolean XmlTags)
407     {
408         configuration.XmlTags = XmlTags;
409     }
410
411     public boolean getXmlTags()
412     {
413         return configuration.XmlTags;
414     }
415
416     /**
417      * XmlOut - create output as XML
418      * @see org.w3c.tidy.Configuration#XmlOut
419      */

420
421     public void setXmlOut(boolean XmlOut)
422     {
423         configuration.XmlOut = XmlOut;
424     }
425
426     public boolean getXmlOut()
427     {
428         return configuration.XmlOut;
429     }
430
431     /**
432      * XHTML - output extensible HTML
433      * @see org.w3c.tidy.Configuration#xHTML
434      */

435
436     public void setXHTML(boolean xHTML)
437     {
438         configuration.xHTML = xHTML;
439     }
440
441     public boolean getXHTML()
442     {
443         return configuration.xHTML;
444     }
445
446     /**
447      * RawOut - avoid mapping values > 127 to entities
448      * @see org.w3c.tidy.Configuration#RawOut
449      */

450
451     public void setRawOut(boolean RawOut)
452     {
453         configuration.RawOut = RawOut;
454     }
455
456     public boolean getRawOut()
457     {
458         return configuration.RawOut;
459     }
460
461     /**
462      * UpperCaseTags - output tags in upper not lower case
463      * @see org.w3c.tidy.Configuration#UpperCaseTags
464      */

465
466     public void setUpperCaseTags(boolean UpperCaseTags)
467     {
468         configuration.UpperCaseTags = UpperCaseTags;
469     }
470
471     public boolean getUpperCaseTags()
472     {
473         return configuration.UpperCaseTags;
474     }
475
476     /**
477      * UpperCaseAttrs - output attributes in upper not lower case
478      * @see org.w3c.tidy.Configuration#UpperCaseAttrs
479      */

480
481     public void setUpperCaseAttrs(boolean UpperCaseAttrs)
482     {
483         configuration.UpperCaseAttrs = UpperCaseAttrs;
484     }
485
486     public boolean getUpperCaseAttrs()
487     {
488         return configuration.UpperCaseAttrs;
489     }
490
491     /**
492      * MakeClean - remove presentational clutter
493      * @see org.w3c.tidy.Configuration#MakeClean
494      */

495
496     public void setMakeClean(boolean MakeClean)
497     {
498         configuration.MakeClean = MakeClean;
499     }
500
501     public boolean getMakeClean()
502     {
503         return configuration.MakeClean;
504     }
505
506     /**
507      * BreakBeforeBR - o/p newline before &lt;br&gt; or not?
508      * @see org.w3c.tidy.Configuration#BreakBeforeBR
509      */

510
511     public void setBreakBeforeBR(boolean BreakBeforeBR)
512     {
513         configuration.BreakBeforeBR = BreakBeforeBR;
514     }
515
516     public boolean getBreakBeforeBR()
517     {
518         return configuration.BreakBeforeBR;
519     }
520
521     /**
522      * BurstSlides - create slides on each h2 element
523      * @see org.w3c.tidy.Configuration#BurstSlides
524      */

525
526     public void setBurstSlides(boolean BurstSlides)
527     {
528         configuration.BurstSlides = BurstSlides;
529     }
530
531     public boolean getBurstSlides()
532     {
533         return configuration.BurstSlides;
534     }
535
536     /**
537      * NumEntities - use numeric entities
538      * @see org.w3c.tidy.Configuration#NumEntities
539      */

540
541     public void setNumEntities(boolean NumEntities)
542     {
543         configuration.NumEntities = NumEntities;
544     }
545
546     public boolean getNumEntities()
547     {
548         return configuration.NumEntities;
549     }
550
551     /**
552      * QuoteMarks - output " marks as &amp;quot;
553      * @see org.w3c.tidy.Configuration#QuoteMarks
554      */

555
556     public void setQuoteMarks(boolean QuoteMarks)
557     {
558         configuration.QuoteMarks = QuoteMarks;
559     }
560
561     public boolean getQuoteMarks()
562     {
563         return configuration.QuoteMarks;
564     }
565
566     /**
567      * QuoteNbsp - output non-breaking space as entity
568      * @see org.w3c.tidy.Configuration#QuoteNbsp
569      */

570
571     public void setQuoteNbsp(boolean QuoteNbsp)
572     {
573         configuration.QuoteNbsp = QuoteNbsp;
574     }
575
576     public boolean getQuoteNbsp()
577     {
578         return configuration.QuoteNbsp;
579     }
580
581     /**
582      * QuoteAmpersand - output naked ampersand as &amp;
583      * @see org.w3c.tidy.Configuration#QuoteAmpersand
584      */

585
586     public void setQuoteAmpersand(boolean QuoteAmpersand)
587     {
588         configuration.QuoteAmpersand = QuoteAmpersand;
589     }
590
591     public boolean getQuoteAmpersand()
592     {
593         return configuration.QuoteAmpersand;
594     }
595
596     /**
597      * WrapAttVals - wrap within attribute values
598      * @see org.w3c.tidy.Configuration#WrapAttVals
599      */

600
601     public void setWrapAttVals(boolean WrapAttVals)
602     {
603         configuration.WrapAttVals = WrapAttVals;
604     }
605
606     public boolean getWrapAttVals()
607     {
608         return configuration.WrapAttVals;
609     }
610
611     /**
612      * WrapScriptlets - wrap within JavaScript string literals
613      * @see org.w3c.tidy.Configuration#WrapScriptlets
614      */

615
616     public void setWrapScriptlets(boolean WrapScriptlets)
617     {
618         configuration.WrapScriptlets = WrapScriptlets;
619     }
620
621     public boolean getWrapScriptlets()
622     {
623         return configuration.WrapScriptlets;
624     }
625
626     /**
627      * WrapSection - wrap within &lt;![ ... ]&gt; section tags
628      * @see org.w3c.tidy.Configuration#WrapSection
629      */

630
631     public void setWrapSection(boolean WrapSection)
632     {
633         configuration.WrapSection = WrapSection;
634     }
635
636     public boolean getWrapSection()
637     {
638         return configuration.WrapSection;
639     }
640
641     /**
642      * AltText - default text for alt attribute
643      * @see org.w3c.tidy.Configuration#altText
644      */

645
646     public void setAltText(String JavaDoc altText)
647     {
648         configuration.altText = altText;
649     }
650
651     public String JavaDoc getAltText()
652     {
653         return configuration.altText;
654     }
655
656     /**
657      * Slidestyle - style sheet for slides
658      * @see org.w3c.tidy.Configuration#slidestyle
659      */

660
661     public void setSlidestyle(String JavaDoc slidestyle)
662     {
663         configuration.slidestyle = slidestyle;
664     }
665
666     public String JavaDoc getSlidestyle()
667     {
668         return configuration.slidestyle;
669     }
670
671     /**
672      * XmlPi - add &lt;?xml?&gt; for XML docs
673      * @see org.w3c.tidy.Configuration#XmlPi
674      */

675
676     public void setXmlPi(boolean XmlPi)
677     {
678         configuration.XmlPi = XmlPi;
679     }
680
681     public boolean getXmlPi()
682     {
683         return configuration.XmlPi;
684     }
685
686     /**
687      * DropFontTags - discard presentation tags
688      * @see org.w3c.tidy.Configuration#DropFontTags
689      */

690
691     public void setDropFontTags(boolean DropFontTags)
692     {
693         configuration.DropFontTags = DropFontTags;
694     }
695
696     public boolean getDropFontTags()
697     {
698         return configuration.DropFontTags;
699     }
700
701     /**
702      * DropEmptyParas - discard empty p elements
703      * @see org.w3c.tidy.Configuration#DropEmptyParas
704      */

705
706     public void setDropEmptyParas(boolean DropEmptyParas)
707     {
708         configuration.DropEmptyParas = DropEmptyParas;
709     }
710
711     public boolean getDropEmptyParas()
712     {
713         return configuration.DropEmptyParas;
714     }
715
716     /**
717      * FixComments - fix comments with adjacent hyphens
718      * @see org.w3c.tidy.Configuration#FixComments
719      */

720
721     public void setFixComments(boolean FixComments)
722     {
723         configuration.FixComments = FixComments;
724     }
725
726     public boolean getFixComments()
727     {
728         return configuration.FixComments;
729     }
730
731     /**
732      * WrapAsp - wrap within ASP pseudo elements
733      * @see org.w3c.tidy.Configuration#WrapAsp
734      */

735
736     public void setWrapAsp(boolean WrapAsp)
737     {
738         configuration.WrapAsp = WrapAsp;
739     }
740
741     public boolean getWrapAsp()
742     {
743         return configuration.WrapAsp;
744     }
745
746     /**
747      * WrapJste - wrap within JSTE pseudo elements
748      * @see org.w3c.tidy.Configuration#WrapJste
749      */

750
751     public void setWrapJste(boolean WrapJste)
752     {
753         configuration.WrapJste = WrapJste;
754     }
755
756     public boolean getWrapJste()
757     {
758         return configuration.WrapJste;
759     }
760
761     /**
762      * WrapPhp - wrap within PHP pseudo elements
763      * @see org.w3c.tidy.Configuration#WrapPhp
764      */

765
766     public void setWrapPhp(boolean WrapPhp)
767     {
768         configuration.WrapPhp = WrapPhp;
769     }
770
771     public boolean getWrapPhp()
772     {
773         return configuration.WrapPhp;
774     }
775
776     /**
777      * FixBackslash - fix URLs by replacing \ with /
778      * @see org.w3c.tidy.Configuration#FixBackslash
779      */

780
781     public void setFixBackslash(boolean FixBackslash)
782     {
783         configuration.FixBackslash = FixBackslash;
784     }
785
786     public boolean getFixBackslash()
787     {
788         return configuration.FixBackslash;
789     }
790
791     /**
792      * IndentAttributes - newline+indent before each attribute
793      * @see org.w3c.tidy.Configuration#IndentAttributes
794      */

795
796     public void setIndentAttributes(boolean IndentAttributes)
797     {
798         configuration.IndentAttributes = IndentAttributes;
799     }
800
801     public boolean getIndentAttributes()
802     {
803         return configuration.IndentAttributes;
804     }
805
806     /**
807      * DocType - user specified doctype
808      * omit | auto | strict | loose | <i>fpi</i>
809      * where the <i>fpi</i> is a string similar to
810      * &quot;-//ACME//DTD HTML 3.14159//EN&quot;
811      * Note: for <i>fpi</i> include the double-quotes in the string.
812      * @see org.w3c.tidy.Configuration#docTypeStr
813      * @see org.w3c.tidy.Configuration#docTypeMode
814      */

815
816     public void setDocType(String JavaDoc doctype)
817     {
818         if (doctype != null)
819             configuration.docTypeStr = configuration.parseDocType(doctype, "doctype");
820     }
821
822     public String JavaDoc getDocType()
823     {
824         String JavaDoc result = null;
825         switch (configuration.docTypeMode) {
826         case Configuration.DOCTYPE_OMIT:
827             result = "omit";
828             break;
829         case Configuration.DOCTYPE_AUTO:
830             result = "auto";
831             break;
832         case Configuration.DOCTYPE_STRICT:
833             result = "strict";
834             break;
835         case Configuration.DOCTYPE_LOOSE:
836             result = "loose";
837             break;
838         case Configuration.DOCTYPE_USER:
839             result = configuration.docTypeStr;
840             break;
841         }
842         return result;
843     }
844
845     /**
846      * LogicalEmphasis - replace i by em and b by strong
847      * @see org.w3c.tidy.Configuration#LogicalEmphasis
848      */

849
850     public void setLogicalEmphasis(boolean LogicalEmphasis)
851     {
852         configuration.LogicalEmphasis = LogicalEmphasis;
853     }
854
855     public boolean getLogicalEmphasis()
856     {
857         return configuration.LogicalEmphasis;
858     }
859
860     /**
861      * XmlPIs - if set to true PIs must end with ?>
862      * @see org.w3c.tidy.Configuration#XmlPIs
863      */

864
865     public void setXmlPIs(boolean XmlPIs)
866     {
867         configuration.XmlPIs = XmlPIs;
868     }
869
870     public boolean getXmlPIs()
871     {
872         return configuration.XmlPIs;
873     }
874
875     /**
876      * EncloseText - if true text at body is wrapped in &lt;p&gt;'s
877      * @see org.w3c.tidy.Configuration#EncloseBodyText
878      */

879
880     public void setEncloseText(boolean EncloseText)
881     {
882         configuration.EncloseBodyText = EncloseText;
883     }
884
885     public boolean getEncloseText()
886     {
887         return configuration.EncloseBodyText;
888     }
889
890     /**
891      * EncloseBlockText - if true text in blocks is wrapped in &lt;p&gt;'s
892      * @see org.w3c.tidy.Configuration#EncloseBlockText
893      */

894
895     public void setEncloseBlockText(boolean EncloseBlockText)
896     {
897         configuration.EncloseBlockText = EncloseBlockText;
898     }
899
900     public boolean getEncloseBlockText()
901     {
902         return configuration.EncloseBlockText;
903     }
904
905     /**
906      * KeepFileTimes - if true last modified time is preserved<br>
907      * <b>this is NOT supported at this time.</b>
908      * @see org.w3c.tidy.Configuration#KeepFileTimes
909      */

910
911     public void setKeepFileTimes(boolean KeepFileTimes)
912     {
913         configuration.KeepFileTimes = KeepFileTimes;
914     }
915
916     public boolean getKeepFileTimes()
917     {
918         return configuration.KeepFileTimes;
919     }
920
921     /**
922      * Word2000 - draconian cleaning for Word2000
923      * @see org.w3c.tidy.Configuration#Word2000
924      */

925
926     public void setWord2000(boolean Word2000)
927     {
928         configuration.Word2000 = Word2000;
929     }
930
931     public boolean getWord2000()
932     {
933         return configuration.Word2000;
934     }
935
936     /**
937      * TidyMark - add meta element indicating tidied doc
938      * @see org.w3c.tidy.Configuration#TidyMark
939      */

940
941     public void setTidyMark(boolean TidyMark)
942     {
943         configuration.TidyMark = TidyMark;
944     }
945
946     public boolean getTidyMark()
947     {
948         return configuration.TidyMark;
949     }
950
951     /**
952      * XmlSpace - if set to yes adds xml:space attr as needed
953      * @see org.w3c.tidy.Configuration#XmlSpace
954      */

955
956     public void setXmlSpace(boolean XmlSpace)
957     {
958         configuration.XmlSpace = XmlSpace;
959     }
960
961     public boolean getXmlSpace()
962     {
963         return configuration.XmlSpace;
964     }
965
966     /**
967      * Emacs - if true format error output for GNU Emacs
968      * @see org.w3c.tidy.Configuration#Emacs
969      */

970
971     public void setEmacs(boolean Emacs)
972     {
973         configuration.Emacs = Emacs;
974     }
975
976     public boolean getEmacs()
977     {
978         return configuration.Emacs;
979     }
980
981     /**
982      * LiteralAttribs - if true attributes may use newlines
983      * @see org.w3c.tidy.Configuration#LiteralAttribs
984      */

985
986     public void setLiteralAttribs(boolean LiteralAttribs)
987     {
988         configuration.LiteralAttribs = LiteralAttribs;
989     }
990
991     public boolean getLiteralAttribs()
992     {
993         return configuration.LiteralAttribs;
994     }
995
996     /**
997      * InputStreamName - the name of the input stream (printed in the
998      * header information).
999      */

1000    public void setInputStreamName(String JavaDoc name)
1001    {
1002        if (name != null)
1003            inputStreamName = name;
1004    }
1005
1006    public String JavaDoc getInputStreamName()
1007    {
1008        return inputStreamName;
1009    }
1010
1011    /**
1012     * Sets the configuration from a configuration file.
1013     */

1014
1015    public void setConfigurationFromFile(String JavaDoc filename)
1016    {
1017        configuration.parseFile(filename);
1018    }
1019
1020    /**
1021     * Sets the configuration from a properties object.
1022     */

1023
1024    public void setConfigurationFromProps(Properties JavaDoc props)
1025    {
1026        configuration.addProps(props);
1027    }
1028
1029    /**
1030     * first time initialization which should
1031     * precede reading the command line
1032     */

1033
1034    private void init()
1035    {
1036        configuration = new Configuration();
1037        if (configuration == null) return;
1038
1039        AttributeTable at = AttributeTable.getDefaultAttributeTable();
1040        if (at == null) return;
1041        TagTable tt = new TagTable();
1042        if (tt == null) return;
1043        tt.setConfiguration(configuration);
1044        configuration.tt = tt;
1045        EntityTable et = EntityTable.getDefaultEntityTable();
1046        if (et == null) return;
1047
1048        /* Unnecessary - same initial values in Configuration
1049        Configuration.XmlTags = false;
1050        Configuration.XmlOut = false;
1051        Configuration.HideEndTags = false;
1052        Configuration.UpperCaseTags = false;
1053        Configuration.MakeClean = false;
1054        Configuration.writeback = false;
1055        Configuration.OnlyErrors = false;
1056        */

1057
1058        configuration.errfile = null;
1059        stderr = new PrintWriter JavaDoc(System.err, true);
1060        errout = stderr;
1061        initialized = true;
1062    }
1063
1064    /**
1065     * Parses InputStream in and returns the root Node.
1066     * If out is non-null, pretty prints to OutputStream out.
1067     */

1068
1069    public Node parse(InputStream JavaDoc in, OutputStream JavaDoc out)
1070    {
1071        Node document = null;
1072
1073        try
1074        {
1075          document = parse(in, null, out);
1076        }
1077        catch (FileNotFoundException JavaDoc fnfe) {}
1078        catch (IOException JavaDoc e) {}
1079
1080        return document;
1081    }
1082
1083
1084    /**
1085     * Internal routine that actually does the parsing. The caller
1086     * can pass either an InputStream or file name. If both are passed,
1087     * the file name is preferred.
1088     */

1089
1090    private Node parse(InputStream JavaDoc in, String JavaDoc file, OutputStream JavaDoc out)
1091                  throws FileNotFoundException JavaDoc, IOException JavaDoc
1092    {
1093        Lexer lexer;
1094        Node document = null;
1095        Node doctype;
1096        Out o = new OutImpl(); /* normal output stream */
1097        PPrint pprint;
1098
1099        if (!initialized)
1100            return null;
1101
1102        if (errout == null)
1103            return null;
1104
1105        parseErrors = 0;
1106        parseWarnings = 0;
1107
1108        /* ensure config is self-consistent */
1109        configuration.adjust();
1110
1111        if (file != null)
1112        {
1113            in = new FileInputStream JavaDoc(file);
1114            inputStreamName = file;
1115        }
1116        else if (in == null)
1117        {
1118            in = System.in;
1119            inputStreamName = "stdin";
1120        }
1121
1122        if (in != null)
1123        {
1124            lexer = new Lexer(new StreamInImpl(in,
1125                                               configuration.CharEncoding,
1126                                               configuration.tabsize),
1127                              configuration);
1128            lexer.errout = errout;
1129
1130            /*
1131              store pointer to lexer in input stream
1132              to allow character encoding errors to be
1133              reported
1134            */

1135            lexer.in.lexer = lexer;
1136
1137            /* Tidy doesn't alter the doctype for generic XML docs */
1138            if (configuration.XmlTags)
1139                document = ParserImpl.parseXMLDocument(lexer);
1140            else
1141            {
1142                lexer.warnings = 0;
1143                if (!configuration.Quiet)
1144                    Report.helloMessage(errout, Report.RELEASE_DATE, inputStreamName);
1145
1146                document = ParserImpl.parseDocument(lexer);
1147
1148                if (!document.checkNodeIntegrity())
1149                {
1150                    Report.badTree(errout);
1151                    return null;
1152                }
1153
1154                Clean cleaner = new Clean(configuration.tt);
1155
1156                /* simplifies <b><b> ... </b> ...</b> etc. */
1157                cleaner.nestedEmphasis(document);
1158
1159                /* cleans up <dir>indented text</dir> etc. */
1160                cleaner.list2BQ(document);
1161                cleaner.bQ2Div(document);
1162
1163                /* replaces i by em and b by strong */
1164                if (configuration.LogicalEmphasis)
1165                    cleaner.emFromI(document);
1166
1167                if (configuration.Word2000 && cleaner.isWord2000(document, configuration.tt))
1168                {
1169                    /* prune Word2000's <![if ...]> ... <![endif]> */
1170                    cleaner.dropSections(lexer, document);
1171
1172                    /* drop style & class attributes and empty p, span elements */
1173                    cleaner.cleanWord2000(lexer, document);
1174                }
1175
1176                /* replaces presentational markup by style rules */
1177                if (configuration.MakeClean || configuration.DropFontTags)
1178                    cleaner.cleanTree(lexer, document);
1179
1180                if (!document.checkNodeIntegrity())
1181                {
1182                    Report.badTree(errout);
1183                    return null;
1184                }
1185                doctype = document.findDocType();
1186                if (document.content != null)
1187                {
1188                    if (configuration.xHTML)
1189                        lexer.setXHTMLDocType(document);
1190                    else
1191                        lexer.fixDocType(document);
1192
1193                    if (configuration.TidyMark)
1194                        lexer.addGenerator(document);
1195                }
1196
1197                /* ensure presence of initial <?XML version="1.0"?> */
1198                if (configuration.XmlOut && configuration.XmlPi)
1199                    lexer.fixXMLPI(document);
1200
1201                if(!configuration.Quiet && document.content != null)
1202                {
1203                    Report.reportVersion(errout, lexer, inputStreamName, doctype);
1204                    Report.reportNumWarnings(errout, lexer);
1205                }
1206            }
1207
1208            parseWarnings = lexer.warnings;
1209            parseErrors = lexer.errors;
1210
1211            // Try to close the InputStream but only if if we created it.
1212

1213            if ( (file != null) && (in != System.in) )
1214            {
1215                try
1216                {
1217                    in.close();
1218                }
1219                catch (IOException JavaDoc e ) {}
1220            }
1221
1222            if (lexer.errors > 0)
1223                Report.needsAuthorIntervention(errout);
1224
1225            o.state = StreamIn.FSM_ASCII;
1226            o.encoding = configuration.CharEncoding;
1227
1228            if (!configuration.OnlyErrors && lexer.errors == 0)
1229            {
1230                if (configuration.BurstSlides)
1231                {
1232                    Node body;
1233
1234                    body = null;
1235                    /*
1236                       remove doctype to avoid potential clash with
1237                       markup introduced when bursting into slides
1238                    */

1239                    /* discard the document type */
1240                    doctype = document.findDocType();
1241
1242                    if (doctype != null)
1243                        Node.discardElement(doctype);
1244
1245                    /* slides use transitional features */
1246                    lexer.versions |= Dict.VERS_HTML40_LOOSE;
1247
1248                    /* and patch up doctype to match */
1249                    if (configuration.xHTML)
1250                        lexer.setXHTMLDocType(document);
1251                    else
1252                        lexer.fixDocType(document);
1253
1254                    /* find the body element which may be implicit */
1255                    body = document.findBody(configuration.tt);
1256
1257                    if (body != null)
1258                    {
1259                        pprint = new PPrint(configuration);
1260                        Report.reportNumberOfSlides(errout, pprint.countSlides(body));
1261                        pprint.createSlides(lexer, document);
1262                    }
1263                    else
1264                        Report.missingBody(errout);
1265                }
1266                else if (configuration.writeback && (file != null))
1267                {
1268                    try
1269                    {
1270                        pprint = new PPrint(configuration);
1271                        o.out = new FileOutputStream JavaDoc(file);
1272
1273                        if (configuration.XmlTags)
1274                            pprint.printXMLTree(o, (short)0, 0, lexer, document);
1275                        else
1276                            pprint.printTree(o, (short)0, 0, lexer, document);
1277
1278                        pprint.flushLine(o, 0);
1279                        o.out.close();
1280                    }
1281                    catch (IOException JavaDoc e)
1282                    {
1283                        errout.println(file + e.toString());
1284                    }
1285                }
1286                else if (out != null)
1287                {
1288                    pprint = new PPrint(configuration);
1289                    o.out = out;
1290
1291                    if (configuration.XmlTags)
1292                        pprint.printXMLTree(o, (short)0, 0, lexer, document);
1293                    else
1294                        pprint.printTree(o, (short)0, 0, lexer, document);
1295
1296                    pprint.flushLine(o, 0);
1297                }
1298
1299            }
1300
1301            Report.errorSummary(lexer);
1302        }
1303        return document;
1304    }
1305
1306
1307    /**
1308     * Parses InputStream in and returns a DOM Document node.
1309     * If out is non-null, pretty prints to OutputStream out.
1310     */

1311
1312    public org.w3c.dom.Document JavaDoc parseDOM(InputStream JavaDoc in, OutputStream JavaDoc out)
1313    {
1314        Node document = parse(in, out);
1315        if (document != null)
1316            return (org.w3c.dom.Document JavaDoc)document.getAdapter();
1317        else
1318            return null;
1319    }
1320
1321    /**
1322     * Creates an empty DOM Document.
1323     */

1324
1325    public static org.w3c.dom.Document JavaDoc createEmptyDocument()
1326    {
1327        Node document = new Node(Node.RootNode, new byte[0], 0, 0);
1328        Node node = new Node(Node.StartTag, new byte[0], 0, 0, "html", new TagTable());
1329        if (document != null && node != null)
1330        {
1331            Node.insertNodeAtStart(document, node);
1332            return (org.w3c.dom.Document JavaDoc)document.getAdapter();
1333        } else {
1334            return null;
1335        }
1336    }
1337
1338    /**
1339     * Pretty-prints a DOM Document.
1340     */

1341
1342    public void pprint(org.w3c.dom.Document JavaDoc doc, OutputStream JavaDoc out)
1343    {
1344        Out o = new OutImpl();
1345        PPrint pprint;
1346        Node document;
1347
1348        if (!(doc instanceof DOMDocumentImpl)) {
1349            return;
1350        }
1351        document = ((DOMDocumentImpl)doc).adaptee;
1352
1353        o.state = StreamIn.FSM_ASCII;
1354        o.encoding = configuration.CharEncoding;
1355
1356        if (out != null)
1357        {
1358            pprint = new PPrint(configuration);
1359            o.out = out;
1360
1361            if (configuration.XmlTags)
1362                pprint.printXMLTree(o, (short)0, 0, null, document);
1363            else
1364                pprint.printTree(o, (short)0, 0, null, document);
1365
1366            pprint.flushLine(o, 0);
1367        }
1368    }
1369
1370    /**
1371     * Command line interface to parser and pretty printer.
1372     */

1373
1374    public static void main(String JavaDoc[] argv)
1375    {
1376        int totalerrors = 0;
1377        int totalwarnings = 0;
1378        String JavaDoc file;
1379        InputStream JavaDoc in;
1380        String JavaDoc prog = "Tidy";
1381        Node document;
1382        Node doctype;
1383        Lexer lexer;
1384        String JavaDoc s;
1385        Out out = new OutImpl(); /* normal output stream */
1386        PPrint pprint;
1387        int argc = argv.length + 1;
1388        int argIndex = 0;
1389        Tidy tidy;
1390        Configuration configuration;
1391        String JavaDoc arg;
1392        String JavaDoc current_errorfile = "stderr";
1393
1394        tidy = new Tidy();
1395        configuration = tidy.getConfiguration();
1396
1397        /* read command line */
1398
1399        while (argc > 0)
1400        {
1401            if (argc > 1 && argv[argIndex].startsWith("-"))
1402            {
1403                /* support -foo and --foo */
1404                arg = argv[argIndex].substring(1);
1405
1406                if (arg.length() > 0 && arg.charAt(0) == '-')
1407                    arg = arg.substring(1);
1408
1409                if (arg.equals("xml"))
1410                    configuration.XmlTags = true;
1411                else if (arg.equals("asxml") || arg.equals("asxhtml"))
1412                    configuration.xHTML = true;
1413                else if (arg.equals("indent"))
1414                {
1415                    configuration.IndentContent = true;
1416                    configuration.SmartIndent = true;
1417                }
1418                else if (arg.equals("omit"))
1419                    configuration.HideEndTags = true;
1420                else if (arg.equals("upper"))
1421                    configuration.UpperCaseTags = true;
1422                else if (arg.equals("clean"))
1423                    configuration.MakeClean = true;
1424                else if (arg.equals("raw"))
1425                    configuration.CharEncoding = Configuration.RAW;
1426                else if (arg.equals("ascii"))
1427                    configuration.CharEncoding = Configuration.ASCII;
1428                else if (arg.equals("latin1"))
1429                    configuration.CharEncoding = Configuration.LATIN1;
1430                else if (arg.equals("utf8"))
1431                    configuration.CharEncoding = Configuration.UTF8;
1432                else if (arg.equals("iso2022"))
1433                    configuration.CharEncoding = Configuration.ISO2022;
1434                else if (arg.equals("mac"))
1435                    configuration.CharEncoding = Configuration.MACROMAN;
1436                else if (arg.equals("numeric"))
1437                    configuration.NumEntities = true;
1438                else if (arg.equals("modify"))
1439                    configuration.writeback = true;
1440                else if (arg.equals("change")) /* obsolete */
1441                    configuration.writeback = true;
1442                else if (arg.equals("update")) /* obsolete */
1443                    configuration.writeback = true;
1444                else if (arg.equals("errors"))
1445                    configuration.OnlyErrors = true;
1446                else if (arg.equals("quiet"))
1447                    configuration.Quiet = true;
1448                else if (arg.equals("slides"))
1449                    configuration.BurstSlides = true;
1450                else if (arg.equals("help") ||
1451                         argv[argIndex].charAt(1) == '?'||
1452                         argv[argIndex].charAt(1) == 'h')
1453                {
1454                    Report.helpText(new PrintWriter JavaDoc(System.out, true), prog);
1455                    System.exit(1);
1456                }
1457                else if (arg.equals("config"))
1458                {
1459                    if (argc >= 3)
1460                    {
1461                        configuration.parseFile(argv[argIndex + 1]);
1462                        --argc;
1463                        ++argIndex;
1464                    }
1465                }
1466                else if (argv[argIndex].equals("-file") ||
1467                         argv[argIndex].equals("--file") ||
1468                            argv[argIndex].equals("-f"))
1469                {
1470                    if (argc >= 3)
1471                    {
1472                        configuration.errfile = argv[argIndex + 1];
1473                        --argc;
1474                        ++argIndex;
1475                    }
1476                }
1477                else if (argv[argIndex].equals("-wrap") ||
1478                         argv[argIndex].equals("--wrap") ||
1479                            argv[argIndex].equals("-w"))
1480                {
1481                    if (argc >= 3)
1482                    {
1483                        configuration.wraplen =
1484                            Integer.parseInt(argv[argIndex + 1]);
1485                        --argc;
1486                        ++argIndex;
1487                    }
1488                }
1489                else if (argv[argIndex].equals("-version") ||
1490                         argv[argIndex].equals("--version") ||
1491                            argv[argIndex].equals("-v"))
1492                {
1493                    Report.showVersion(tidy.getErrout());
1494                    System.exit(0);
1495                }
1496                else
1497                {
1498                    s = argv[argIndex];
1499
1500                    for (int i = 1; i < s.length(); i++)
1501                    {
1502                        if (s.charAt(i) == 'i')
1503                        {
1504                            configuration.IndentContent = true;
1505                            configuration.SmartIndent = true;
1506                        }
1507                        else if (s.charAt(i) == 'o')
1508                            configuration.HideEndTags = true;
1509                        else if (s.charAt(i) == 'u')
1510                            configuration.UpperCaseTags = true;
1511                        else if (s.charAt(i) == 'c')
1512                            configuration.MakeClean = true;
1513                        else if (s.charAt(i) == 'n')
1514                            configuration.NumEntities = true;
1515                        else if (s.charAt(i) == 'm')
1516                            configuration.writeback = true;
1517                        else if (s.charAt(i) == 'e')
1518                            configuration.OnlyErrors = true;
1519                        else if (s.charAt(i) == 'q')
1520                            configuration.Quiet = true;
1521                        else
1522                            Report.unknownOption(tidy.getErrout(), s.charAt(i));
1523                    }
1524                }
1525
1526                --argc;
1527                ++argIndex;
1528                continue;
1529            }
1530
1531            /* ensure config is self-consistent */
1532            configuration.adjust();
1533
1534            /* user specified error file */
1535            if (configuration.errfile != null)
1536            {
1537                /* is it same as the currently opened file? */
1538                if (!configuration.errfile.equals(current_errorfile))
1539                {
1540                    /* no so close previous error file */
1541
1542                    if (tidy.getErrout() != tidy.getStderr())
1543                        tidy.getErrout().close();
1544
1545                    /* and try to open the new error file */
1546                    try
1547                    {
1548                        tidy.setErrout(
1549                            new PrintWriter JavaDoc(
1550                                new FileWriter JavaDoc(configuration.errfile), true));
1551                        current_errorfile = configuration.errfile;
1552                    }
1553                    catch (IOException JavaDoc e)
1554                    {
1555                        /* can't be opened so fall back to stderr */
1556                        current_errorfile = "stderr";
1557                        tidy.setErrout(tidy.getStderr());
1558                    }
1559                }
1560            }
1561
1562            if (argc > 1)
1563            {
1564                file = argv[argIndex];
1565            }
1566            else
1567            {
1568                file = "stdin";
1569            }
1570
1571            try
1572            {
1573                document = tidy.parse(null, file, System.out);
1574                totalwarnings += tidy.parseWarnings;
1575                totalerrors += tidy.parseErrors;
1576            }
1577            catch (FileNotFoundException JavaDoc fnfe)
1578            {
1579                Report.unknownFile(tidy.getErrout(), prog, file);
1580            }
1581            catch (IOException JavaDoc ioe)
1582            {
1583                Report.unknownFile(tidy.getErrout(), prog, file);
1584            }
1585
1586            --argc;
1587            ++argIndex;
1588
1589            if (argc <= 1)
1590                break;
1591        }
1592
1593        if (totalerrors + totalwarnings > 0)
1594            Report.generalInfo(tidy.getErrout());
1595
1596        if (tidy.getErrout() != tidy.getStderr())
1597            tidy.getErrout().close();
1598
1599        /* return status can be used by scripts */
1600
1601        if (totalerrors > 0)
1602            System.exit(2);
1603
1604        if (totalwarnings > 0)
1605            System.exit(1);
1606
1607        /* 0 signifies all is ok */
1608        System.exit(0);
1609    }
1610}
1611
Popular Tags