KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > w3c > tidy > Clean


1 /*
2  * @(#)Clean.java 1.11 2000/08/16
3  *
4  */

5
6 package org.w3c.tidy;
7
8 /**
9  *
10  * Clean up misuse of presentation markup
11  *
12  * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13  * See Tidy.java for the copyright notice.
14  * Derived from <a HREF="http://www.w3.org/People/Raggett/tidy">
15  * HTML Tidy Release 4 Aug 2000</a>
16  *
17  * @author Dave Raggett <dsr@w3.org>
18  * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19  * @version 1.0, 1999/05/22
20  * @version 1.0.1, 1999/05/29
21  * @version 1.1, 1999/06/18 Java Bean
22  * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
23  * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
24  * @version 1.4, 1999/09/04 DOM support
25  * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
26  * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
27  * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
28  * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
29  * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
30  * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
31  * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
32  */

33
34 /*
35   Filters from other formats such as Microsoft Word
36   often make excessive use of presentation markup such
37   as font tags, B, I, and the align attribute. By applying
38   a set of production rules, it is straight forward to
39   transform this to use CSS.
40
41   Some rules replace some of the children of an element by
42   style properties on the element, e.g.
43
44   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
45
46   Such rules are applied to the element's content and then
47   to the element itself until none of the rules more apply.
48   Having applied all the rules to an element, it will have
49   a style attribute with one or more properties.
50
51   Other rules strip the element they apply to, replacing
52   it by style properties on the contents, e.g.
53   
54   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
55       
56   These rules are applied to an element before processing
57   its content and replace the current element by the first
58   element in the exposed content.
59
60   After applying both sets of rules, you can replace the
61   style attribute by a class value and style rule in the
62   document head. To support this, an association of styles
63   and class names is built.
64
65   A naive approach is to rely on string matching to test
66   when two property lists are the same. A better approach
67   would be to first sort the properties before matching.
68 */

69
70 public class Clean {
71
72     private int classNum = 1;
73
74     private TagTable tt;
75
76     public Clean(TagTable tt)
77     {
78       this.tt = tt;
79     }
80
81     private StyleProp insertProperty(StyleProp props, String JavaDoc name,
82                                             String JavaDoc value)
83     {
84         StyleProp first, prev, prop;
85         int cmp;
86
87         prev = null;
88         first = props;
89
90         while (props != null)
91         {
92             cmp = props.name.compareTo(name);
93
94             if (cmp == 0)
95             {
96                 /* this property is already defined, ignore new value */
97                 return first;
98             }
99
100             if (cmp > 0) // props.name > name
101
{
102                 /* insert before this */
103
104                 prop = new StyleProp(name, value, props);
105
106                 if (prev != null)
107                     prev.next = prop;
108                 else
109                     first = prop;
110
111                 return first;
112             }
113
114             prev = props;
115             props = props.next;
116         }
117
118         prop = new StyleProp(name, value);
119
120         if (prev != null)
121             prev.next = prop;
122         else
123             first = prop;
124
125         return first;
126     }
127
128     /*
129      Create sorted linked list of properties from style string
130      It temporarily places nulls in place of ':' and ';' to
131      delimit the strings for the property name and value.
132      Some systems don't allow you to null literal strings,
133      so to avoid this, a copy is made first.
134     */

135     private StyleProp createProps(StyleProp prop, String JavaDoc style)
136     {
137         int name_end;
138         int value_end;
139         int value_start = 0;
140         int name_start = 0;
141         boolean more;
142
143         name_start = 0;
144         while (name_start < style.length())
145         {
146             while (name_start < style.length() &&
147                        style.charAt(name_start) == ' ')
148                 ++name_start;
149
150             name_end = name_start;
151
152             while (name_end < style.length())
153             {
154                 if (style.charAt(name_end) == ':')
155                 {
156                     value_start = name_end + 1;
157                     break;
158                 }
159
160                 ++name_end;
161             }
162
163             if (name_end >= style.length() || style.charAt(name_end) != ':')
164                 break;
165
166             while (value_start < style.length() &&
167                        style.charAt(value_start) == ' ')
168                 ++value_start;
169
170             value_end = value_start;
171             more = false;
172
173             while (value_end < style.length())
174             {
175                 if (style.charAt(value_end) == ';')
176                 {
177                     more = true;
178                     break;
179                 }
180
181                 ++value_end;
182             }
183
184             prop = insertProperty(prop,
185                                   style.substring(name_start, name_end),
186                                   style.substring(value_start, value_end));
187
188             if (more)
189             {
190                 name_start = value_end + 1;
191                 continue;
192             }
193
194             break;
195         }
196
197         return prop;
198     }
199
200     private String JavaDoc createPropString(StyleProp props)
201     {
202         String JavaDoc style = "";
203         int len;
204         StyleProp prop;
205
206         /* compute length */
207
208         for (len = 0, prop = props; prop != null; prop = prop.next)
209         {
210             len += prop.name.length() + 2;
211             len += prop.value.length() + 2;
212         }
213
214         for (prop = props; prop != null; prop = prop.next)
215         {
216             style = style.concat(prop.name);
217             style = style.concat(": ");
218
219             style = style.concat(prop.value);
220
221             if (prop.next == null)
222                 break;
223
224             style = style.concat("; ");
225         }
226
227         return style;
228     }
229
230     /*
231       create string with merged properties
232     */

233     private String JavaDoc addProperty(String JavaDoc style, String JavaDoc property)
234     {
235         StyleProp prop;
236
237         prop = createProps(null, style);
238         prop = createProps(prop, property);
239         style = createPropString(prop);
240         return style;
241     }
242
243     private String JavaDoc gensymClass(String JavaDoc tag)
244     {
245         String JavaDoc str;
246
247         str = "c" + classNum;
248         classNum++;
249         return str;
250     }
251
252     private String JavaDoc findStyle(Lexer lexer, String JavaDoc tag, String JavaDoc properties)
253     {
254         Style style;
255
256         for (style = lexer.styles; style != null; style=style.next)
257         {
258             if (style.tag.equals(tag) &&
259                 style.properties.equals(properties))
260                 return style.tagClass;
261         }
262
263         style = new Style(tag, gensymClass(tag), properties, lexer.styles);
264         lexer.styles = style;
265         return style.tagClass;
266     }
267
268     /*
269      Find style attribute in node, and replace it
270      by corresponding class attribute. Search for
271      class in style dictionary otherwise gensym
272      new class and add to dictionary.
273
274      Assumes that node doesn't have a class attribute
275     */

276     private void style2Rule(Lexer lexer, Node node)
277     {
278         AttVal styleattr, classattr;
279         String JavaDoc classname;
280
281         styleattr = node.getAttrByName("style");
282
283         if (styleattr != null)
284         {
285                 classname = findStyle(lexer, node.element, styleattr.value);
286                 classattr = node.getAttrByName("class");
287
288                 /*
289          if there already is a class attribute
290          then append class name after a space
291         */

292                 if (classattr != null)
293         {
294                         classattr.value = classattr.value + " " + classname;
295                         node.removeAttribute(styleattr);
296         }
297         else /* reuse style attribute for class attribute */
298         {
299                         styleattr.attribute = "class";
300                         styleattr.value = classname;
301         }
302         }
303     }
304
305     private void addColorRule(Lexer lexer, String JavaDoc selector, String JavaDoc color)
306     {
307         if (color != null)
308         {
309             lexer.addStringLiteral(selector);
310             lexer.addStringLiteral(" { color: ");
311             lexer.addStringLiteral(color);
312             lexer.addStringLiteral(" }\n");
313         }
314     }
315
316     /*
317      move presentation attribs from body to style element
318
319      background="foo" -> body { background-image: url(foo) }
320      bgcolor="foo" -> body { background-color: foo }
321      text="foo" -> body { color: foo }
322      link="foo" -> :link { color: foo }
323      vlink="foo" -> :visited { color: foo }
324      alink="foo" -> :active { color: foo }
325     */

326     private void cleanBodyAttrs(Lexer lexer, Node body)
327     {
328         AttVal attr;
329         String JavaDoc bgurl = null;
330         String JavaDoc bgcolor = null;
331         String JavaDoc color = null;
332     
333         attr = body.getAttrByName("background");
334
335         if (attr != null)
336         {
337             bgurl = attr.value;
338             attr.value = null;
339             body.removeAttribute(attr);
340         }
341
342         attr = body.getAttrByName("bgcolor");
343
344         if (attr != null)
345         {
346             bgcolor = attr.value;
347             attr.value = null;
348             body.removeAttribute(attr);
349         }
350
351         attr = body.getAttrByName("text");
352
353         if (attr != null)
354         {
355             color = attr.value;
356             attr.value = null;
357             body.removeAttribute(attr);
358         }
359
360         if (bgurl != null || bgcolor != null || color != null)
361         {
362             lexer.addStringLiteral(" body {\n");
363
364             if (bgurl != null)
365             {
366                 lexer.addStringLiteral(" background-image: url(");
367                 lexer.addStringLiteral(bgurl);
368                 lexer.addStringLiteral(");\n");
369             }
370
371             if (bgcolor != null)
372             {
373                 lexer.addStringLiteral(" background-color: ");
374                 lexer.addStringLiteral(bgcolor);
375                 lexer.addStringLiteral(";\n");
376             }
377
378             if (color != null)
379             {
380                 lexer.addStringLiteral(" color: ");
381                 lexer.addStringLiteral(color);
382                 lexer.addStringLiteral(";\n");
383             }
384
385             lexer.addStringLiteral(" }\n");
386         }
387
388         attr = body.getAttrByName("link");
389
390         if (attr != null)
391         {
392             addColorRule(lexer, " :link", attr.value);
393             body.removeAttribute(attr);
394         }
395
396         attr = body.getAttrByName("vlink");
397
398         if (attr != null)
399         {
400             addColorRule(lexer, " :visited", attr.value);
401             body.removeAttribute(attr);
402         }
403
404         attr = body.getAttrByName("alink");
405
406         if (attr != null)
407         {
408             addColorRule(lexer, " :active", attr.value);
409             body.removeAttribute(attr);
410         }
411     }
412
413     private boolean niceBody(Lexer lexer, Node doc)
414     {
415         Node body = doc.findBody(lexer.configuration.tt);
416
417         if (body != null)
418         {
419             if (
420                 body.getAttrByName("background") != null ||
421                 body.getAttrByName("bgcolor") != null ||
422                 body.getAttrByName("text") != null ||
423                 body.getAttrByName("link") != null ||
424                 body.getAttrByName("vlink") != null ||
425                 body.getAttrByName("alink") != null
426                )
427             {
428                 lexer.badLayout |= Report.USING_BODY;
429                 return false;
430             }
431         }
432
433         return true;
434     }
435
436     /* create style element using rules from dictionary */
437     private void createStyleElement(Lexer lexer, Node doc)
438     {
439         Node node, head, body;
440         Style style;
441         AttVal av;
442
443         if (lexer.styles == null && niceBody(lexer, doc))
444             return;
445
446         node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
447         node.implicit = true;
448
449         /* insert type attribute */
450         av = new AttVal(null, null, '"', "type", "text/css");
451         av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
452         node.attributes = av;
453
454         body = doc.findBody(lexer.configuration.tt);
455
456         lexer.txtstart = lexer.lexsize;
457
458         if (body != null)
459             cleanBodyAttrs(lexer, body);
460
461         for (style = lexer.styles; style != null; style = style.next)
462         {
463             lexer.addCharToLexer(' ');
464             lexer.addStringLiteral(style.tag);
465             lexer.addCharToLexer('.');
466             lexer.addStringLiteral(style.tagClass);
467             lexer.addCharToLexer(' ');
468             lexer.addCharToLexer('{');
469             lexer.addStringLiteral(style.properties);
470             lexer.addCharToLexer('}');
471             lexer.addCharToLexer('\n');
472         }
473
474         lexer.txtend = lexer.lexsize;
475
476         Node.insertNodeAtEnd(node,
477                              lexer.newNode(Node.TextNode,
478                                       lexer.lexbuf,
479                                       lexer.txtstart,
480                                       lexer.txtend));
481
482         /*
483          now insert style element into document head
484
485          doc is root node. search its children for html node
486          the head node should be first child of html node
487         */

488
489         head = doc.findHEAD(lexer.configuration.tt);
490     
491         if (head != null)
492             Node.insertNodeAtEnd(head, node);
493     }
494
495     /* ensure bidirectional links are consistent */
496     private void fixNodeLinks(Node node)
497     {
498         Node child;
499
500         if (node.prev != null)
501             node.prev.next = node;
502         else
503             node.parent.content = node;
504
505         if (node.next != null)
506             node.next.prev = node;
507         else
508             node.parent.last = node;
509
510         for (child = node.content; child != null; child = child.next)
511             child.parent = node;
512     }
513
514     /*
515      used to strip child of node when
516      the node has one and only one child
517     */

518     private void stripOnlyChild(Node node)
519     {
520         Node child;
521
522         child = node.content;
523         node.content = child.content;
524         node.last = child.last;
525         child.content = null;
526
527         for (child = node.content; child != null; child = child.next)
528             child.parent = node;
529     }
530
531     /* used to strip font start and end tags */
532     private void discardContainer(Node element, MutableObject pnode)
533     {
534         Node node;
535         Node parent = element.parent;
536
537         if (element.content != null)
538         {
539             element.last.next = element.next;
540
541             if (element.next != null)
542             {
543                 element.next.prev = element.last;
544                 element.last.next = element.next;
545             }
546             else
547                 parent.last = element.last;
548
549             if (element.prev != null)
550             {
551                 element.content.prev = element.prev;
552                 element.prev.next = element.content;
553             }
554             else
555                 parent.content = element.content;
556
557             for (node = element.content; node != null; node = node.next)
558                 node.parent = parent;
559
560             pnode.setObject(element.content);
561         }
562         else
563         {
564             if (element.next != null)
565                 element.next.prev = element.prev;
566             else
567                 parent.last = element.prev;
568
569             if (element.prev != null)
570                 element.prev.next = element.next;
571             else
572                 parent.content = element.next;
573
574             pnode.setObject(element.next);
575         }
576
577         element.next = null;
578         element.content = null;
579     }
580
581     /*
582      Add style property to element, creating style
583      attribute as needed and adding ; delimiter
584     */

585     private void addStyleProperty(Node node, String JavaDoc property)
586     {
587         AttVal av;
588
589         for (av = node.attributes; av != null; av = av.next)
590         {
591             if (av.attribute.equals("style"))
592                 break;
593         }
594
595         /* if style attribute already exists then insert property */
596
597         if (av != null)
598         {
599             String JavaDoc s;
600
601             s = addProperty(av.value, property);
602             av.value = s;
603         }
604         else /* else create new style attribute */
605         {
606             av = new AttVal(node.attributes, null, '"', "style", property);
607             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
608             node.attributes = av;
609         }
610     }
611
612     /*
613       Create new string that consists of the
614       combined style properties in s1 and s2
615
616       To merge property lists, we build a linked
617       list of property/values and insert properties
618       into the list in order, merging values for
619       the same property name.
620     */

621     private String JavaDoc mergeProperties(String JavaDoc s1, String JavaDoc s2)
622     {
623         String JavaDoc s;
624         StyleProp prop;
625
626         prop = createProps(null, s1);
627         prop = createProps(prop, s2);
628         s = createPropString(prop);
629         return s;
630     }
631
632     private void mergeStyles(Node node, Node child)
633     {
634         AttVal av;
635         String JavaDoc s1, s2, style;
636
637         for (s2 = null, av = child.attributes; av != null; av = av.next)
638         {
639             if (av.attribute.equals("style"))
640             {
641                 s2 = av.value;
642                 break;
643             }
644         }
645
646         for (s1 = null, av = node.attributes; av != null; av = av.next)
647         {
648             if (av.attribute.equals("style"))
649             {
650                 s1 = av.value;
651                 break;
652             }
653         }
654
655         if (s1 != null)
656         {
657             if (s2 != null) /* merge styles from both */
658             {
659                 style = mergeProperties(s1, s2);
660                 av.value = style;
661             }
662         }
663         else if (s2 != null) /* copy style of child */
664         {
665             av = new AttVal(node.attributes, null, '"', "style", s2);
666             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
667             node.attributes = av;
668         }
669     }
670
671     private String JavaDoc fontSize2Name(String JavaDoc size)
672     {
673         /*
674         String[] sizes =
675         {
676             "50%",
677             "60%",
678             "80%",
679             null,
680             "120%",
681             "150%",
682             "200%"
683         };
684         */

685
686         String JavaDoc[] sizes =
687         {
688             "60%",
689             "70%",
690             "80%",
691             null,
692             "120%",
693             "150%",
694             "200%"
695         };
696         String JavaDoc buf;
697
698         if (size.length() > 0 &&
699             '0' <= size.charAt(0) && size.charAt(0) <= '6')
700         {
701             int n = size.charAt(0) - '0';
702             return sizes[n];
703         }
704
705         if (size.length() > 0 && size.charAt(0) == '-')
706         {
707             if (size.length() > 1 &&
708                 '0' <= size.charAt(1) && size.charAt(1) <= '6')
709             {
710                 int n = size.charAt(1) - '0';
711                 double x;
712
713                 for (x = 1.0; n > 0; --n)
714                     x *= 0.8;
715
716                 x *= 100.0;
717                 buf = "" + (int)x + "%";
718
719                 return buf;
720             }
721
722             return "smaller"; /*"70%"; */
723         }
724
725         if (size.length() > 1 &&
726             '0' <= size.charAt(1) && size.charAt(1) <= '6')
727         {
728             int n = size.charAt(1) - '0';
729             double x;
730
731             for (x = 1.0; n > 0; --n)
732                 x *= 1.2;
733
734             x *= 100.0;
735             buf = "" + (int)x + "%";
736
737             return buf;
738         }
739
740         return "larger"; /* "140%" */
741     }
742
743     private void addFontFace(Node node, String JavaDoc face)
744     {
745         addStyleProperty(node, "font-family: " + face);
746     }
747
748     private void addFontSize(Node node, String JavaDoc size)
749     {
750         String JavaDoc value;
751
752         if (size.equals("6") && node.tag == tt.tagP)
753         {
754             node.element = "h1";
755             tt.findTag(node);
756             return;
757         }
758
759         if (size.equals("5") && node.tag == tt.tagP)
760         {
761             node.element = "h2";
762             tt.findTag(node);
763             return;
764         }
765
766         if (size.equals("4") && node.tag == tt.tagP)
767         {
768             node.element = "h3";
769             tt.findTag(node);
770             return;
771         }
772
773         value = fontSize2Name(size);
774
775         if (value != null)
776         {
777             addStyleProperty(node, "font-size: " + value);
778         }
779     }
780
781     private void addFontColor(Node node, String JavaDoc color)
782     {
783         addStyleProperty(node, "color: " + color);
784     }
785
786     private void addAlign(Node node, String JavaDoc align)
787     {
788         /* force alignment value to lower case */
789         addStyleProperty(node, "text-align: " + align.toLowerCase());
790     }
791
792     /*
793      add style properties to node corresponding to
794      the font face, size and color attributes
795     */

796     private void addFontStyles(Node node, AttVal av)
797     {
798         while (av != null)
799         {
800             if (av.attribute.equals("face"))
801                 addFontFace(node, av.value);
802             else if (av.attribute.equals("size"))
803                 addFontSize(node, av.value);
804             else if (av.attribute.equals("color"))
805                 addFontColor(node, av.value);
806
807             av = av.next;
808         }
809     }
810
811     /*
812         Symptom: <p align=center>
813         Action: <p style="text-align: center">
814     */

815     private void textAlign(Lexer lexer, Node node)
816     {
817         AttVal av, prev;
818
819         prev = null;
820
821         for (av = node.attributes; av != null; av = av.next)
822         {
823             if (av.attribute.equals("align"))
824             {
825                 if (prev != null)
826                     prev.next = av.next;
827                 else
828                     node.attributes = av.next;
829
830                 if (av.value != null)
831                 {
832                     addAlign(node, av.value);
833                 }
834
835                 break;
836             }
837
838             prev = av;
839         }
840     }
841
842     /*
843        The clean up rules use the pnode argument to return the
844        next node when the orignal node has been deleted
845     */

846
847     /*
848         Symptom: <dir> <li> where <li> is only child
849         Action: coerce <dir> <li> to <div> with indent.
850     */

851
852     private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode)
853     {
854         Node child;
855
856         if (node.tag == tt.tagDir ||
857             node.tag == tt.tagUl ||
858             node.tag == tt.tagOl)
859         {
860             child = node.content;
861
862             if (child == null)
863                 return false;
864
865             /* check child has no peers */
866
867             if (child.next != null)
868                 return false;
869
870             if (child.tag != tt.tagLi)
871                 return false;
872
873             if (!child.implicit)
874                 return false;
875
876             /* coerce dir to div */
877
878             node.tag = tt.tagDiv;
879             node.element = "div";
880             addStyleProperty(node, "margin-left: 2em");
881             stripOnlyChild(node);
882             return true;
883
884 //#if 0
885
//Node content;
886
//Node last;
887
//content = child.content;
888
//last = child.last;
889
//child.content = null;
890

891             /* adjust parent and set margin on contents of <li> */
892
893             //for (child = content; child != null; child = child.next)
894
//{
895
// child.parent = node.parent;
896
// addStyleProperty(child, "margin-left: 1em");
897
//}
898

899             /* hook first/last into sequence */
900
901             //if (content != null)
902
//{
903
// content.prev = node.prev;
904
// last.next = node.next;
905
// fixNodeLinks(content);
906
// fixNodeLinks(last);
907
//}
908

909             //node.next = null;
910

911             /* ensure that new node is cleaned */
912             //pnode.setObject(cleanNode(lexer, content));
913
//return true;
914
//#endif
915
}
916
917         return false;
918     }
919
920     /*
921         Symptom: <center>
922         Action: replace <center> by <div style="text-align: center">
923     */

924
925     private boolean center2Div(Lexer lexer, Node node, MutableObject pnode)
926     {
927         if (node.tag == tt.tagCenter)
928         {
929             if (lexer.configuration.DropFontTags)
930             {
931                 if (node.content != null)
932                 {
933                     Node last = node.last;
934                     Node parent = node.parent;
935
936                     discardContainer(node, pnode);
937
938                     node = lexer.inferredTag("br");
939
940                     if (last.next != null)
941                         last.next.prev = node;
942
943                     node.next = last.next;
944                     last.next = node;
945                     node.prev = last;
946
947                     if (parent.last == last)
948                         parent.last = node;
949
950                     node.parent = parent;
951                 }
952                 else
953