KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > w3c > tidy > Clean


1 /*
2  * @(#)Clean.java 1.11 2000/08/16
3  *
4  */

5
6 package org.w3c.tidy;
7
8 /**
9  *
10  * Clean up misuse of presentation markup
11  *
12  * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13  * See Tidy.java for the copyright notice.
14  * Derived from <a HREF="http://www.w3.org/People/Raggett/tidy">
15  * HTML Tidy Release 4 Aug 2000</a>
16  *
17  * @author Dave Raggett <dsr@w3.org>
18  * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19  * @version 1.0, 1999/05/22
20  * @version 1.0.1, 1999/05/29
21  * @version 1.1, 1999/06/18 Java Bean
22  * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
23  * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
24  * @version 1.4, 1999/09/04 DOM support
25  * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
26  * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
27  * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
28  * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
29  * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
30  * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
31  * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
32  */

33
34 /*
35   Filters from other formats such as Microsoft Word
36   often make excessive use of presentation markup such
37   as font tags, B, I, and the align attribute. By applying
38   a set of production rules, it is straight forward to
39   transform this to use CSS.
40
41   Some rules replace some of the children of an element by
42   style properties on the element, e.g.
43
44   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
45
46   Such rules are applied to the element's content and then
47   to the element itself until none of the rules more apply.
48   Having applied all the rules to an element, it will have
49   a style attribute with one or more properties.
50
51   Other rules strip the element they apply to, replacing
52   it by style properties on the contents, e.g.
53   
54   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
55       
56   These rules are applied to an element before processing
57   its content and replace the current element by the first
58   element in the exposed content.
59
60   After applying both sets of rules, you can replace the
61   style attribute by a class value and style rule in the
62   document head. To support this, an association of styles
63   and class names is built.
64
65   A naive approach is to rely on string matching to test
66   when two property lists are the same. A better approach
67   would be to first sort the properties before matching.
68 */

69
70 public class Clean {
71
72     private int classNum = 1;
73
74     private TagTable tt;
75
76     public Clean(TagTable tt)
77     {
78       this.tt = tt;
79     }
80
81     private StyleProp insertProperty(StyleProp props, String JavaDoc name,
82                                             String JavaDoc value)
83     {
84         StyleProp first, prev, prop;
85         int cmp;
86
87         prev = null;
88         first = props;
89
90         while (props != null)
91         {
92             cmp = props.name.compareTo(name);
93
94             if (cmp == 0)
95             {
96                 /* this property is already defined, ignore new value */
97                 return first;
98             }
99
100             if (cmp > 0) // props.name > name
101
{
102                 /* insert before this */
103
104                 prop = new StyleProp(name, value, props);
105
106                 if (prev != null)
107                     prev.next = prop;
108                 else
109                     first = prop;
110
111                 return first;
112             }
113
114             prev = props;
115             props = props.next;
116         }
117
118         prop = new StyleProp(name, value);
119
120         if (prev != null)
121             prev.next = prop;
122         else
123             first = prop;
124
125         return first;
126     }
127
128     /*
129      Create sorted linked list of properties from style string
130      It temporarily places nulls in place of ':' and ';' to
131      delimit the strings for the property name and value.
132      Some systems don't allow you to null literal strings,
133      so to avoid this, a copy is made first.
134     */

135     private StyleProp createProps(StyleProp prop, String JavaDoc style)
136     {
137         int name_end;
138         int value_end;
139         int value_start = 0;
140         int name_start = 0;
141         boolean more;
142
143         name_start = 0;
144         while (name_start < style.length())
145         {
146             while (name_start < style.length() &&
147                        style.charAt(name_start) == ' ')
148                 ++name_start;
149
150             name_end = name_start;
151
152             while (name_end < style.length())
153             {
154                 if (style.charAt(name_end) == ':')
155                 {
156                     value_start = name_end + 1;
157                     break;
158                 }
159
160                 ++name_end;
161             }
162
163             if (name_end >= style.length() || style.charAt(name_end) != ':')
164                 break;
165
166             while (value_start < style.length() &&
167                        style.charAt(value_start) == ' ')
168                 ++value_start;
169
170             value_end = value_start;
171             more = false;
172
173             while (value_end < style.length())
174             {
175                 if (style.charAt(value_end) == ';')
176                 {
177                     more = true;
178                     break;
179                 }
180
181                 ++value_end;
182             }
183
184             prop = insertProperty(prop,
185                                   style.substring(name_start, name_end),
186                                   style.substring(value_start, value_end));
187
188             if (more)
189             {
190                 name_start = value_end + 1;
191                 continue;
192             }
193
194             break;
195         }
196
197         return prop;
198     }
199
200     private String JavaDoc createPropString(StyleProp props)
201     {
202         String JavaDoc style = "";
203         int len;
204         StyleProp prop;
205
206         /* compute length */
207
208         for (len = 0, prop = props; prop != null; prop = prop.next)
209         {
210             len += prop.name.length() + 2;
211             len += prop.value.length() + 2;
212         }
213
214         for (prop = props; prop != null; prop = prop.next)
215         {
216             style = style.concat(prop.name);
217             style = style.concat(": ");
218
219             style = style.concat(prop.value);
220
221             if (prop.next == null)
222                 break;
223
224             style = style.concat("; ");
225         }
226
227         return style;
228     }
229
230     /*
231       create string with merged properties
232     */

233     private String JavaDoc addProperty(String JavaDoc style, String JavaDoc property)
234     {
235         StyleProp prop;
236
237         prop = createProps(null, style);
238         prop = createProps(prop, property);
239         style = createPropString(prop);
240         return style;
241     }
242
243     private String JavaDoc gensymClass(String JavaDoc tag)
244     {
245         String JavaDoc str;
246
247         str = "c" + classNum;
248         classNum++;
249         return str;
250     }
251
252     private String JavaDoc findStyle(Lexer lexer, String JavaDoc tag, String JavaDoc properties)
253     {
254         Style style;
255
256         for (style = lexer.styles; style != null; style=style.next)
257         {
258             if (style.tag.equals(tag) &&
259                 style.properties.equals(properties))
260                 return style.tagClass;
261         }
262
263         style = new Style(tag, gensymClass(tag), properties, lexer.styles);
264         lexer.styles = style;
265         return style.tagClass;
266     }
267
268     /*
269      Find style attribute in node, and replace it
270      by corresponding class attribute. Search for
271      class in style dictionary otherwise gensym
272      new class and add to dictionary.
273
274      Assumes that node doesn't have a class attribute
275     */

276     private void style2Rule(Lexer lexer, Node node)
277     {
278         AttVal styleattr, classattr;
279         String JavaDoc classname;
280
281         styleattr = node.getAttrByName("style");
282
283         if (styleattr != null)
284         {
285                 classname = findStyle(lexer, node.element, styleattr.value);
286                 classattr = node.getAttrByName("class");
287
288                 /*
289          if there already is a class attribute
290          then append class name after a space
291         */

292                 if (classattr != null)
293         {
294                         classattr.value = classattr.value + " " + classname;
295                         node.removeAttribute(styleattr);
296         }
297         else /* reuse style attribute for class attribute */
298         {
299                         styleattr.attribute = "class";
300                         styleattr.value = classname;
301         }
302         }
303     }
304
305     private void addColorRule(Lexer lexer, String JavaDoc selector, String JavaDoc color)
306     {
307         if (color != null)
308         {
309             lexer.addStringLiteral(selector);
310             lexer.addStringLiteral(" { color: ");
311             lexer.addStringLiteral(color);
312             lexer.addStringLiteral(" }\n");
313         }
314     }
315
316     /*
317      move presentation attribs from body to style element
318
319      background="foo" -> body { background-image: url(foo) }
320      bgcolor="foo" -> body { background-color: foo }
321      text="foo" -> body { color: foo }
322      link="foo" -> :link { color: foo }
323      vlink="foo" -> :visited { color: foo }
324      alink="foo" -> :active { color: foo }
325     */

326     private void cleanBodyAttrs(Lexer lexer, Node body)
327     {
328         AttVal attr;
329         String JavaDoc bgurl = null;
330         String JavaDoc bgcolor = null;
331         String JavaDoc color = null;
332     
333         attr = body.getAttrByName("background");
334
335         if (attr != null)
336         {
337             bgurl = attr.value;
338             attr.value = null;
339             body.removeAttribute(attr);
340         }
341
342         attr = body.getAttrByName("bgcolor");
343
344         if (attr != null)
345         {
346             bgcolor = attr.value;
347             attr.value = null;
348             body.removeAttribute(attr);
349         }
350
351         attr = body.getAttrByName("text");
352
353         if (attr != null)
354         {
355             color = attr.value;
356             attr.value = null;
357             body.removeAttribute(attr);
358         }
359
360         if (bgurl != null || bgcolor != null || color != null)
361         {
362             lexer.addStringLiteral(" body {\n");
363
364             if (bgurl != null)
365             {
366                 lexer.addStringLiteral(" background-image: url(");
367                 lexer.addStringLiteral(bgurl);
368                 lexer.addStringLiteral(");\n");
369             }
370
371             if (bgcolor != null)
372             {
373                 lexer.addStringLiteral(" background-color: ");
374                 lexer.addStringLiteral(bgcolor);
375                 lexer.addStringLiteral(";\n");
376             }
377
378             if (color != null)
379             {
380                 lexer.addStringLiteral(" color: ");
381                 lexer.addStringLiteral(color);
382                 lexer.addStringLiteral(";\n");
383             }
384
385             lexer.addStringLiteral(" }\n");
386         }
387
388         attr = body.getAttrByName("link");
389
390         if (attr != null)
391         {
392             addColorRule(lexer, " :link", attr.value);
393             body.removeAttribute(attr);
394         }
395
396         attr = body.getAttrByName("vlink");
397
398         if (attr != null)
399         {
400             addColorRule(lexer, " :visited", attr.value);
401             body.removeAttribute(attr);
402         }
403
404         attr = body.getAttrByName("alink");
405
406         if (attr != null)
407         {
408             addColorRule(lexer, " :active", attr.value);
409             body.removeAttribute(attr);
410         }
411     }
412
413     private boolean niceBody(Lexer lexer, Node doc)
414     {
415         Node body = doc.findBody(lexer.configuration.tt);
416
417         if (body != null)
418         {
419             if (
420                 body.getAttrByName("background") != null ||
421                 body.getAttrByName("bgcolor") != null ||
422                 body.getAttrByName("text") != null ||
423                 body.getAttrByName("link") != null ||
424                 body.getAttrByName("vlink") != null ||
425                 body.getAttrByName("alink") != null
426                )
427             {
428                 lexer.badLayout |= Report.USING_BODY;
429                 return false;
430             }
431         }
432
433         return true;
434     }
435
436     /* create style element using rules from dictionary */
437     private void createStyleElement(Lexer lexer, Node doc)
438     {
439         Node node, head, body;
440         Style style;
441         AttVal av;
442
443         if (lexer.styles == null && niceBody(lexer, doc))
444             return;
445
446         node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
447         node.implicit = true;
448
449         /* insert type attribute */
450         av = new AttVal(null, null, '"', "type", "text/css");
451         av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
452         node.attributes = av;
453
454         body = doc.findBody(lexer.configuration.tt);
455
456         lexer.txtstart = lexer.lexsize;
457
458         if (body != null)
459             cleanBodyAttrs(lexer, body);
460
461         for (style = lexer.styles; style != null; style = style.next)
462         {
463             lexer.addCharToLexer(' ');
464             lexer.addStringLiteral(style.tag);
465             lexer.addCharToLexer('.');
466             lexer.addStringLiteral(style.tagClass);
467             lexer.addCharToLexer(' ');
468             lexer.addCharToLexer('{');
469             lexer.addStringLiteral(style.properties);
470             lexer.addCharToLexer('}');
471             lexer.addCharToLexer('\n');
472         }
473
474         lexer.txtend = lexer.lexsize;
475
476         Node.insertNodeAtEnd(node,
477                              lexer.newNode(Node.TextNode,
478                                       lexer.lexbuf,
479                                       lexer.txtstart,
480                                       lexer.txtend));
481
482         /*
483          now insert style element into document head
484
485          doc is root node. search its children for html node
486          the head node should be first child of html node
487         */

488
489         head = doc.findHEAD(lexer.configuration.tt);
490     
491         if (head != null)
492             Node.insertNodeAtEnd(head, node);
493     }
494
495     /* ensure bidirectional links are consistent */
496     private void fixNodeLinks(Node node)
497     {
498         Node child;
499
500         if (node.prev != null)
501             node.prev.next = node;
502         else
503             node.parent.content = node;
504
505         if (node.next != null)
506             node.next.prev = node;
507         else
508             node.parent.last = node;
509
510         for (child = node.content; child != null; child = child.next)
511             child.parent = node;
512     }
513
514     /*
515      used to strip child of node when
516      the node has one and only one child
517     */

518     private void stripOnlyChild(Node node)
519     {
520         Node child;
521
522         child = node.content;
523         node.content = child.content;
524         node.last = child.last;
525         child.content = null;
526
527         for (child = node.content; child != null; child = child.next)
528             child.parent = node;
529     }
530
531     /* used to strip font start and end tags */
532     private void discardContainer(Node element, MutableObject pnode)
533     {
534         Node node;
535         Node parent = element.parent;
536
537         if (element.content != null)
538         {
539             element.last.next = element.next;
540
541             if (element.next != null)
542             {
543                 element.next.prev = element.last;
544                 element.last.next = element.next;
545             }
546             else
547                 parent.last = element.last;
548
549             if (element.prev != null)
550             {
551                 element.content.prev = element.prev;
552                 element.prev.next = element.content;
553             }
554             else
555                 parent.content = element.content;
556
557             for (node = element.content; node != null; node = node.next)
558                 node.parent = parent;
559
560             pnode.setObject(element.content);
561         }
562         else
563         {
564             if (element.next != null)
565                 element.next.prev = element.prev;
566             else
567                 parent.last = element.prev;
568
569             if (element.prev != null)
570                 element.prev.next = element.next;
571             else
572                 parent.content = element.next;
573
574             pnode.setObject(element.next);
575         }
576
577         element.next = null;
578         element.content = null;
579     }
580
581     /*
582      Add style property to element, creating style
583      attribute as needed and adding ; delimiter
584     */

585     private void addStyleProperty(Node node, String JavaDoc property)
586     {
587         AttVal av;
588
589         for (av = node.attributes; av != null; av = av.next)
590         {
591             if (av.attribute.equals("style"))
592                 break;
593         }
594
595         /* if style attribute already exists then insert property */
596
597         if (av != null)
598         {
599             String JavaDoc s;
600
601             s = addProperty(av.value, property);
602             av.value = s;
603         }
604         else /* else create new style attribute */
605         {
606             av = new AttVal(node.attributes, null, '"', "style", property);
607             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
608             node.attributes = av;
609         }
610     }
611
612     /*
613       Create new string that consists of the
614       combined style properties in s1 and s2
615
616       To merge property lists, we build a linked
617       list of property/values and insert properties
618       into the list in order, merging values for
619       the same property name.
620     */

621     private String JavaDoc mergeProperties(String JavaDoc s1, String JavaDoc s2)
622     {
623         String JavaDoc s;
624         StyleProp prop;
625
626         prop = createProps(null, s1);
627         prop = createProps(prop, s2);
628         s = createPropString(prop);
629         return s;
630     }
631
632     private void mergeStyles(Node node, Node child)
633     {
634         AttVal av;
635         String JavaDoc s1, s2, style;
636
637         for (s2 = null, av = child.attributes; av != null; av = av.next)
638         {
639             if (av.attribute.equals("style"))
640             {
641                 s2 = av.value;
642                 break;
643             }
644         }
645
646         for (s1 = null, av = node.attributes; av != null; av = av.next)
647         {
648             if (av.attribute.equals("style"))
649             {
650                 s1 = av.value;
651                 break;
652             }
653         }
654
655         if (s1 != null)
656         {
657             if (s2 != null) /* merge styles from both */
658             {
659                 style = mergeProperties(s1, s2);
660                 av.value = style;
661             }
662         }
663         else if (s2 != null) /* copy style of child */
664         {
665             av = new AttVal(node.attributes, null, '"', "style", s2);
666             av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
667             node.attributes = av;
668         }
669     }
670
671     private String JavaDoc fontSize2Name(String JavaDoc size)
672     {
673         /*
674         String[] sizes =
675         {
676             "50%",
677             "60%",
678             "80%",
679             null,
680             "120%",
681             "150%",
682             "200%"
683         };
684         */

685
686         String JavaDoc[] sizes =
687         {
688             "60%",
689             "70%",
690             "80%",
691             null,
692             "120%",
693             "150%",
694             "200%"
695         };
696         String JavaDoc buf;
697
698         if (size.length() > 0 &&
699             '0' <= size.charAt(0) && size.charAt(0) <= '6')
700         {
701             int n = size.charAt(0) - '0';
702             return sizes[n];
703         }
704
705         if (size.length() > 0 && size.charAt(0) == '-')
706         {
707             if (size.length() > 1 &&
708                 '0' <= size.charAt(1) && size.charAt(1) <= '6')
709             {
710                 int n = size.charAt(1) - '0';
711                 double x;
712
713                 for (x = 1.0; n > 0; --n)
714                     x *= 0.8;
715
716                 x *= 100.0;
717                 buf = "" + (int)x + "%";
718
719                 return buf;
720             }
721
722             return "smaller"; /*"70%"; */
723         }
724
725         if (size.length() > 1 &&
726             '0' <= size.charAt(1) && size.charAt(1) <= '6')
727         {
728             int n = size.charAt(1) - '0';
729             double x;
730
731             for (x = 1.0; n > 0; --n)
732                 x *= 1.2;
733
734             x *= 100.0;
735             buf = "" + (int)x + "%";
736
737             return buf;
738         }
739
740         return "larger"; /* "140%" */
741     }
742
743     private void addFontFace(Node node, String JavaDoc face)
744     {
745         addStyleProperty(node, "font-family: " + face);
746     }
747
748     private void addFontSize(Node node, String JavaDoc size)
749     {
750         String JavaDoc value;
751
752         if (size.equals("6") && node.tag == tt.tagP)
753         {
754             node.element = "h1";
755             tt.findTag(node);
756             return;
757         }
758
759         if (size.equals("5") && node.tag == tt.tagP)
760         {
761             node.element = "h2";
762             tt.findTag(node);
763             return;
764         }
765
766         if (size.equals("4") && node.tag == tt.tagP)
767         {
768             node.element = "h3";
769             tt.findTag(node);
770             return;
771         }
772
773         value = fontSize2Name(size);
774
775         if (value != null)
776         {
777             addStyleProperty(node, "font-size: " + value);
778         }
779     }
780
781     private void addFontColor(Node node, String JavaDoc color)
782     {
783         addStyleProperty(node, "color: " + color);
784     }
785
786     private void addAlign(Node node, String JavaDoc align)
787     {
788         /* force alignment value to lower case */
789         addStyleProperty(node, "text-align: " + align.toLowerCase());
790     }
791
792     /*
793      add style properties to node corresponding to
794      the font face, size and color attributes
795     */

796     private void addFontStyles(Node node, AttVal av)
797     {
798         while (av != null)
799         {
800             if (av.attribute.equals("face"))
801                 addFontFace(node, av.value);
802             else if (av.attribute.equals("size"))
803                 addFontSize(node, av.value);
804             else if (av.attribute.equals("color"))
805                 addFontColor(node, av.value);
806
807             av = av.next;
808         }
809     }
810
811     /*
812         Symptom: <p align=center>
813         Action: <p style="text-align: center">
814     */

815     private void textAlign(Lexer lexer, Node node)
816     {
817         AttVal av, prev;
818
819         prev = null;
820
821         for (av = node.attributes; av != null; av = av.next)
822         {
823             if (av.attribute.equals("align"))
824             {
825                 if (prev != null)
826                     prev.next = av.next;
827                 else
828                     node.attributes = av.next;
829
830                 if (av.value != null)
831                 {
832                     addAlign(node, av.value);
833                 }
834
835                 break;
836             }
837
838             prev = av;
839         }
840     }
841
842     /*
843        The clean up rules use the pnode argument to return the
844        next node when the orignal node has been deleted
845     */

846
847     /*
848         Symptom: <dir> <li> where <li> is only child
849         Action: coerce <dir> <li> to <div> with indent.
850     */

851
852     private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode)
853     {
854         Node child;
855
856         if (node.tag == tt.tagDir ||
857             node.tag == tt.tagUl ||
858             node.tag == tt.tagOl)
859         {
860             child = node.content;
861
862             if (child == null)
863                 return false;
864
865             /* check child has no peers */
866
867             if (child.next != null)
868                 return false;
869
870             if (child.tag != tt.tagLi)
871                 return false;
872
873             if (!child.implicit)
874                 return false;
875
876             /* coerce dir to div */
877
878             node.tag = tt.tagDiv;
879             node.element = "div";
880             addStyleProperty(node, "margin-left: 2em");
881             stripOnlyChild(node);
882             return true;
883
884 //#if 0
885
//Node content;
886
//Node last;
887
//content = child.content;
888
//last = child.last;
889
//child.content = null;
890

891             /* adjust parent and set margin on contents of <li> */
892
893             //for (child = content; child != null; child = child.next)
894
//{
895
// child.parent = node.parent;
896
// addStyleProperty(child, "margin-left: 1em");
897
//}
898

899             /* hook first/last into sequence */
900
901             //if (content != null)
902
//{
903
// content.prev = node.prev;
904
// last.next = node.next;
905
// fixNodeLinks(content);
906
// fixNodeLinks(last);
907
//}
908

909             //node.next = null;
910

911             /* ensure that new node is cleaned */
912             //pnode.setObject(cleanNode(lexer, content));
913
//return true;
914
//#endif
915
}
916
917         return false;
918     }
919
920     /*
921         Symptom: <center>
922         Action: replace <center> by <div style="text-align: center">
923     */

924
925     private boolean center2Div(Lexer lexer, Node node, MutableObject pnode)
926     {
927         if (node.tag == tt.tagCenter)
928         {
929             if (lexer.configuration.DropFontTags)
930             {
931                 if (node.content != null)
932                 {
933                     Node last = node.last;
934                     Node parent = node.parent;
935
936                     discardContainer(node, pnode);
937
938                     node = lexer.inferredTag("br");
939
940                     if (last.next != null)
941                         last.next.prev = node;
942
943                     node.next = last.next;
944                     last.next = node;
945                     node.prev = last;
946
947                     if (parent.last == last)
948                         parent.last = node;
949
950                     node.parent = parent;
951                 }
952                 else
953                 {
954                     Node prev = node.prev;
955                     Node next = node.next;
956                     Node parent = node.parent;
957                     discardContainer(node, pnode);
958
959                     node = lexer.inferredTag("br");
960                     node.next = next;
961                     node.prev = prev;
962                     node.parent = parent;
963
964                     if (next != null)
965                         next.prev = node;
966                     else
967                         parent.last = node;
968
969                     if (prev != null)
970                         prev.next = node;
971                     else
972                         parent.content = node;
973                 }
974
975                 return true;
976             }
977             node.tag = tt.tagDiv;
978             node.element = "div";
979             addStyleProperty(node, "text-align: center");
980             return true;
981         }
982
983         return false;
984     }
985
986     /*
987         Symptom <div><div>...</div></div>
988         Action: merge the two divs
989
990       This is useful after nested <dir>s used by Word
991       for indenting have been converted to <div>s
992     */

993     private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode)
994     {
995         Node child;
996
997         if (node.tag != tt.tagDiv)
998             return false;
999
1000        child = node.content;
1001
1002        if (child == null)
1003            return false;
1004
1005        if (child.tag != tt.tagDiv)
1006            return false;
1007
1008        if (child.next != null)
1009            return false;
1010
1011        mergeStyles(node, child);
1012        stripOnlyChild(node);
1013        return true;
1014    }
1015
1016    /*
1017        Symptom: <ul><li><ul>...</ul></li></ul>
1018        Action: discard outer list
1019    */

1020
1021    private boolean nestedList(Lexer lexer, Node node, MutableObject pnode)
1022    {
1023        Node child, list;
1024
1025        if (node.tag == tt.tagUl || node.tag == tt.tagOl)
1026        {
1027            child = node.content;
1028
1029            if (child == null)
1030                return false;
1031
1032            /* check child has no peers */
1033
1034            if (child.next != null)
1035                return false;
1036
1037            list = child.content;
1038
1039            if (list == null)
1040                return false;
1041
1042            if (list.tag != node.tag)
1043                return false;
1044
1045            pnode.setObject(node.next);
1046
1047            /* move inner list node into position of outer node */
1048            list.prev = node.prev;
1049            list.next = node.next;
1050            list.parent = node.parent;
1051            fixNodeLinks(list);
1052
1053            /* get rid of outer ul and its li */
1054            child.content = null;
1055            node.content = null;
1056            node.next = null;
1057
1058            /*
1059              If prev node was a list the chances are this node
1060              should be appended to that list. Word has no way of
1061              recognizing nested lists and just uses indents
1062            */

1063
1064            if (list.prev != null)
1065            {
1066                node = list;
1067                list = node.prev;
1068
1069                if (list.tag == tt.tagUl || list.tag == tt.tagOl)
1070                {
1071                    list.next = node.next;
1072
1073                    if (list.next != null)
1074                        list.next.prev = list;
1075
1076                    child = list.last; /* <li> */
1077
1078                    node.parent = child;
1079                    node.next = null;
1080                    node.prev = child.last;
1081                    fixNodeLinks(node);
1082                }
1083            }
1084
1085            cleanNode(lexer, node);
1086            return true;
1087        }
1088
1089        return false;
1090    }
1091
1092    /*
1093        Symptom: the only child of a block-level element is a
1094        presentation element such as B, I or FONT
1095
1096        Action: add style "font-weight: bold" to the block and
1097        strip the <b> element, leaving its children.
1098
1099      example:
1100
1101        <p>
1102          <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1103        </p>
1104
1105      becomes:
1106
1107          <p style="font-weight: bold; font-family: Arial; font-size: 6">
1108            Draft Recommended Practice
1109          </p>
1110
1111      This code also replaces the align attribute by a style attribute.
1112      However, to avoid CSS problems with Navigator 4, this isn't done
1113      for the elements: caption, tr and table
1114    */

1115    private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode)
1116    {
1117        Node child;
1118
1119        if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0)
1120        {
1121            if (node.tag != tt.tagTable
1122                    && node.tag != tt.tagTr
1123                    && node.tag != tt.tagLi)
1124            {
1125                /* check for align attribute */
1126                if (node.tag != tt.tagCaption)
1127                    textAlign(lexer, node);
1128
1129                child = node.content;
1130
1131                if (child == null)
1132                    return false;
1133
1134                /* check child has no peers */
1135
1136                if (child.next != null)
1137                    return false;
1138
1139                if (child.tag == tt.tagB)
1140                {
1141                    mergeStyles(node, child);
1142                    addStyleProperty(node, "font-weight: bold");
1143                    stripOnlyChild(node);
1144                    return true;
1145                }
1146
1147                if (child.tag == tt.tagI)
1148                {
1149                    mergeStyles(node, child);
1150                    addStyleProperty(node, "font-style: italic");
1151                    stripOnlyChild(node);
1152                    return true;
1153                }
1154
1155                if (child.tag == tt.tagFont)
1156                {
1157                    mergeStyles(node, child);
1158                    addFontStyles(node, child.attributes);
1159                    stripOnlyChild(node);
1160                    return true;
1161                }
1162            }
1163        }
1164
1165        return false;
1166    }
1167
1168    /* the only child of table cell or an inline element such as em */
1169    private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode)
1170    {
1171        Node child;
1172
1173        if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE|Dict.CM_ROW)) != 0)
1174        {
1175            child = node.content;
1176
1177            if (child == null)
1178                return false;
1179
1180            /* check child has no peers */
1181
1182            if (child.next != null)
1183                return false;
1184
1185            if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis)
1186            {
1187                mergeStyles(node, child);
1188                addStyleProperty(node, "font-weight: bold");
1189                stripOnlyChild(node);
1190                return true;
1191            }
1192
1193            if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis)
1194            {
1195                mergeStyles(node, child);
1196                addStyleProperty(node, "font-style: italic");
1197                stripOnlyChild(node);
1198                return true;
1199            }
1200
1201            if (child.tag == tt.tagFont)
1202            {
1203                mergeStyles(node, child);
1204                addFontStyles(node, child.attributes);
1205                stripOnlyChild(node);
1206                return true;
1207            }
1208        }
1209
1210        return false;
1211    }
1212
1213    /*
1214      Replace font elements by span elements, deleting
1215      the font element's attributes and replacing them
1216      by a single style attribute.
1217    */

1218    private boolean font2Span(Lexer lexer, Node node, MutableObject pnode)
1219    {
1220        AttVal av, style, next;
1221
1222        if (node.tag == tt.tagFont)
1223        {
1224            if (lexer.configuration.DropFontTags)
1225            {
1226                discardContainer(node, pnode);
1227                return false;
1228            }
1229
1230            /* if FONT is only child of parent element then leave alone */
1231            if (node.parent.content == node
1232                && node.next == null)
1233                return false;
1234
1235            addFontStyles(node, node.attributes);
1236
1237            /* extract style attribute and free the rest */
1238            av = node.attributes;
1239            style = null;
1240
1241            while (av != null)
1242            {
1243                next = av.next;
1244
1245                if (av.attribute.equals("style"))
1246                {
1247                    av.next = null;
1248                    style = av;
1249                }
1250
1251                av = next;
1252            }
1253
1254            node.attributes = style;
1255
1256            node.tag = tt.tagSpan;
1257            node.element = "span";
1258
1259            return true;
1260        }
1261
1262        return false;
1263    }
1264
1265    /*
1266      Applies all matching rules to a node.
1267    */

1268    private Node cleanNode(Lexer lexer, Node node)
1269    {
1270        Node next = null;
1271        MutableObject o = new MutableObject();
1272        boolean b = false;
1273
1274        for (next = node; node.isElement(); node = next)
1275        {
1276            o.setObject(next);
1277
1278            b = dir2Div(lexer, node, o);
1279            next = (Node)o.getObject();
1280            if (b)
1281                continue;
1282
1283            b = nestedList(lexer, node, o);
1284            next = (Node)o.getObject();
1285            if (b)
1286                continue;
1287
1288            b = center2Div(lexer, node, o);
1289            next = (Node)o.getObject();
1290            if (b)
1291                continue;
1292
1293            b = mergeDivs(lexer, node, o);
1294            next = (Node)o.getObject();
1295            if (b)
1296                continue;
1297
1298            b = blockStyle(lexer, node, o);
1299            next = (Node)o.getObject();
1300            if (b)
1301                continue;
1302
1303            b = inlineStyle(lexer, node, o);
1304            next = (Node)o.getObject();
1305            if (b)
1306                continue;
1307
1308            b = font2Span(lexer, node, o);
1309            next = (Node)o.getObject();
1310            if (b)
1311                continue;
1312
1313            break;
1314        }
1315
1316        return next;
1317    }
1318
1319    private Node createStyleProperties(Lexer lexer, Node node)
1320    {
1321        Node child;
1322
1323        if (node.content != null)
1324        {
1325            for (child = node.content; child != null; child = child.next)
1326            {
1327                child = createStyleProperties(lexer, child);
1328            }
1329        }
1330
1331        return cleanNode(lexer, node);
1332    }
1333
1334    private void defineStyleRules(Lexer lexer, Node node)
1335    {
1336        Node child;
1337
1338        if (node.content != null)
1339        {
1340            for (child = node.content;
1341                    child != null; child = child.next)
1342            {
1343                defineStyleRules(lexer, child);
1344            }
1345        }
1346
1347        style2Rule(lexer, node);
1348    }
1349
1350    public void cleanTree(Lexer lexer, Node doc)
1351    {
1352        doc = createStyleProperties(lexer, doc);
1353
1354        if (!lexer.configuration.MakeClean)
1355        {
1356            defineStyleRules(lexer, doc);
1357            createStyleElement(lexer, doc);
1358        }
1359    }
1360
1361    /* simplifies <b><b> ... </b> ...</b> etc. */
1362    public void nestedEmphasis(Node node)
1363    {
1364        MutableObject o = new MutableObject();
1365        Node next;
1366
1367        while (node != null)
1368        {
1369            next = node.next;
1370
1371            if ((node.tag == tt.tagB || node.tag == tt.tagI)
1372                && node.parent != null && node.parent.tag == node.tag)
1373            {
1374                /* strip redundant inner element */
1375                o.setObject(next);
1376                discardContainer(node, o);
1377                next = (Node)o.getObject();
1378                node = next;
1379                continue;
1380            }
1381
1382            if (node.content != null)
1383                nestedEmphasis(node.content);
1384
1385            node = next;
1386        }
1387    }
1388
1389    /* replace i by em and b by strong */
1390    public void emFromI(Node node)
1391    {
1392        while (node != null)
1393        {
1394            if (node.tag == tt.tagI)
1395            {
1396                node.element = tt.tagEm.name;
1397                node.tag = tt.tagEm;
1398            }
1399            else if (node.tag == tt.tagB)
1400            {
1401                node.element = tt.tagStrong.name;
1402                node.tag = tt.tagStrong;
1403            }
1404
1405            if (node.content != null)
1406                emFromI(node.content);
1407
1408            node = node.next;
1409        }
1410    }
1411
1412    /*
1413     Some people use dir or ul without an li
1414     to indent the content. The pattern to
1415     look for is a list with a single implicit
1416     li. This is recursively replaced by an
1417     implicit blockquote.
1418    */

1419    public void list2BQ(Node node)
1420    {
1421        while (node != null)
1422        {
1423            if (node.content != null)
1424                list2BQ(node.content);
1425
1426            if (node.tag != null && node.tag.parser == ParserImpl.getParseList() &&
1427                node.hasOneChild() && node.content.implicit)
1428            {
1429                stripOnlyChild(node);
1430                node.element = tt.tagBlockquote.name;
1431                node.tag = tt.tagBlockquote;
1432                node.implicit = true;
1433            }
1434
1435            node = node.next;
1436        }
1437    }
1438
1439    /*
1440     Replace implicit blockquote by div with an indent
1441     taking care to reduce nested blockquotes to a single
1442     div with the indent set to match the nesting depth
1443    */

1444    public void bQ2Div(Node node)
1445    {
1446        int indent;
1447        String JavaDoc indent_buf;
1448
1449        while (node != null)
1450        {
1451            if (node.tag == tt.tagBlockquote && node.implicit)
1452            {
1453                indent = 1;
1454
1455                while(node.hasOneChild() &&
1456                      node.content.tag == tt.tagBlockquote &&
1457                      node.implicit)
1458                {
1459                    ++indent;
1460                    stripOnlyChild(node);
1461                }
1462
1463                if (node.content != null)
1464                    bQ2Div(node.content);
1465
1466                indent_buf = "margin-left: " +
1467                             (new Integer JavaDoc(2*indent)).toString() + "em";
1468
1469                node.element = tt.tagDiv.name;
1470                node.tag = tt.tagDiv;
1471                node.addAttribute("style", indent_buf);
1472            }
1473            else if (node.content != null)
1474                bQ2Div(node.content);
1475
1476
1477            node = node.next;
1478        }
1479    }
1480
1481    /* node is <![if ...]> prune up to <![endif]> */
1482    public Node pruneSection(Lexer lexer, Node node)
1483    {
1484        for (;;)
1485        {
1486            /* discard node and returns next */
1487            node = Node.discardElement(node);
1488
1489            if (node == null)
1490                return null;
1491        
1492            if (node.type == Node.SectionTag)
1493            {
1494                if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
1495                {
1496                    node = pruneSection(lexer, node);
1497                    continue;
1498                }
1499
1500                if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif"))
1501                {
1502                    node = Node.discardElement(node);
1503                    break;
1504                }
1505            }
1506        }
1507
1508        return node;
1509    }
1510
1511    public void dropSections(Lexer lexer, Node node)
1512    {
1513        while (node != null)
1514        {
1515            if (node.type == Node.SectionTag)
1516            {
1517                /* prune up to matching endif */
1518                if ((Lexer.getString(node.textarray, node.start, 2)).equals("if"))
1519                {
1520                    node = pruneSection(lexer, node);
1521                    continue;
1522                }
1523
1524                /* discard others as well */
1525                node = Node.discardElement(node);
1526                continue;
1527            }
1528
1529            if (node.content != null)
1530                dropSections(lexer, node.content);
1531
1532            node = node.next;
1533        }
1534    }
1535
1536    public void purgeAttributes(Node node)
1537    {
1538        AttVal attr = node.attributes;
1539        AttVal next = null;
1540        AttVal prev = null;
1541
1542        while (attr != null)
1543        {
1544            next = attr.next;
1545
1546            /* special check for class="Code" denoting pre text */
1547            if (attr.attribute != null &&
1548                attr.value != null &&
1549                attr.attribute.equals("class") &&
1550                attr.value.equals("Code"))
1551            {
1552                prev = attr;
1553            }
1554            else if (attr.attribute != null &&
1555                (attr.attribute.equals("class") ||
1556                 attr.attribute.equals("style") ||
1557                 attr.attribute.equals("lang") ||
1558                 attr.attribute.startsWith("x:") ||
1559                 ((attr.attribute.equals("height") || attr.attribute.equals("width")) &&
1560                    (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh))))
1561            {
1562                if (prev != null)
1563                    prev.next = next;
1564                else
1565                    node.attributes = next;
1566
1567            }
1568            else
1569                prev = attr;
1570
1571            attr = next;
1572        }
1573    }
1574
1575    /* Word2000 uses span excessively, so we strip span out */
1576    public Node stripSpan(Lexer lexer, Node span)
1577    {
1578        Node node;
1579        Node prev = null;
1580        Node content;
1581
1582        /*
1583         deal with span elements that have content
1584         by splicing the content in place of the span
1585         after having processed it
1586        */

1587
1588        cleanWord2000(lexer, span.content);
1589        content = span.content;
1590
1591        if (span.prev != null)
1592            prev = span.prev;
1593        else if (content != null)
1594        {
1595            node = content;
1596            content = content.next;
1597            Node.removeNode(node);
1598            Node.insertNodeBeforeElement(span, node);
1599            prev = node;
1600        }
1601
1602        while (content != null)
1603        {
1604            node = content;
1605            content = content.next;
1606            Node.removeNode(node);
1607            Node.insertNodeAfterElement(prev, node);
1608            prev = node;
1609        }
1610
1611        if (span.next == null)
1612            span.parent.last = prev;
1613
1614        node = span.next;
1615        span.content = null;
1616        Node.discardElement(span);
1617        return node;
1618    }
1619
1620    /* map non-breaking spaces to regular spaces */
1621    private void normalizeSpaces(Lexer lexer, Node node)
1622    {
1623        while (node != null)
1624        {
1625            if (node.content != null)
1626                normalizeSpaces(lexer, node.content);
1627
1628            if (node.type == Node.TextNode)
1629            {
1630                int i;
1631                MutableInteger c = new MutableInteger();
1632                int p = node.start;
1633
1634                for (i = node.start; i < node.end; ++i)
1635                {
1636                    c.value = (int)node.textarray[i];
1637
1638                    /* look for UTF-8 multibyte character */
1639                    if (c.value > 0x7F)
1640                        i += PPrint.getUTF8(node.textarray, i, c);
1641
1642                    if (c.value == 160)
1643                        c.value = ' ';
1644
1645                    p = PPrint.putUTF8(node.textarray, p, c.value);
1646                }
1647            }
1648
1649            node = node.next;
1650        }
1651    }
1652
1653    /*
1654     This is a major clean up to strip out all the extra stuff you get
1655     when you save as web page from Word 2000. It doesn't yet know what
1656     to do with VML tags, but these will appear as errors unless you
1657     declare them as new tags, such as o:p which needs to be declared
1658     as inline.
1659    */

1660    public void cleanWord2000(Lexer lexer, Node node)
1661    {
1662        /* used to a list from a sequence of bulletted p's */
1663        Node list = null;
1664
1665        while (node != null)
1666        {
1667            /* discard Word's style verbiage */
1668            if (node.tag == tt.tagStyle ||
1669                node.tag == tt.tagMeta ||
1670                node.type == Node.CommentTag)
1671            {
1672                node = Node.discardElement(node);
1673                continue;
1674            }
1675
1676            /* strip out all span tags Word scatters so liberally! */
1677            if (node.tag == tt.tagSpan)
1678            {
1679                node = stripSpan(lexer, node);
1680                continue;
1681            }
1682
1683            /* get rid of Word's xmlns attributes */
1684            if (node.tag == tt.tagHtml)
1685            {
1686                /* check that it's a Word 2000 document */
1687                if (node.getAttrByName("xmlns:o") == null)
1688                    return;
1689            }
1690
1691            if (node.tag == tt.tagLink)
1692            {
1693                AttVal attr = node.getAttrByName("rel");
1694
1695                if (attr != null && attr.value != null &&
1696                    attr.value.equals("File-List"))
1697                {
1698                    node = Node.discardElement(node);
1699                    continue;
1700                }
1701            }
1702
1703            /* discard empty paragraphs */
1704            if (node.content == null && node.tag == tt.tagP)
1705            {
1706                node = Node.discardElement(node);
1707                continue;
1708            }
1709
1710            if (node.tag == tt.tagP)
1711            {
1712                AttVal attr = node.getAttrByName("class");
1713
1714                /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1715                if (attr != null && attr.value != null &&
1716                    attr.value.equals("MsoListBullet"))
1717                {
1718                    Node.coerceNode(lexer, node, tt.tagLi);
1719
1720                    if (list == null || list.tag != tt.tagUl)
1721                    {
1722                        list = lexer.inferredTag("ul");
1723                        Node.insertNodeBeforeElement(node, list);
1724                    }
1725
1726                    purgeAttributes(node);
1727
1728                    if (node.content != null)
1729                        cleanWord2000(lexer, node.content);
1730
1731                    /* remove node and append to contents of list */
1732                    Node.removeNode(node);
1733                    Node.insertNodeAtEnd(list, node);
1734                    node = list.next;
1735                }
1736                /* map sequence of <p class="Code"> to <pre>...</pre> */
1737                else if (attr != null && attr.value != null &&
1738                         attr.value.equals("Code"))
1739                {
1740                    Node br = lexer.newLineNode();
1741                    normalizeSpaces(lexer, node);
1742
1743                    if (list == null || list.tag != tt.tagPre)
1744                    {
1745                        list = lexer.inferredTag("pre");
1746                        Node.insertNodeBeforeElement(node, list);
1747                    }
1748
1749                    /* remove node and append to contents of list */
1750                    Node.removeNode(node);
1751                    Node.insertNodeAtEnd(list, node);
1752                    stripSpan(lexer, node);
1753                    Node.insertNodeAtEnd(list, br);
1754                    node = list.next;
1755                }
1756                else
1757                    list = null;
1758            }
1759            else
1760                list = null;
1761
1762            /* strip out style and class attributes */
1763            if (node.type == Node.StartTag || node.type == Node.StartEndTag)
1764                purgeAttributes(node);
1765
1766            if (node.content != null)
1767                cleanWord2000(lexer, node.content);
1768
1769            node = node.next;
1770        }
1771    }
1772
1773    public boolean isWord2000(Node root, TagTable tt)
1774    {
1775        Node html = root.findHTML(tt);
1776
1777        return (html != null && html.getAttrByName("xmlns:o") != null);
1778    }
1779}
1780
Popular Tags