KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > util > Generate


1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/util/Generate.java,v 1.2 2004/02/11 02:16:59 woolfel Exp $
2
/*
3  * ====================================================================
4  * Copyright 2002-2004 The Apache Software Foundation.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  */

19
20 // The developers of JMeter and Apache are greatful to the developers
21
// of HTMLParser for giving Apache Software Foundation a non-exclusive
22
// license. The performance benefits of HTMLParser are clear and the
23
// users of JMeter will benefit from the hard work the HTMLParser
24
// team. For detailed information about HTMLParser, the project is
25
// hosted on sourceforge at http://htmlparser.sourceforge.net/.
26
//
27
// HTMLParser was originally created by Somik Raha in 2000. Since then
28
// a healthy community of users has formed and helped refine the
29
// design so that it is able to tackle the difficult task of parsing
30
// dirty HTML. Derrick Oswald is the current lead developer and was kind
31
// enough to assist JMeter.
32
//
33
// This class was contributed by
34
// Derrick Oswald
35
//
36

37 package org.htmlparser.util;
38
39 import org.htmlparser.Node;
40 import org.htmlparser.Parser;
41 import org.htmlparser.RemarkNode;
42 import org.htmlparser.StringNode;
43 import org.htmlparser.tags.EndTag;
44 import org.htmlparser.tags.LinkTag;
45 import org.htmlparser.tags.Tag;
46
47 /**
48  * Create a character reference translation class source file.
49  * Usage:
50  * <pre>
51  * java -classpath .:lib/htmlparser.jar Generate > Translate.java
52  * </pre>
53  * Derived from HTMLStringFilter.java provided as an example with the
54  * htmlparser.jar file available at
55  * <a HREF="http://htmlparser.sourceforge.net">htmlparser.sourceforge.net</a>
56  * written by Somik Raha (
57  * <a HREF='mailto:somik@industriallogic.com?
58  * subject=htmlparser'>somik@industriallogic. com</a>
59  * <a HREF="http://industriallogic.com">http://industriallogic.com</a>).
60  * @author <a HREF='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>
61  */

62 public class Generate
63 {
64     /**
65      * The working parser.
66      */

67     protected Parser parser;
68
69     /**
70      * The system specific line separator string.
71      */

72     protected static final String JavaDoc nl =
73         System.getProperty("line.separator", "\n");
74
75     /**
76      * Create a Generate object.
77      * Sets up the generation by creating a new <code>Parser</code> pointed
78      * at <a HREF="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>
79      * with the standard scanners registered.
80      */

81     public Generate() throws ParserException
82     {
83         parser =
84             new Parser("http://www.w3.org/TR/REC-html40/sgml/entities.html");
85         parser.registerScanners();
86     }
87
88     /**
89      * Translate character references.
90      * After generating the Translate class we could use it
91      * to do this job, but that would involve a bootstrap
92      * problem, so this method does the reference conversion
93      * for a very tiny subset (enough to understand the w3.org
94      * page).
95      * @param string The raw string.
96      * @return The string with character references fixed.
97      */

98     public String JavaDoc translate(String JavaDoc string)
99     {
100         int index;
101         int amp;
102         StringBuffer JavaDoc ret;
103
104         ret = new StringBuffer JavaDoc(4096);
105
106         index = 0;
107         while ((index < string.length())
108             && (-1 != (amp = string.indexOf('&', index))))
109         {
110             // include the part before the special character
111
ret.append(string.substring(index, amp));
112             if (string.startsWith("&nbsp;", amp))
113             {
114                 ret.append(" ");
115                 index = amp + 6;
116             }
117             else if (string.startsWith("&lt;", amp))
118             {
119                 ret.append("<");
120                 index = amp + 4;
121             }
122             else if (string.startsWith("&gt;", amp))
123             {
124                 ret.append(">");
125                 index = amp + 4;
126             }
127             else if (string.startsWith("&amp;", amp))
128             {
129                 ret.append("&");
130                 index = amp + 5;
131             }
132             else if (string.startsWith("&quote;", amp))
133             {
134                 ret.append("\"");
135                 index = amp + 7;
136             }
137             else if (string.startsWith("&divide;", amp))
138             {
139                 ret.append('\u00F7');
140                 index = amp + 8;
141             }
142             else if (string.startsWith("&copy;", amp))
143             {
144                 ret.append('\u00A9');
145                 index = amp + 6;
146             }
147             else
148             {
149                 System.out.println(
150                     "unknown special character starting with "
151                         + string.substring(amp, amp + 7));
152                 ret.append("&");
153                 index = amp + 1;
154             }
155         }
156         ret.append(string.substring(index));
157
158         return (ret.toString());
159     }
160
161     /**
162      * Pull out text elements from the HTML.
163      */

164     public void parse() throws ParserException
165     {
166         Node node;
167         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(4096);
168
169         // Run through an enumeration of html elements, and pick up
170
// only those that are plain string.
171
for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
172         {
173             node = e.nextNode();
174
175             if (node instanceof StringNode)
176             {
177                 // Node is a plain string
178
// Cast it to an HTMLStringNode
179
StringNode stringNode = (StringNode) node;
180                 // Retrieve the data from the object
181
buffer.append(stringNode.getText());
182             }
183             else if (node instanceof LinkTag)
184             {
185                 // Node is a link
186
// Cast it to an HTMLLinkTag
187
LinkTag linkNode = (LinkTag) node;
188                 // Retrieve the data from the object and print it
189
buffer.append(linkNode.getLinkText());
190             }
191             else if (node instanceof Tag)
192             {
193                 String JavaDoc contents = ((Tag) node).getText();
194                 if (contents.equals("BR") || contents.equals("P"))
195                     buffer.append(nl);
196             }
197             else if (node instanceof EndTag)
198             {
199                 String JavaDoc contents = ((EndTag) node).getText();
200                 if (contents.equals("BR") || contents.equals("P"))
201                     buffer.append(nl);
202             }
203             else if (node instanceof RemarkNode)
204             {
205             }
206             else
207             {
208                 System.out.println();
209                 System.out.println(node.toString());
210             }
211         }
212
213         String JavaDoc text = translate(buffer.toString());
214         sgml(text);
215     }
216
217     /**
218      * Find the lowest index of whitespace (space or newline).
219      * @param string The string to look in.
220      * @param index Where to start looking.
221      * @return -1 if there is no whitespace, the minimum index otherwise.
222      */

223     public int indexOfWhitespace(String JavaDoc string, int index)
224     {
225         int space;
226         int cr;
227         int ret;
228
229         space = string.indexOf(" ", index);
230         cr = string.indexOf(nl, index);
231         if (-1 == space)
232             ret = cr;
233         else if (-1 == cr)
234             ret = space;
235         else
236             ret = Math.min(space, cr);
237
238         return (ret);
239     }
240
241     /**
242      * Rewrite the comment string.
243      * In the sgml table, the comments are of the form:
244      * <pre>
245      * -- latin capital letter I with diaeresis,
246      * U+00CF ISOlat1
247      * </pre>
248      * so we just want to make a one-liner without the spaces and newlines.
249      * @param string The raw comment.
250      * @return The single line comment.
251      */

252     public String JavaDoc pack(String JavaDoc string)
253     {
254         int index;
255         int spaces;
256         StringBuffer JavaDoc ret;
257
258         ret = new StringBuffer JavaDoc(string.length());
259
260         if (string.startsWith("-- "))
261             string = string.substring(3);
262         // remove doublespaces
263
index = 0;
264         while ((index < string.length())
265             && (-1 != (spaces = indexOfWhitespace(string, index))))
266         {
267             ret.append(string.substring(index, spaces));
268             ret.append(" ");
269             while ((spaces < string.length())
270                 && Character.isWhitespace(string.charAt(spaces)))
271                 spaces++;
272             index = spaces;
273         }
274         if (index < string.length())
275             ret.append(string.substring(index));
276
277         return (ret.toString());
278     }
279
280     /**
281      * Pretty up a comment string.
282      * @param string The comment to operate on.
283      * @return The beautiful comment string.
284      */

285     public String JavaDoc pretty(String JavaDoc string)
286     {
287         int index;
288         int spaces;
289         StringBuffer JavaDoc ret;
290
291         ret = new StringBuffer JavaDoc(string.length());
292
293         // newline instead of doublespaces
294
index = 0;
295         while ((index < string.length())
296             && (-1 != (spaces = string.indexOf(" ", index))))
297         {
298             ret.append(" // " + string.substring(index, spaces));
299             if (!string.substring(index, spaces).endsWith(nl))
300                 ret.append(nl);
301             while ((spaces < string.length())
302                 && Character.isWhitespace(string.charAt(spaces)))
303                 spaces++;
304             index = spaces;
305         }
306         if (index < string.length())
307             ret.append(" // " + string.substring(index));
308
309         return (ret.toString());
310     }
311
312     /**
313      * Pad a string on the left with the given character to the length specified.
314      * @param string The string to pad
315      * @param character The character to pad with.
316      * @param length The size to pad to.
317      * @return The padded string.
318      */

319     public String JavaDoc pad(String JavaDoc string, char character, int length)
320     {
321         StringBuffer JavaDoc ret;
322
323         ret = new StringBuffer JavaDoc(length);
324         ret.append(string);
325         while (length > ret.length())
326             ret.insert(0, character);
327
328         return (ret.toString());
329     }
330
331     /**
332      * Convert the textual representation of the numeric character reference to a character.
333      * @param string The numeric character reference (in quotes).
334      * @return The character represented by the numeric character reference.
335      *
336      */

337     public String JavaDoc unicode(String JavaDoc string)
338     {
339         int code;
340
341         if (string.startsWith("\"&#") && string.endsWith(";\""))
342         {
343             string = string.substring(3, string.length() - 2);
344             try
345             {
346                 code = Integer.parseInt(string);
347                 string =
348                     "new Character ('\\u"
349                         + pad(Integer.toHexString(code), '0', 4)
350                         + "')";
351             }
352             catch (Exception JavaDoc e)
353             {
354                 e.printStackTrace();
355             }
356             return (string);
357         }
358         else
359             return (string);
360     }
361
362     /**
363      * Parse the sgml declaration for character entity reference
364      * name, equivalent numeric character reference and a comment.
365      * Emit a java hash table 'put' with the name as the key, the
366      * numeric character as the value and comment the insertion
367      * with the comment.
368      * @param string The contents of the sgml declaration.
369      */

370     public void extract(String JavaDoc string)
371     {
372         int space;
373         String JavaDoc token;
374         String JavaDoc code;
375         int comment;
376         String JavaDoc description;
377
378         if (string.startsWith("<!--"))
379             System.out.println(
380                 pretty(string.substring(4, string.length() - 3).trim()));
381         else if (string.startsWith("<!ENTITY"))
382         {
383             string = string.substring(8, string.length() - 3).trim();
384             if (-1 != (space = string.indexOf(" ")))
385             {
386                 token = string.substring(0, space);
387                 string = string.substring(space).trim();
388                 if (string.startsWith("CDATA"))
389                 {
390                     string = string.substring(5).trim();
391                     if (-1 != (space = string.indexOf(" ")))
392                     {
393                         code = string.substring(0, space).trim();
394                         code = unicode(code);
395                         string = string.substring(space).trim();
396                         System
397                             .out
398                             .println(" mRefChar.put (\"" + token + "\","
399                         // no token is larger than 8 characters - yet
400
+pad(code, ' ', code.length() + 9 - token.length())
401                             + ");"
402                             + " // "
403                             + pack(string));
404                     }
405                     else
406                         System.out.println(string);
407                 }
408                 else
409                     System.out.println(string);
410             }
411             else
412                 System.out.println(string);
413         }
414         else
415             System.out.println(string);
416     }
417
418     /**
419      * Extract special characters.
420      * Scan the string looking for substrings of the form:
421      * <pre>
422      * &lt;!ENTITY nbsp CDATA "&amp;#160;" -- no-break space = non-breaking space, U+00A0 ISOnum --&gt;
423      * </pre>
424      * and emit a java definition for each.
425      * @param string The raw string from w3.org.
426      */

427     public void sgml(String JavaDoc string)
428     {
429         int index;
430         int begin;
431         int end;
432         StringBuffer JavaDoc ret;
433
434         ret = new StringBuffer JavaDoc(4096);
435
436         index = 0;
437         while (-1 != (begin = string.indexOf("<", index)))
438         {
439             if (-1 != (end = string.indexOf("-->", begin)))
440             {
441                 extract(string.substring(begin, end + 3));
442                 index = end + 3;
443             }
444             else
445                 index = begin + 1;
446         }
447     }
448
449     /**
450      * Generator program.
451      * <pre>
452      * java -classpath .:lib/htmlparser.jar Generate > Translate.java
453      * </pre>
454      * @param args <em>Not used.</em>
455      */

456     public static void main(String JavaDoc[] args) throws ParserException
457     {
458         Generate filter = new Generate();
459         System.out.println("import java.util.Hashtable;");
460         System.out.println("import java.util.Iterator;");
461         System.out.println();
462         System.out.println("/**");
463         System.out.println(
464             " * Translate numeric character references and character entity references to unicode characters.");
465         System.out.println(
466             " * Based on tables found at <a HREF=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">");
467         System.out.println(
468             " * http://www.w3.org/TR/REC-html40/sgml/entities.html</a>");
469         System.out.println(
470             " * <p><b>Note: Do not edit! This class is created by the Generate class.</b>");
471         System.out.println(" * <p>Typical usage:");
472         System.out.println(" * <pre>");
473         System.out.println(
474             " * String s = Translate.decode (getTextFromHtmlPage ());");
475         System.out.println(" * </pre>");
476         System.out.println(
477             " * @author <a HREF='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>");
478         System.out.println(" */");
479         System.out.println("public class Translate");
480         System.out.println("{");
481         System.out.println(" /**");
482         System.out.println(
483             " * Table mapping entity reference kernel to character.");
484         System.out.println(
485             " * <p><code>String</code>-><code>Character</code>");
486         System.out.println(" */");
487         System.out.println(" protected static Hashtable mRefChar;");
488         System.out.println(" static");
489         System.out.println(" {");
490         System.out.println(" mRefChar = new Hashtable (1000);");
491         System.out.println();
492         filter.parse();
493         System.out.println(" }");
494         System.out.println();
495         System.out.println(" /**");
496         System.out.println(
497             " * Table mapping character to entity reference kernel.");
498         System.out.println(
499             " * <p><code>Character</code>-><code>String</code>");
500         System.out.println(" */");
501         System.out.println(" protected static Hashtable mCharRef;");
502         System.out.println(" static");
503         System.out.println(" {");
504         System.out.println(
505             " mCharRef = new Hashtable (mRefChar.size ());");
506         System.out.println();
507         System.out.println(
508             " Iterator iterator = mRefChar.keySet ().iterator ();");
509         System.out.println(" while (iterator.hasNext ())");
510         System.out.println(" {");
511         System.out.println(
512             " String key = (String)iterator.next ();");
513         System.out.println(
514             " Character character = (Character)mRefChar.get (key);");
515         System.out.println(" mCharRef.put (character, key);");
516         System.out.println(" }");
517         System.out.println(" }");
518         System.out.println();
519         System.out.println(" /**");
520         System.out.println(" * Private constructor.");
521         System.out.println(
522             " * This class is fully static and thread safe.");
523         System.out.println(" */");
524         System.out.println(" private Translate ()");
525         System.out.println(" {");
526         System.out.println(" }");
527         System.out.println();
528         System.out.println(" /**");
529         System.out.println(
530             " * Convert a reference to a unicode character.");
531         System.out.println(
532             " * Convert a single numeric character reference or character entity reference");
533         System.out.println(" * to a unicode character.");
534         System.out.println(
535             " * @param string The string to convert. Of the form &xxxx; or &amp;#xxxx; with");
536         System.out.println(
537             " * or without the leading ampersand or trailing semi-colon.");
538         System.out.println(
539             " * @return The converted character or '\\0' (zero) if the string is an");
540         System.out.println(" * invalid reference.");
541         System.out.println(" */");
542         System.out.println(
543             " public static char convertToChar (String string)");
544         System.out.println(" {");
545         System.out.println(" int length;");
546         System.out.println(" Character item;");
547         System.out.println(" char ret;");
548         System.out.println();
549         System.out.println(" ret = 0;");
550         System.out.println();
551         System.out.println(" length = string.length ();");
552         System.out.println(" if (0 < length)");
553         System.out.println(" {");
554         System.out.println(" if ('&' == string.charAt (0))");
555         System.out.println(" {");
556         System.out.println(" string = string.substring (1);");
557         System.out.println(" length--;");
558         System.out.println(" }");
559         System.out.println(" if (0 < length)");
560         System.out.println(" {");
561         System.out.println(
562             " if (';' == string.charAt (length - 1))");
563         System.out.println(
564             " string = string.substring (0, --length);");
565         System.out.println(" if (0 < length)");
566         System.out.println(" {");
567         System.out.println(" if ('#' == string.charAt (0))");
568         System.out.println(" try");
569         System.out.println(" {");
570         System.out.println(
571             " ret = (char)Integer.parseInt (string.substring (1));");
572         System.out.println(" }");
573         System.out.println(
574             " catch (NumberFormatException nfe)");
575         System.out.println(" {");
576         System.out.println(
577             " /* failed conversion, return 0 */");
578         System.out.println(" }");
579         System.out.println(" else");
580         System.out.println(" {");
581         System.out.println(
582             " item = (Character)refChar.get (string);");
583         System.out.println(" if (null != item)");
584         System.out.println(
585             " ret = item.charValue ();");
586         System.out.println(" }");
587         System.out.println(" }");
588         System.out.println(" }");
589         System.out.println(" }");
590         System.out.println();
591         System.out.println(" return (ret);");
592         System.out.println(" }");
593         System.out.println();
594         System.out.println(" /**");
595         System.out.println(" * Decode a string containing references.");
596         System.out.println(
597             " * Change all numeric character reference and character entity references");
598         System.out.println(" * to unicode characters.");
599         System.out.println(" * @param string The string to translate.");
600         System.out.println(" */");
601         System.out.println(" public static String decode (String string)");
602         System.out.println(" {");
603         System.out.println(" int index;");
604         System.out.println(" int length;");
605         System.out.println(" int amp;");
606         System.out.println(" int semi;");
607         System.out.println(" String code;");
608         System.out.println(" char character;");
609         System.out.println(" StringBuffer ret;");
610         System.out.println();
611         System.out.println(
612             " ret = new StringBuffer (string.length ());");
613         System.out.println();
614         System.out.println(" index = 0;");
615         System.out.println(" length = string.length ();");
616         System.out.println(
617             " while ((index < length) && (-1 != (amp = string.indexOf ('&', index))))");
618         System.out.println(" {");
619         System.out.println(
620             " ret.append (string.substring (index, amp));");
621         System.out.println(" index = amp + 1;");
622         System.out.println(" if (amp < length - 1)");
623         System.out.println(" {");
624         System.out.println(" semi = string.indexOf (';', amp);");
625         System.out.println(" if (-1 != semi)");
626         System.out.println(
627             " code = string.substring (amp, semi + 1);");
628         System.out.println(" else");
629         System.out.println(
630             " code = string.substring (amp);");
631         System.out.println(
632             " if (0 != (character = convertToChar (code)))");
633         System.out.println(" index += code.length () - 1;");
634         System.out.println(" else");
635         System.out.println(" character = '&';");
636         System.out.println(" }");
637         System.out.println(" else");
638         System.out.println(" character = '&';");
639         System.out.println(" ret.append (character);");
640         System.out.println(" }");
641         System.out.println(" if (index < length)");
642         System.out.println(
643             " ret.append (string.substring (index));");
644         System.out.println();
645         System.out.println(" return (ret.toString ());");
646         System.out.println(" }");
647         System.out.println();
648         System.out.println(" /**");
649         System.out.println(
650             " * Convert a character to a character entity reference.");
651         System.out.println(
652             " * Convert a unicode character to a character entity reference of");
653         System.out.println(" * the form &xxxx;.");
654         System.out.println(" * @param character The character to convert.");
655         System.out.println(
656             " * @return The converted character or <code>null</code> if the character");
657         System.out.println(" * is not one of the known entity references.");
658         System.out.println(" */");
659         System.out.println(
660             " public static String convertToString (Character character)");
661         System.out.println(" {");
662         System.out.println(" StringBuffer buffer;");
663         System.out.println(" String ret;");
664         System.out.println();
665         System.out.println(
666             " if (null != (ret = (String)mCharRef.get (character)))");
667         System.out.println(" {");
668         System.out.println(
669             " buffer = new StringBuffer (ret.length () + 2);");
670         System.out.println(" buffer.append ('&');");
671         System.out.println(" buffer.append (ret);");
672         System.out.println(" buffer.append (';');");
673         System.out.println(" ret = buffer.toString ();");
674         System.out.println(" }");
675         System.out.println();
676         System.out.println(" return (ret);");
677         System.out.println(" }");
678         System.out.println();
679         System.out.println(" /**");
680         System.out.println(
681             " * Convert a character to a numeric character reference.");
682         System.out.println(
683             " * Convert a unicode character to a numeric character reference of");
684         System.out.println(" * the form &amp;#xxxx;.");
685         System.out.println(" * @param character The character to convert.");
686         System.out.println(" * @return The converted character.");
687         System.out.println(" */");
688         System.out.println(
689             " public static String convertToString (int character)");
690         System.out.println(" {");
691         System.out.println(" StringBuffer ret;");
692         System.out.println();
693         System.out.println(
694             " ret = new StringBuffer (13); /* &#2147483647; */");
695         System.out.println(" ret.append (\"&#\");");
696         System.out.println(" ret.append (character);");
697         System.out.println(" ret.append (';');");
698         System.out.println();
699         System.out.println(" return (ret.toString ());");
700         System.out.println(" }");
701         System.out.println();
702         System.out.println(" /**");
703         System.out.println(" * Encode a string to use references.");
704         System.out.println(
705             " * Change all characters that are not ASCII to their numeric character");
706         System.out.println(" * reference or character entity reference.");
707         System.out.println(
708             " * This implementation is inefficient, allocating a new");
709         System.out.println(
710             " * <code>Character</code> for each character in the string,");
711         System.out.println(
712             " * but this class is primarily intended to decode strings");
713         System.out.println(
714             " * so efficiency and speed in the encoding was not a priority.");
715         System.out.println(" * @param string The string to translate.");
716         System.out.println(" */");
717         System.out.println(" public static String encode (String string)");
718         System.out.println(" {");
719         System.out.println(" int length;");
720         System.out.println(" char c;");
721         System.out.println(" Character character;");
722         System.out.println(" String value;");
723         System.out.println(" StringBuffer ret;");
724         System.out.println();
725         System.out.println(
726             " ret = new StringBuffer (string.length () * 6);");
727         System.out.println(" length = string.length ();");
728         System.out.println(" for (int i = 0; i < length; i++)");
729         System.out.println(" {");
730         System.out.println(" c = string.charAt (i);");
731         System.out.println(" character = new Character (c);");
732         System.out.println(
733             " if (null != (value = convertToString (character)))");
734         System.out.println(" ret.append (value);");
735         System.out.println(
736             " else if (!((c > 0x001F) && (c < 0x007F)))");
737         System.out.println(" {");
738         System.out.println(" value = convertToString (c);");
739         System.out.println(" ret.append (value);");
740         System.out.println(" }");
741         System.out.println(" else");
742         System.out.println(" ret.append (character);");
743         System.out.println(" }");
744         System.out.println();
745         System.out.println(" return (ret.toString ());");
746         System.out.println(" }");
747         System.out.println("}");
748     }
749 }
750
Popular Tags