KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > enhydra > oyster > util > HtmlAnalyzer


1 /**
2  * Title: Oyster Project
3  * Description: Creating S/MIME email transport capabilities.
4  * Copyright: Copyright (c) 2001
5  * @Author Vladimir Radisic
6  * @Version 2.1.5
7  */

8
9 package org.enhydra.oyster.util;
10
11 import org.enhydra.oyster.exception.SMIMEException;
12 import java.util.Vector JavaDoc;
13 import java.net.MalformedURLException JavaDoc;
14 import java.net.URL JavaDoc;
15 import java.io.InputStream JavaDoc;
16 import java.io.ByteArrayOutputStream JavaDoc;
17 import java.io.File JavaDoc;
18 import org.w3c.dom.Attr JavaDoc;
19 import org.w3c.dom.Document JavaDoc;
20 import org.w3c.dom.NamedNodeMap JavaDoc;
21 import org.w3c.dom.Node JavaDoc;
22 import org.w3c.dom.NodeList JavaDoc;
23 import org.w3c.tidy.Tidy;
24
25
26 /**
27  * HtmlAnalyzer class is used for parsing html code which has to become content
28  * of the message. For parsing is used JTidy parser. As result of parsing, DOM
29  * (Document Object Model) structure is obtained. It is tree-like construction
30  * with nodes and hierarchical structures that descripts input html code. This
31  * structure is easy for browsing and searching for specific html elements and
32  * attributes. By using DOM, all references to resources (image, movie, sound... ),
33  * defined in "src" and "background" attributes, are explored and swapped with
34  * generated unique Content-ID values which are necessary in forming
35  * "multipart/related" MimeMultipart object.<BR>
36  * <BR>
37  * DOM, generated inside of the object of this class, is also used in the process of
38  * generation plain/text message based on, and derived from the given html code.
39  * This plain text is later used in creation of "multipart/alternative"
40  * MimeMultipart object.
41  */

42 public class HtmlAnalyzer {
43
44 /**
45  * plain/text representation of page
46  */

47   private String JavaDoc plainText = "";
48
49 /**
50  * Enable/disable p tag in text/html to text/plain conversion.
51  */

52   private boolean pTagEnable = true;
53
54 /**
55  * Path to html file or prefix path to the embeded resource's adresses in
56  * html code (for example for "src" attribute of IMG tag). Can be null which
57  * means that prefix won't be added to resources location in the process of
58  * searching for specific adress attributes given in html code.
59  */

60   private String JavaDoc absolutPath = null;
61
62 /**
63  * Container for parsed html document in DOM (Document Object Model)
64  * representation.
65  */

66   private Document JavaDoc doc;
67
68 /**
69  * Indent from left margin pointer. This information is used in the process of
70  * generation plain text message based on html code.
71  */

72   private int indent = 0;
73
74 /**
75  * Current sequential number of OL (ordered list) html element. This information
76  * is used in the process of generation plain text message based on html code.
77  */

78   private int olNumber = 1;
79
80 /**
81  * Current html element is OL (ordered list), UN (unordered list) or something
82  * else. This information is used in the process of generation plain text message based
83  * on html code.
84  */

85   private String JavaDoc ul_ol = "";
86
87 /**
88  * Constant used in generating indent from left side. This information is used in
89  * the process of generation plain text message based on html code.
90  */

91   private final String JavaDoc indentString =
92         "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t";
93 /**
94  * Container for storing pairs of replaced url or file addresses and
95  * corresponding generated Content-ID values.
96  */

97   private Vector JavaDoc sourceLinks = new Vector JavaDoc(0,1);
98
99 /**
100  * Enable/disable swapping resource references in html code with generated
101  * value for Content-ID message bodypart header line. Default value is true
102  * (enable swapping)
103  */

104  private boolean enableSwapping = true;
105
106 /**
107  * Constructs HtmlAnalyzer from data given from InputStream. This constructor
108  * parses html code from input stream withouth swaping resources' locations from
109  * atribute's "src" and "background" value with generated Content-ID values. Also,
110  * depending on second argument, it is performed generation of plain text message
111  * based on html code. If second argument is null then autogeneration of plain
112  * text message is performed. Otherwise, second String argument is used as
113  * external prepared plain text message.
114  * @param content0 html code given as InputStream
115  * @param externalPlainText0 external plain text message represented as String.
116  * If this argument has value null then autogeneration of text plain message is
117  * performed according to passed html code via content0 argument.
118  * @exception SMIMEException caused by its private method analyze().
119  */

120   public HtmlAnalyzer(InputStream JavaDoc content0, String JavaDoc externalPlainText0) throws SMIMEException
121   {
122     Tidy tidy = new Tidy();
123     tidy.setWraplen(1000);
124     tidy.setShowWarnings(false);
125     tidy.setUpperCaseTags(true);
126     doc = (tidy.parseDOM(content0, null));
127     enableSwapping = false;
128     if(externalPlainText0 == null) {
129       analyze(doc);
130       plainText = plainText + "\r\n";
131     }
132     else {
133       analyzeLight(doc);
134       plainText = new String JavaDoc(externalPlainText0);
135       plainText = plainText + "\r\n";
136     }
137   }
138
139
140 /**
141  * Constructs HtmlAnalyzer from data given from InputStream. This constructor
142  * parses html code from input stream with swaping resources' locations from
143  * atribute's "src and "background" value with generated Content-ID values. In
144  * that process, it is used given second paremeter "path0" which represents
145  * common path to all resources in html code with relative path adresses. Also,
146  * depended on third argument, it is performed generation of plain text message
147  * based on html code. If third argument is null then autogeneration of plain
148  * text message is performed. Otherwise, third String argument is used as
149  * external prepared plain text message.
150  * @param content0 html code given as InputStream.
151  * @param path0 common path used for resolving all resources in html code with
152  * relative path adresses.
153  * @param externalPlainText0 external plain text message represented as String.
154  * If this argument has value null then autogeneration of text plain message is
155  * performed according to passed html code via content0 argument.
156  * @exception SMIMEException caused by its private method analyze().
157  */

158   public HtmlAnalyzer(InputStream JavaDoc content0, String JavaDoc path0, String JavaDoc externalPlainText0)
159       throws SMIMEException
160   {
161     if(path0 != null) {
162       absolutPath = new String JavaDoc(path0);
163       if(absolutPath.charAt( absolutPath.length()-1) == '\\' ||
164          absolutPath.charAt( absolutPath.length()-1) == '/' )
165          absolutPath = absolutPath.substring(0,absolutPath.length()-1);
166
167       absolutPath = absolutPath.replace('/', File.separatorChar);
168       absolutPath = absolutPath.replace('\\', File.separatorChar) + File.separator;
169     }
170
171     Tidy tidy = new Tidy();
172     tidy.setWraplen(1000);
173     tidy.setShowWarnings(false);
174     tidy.setUpperCaseTags(true);
175     doc = (tidy.parseDOM(content0, null));
176     if(externalPlainText0 == null) {
177       analyze(doc);
178       plainText = plainText + "\r\n";
179     }
180     else {
181       analyzeLight(doc);
182       plainText = new String JavaDoc(externalPlainText0);
183       plainText = plainText + "\r\n";
184     }
185   }
186
187 /**
188  * Returns pairs of swapped resource URL adresses or File paths and appropriate
189  * generated Content IDs.
190  * @return Vector object whose even (and 0) indexes contain resource addresses
191  * as File or String objects, and whose odd indexes contain appropriate
192  * swapped Content-ID values.
193  */

194   public Vector JavaDoc getSwappedAdresses() {
195     return sourceLinks;
196   }
197
198 /**
199  * Returns plain/text representation of given html code document
200  * @return html document transformed to plain/text.
201  */

202   public String JavaDoc getPlainText() {
203     return plainText;
204   }
205
206 /**
207  * Returns html/text document passed throught JTidy html parser. All resource
208  * references which were accessible on the file system are swapped with
209  * generated content ID value. Also, all virtual references to appropriate
210  * InputStream resources (see setContent methods in classes from package
211  * org.enhydra.oyster.smime) are also swapped with generated Content-ID
212  * value.
213  * @return parsed html/text document.
214  * @exception SMIMEException caused by non SMIMEException which is:
215  * UnsupportedEncodingException.
216  */

217   public String JavaDoc getHtmlText() throws SMIMEException {
218     String JavaDoc returnString;
219
220     Tidy tidy = new Tidy();
221     tidy.setWraplen(1000);
222     ByteArrayOutputStream JavaDoc out = new ByteArrayOutputStream JavaDoc();
223
224     tidy.pprint(doc,out);
225
226     try {
227       returnString = out.toString("ISO-8859-1");
228       out.close();
229     }
230     catch(Exception JavaDoc e) {
231       throw SMIMEException.getInstance(this, e, "getHtmlText" );
232     }
233
234     return returnString;
235   }
236
237
238 /**
239  * Analyzes html code and creates alternative plain/text message from html code.
240  * Also, it creates Vector with corresponding pairs of resource locations discovered
241  * in html code (values of "background" and "src" attributes) and generated
242  * Content-ID values.
243  * @param node0 node element got from JTidy parser.
244  * @exception SMIMEException caused by MimeAssist.generateID() method or by
245  * its private method existenceOfResource().
246  */

247   private void analyze(Node JavaDoc node0) throws SMIMEException {
248
249     if ( node0 == null ) {
250        return;
251     }
252     String JavaDoc brLine = "\r\n";
253     int type = node0.getNodeType();
254
255     boolean pTagEnable_old = true;
256     int indent_old = 0;
257     int olNumber_old = 1;
258     String JavaDoc ul_ol_old = "";
259
260     switch (type) {
261       case Node.DOCUMENT_NODE: // Document node
262
analyze(((Document JavaDoc)node0).getDocumentElement());
263         break;
264
265       case Node.ELEMENT_NODE: // Element node
266
String JavaDoc elName = node0.getNodeName();
267         if (elName.equalsIgnoreCase("br")) {
268           plainText = plainText + brLine;
269           if(indent > 0)
270             plainText = plainText +
271             indentString.substring(0,indent-1);
272         }
273         else if(elName.equalsIgnoreCase("hr")) {
274           plainText = plainText + brLine +
275                       "==================================================" +
276                       brLine;
277         }
278         else if(elName.equalsIgnoreCase("p")) {
279           if(pTagEnable) {
280             plainText = plainText + brLine + brLine;
281             if(indent > 0)
282               plainText = plainText +
283               indentString.substring(0,indent-1);
284           }
285           pTagEnable = true;
286         }
287         else if(elName.equalsIgnoreCase("ul")) {
288           pTagEnable_old = pTagEnable;
289           pTagEnable = false;
290           ul_ol_old = ul_ol;
291           ul_ol = elName;
292           indent_old = indent;
293           indent++;
294         }
295         else if(elName.equalsIgnoreCase("ol")) {
296           pTagEnable_old = pTagEnable;
297           pTagEnable = false;
298           ul_ol_old = ul_ol;
299           ul_ol = elName;
300           indent_old = indent;
301           indent++;
302           olNumber_old = olNumber;
303         }
304         else if(elName.equalsIgnoreCase("li")) {
305           pTagEnable = false;
306           if (ul_ol.equalsIgnoreCase("ul")) {
307             plainText = plainText + brLine +
308             indentString.substring(0,indent-1) +
309             ">> ";
310           }
311           else if (ul_ol.equalsIgnoreCase("ol")) {
312             plainText = plainText + brLine +
313             indentString.substring(0,indent-1) +
314             olNumber + ". ";
315             olNumber++;
316           }
317         }
318         else if(elName.equalsIgnoreCase("blockquote")) {
319           pTagEnable_old = pTagEnable;
320           pTagEnable = false;
321           indent_old = indent;
322           indent++;
323           plainText = plainText + brLine +
324           indentString.substring(0,indent);
325         }
326         else if(elName.equalsIgnoreCase("q")) {
327           pTagEnable_old = pTagEnable;
328           pTagEnable = false;
329           plainText = plainText + "\"";
330         }
331         else if(elName.equalsIgnoreCase("table")) {
332           plainText = plainText + brLine +
333           "**************************************************" + brLine +
334           "--------------------------------------------------" + brLine +
335           "-- -- -- -- -- -- -- -- -- -- -- -- --" + brLine;
336         }
337         else if(elName.equalsIgnoreCase("tr")) {
338           plainText = plainText + brLine;
339         }
340         else if(elName.equalsIgnoreCase("td")) {
341           plainText = plainText + brLine;
342         }
343 // attributes handling
344
NamedNodeMap JavaDoc attrs = node0.getAttributes();
345         for ( int i = 0; i < attrs.getLength(); i++ ) {
346           attrs.item(i).getNodeName().toUpperCase();
347           if( enableSwapping &&
348               ( (attrs.item(i).getNodeName()).equalsIgnoreCase("src") ||
349                 (attrs.item(i).getNodeName()).equalsIgnoreCase("background")) ) {
350             String JavaDoc resource = attrs.item(i).getNodeValue();
351             String JavaDoc cid = null;
352 //*****nnn<virtual_file_name> <-- resources got from byte array input stream
353
if (resource.substring(0,5).equalsIgnoreCase("*****")) {
354               for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) {
355                 if (sourceLinks.elementAt(j) instanceof String JavaDoc &&
356                    ((String JavaDoc)sourceLinks.elementAt(j)).equals(resource) )
357                   cid = (String JavaDoc)sourceLinks.elementAt(j+1);
358               }
359               if(cid == null) {
360                 cid = MimeAssist.generateID();
361                 sourceLinks.add(resource);
362                 sourceLinks.add(cid);
363               }
364               attrs.item(i).setNodeValue("cid:"+cid);
365             }
366             else {
367               File JavaDoc fRes = existenceOfResource(resource);
368               if (fRes!=null) {
369                 for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) {
370                   if (sourceLinks.elementAt(j) instanceof File JavaDoc &&
371                      ((File JavaDoc)sourceLinks.elementAt(j)).compareTo(fRes) == 0 )
372                     cid = (String JavaDoc)sourceLinks.elementAt(j+1);
373                 }
374                 if(cid == null) {
375                   cid = MimeAssist.generateID();
376                   sourceLinks.add(fRes);
377                   sourceLinks.add(cid);
378                 }
379                 attrs.item(i).setNodeValue("cid:"+cid);
380               }
381             }
382           }
383         }
384 // finish of opening particular element tag
385
NodeList JavaDoc children = node0.getChildNodes(); //Passing through the node tree
386
if ( children != null ) {
387           int len = children.getLength();
388           for ( int i = 0; i < len; i++ ) {
389              analyze(children.item(i));
390           }
391         }
392 // start of closing particular element tag
393
if(elName.equalsIgnoreCase("ul")) {
394           pTagEnable = pTagEnable_old;
395           ul_ol = ul_ol_old;
396           indent = indent_old;
397         }
398         else if(elName.equalsIgnoreCase("ol")) {
399           pTagEnable = pTagEnable_old;
400           ul_ol = ul_ol_old;
401           indent = indent_old;
402           olNumber = olNumber_old;
403         }
404         else if(elName.equalsIgnoreCase("table")) {
405           plainText = plainText + brLine +
406           "**************************************************";
407         }
408         else if(elName.equalsIgnoreCase("tr")) {
409           plainText = plainText + brLine +
410           "--------------------------------------------------";
411         }
412         else if(elName.equalsIgnoreCase("td")) {
413           plainText = plainText + brLine +
414           "-- -- -- -- -- -- -- -- -- -- -- -- --";
415         }
416         else if(elName.equalsIgnoreCase("blockquote")) {
417           indent = indent_old;
418           pTagEnable = pTagEnable_old;
419         }
420         else if(elName.equalsIgnoreCase("q")) {
421           plainText = plainText + "\"";
422           pTagEnable = pTagEnable_old;
423         }
424
425         break;
426
427       case Node.TEXT_NODE:
428         String JavaDoc nodeVal = node0.getNodeValue();
429         plainText = plainText + nodeVal;
430         break;
431     }
432
433   }
434
435   /**
436    * Analyzes html code withouth generation of alternative plain/text message
437    * from html code. It only creates Vector with corresponding pairs of
438    * resource locations discovered in html code (values of "background" and "src"
439    * attributes), and generated Content-ID values.
440    * @param node0 node element got from JTidy parser.
441    * @exception SMIMEException caused by MimeAssist.generateID() method or by
442    * its private method existenceOfResource().
443    */

444     private void analyzeLight(Node JavaDoc node0) throws SMIMEException {
445
446       if ( node0 == null ) {
447          return;
448       }
449       int type = node0.getNodeType();
450
451       switch (type) {
452         case Node.DOCUMENT_NODE: // Document node
453
analyzeLight(((Document JavaDoc)node0).getDocumentElement());
454           break;
455
456         case Node.ELEMENT_NODE: // Element node
457

458 // attributes handling
459
NamedNodeMap JavaDoc attrs = node0.getAttributes();
460           for ( int i = 0; i < attrs.getLength(); i++ ) {
461             attrs.item(i).getNodeName().toUpperCase();
462             if( enableSwapping &&
463                 ( (attrs.item(i).getNodeName()).equalsIgnoreCase("src") ||
464                   (attrs.item(i).getNodeName()).equalsIgnoreCase("background")) ) {
465               String JavaDoc resource = attrs.item(i).getNodeValue();
466               String JavaDoc cid = null;
467 //*****nnn<virtual_file_name> <-- resources got from byte array input stream
468
if (resource.substring(0,5).equalsIgnoreCase("*****")) {
469                 for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) {
470                   if (sourceLinks.elementAt(j) instanceof String JavaDoc &&
471                      ((String JavaDoc)sourceLinks.elementAt(j)).equals(resource) )
472                     cid = (String JavaDoc)sourceLinks.elementAt(j+1);
473                 }
474                 if(cid == null) {
475                   cid = MimeAssist.generateID();
476                   sourceLinks.add(resource);
477                   sourceLinks.add(cid);
478                 }
479                 attrs.item(i).setNodeValue("cid:"+cid);
480               }
481               else {
482                 File JavaDoc fRes = existenceOfResource(resource);
483                 if (fRes!=null) {
484                   for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) {
485                     if (sourceLinks.elementAt(j) instanceof File JavaDoc &&
486                        ((File JavaDoc)sourceLinks.elementAt(j)).compareTo(fRes) == 0 )
487                       cid = (String JavaDoc)sourceLinks.elementAt(j+1);
488                   }
489                   if(cid == null) {
490                     cid = MimeAssist.generateID();
491                     sourceLinks.add(fRes);
492                     sourceLinks.add(cid);
493                   }
494                   attrs.item(i).setNodeValue("cid:"+cid);
495                 }
496               }
497             }
498           }
499 // finish of opening particular element tag
500
NodeList JavaDoc children = node0.getChildNodes(); //Passing through the node tree
501
if ( children != null ) {
502             int len = children.getLength();
503             for ( int i = 0; i < len; i++ ) {
504                analyzeLight(children.item(i));
505             }
506           }
507           break;
508
509         case Node.TEXT_NODE:
510           break;
511       }
512
513     }
514
515
516
517 /**
518  * Method checks if it is given a resource reachable in the destination file system.
519  * @param resource0 can be absolute or relative path with specified file name
520  * or adress of file in URL form (example "file:///c:/temp/example.gif" )
521  * @return object of class File which represents existance of the resource file
522  * or null if resource does not exist on the destination in file system.
523  * @SMIMEException caused by non SMIMEException which is IOException.
524  */

525   private File JavaDoc existenceOfResource(String JavaDoc resource0) throws SMIMEException {
526
527     boolean resourceIsUrl = true;
528     String JavaDoc resource = new String JavaDoc(resource0);
529     URL JavaDoc url = null;
530
531     try {
532       url = new URL JavaDoc(resource0);
533     }
534     catch(MalformedURLException JavaDoc e) {
535       resourceIsUrl = false;
536     }
537
538     if( resourceIsUrl == true && (!url.getProtocol().equalsIgnoreCase("file")) )
539       return null;
540     else if( resourceIsUrl == true && url.getProtocol().equalsIgnoreCase("file") ) {
541       resource = url.getFile();
542     }
543
544     resource= replaceHex(resource);
545     resource = resource.replace('/', File.separatorChar);
546     resource = resource.replace('\\', File.separatorChar);
547     File JavaDoc fRes = new File JavaDoc(resource);
548
549     try {
550       if(fRes.exists())
551         return fRes.getAbsoluteFile().getCanonicalFile();
552
553       fRes = new File JavaDoc(absolutPath + resource);
554       if(fRes.exists())
555         return fRes.getAbsoluteFile().getCanonicalFile();
556
557       fRes = new File JavaDoc(absolutPath + resource);
558       if(fRes.exists())
559         return fRes.getAbsoluteFile().getCanonicalFile();
560     }
561     catch(Exception JavaDoc e) {
562       throw SMIMEException.getInstance(this, e, "existenceOfResource");
563     }
564
565     return null;
566   }
567
568 /**
569  * Replaces possible hexadecimal representation of blank characters (presented
570  * with "%20") from resource String representation, with blank character.
571  * @param resources0 resource which is examined for hex representation of blank
572  * characters.
573  * @return String with replaced hexadecimal representation of blank characters.
574  */

575   private String JavaDoc replaceHex(String JavaDoc resources0) {
576     while(resources0.indexOf("%20")!=-1) {
577       resources0 = resources0.substring(0, resources0.indexOf("%20")) + " " +
578                    resources0.substring(resources0.indexOf("%20")+3);
579     }
580     return resources0;
581   }
582
583
584 }
Popular Tags