HtmlAnalyzer


1   /**
2    * Title:        Oyster Project
3    * Description:  Creating S/MIME email transport capabilities.
4    * Copyright:    Copyright (c) 2001
5    * @Author       Vladimir Radisic
6    * @Version      2.1.5
7    */
8   
9   package org.enhydra.oyster.util;
10  
11  import org.enhydra.oyster.exception.SMIMEException;
12  import java.util.Vector  ;
13  import java.net.MalformedURLException  ;
14  import java.net.URL  ;
15  import java.io.InputStream  ;
16  import java.io.ByteArrayOutputStream  ;
17  import java.io.File  ;
18  import org.w3c.dom.Attr  ;
19  import org.w3c.dom.Document  ;
20  import org.w3c.dom.NamedNodeMap  ;
21  import org.w3c.dom.Node  ;
22  import org.w3c.dom.NodeList  ;
23  import org.w3c.tidy.Tidy;
24  
25  
26  /**
27   * HtmlAnalyzer class is used for parsing html code which has to become content
28   * of the message. For parsing is used JTidy parser. As result of parsing, DOM
29   * (Document Object Model) structure is obtained. It is tree-like construction
30   * with nodes and hierarchical structures that descripts input html code. This
31   * structure is easy for browsing and searching for specific html elements and
32   * attributes. By using DOM, all references to resources (image, movie, sound... ),
33   * defined in "src" and "background" attributes, are explored and swapped with
34   * generated unique Content-ID values which are necessary in forming
35   * "multipart/related" MimeMultipart object.<BR>
36   * <BR>
37   * DOM, generated inside of the object of this class, is also used in the process of
38   * generation plain/text message based on, and derived from the given html code.
39   * This plain text is later used in creation of "multipart/alternative"
40   * MimeMultipart object.
41   */
42  public class HtmlAnalyzer {
43  
44  /**
45   * plain/text representation of page
46   */
47    private String   plainText = "";
48  
49  /**
50   * Enable/disable p tag in text/html to text/plain conversion.
51   */
52    private boolean pTagEnable = true;
53  
54  /**
55   * Path to html file or prefix path to the embeded resource's adresses in
56   * html code (for example for "src" attribute of IMG tag). Can be null which
57   * means that prefix won't be added to resources location in the process of
58   * searching for specific adress attributes given in html code.
59   */
60    private String   absolutPath = null;
61  
62  /**
63   * Container for parsed html document in DOM (Document Object Model)
64   * representation.
65   */
66    private Document   doc;
67  
68  /**
69   * Indent from left margin pointer. This information is used in the process of
70   * generation plain text message based on html code.
71   */
72    private int indent = 0;
73  
74  /**
75   * Current sequential number of OL (ordered list) html element. This information
76   * is used in the process of generation plain text message based on html code.
77   */
78    private int olNumber = 1;
79  
80  /**
81   * Current html element is OL (ordered list), UN (unordered list) or something
82   * else. This information is used in the process of generation plain text message based
83   * on html code.
84   */
85    private String   ul_ol = "";
86  
87  /**
88   * Constant used in generating indent from left side. This information is used in
89   * the process of generation plain text message based on html code.
90   */
91    private final String   indentString =
92          "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t";
93  /**
94   * Container for storing pairs of replaced url or file addresses and
95   * corresponding generated Content-ID values.
96   */
97    private Vector   sourceLinks = new Vector  (0,1);
98  
99  /**
100  * Enable/disable swapping resource references in html code with generated
101  * value for Content-ID message bodypart header line. Default value is true
102  * (enable swapping)
103  */
104  private boolean enableSwapping = true;
105 
106 /**
107  * Constructs HtmlAnalyzer from data given from InputStream. This constructor
108  * parses html code from input stream withouth swaping resources' locations from
109  * atribute's "src" and "background" value with generated Content-ID values. Also,
110  * depending on second argument, it is performed generation of plain text message
111  * based on html code. If second argument is null then autogeneration of plain
112  * text message is performed. Otherwise, second String argument is used as
113  * external prepared plain text message.
114  * @param content0 html code given as InputStream
115  * @param externalPlainText0 external plain text message represented as String.
116  * If this argument has value null then autogeneration of text plain message is
117  * performed according to passed html code via content0 argument.
118  * @exception SMIMEException caused by its private method analyze().
119  */
120   public HtmlAnalyzer(InputStream   content0, String   externalPlainText0) throws SMIMEException
121   {
122     Tidy tidy = new Tidy();
123     tidy.setWraplen(1000);
124     tidy.setShowWarnings(false);
125     tidy.setUpperCaseTags(true);
126     doc = (tidy.parseDOM(content0, null));
127     enableSwapping = false;
128     if(externalPlainText0 == null) {
129       analyze(doc);
130       plainText = plainText + "\r\n";
131     }
132     else {
133       analyzeLight(doc);
134       plainText = new String  (externalPlainText0);
135       plainText = plainText + "\r\n";
136     }
137   }
138 
139 
140 /**
141  * Constructs HtmlAnalyzer from data given from InputStream. This constructor
142  * parses html code from input stream with swaping resources' locations from
143  * atribute's "src and "background" value with generated Content-ID values. In
144  * that process, it is used given second paremeter "path0" which represents
145  * common path to all resources in html code with relative path adresses.  Also,
146  * depended on third argument, it is performed generation of plain text message
147  * based on html code. If third argument is null then autogeneration of plain
148  * text message is performed. Otherwise, third String argument is used as
149  * external prepared plain text message.
150  * @param content0 html code given as InputStream.
151  * @param path0 common path used for resolving all resources in html code with
152  * relative path adresses.
153  * @param externalPlainText0 external plain text message represented as String.
154  * If this argument has value null then autogeneration of text plain message is
155  * performed according to passed html code via content0 argument.
156  * @exception SMIMEException caused by its private method analyze().
157  */
158   public HtmlAnalyzer(InputStream   content0, String   path0, String   externalPlainText0)
159       throws SMIMEException
160   {
161     if(path0 != null) {
162       absolutPath = new String  (path0);
163       if(absolutPath.charAt( absolutPath.length()-1) == '\\' ||
164          absolutPath.charAt( absolutPath.length()-1) == '/' )
165          absolutPath = absolutPath.substring(0,absolutPath.length()-1);
166 
167       absolutPath = absolutPath.replace('/', File.separatorChar);
168       absolutPath = absolutPath.replace('\\', File.separatorChar) + File.separator;
169     }
170 
171     Tidy tidy = new Tidy();
172     tidy.setWraplen(1000);
173     tidy.setShowWarnings(false);
174     tidy.setUpperCaseTags(true);
175     doc = (tidy.parseDOM(content0, null));
176     if(externalPlainText0 == null) {
177       analyze(doc);
178       plainText = plainText + "\r\n";
179     }
180     else {
181       analyzeLight(doc);
182       plainText = new String  (externalPlainText0);
183       plainText = plainText + "\r\n";
184     }
185   }
186 
187 /**
188  * Returns pairs of swapped resource URL adresses or File paths and appropriate
189  * generated Content IDs.
190  * @return Vector object whose even (and 0) indexes contain resource addresses
191  * as File or String objects, and whose odd indexes contain appropriate
192  * swapped Content-ID values.
193  */
194   public Vector   getSwappedAdresses() {
195     return sourceLinks;
196   }
197 
198 /**
199  * Returns plain/text representation of given html code document
200  * @return html document transformed to plain/text.
201  */
202   public String   getPlainText() {
203     return plainText;
204   }
205 
206 /**
207  * Returns html/text document passed throught JTidy html parser. All resource
208  * references which were accessible on the file system are swapped with
209  * generated content ID value. Also, all virtual references to appropriate
210  * InputStream resources (see setContent methods in classes from package
211  * org.enhydra.oyster.smime) are also swapped with generated Content-ID
212  * value.
213  * @return parsed html/text document.
214  * @exception SMIMEException caused by non SMIMEException which is:
215  * UnsupportedEncodingException.
216  */
217   public String   getHtmlText() throws SMIMEException {
218     String   returnString;
219 
220     Tidy tidy = new Tidy();
221     tidy.setWraplen(1000);
222     ByteArrayOutputStream   out = new ByteArrayOutputStream  ();
223 
224     tidy.pprint(doc,out);
225 
226     try {
227       returnString = out.toString("ISO-8859-1");
228       out.close();
229     }
230     catch(Exception   e) {
231       throw SMIMEException.getInstance(this, e, "getHtmlText" );
232     }
233 
234     return returnString;
235   }
236 
237 
238 /**
239  * Analyzes html code and creates alternative plain/text message from html code.
240  * Also, it creates Vector with corresponding pairs of resource locations discovered
241  * in html code (values of "background" and "src" attributes) and generated
242  * Content-ID values.
243  * @param node0 node element got from JTidy parser.
244  * @exception SMIMEException caused by MimeAssist.generateID() method or by
245  * its private method existenceOfResource().
246  */
247   private void analyze(Node   node0) throws SMIMEException {
248 
249     if ( node0 == null ) {
250        return;
251     }
252     String   brLine = "\r\n";
253     int type = node0.getNodeType();
254 
255     boolean pTagEnable_old = true;
256     int indent_old = 0;
257     int olNumber_old = 1;
258     String   ul_ol_old = "";
259 
260     switch (type) {
261       case Node.DOCUMENT_NODE: // Document node
262         analyze(((Document  )node0).getDocumentElement());
263         break;
264 
265       case Node.ELEMENT_NODE: // Element node
266         String   elName = node0.getNodeName();
267         if (elName.equalsIgnoreCase("br")) {
268           plainText = plainText + brLine;
269           if(indent > 0)
270             plainText = plainText +
271             indentString.substring(0,indent-1);
272         }
273         else if(elName.equalsIgnoreCase("hr")) {
274           plainText = plainText + brLine +
275                       "==================================================" +
276                       brLine;
277         }
278         else if(elName.equalsIgnoreCase("p")) {
279           if(pTagEnable) {
280             plainText = plainText + brLine + brLine;
281             if(indent > 0)
282               plainText = plainText +
283               indentString.substring(0,indent-1);
284           }
285           pTagEnable = true;
286         }
287         else if(elName.equalsIgnoreCase("ul")) {
288           pTagEnable_old = pTagEnable;
289           pTagEnable = false;
290           ul_ol_old = ul_ol;
291           ul_ol = elName;
292           indent_old = indent;
293           indent++;
294         }
295         else if(elName.equalsIgnoreCase("ol")) {
296           pTagEnable_old = pTagEnable;
297           pTagEnable = false;
298           ul_ol_old = ul_ol;
299           ul_ol = elName;
300           indent_old = indent;
301           indent++;
302           olNumber_old = olNumber;
303         }
304         else if(elName.equalsIgnoreCase("li")) {
305           pTagEnable = false;
306           if (ul_ol.equalsIgnoreCase("ul")) {
307             plainText = plainText + brLine +
308             indentString.substring(0,indent-1) +
309             ">> ";
310           }
311           else if (ul_ol.equalsIgnoreCase("ol")) {
312             plainText = plainText + brLine +
313             indentString.substring(0,indent-1) +
314             olNumber + ". ";
315             olNumber++;
316           }
317         }
318         else if(elName.equalsIgnoreCase("blockquote")) {
319           pTagEnable_old = pTagEnable;
320           pTagEnable = false;
321           indent_old = indent;
322           indent++;
323           plainText = plainText + brLine +
324           indentString.substring(0,indent);
325         }
326         else if(elName.equalsIgnoreCase("q")) {
327           pTagEnable_old = pTagEnable;
328           pTagEnable = false;
329           plainText = plainText + "\"";
330         }
331         else if(elName.equalsIgnoreCase("table")) {
332           plainText = plainText + brLine +
333           "**************************************************" + brLine +
334           "--------------------------------------------------" + brLine +
335           "--  --  --  --  --  --  --  --  --  --  --  --  --" + brLine;
336         }
337         else if(elName.equalsIgnoreCase("tr")) {
338           plainText = plainText + brLine;
339         }
340         else if(elName.equalsIgnoreCase("td")) {
341           plainText = plainText + brLine;
342         }
343 // attributes handling
344         NamedNodeMap   attrs = node0.getAttributes();
345         for ( int i = 0; i < attrs.getLength(); i++ ) {
346           attrs.item(i).getNodeName().toUpperCase();
347           if( enableSwapping &&
348               ( (attrs.item(i).getNodeName()).equalsIgnoreCase("src") ||
349                 (attrs.item(i).getNodeName()).equalsIgnoreCase("background")) ) {
350             String   resource = attrs.item(i).getNodeValue();
351             String   cid = null;
352 //*****nnn<virtual_file_name>   <-- resources got from byte array input stream
353             if (resource.substring(0,5).equalsIgnoreCase("*****")) {
354               for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) {
355                 if (sourceLinks.elementAt(j) instanceof String   &&
356                    ((String  )sourceLinks.elementAt(j)).equals(resource) )
357                   cid = (String  )sourceLinks.elementAt(j+1);
358               }
359               if(cid == null) {
360                 cid = MimeAssist.generateID();
361                 sourceLinks.add(resource);
362                 sourceLinks.add(cid);
363               }
364               attrs.item(i).setNodeValue("cid:"+cid);
365             }
366             else {
367               File   fRes = existenceOfResource(resource);
368               if (fRes!=null) {
369                 for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) {
370                   if (sourceLinks.elementAt(j) instanceof File   &&
371                      ((File  )sourceLinks.elementAt(j)).compareTo(fRes) == 0 )
372                     cid = (String  )sourceLinks.elementAt(j+1);
373                 }
374                 if(cid == null) {
375                   cid = MimeAssist.generateID();
376                   sourceLinks.add(fRes);
377                   sourceLinks.add(cid);
378                 }
379                 attrs.item(i).setNodeValue("cid:"+cid);
380               }
381             }
382           }
383         }
384 // finish of opening particular element tag
385         NodeList   children = node0.getChildNodes(); //Passing through the node tree
386         if ( children != null ) {
387           int len = children.getLength();
388           for ( int i = 0; i < len; i++ ) {
389              analyze(children.item(i));
390           }
391         }
392 // start of closing particular element tag
393         if(elName.equalsIgnoreCase("ul")) {
394           pTagEnable = pTagEnable_old;
395           ul_ol = ul_ol_old;
396           indent = indent_old;
397         }
398         else if(elName.equalsIgnoreCase("ol")) {
399           pTagEnable = pTagEnable_old;
400           ul_ol = ul_ol_old;
401           indent = indent_old;
402           olNumber = olNumber_old;
403         }
404         else if(elName.equalsIgnoreCase("table")) {
405           plainText = plainText + brLine +
406           "**************************************************";
407         }
408         else if(elName.equalsIgnoreCase("tr")) {
409           plainText = plainText + brLine +
410           "--------------------------------------------------";
411         }
412         else if(elName.equalsIgnoreCase("td")) {
413           plainText = plainText + brLine +
414           "--  --  --  --  --  --  --  --  --  --  --  --  --";
415         }
416         else if(elName.equalsIgnoreCase("blockquote")) {
417           indent = indent_old;
418           pTagEnable = pTagEnable_old;
419         }
420         else if(elName.equalsIgnoreCase("q")) {
421           plainText = plainText + "\"";
422           pTagEnable = pTagEnable_old;
423         }
424 
425         break;
426 
427       case Node.TEXT_NODE:
428         String   nodeVal = node0.getNodeValue();
429         plainText = plainText + nodeVal;
430         break;
431     }
432 
433   }
434 
435   /**
436    * Analyzes html code withouth generation of alternative plain/text message
437    * from html code. It only creates Vector with corresponding pairs of
438    * resource locations discovered in html code (values of "background" and "src"
439    * attributes), and generated Content-ID values.
440    * @param node0 node element got from JTidy parser.
441    * @exception SMIMEException caused by MimeAssist.generateID() method or by
442    * its private method existenceOfResource().
443    */
444     private void analyzeLight(Node   node0) throws SMIMEException {
445 
446       if ( node0 == null ) {
447          return;
448       }
449       int type = node0.getNodeType();
450 
451       switch (type) {
452         case Node.DOCUMENT_NODE: // Document node
453           analyzeLight(((Document  )node0).getDocumentElement());
454           break;
455 
456         case Node.ELEMENT_NODE: // Element node
457 
458 // attributes handling
459           NamedNodeMap   attrs = node0.getAttributes();
460           for ( int i = 0; i < attrs.getLength(); i++ ) {
461             attrs.item(i).getNodeName().toUpperCase();
462             if( enableSwapping &&
463                 ( (attrs.item(i).getNodeName()).equalsIgnoreCase("src") ||
464                   (attrs.item(i).getNodeName()).equalsIgnoreCase("background")) ) {
465               String   resource = attrs.item(i).getNodeValue();
466               String   cid = null;
467 //*****nnn<virtual_file_name>   <-- resources got from byte array input stream
468               if (resource.substring(0,5).equalsIgnoreCase("*****")) {
469                 for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) {
470                   if (sourceLinks.elementAt(j) instanceof String   &&
471                      ((String  )sourceLinks.elementAt(j)).equals(resource) )
472                     cid = (String  )sourceLinks.elementAt(j+1);
473                 }
474                 if(cid == null) {
475                   cid = MimeAssist.generateID();
476                   sourceLinks.add(resource);
477                   sourceLinks.add(cid);
478                 }
479                 attrs.item(i).setNodeValue("cid:"+cid);
480               }
481               else {
482                 File   fRes = existenceOfResource(resource);
483                 if (fRes!=null) {
484                   for(int j = 0; j<sourceLinks.size() & cid == null; j=j+2) {
485                     if (sourceLinks.elementAt(j) instanceof File   &&
486                        ((File  )sourceLinks.elementAt(j)).compareTo(fRes) == 0 )
487                       cid = (String  )sourceLinks.elementAt(j+1);
488                   }
489                   if(cid == null) {
490                     cid = MimeAssist.generateID();
491                     sourceLinks.add(fRes);
492                     sourceLinks.add(cid);
493                   }
494                   attrs.item(i).setNodeValue("cid:"+cid);
495                 }
496               }
497             }
498           }
499 // finish of opening particular element tag
500           NodeList   children = node0.getChildNodes(); //Passing through the node tree
501           if ( children != null ) {
502             int len = children.getLength();
503             for ( int i = 0; i < len; i++ ) {
504                analyzeLight(children.item(i));
505             }
506           }
507           break;
508 
509         case Node.TEXT_NODE:
510           break;
511       }
512 
513     }
514 
515 
516 
517 /**
518  * Method checks if it is given a resource reachable in the destination file system.
519  * @param resource0 can be absolute or relative path with specified file name
520  * or adress of file in URL form (example "file:///c:/temp/example.gif" )
521  * @return object of class File which represents existance of the resource file
522  * or null if resource does not exist on the destination in file system.
523  * @SMIMEException caused by non SMIMEException which is IOException.
524  */
525   private File   existenceOfResource(String   resource0) throws SMIMEException {
526 
527     boolean resourceIsUrl = true;
528     String   resource = new String  (resource0);
529     URL   url = null;
530 
531     try {
532       url = new URL  (resource0);
533     }
534     catch(MalformedURLException   e) {
535       resourceIsUrl = false;
536     }
537 
538     if( resourceIsUrl == true && (!url.getProtocol().equalsIgnoreCase("file")) )
539       return null;
540     else if( resourceIsUrl == true && url.getProtocol().equalsIgnoreCase("file") ) {
541       resource = url.getFile();
542     }
543 
544     resource= replaceHex(resource);
545     resource = resource.replace('/', File.separatorChar);
546     resource = resource.replace('\\', File.separatorChar);
547     File   fRes = new File  (resource);
548 
549     try {
550       if(fRes.exists())
551         return fRes.getAbsoluteFile().getCanonicalFile();
552 
553       fRes = new File  (absolutPath + resource);
554       if(fRes.exists())
555         return fRes.getAbsoluteFile().getCanonicalFile();
556 
557       fRes = new File  (absolutPath + resource);
558       if(fRes.exists())
559         return fRes.getAbsoluteFile().getCanonicalFile();
560     }
561     catch(Exception   e) {
562       throw SMIMEException.getInstance(this, e, "existenceOfResource");
563     }
564 
565     return null;
566   }
567 
568 /**
569  * Replaces possible hexadecimal representation of blank characters (presented
570  * with "%20") from resource String representation, with blank character.
571  * @param resources0 resource which is examined for hex representation of blank
572  * characters.
573  * @return String with replaced hexadecimal representation of blank characters.
574  */
575   private String   replaceHex(String   resources0) {
576     while(resources0.indexOf("%20")!=-1) {
577       resources0 = resources0.substring(0, resources0.indexOf("%20")) + " " +
578                    resources0.substring(resources0.indexOf("%20")+3);
579     }
580     return resources0;
581   }
582 
583 
584 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags