CmsHtmlImportConverter


1   /*
2    * File   :
3    * Date   : 
4    * Version: 
5    *
6    * This library is part of OpenCms -
7    * the Open Source Content Mananagement System
8    *
9    * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10   *
11   * This library is free software; you can redistribute it and/or
12   * modify it under the terms of the GNU Lesser General Public
13   * License as published by the Free Software Foundation; either
14   * version 2.1 of the License, or (at your option) any later version.
15   *
16   * This library is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19   * Lesser General Public License for more details.
20   *
21   * For further information about Alkacon Software GmbH, please see the
22   * company website: http://www.alkacon.com
23   *
24   * For further information about OpenCms, please see the
25   * project website: http://www.opencms.org
26   * 
27   * You should have received a copy of the GNU Lesser General Public
28   * License along with this library; if not, write to the Free Software
29   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
30   */
31  
32  package org.opencms.workplace.tools.database;
33  
34  import org.opencms.file.CmsPropertyDefinition;
35  import org.opencms.i18n.CmsEncoder;
36  import org.opencms.main.CmsLog;
37  import org.opencms.util.CmsStringUtil;
38  
39  import java.io.ByteArrayInputStream  ;
40  import java.io.ByteArrayOutputStream  ;
41  import java.io.IOException  ;
42  import java.io.InputStream  ;
43  import java.io.PrintWriter  ;
44  import java.io.Reader  ;
45  import java.io.StringReader  ;
46  import java.io.StringWriter  ;
47  import java.io.UnsupportedEncodingException  ;
48  import java.io.Writer  ;
49  import java.util.HashSet  ;
50  import java.util.Hashtable  ;
51  import java.util.StringTokenizer  ;
52  import java.util.regex.Matcher  ;
53  import java.util.regex.Pattern  ;
54  
55  import org.w3c.dom.Document  ;
56  import org.w3c.dom.NamedNodeMap  ;
57  import org.w3c.dom.Node  ;
58  import org.w3c.dom.NodeList  ;
59  import org.w3c.tidy.Tidy;
60  
61  /**
62   * This class implements Html-converting routines based on tidy to modify the
63   * Html code of the imported Html pages.<p>
64   * 
65   * @author Michael Emmerich 
66   * 
67   * @version $Revision: 1.10 $ 
68   * 
69   * @since 6.0.0 
70   */
71  public class CmsHtmlImportConverter {
72  
73      /** defintition of the alt attribute. */
74      private static final String   ATTRIB_ALT = "alt";
75  
76      /** defintition of the content attribute. */
77      private static final String   ATTRIB_CONTENT = "content";
78  
79      /** defintition of the href attribute.  */
80      private static final String   ATTRIB_HREF = "href";
81  
82      /** defintition of the name attribute. */
83      private static final String   ATTRIB_NAME = "name";
84  
85      /** defintition of the src attribute. */
86      private static final String   ATTRIB_SRC = "src";
87  
88      /** defintition of the &lt;BODY&gt;&lt;/BODY&gt; node. */
89      private static final String   NODE_BODY = "body";
90  
91      /** defintition of the &lt;HEAD&gt;&lt;/HEAD&gt; node. */
92      private static final String   NODE_HEAD = "head";
93  
94      /** defintition of the &lt;A&gt;&lt;/A&gt; node. */
95      private static final String   NODE_HREF = "a";
96  
97      /** defintition of the &lt;HTML&gt;&lt;/HTML&gt; node. */
98      private static final String   NODE_HTML = "html";
99  
100     /** defintition of the &lt;IMG&gt;&lt;/IMG&gt; node. */
101     private static final String   NODE_IMG = "img";
102 
103     /** defintition of the &lt;META&gt;&lt;/META&gt; node. */
104     private static final String   NODE_META = "meta";
105 
106     /** defintition of the &lt;TITLE&gt;&lt;/TITLE&gt;  node. */
107     private static final String   NODE_TITLE = "title";
108 
109     /**
110      * HashMap stores tag names, after the end-tag, a "\n" is added to the output.<p>
111      */
112     private HashSet   m_enterTags = new HashSet  ();
113 
114     /** 
115      * the absolute path in the real filesystem of the file to convert. 
116      */
117     private String   m_filename;
118 
119     /**
120      * reference to the HtmlImport object, required to access the link translation.
121      */
122     private CmsHtmlImport m_htmlImport;
123 
124     /** 
125      * temporary buffer used in transformation method. 
126      */
127     private StringBuffer   m_tempString;
128 
129     /** instance of JTidy. */
130     private Tidy m_tidy = new Tidy();
131 
132     /** flag to write the output. */
133     private boolean m_write;
134 
135     /**
136      * Default constructor, creates a new HtmlConverter.<p>
137      * 
138      * @param htmlImport reference to the htmlimport
139      * @param xmlMode switch for setting the import to HTML or XML mode
140      */
141     public CmsHtmlImportConverter(CmsHtmlImport htmlImport, boolean xmlMode) {
142 
143         m_tidy.setTidyMark(false);
144         m_tidy.setShowWarnings(false);
145         m_tidy.setQuiet(true);
146         m_tidy.setForceOutput(true);
147 
148         if (xmlMode) {
149             m_tidy.setXmlTags(xmlMode);
150             m_tidy.setXmlSpace(true);
151         }
152 
153         initialiseTags();
154         m_htmlImport = htmlImport;
155     }
156 
157     /**
158      * Extracts the content of a HTML page.<p>
159      * 
160      * This method should be pretty robust and work even if the input HTML does not contains
161      * the specified matchers.<p> 
162      * 
163      * @param content the content to extract the body from
164      * @param startpoint the point where matching starts
165      * @param endpoint the point where matching ends
166      * @return the extracted body tag content
167      */
168     public static String   extractHtml(String   content, String   startpoint, String   endpoint) {
169 
170         /** Regex that matches a start body tag. */
171         Pattern   startPattern = Pattern.compile(startpoint, Pattern.CASE_INSENSITIVE);
172 
173         /** Regex that matches an end body tag. */
174         Pattern   endPattern = Pattern.compile(endpoint, Pattern.CASE_INSENSITIVE);
175 
176         Matcher   startMatcher = startPattern.matcher(content);
177         Matcher   endMatcher = endPattern.matcher(content);
178 
179         int start = 0;
180         int end = content.length();
181 
182         if (startMatcher.find()) {
183             start = startMatcher.end();
184         }
185 
186         if (endMatcher.find(start)) {
187             end = endMatcher.start();
188         }
189 
190         return content.substring(start, end);
191     }
192 
193     /**
194      * Transforms HTML code into user defined output.<p>
195      * 
196      * @param input Reader with HTML code
197      * @param output Writer with transformed code
198      * @param startPattern the start pattern definition for content extracting
199      * @param endPattern the end pattern definition for content extracting 
200      * @param properties the file properties
201      */
202     public void convertHTML(Reader   input, Writer   output, String   startPattern, String   endPattern, Hashtable   properties) {
203 
204         /* local variables */
205         StringBuffer   htmlString = new StringBuffer  ();
206         Node   node;
207         String   outString = "";
208 
209         try {
210             /* write InputStream input in StringBuffer htmlString */
211             int c;
212             while ((c = input.read()) != -1) {
213                 htmlString.append((char)c);
214             }
215         } catch (IOException   e) {
216             if (CmsLog.INIT.isWarnEnabled()) {
217                 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0, e.getLocalizedMessage()));
218             }
219             return;
220         }
221         outString = htmlString.toString();
222         // extract from html if even both patterns are defined
223         if (CmsStringUtil.isNotEmpty(startPattern) && CmsStringUtil.isNotEmpty(endPattern)) {
224             String   extractMain = extractHtml(outString, startPattern, endPattern);
225             if (extractMain.length() != outString.length()) {
226                 String   extractHead = extractHtml(outString, "<html>", CmsStringUtil.BODY_START_REGEX);
227                 //String extractHead = extractHtml(extractMain, "<html>", CmsStringUtil.C_BODY_START_REGEX);     
228                 StringBuffer   buffer = new StringBuffer  (extractHead.length() + extractMain.length() + 255);
229                 buffer.append("<html>");
230                 buffer.append(extractHead);
231                 buffer.append("<body>");
232                 buffer.append(extractMain);
233                 buffer.append("</body></html>");
234                 outString = buffer.toString();
235             }
236         }
237 
238         /* convert htmlString in InputStream for parseDOM */
239         InputStream   in;
240         try {
241             in = new ByteArrayInputStream  (outString.getBytes(CmsEncoder.ENCODING_UTF_8));
242         } catch (UnsupportedEncodingException   e) {
243             // this should never happen since UTF-8 is always supported
244             in = new ByteArrayInputStream  (outString.getBytes());
245         }
246         m_tidy.setInputEncoding(CmsEncoder.ENCODING_UTF_8);
247         m_tidy.setOutputEncoding(CmsEncoder.ENCODING_UTF_8);
248 
249         // hold tidy error information into a new PrintWriter Object
250         PrintWriter   errorLog = new PrintWriter  (new ByteArrayOutputStream  (), true);
251         m_tidy.setErrout(errorLog);
252 
253         node = m_tidy.parseDOM(in, null);
254         /* check if html code has errors */
255         if (m_tidy.getParseErrors() != 0) {
256             if (CmsLog.INIT.isWarnEnabled()) {
257                 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0));
258             }
259         }
260         /* second step: create transformed output with printDocument from DOM */
261         this.printDocument(node, properties);
262 
263         try {
264             String   content = m_tempString.toString();
265             content = CmsStringUtil.substitute(content, "<br></br>", "<br>");
266             content = CmsStringUtil.substitutePerl(content, "</a>(\\w+)", "</a> $1", "g");
267             output.write(content);
268             output.close();
269 
270         } catch (IOException   e) {
271             if (CmsLog.INIT.isWarnEnabled()) {
272                 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_1, e.getLocalizedMessage()));
273             }
274             return;
275         }
276     }
277 
278     /**
279      * Transforms HTML code into user defined output.<p>
280      * 
281      * @param filename the absolute path in the real filesystem of the file to convert
282      * @param inString String with HTML code
283      * @param startPattern the start pattern definition for content extracting
284      * @param endPattern the end pattern definition for content extracting 
285      * @param properties the file properties
286      * @return String with transformed code
287      */
288     public String   convertHTML(
289         String   filename,
290         String   inString,
291         String   startPattern,
292         String   endPattern,
293         Hashtable   properties) {
294 
295         m_tempString = new StringBuffer  ();
296         m_write = true;
297         m_filename = filename.replace('\\', '/');
298         Reader   in = new StringReader  (inString);
299         Writer   out = new StringWriter  ();
300         convertHTML(in, out, startPattern, endPattern, properties);
301         return out.toString();
302     }
303 
304     /**
305      * Initialises Vector m_enterTags with tag names.<p>
306      */
307     private void initialiseTags() {
308 
309         StringTokenizer   T = new StringTokenizer  (
310             "p,table,tr,td,body,head,script,pre,title,style,h1,h2,h3,h4,h5,h6,ul,ol,li",
311             ",");
312         while (T.hasMoreTokens()) {
313             m_enterTags.add(new String  (T.nextToken()));
314         }
315     }
316 
317     /**
318      * Private method to parse DOM and create user defined output.<p>
319      * 
320      * @param node Node of DOM from HTML code
321      * @param properties the file properties
322      */
323     private void printDocument(Node   node, Hashtable   properties) {
324 
325         // if node is empty do nothing... (Recursion)
326         if (node == null) {
327             return;
328         }
329         // initialise local variables
330         int type = node.getNodeType();
331         String   name = node.getNodeName();
332 
333         // detect node type
334         switch (type) {
335             case Node.DOCUMENT_NODE:
336 
337                 this.printDocument(((Document  )node).getDocumentElement(), properties);
338                 break;
339             case Node.ELEMENT_NODE:
340 
341                 // check if its the <head> node. Nothing inside the <head> node
342                 // must be
343                 // part of the output, but we must scan the content of this
344                 // node to get all
345                 // <meta> tags
346                 if (name.equals(NODE_HEAD)) {
347                     m_write = false;
348                 }
349                 // scan element node; if a block has to be removed or replaced,
350                 // break and discard child nodes
351                 transformStartElement(node, properties);
352 
353                 // test if node has children
354                 NodeList   children = node.getChildNodes();
355                 if (children != null) {
356                     int len = children.getLength();
357                     for (int i = 0; i < len; i++) {
358                         // recursively call printDocument with all child nodes
359                         this.printDocument(children.item(i), properties);
360                     }
361                 }
362                 break;
363             case Node.TEXT_NODE:
364 
365                 // replace subStrings in text nodes
366                 transformTextNode(node);
367                 break;
368             default:
369 
370                 break;
371         }
372         // end of recursion, add eventual endtags and suffixes
373         switch (type) {
374             case Node.ELEMENT_NODE:
375                 // analyse endtags and add them to output
376                 transformEndElement(node);
377                 if (node.getNodeName().equals(NODE_HEAD)) {
378                     m_write = true;
379                 }
380                 break;
381             case Node.DOCUMENT_NODE:
382                 break;
383             default:
384                 break;
385         }
386     }
387 
388     /**
389      * Transform element nodes and create end tags in output.<p>
390      * 
391      * @param node actual element node
392      */
393     private void transformEndElement(Node   node) {
394 
395         // check hat kind of node we have
396         String   nodeName = node.getNodeName();
397 
398         // the <HTML> and <BODY> node must be skipped
399         if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
400             // do nothing here
401         } else {
402             // only do some output if we are in writing mode
403             if (m_write) {
404                 m_tempString.append("</");
405                 m_tempString.append(nodeName);
406                 m_tempString.append(">");
407 
408                 // append a "\n" to output String if possible
409                 if (m_enterTags.contains(node.getNodeName())) {
410                     m_tempString.append("\n");
411                 }
412             }
413         }
414     }
415 
416     /**
417      * Transforms element nodes and create start tags in output. <p>
418      * 
419      * @param node actual element node
420      * @param properties the file properties
421      */
422     private void transformStartElement(Node   node, Hashtable   properties) {
423 
424         // check hat kind of node we have
425         String   nodeName = node.getNodeName();
426 
427         // the <HTML> and <BODY> node must be skipped
428         if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
429             // the <TITLE> node must be read and its value set as properties to
430             // the imported file
431 
432         } else if (nodeName.equals(NODE_TITLE)) {
433 
434             writeTitleProperty(node, properties);
435 
436         } else if (nodeName.equals(NODE_META)) {
437 
438             writeMetaTagProperty(node, properties);
439 
440         } else if (nodeName.equals(NODE_HREF)) {
441 
442             // only do some output if we are in writing mode
443             if (m_write) {
444                 m_tempString.append("<");
445                 m_tempString.append(nodeName);
446                 NamedNodeMap   attrs = node.getAttributes();
447                 // look through all attribs to find the reference
448                 for (int i = attrs.getLength() - 1; i >= 0; i--) {
449                     String   name = attrs.item(i).getNodeName();
450                     String   value = attrs.item(i).getNodeValue();
451 
452                     if (name.equals(ATTRIB_HREF)) {
453 
454                         // check if this is an external link
455                         if (value.indexOf("://") > 0) {
456                             // store it for later creation of an entry in the
457                             // link gallery
458                             String   externalLinkFile = m_htmlImport.storeExternalLink(value);
459                             if (externalLinkFile != null) {
460                                 value = m_htmlImport.getLinkGallery() + externalLinkFile;
461                             }
462                         } else if (!value.startsWith("mailto:") && !value.startsWith("javascript:")) {
463 
464                             // save an existing anchor link for later use
465                             //                            if (value.indexOf("#") > 0) {
466                             //                                String anchor = value.substring(value.indexOf("#"), value.length());
467                             //                            }
468                             // get the new link into the VFS
469                             String   internalUri = m_htmlImport.getAbsoluteUri(value, m_filename.substring(
470                                 0,
471                                 m_filename.lastIndexOf("/") + 1));
472 
473                             value = m_htmlImport.translateLink(internalUri);
474                         }
475                     }
476 
477                     m_tempString.append(" ");
478                     m_tempString.append(name);
479                     m_tempString.append("=\"");
480                     m_tempString.append(value);
481                     m_tempString.append("\"");
482                 }
483                 m_tempString.append(">");
484             }
485 
486             // this is a imasge, its reference must be converted
487         } else if (nodeName.equals(NODE_IMG)) {
488 
489             // only do some output if we are in writing mode
490             if (m_write) {
491                 m_tempString.append("<");
492                 m_tempString.append(nodeName);
493                 NamedNodeMap   attrs = node.getAttributes();
494                 // look through all attribs to find the src and alt attributes
495                 String   imagename = "";
496                 String   altText = "";
497                 for (int i = attrs.getLength() - 1; i >= 0; i--) {
498                     String   name = attrs.item(i).getNodeName();
499                     String   value = attrs.item(i).getNodeValue();
500                     if (name.equals(ATTRIB_SRC)) {
501                         // we found the src. now check if it refers to an
502                         // external image.
503                         // if not, we must get the correct location in the VFS
504                         if (value.indexOf("://") <= 0) {
505                             imagename = m_htmlImport.getAbsoluteUri(value, m_filename.substring(
506                                 0,
507                                 m_filename.lastIndexOf("/") + 1));
508                             value = m_htmlImport.translateLink(imagename);
509                         }
510                     } else if (name.equals(ATTRIB_ALT)) {
511                         altText = value;
512                     }
513 
514                     m_tempString.append(" ");
515                     m_tempString.append(name);
516                     m_tempString.append("=\"");
517                     m_tempString.append(value);
518                     m_tempString.append("\"");
519                 }
520 
521                 //store the alt tag of this image for later use
522                 m_htmlImport.storeImageInfo(imagename, altText);
523 
524                 m_tempString.append(">");
525             }
526         } else {
527 
528             // only do some output if we are in writing mode
529             if (m_write) {
530 
531                 m_tempString.append("<");
532                 m_tempString.append(nodeName);
533                 NamedNodeMap   attrs = node.getAttributes();
534                 for (int i = attrs.getLength() - 1; i >= 0; i--) {
535                     m_tempString.append(" " + attrs.item(i).getNodeName() + "=" + "\"");
536                     /* scan attribute values and replace subStrings */
537                     m_tempString.append(attrs.item(i).getNodeValue() + "\"");
538                 }
539                 m_tempString.append(">");
540             }
541         }
542     }
543 
544     /**
545      * Private method to transform text nodes.<p>
546      * 
547      * @param node actual text node
548      */
549     private void transformTextNode(Node   node) {
550 
551         // only do some output if we are in writing mode
552         if (m_write) {
553             String   helpString = node.getNodeValue();
554             m_tempString.append(helpString);
555         }
556     }
557 
558     /**
559      * Writes meta tags as cms properties by analyzing the meta tags nodes.<p>
560      * 
561      * @param node the meta tag node in html document
562      * @param properties the properties hashtable
563      */
564     private void writeMetaTagProperty(Node   node, Hashtable   properties) {
565 
566         NamedNodeMap   attrs = node.getAttributes();
567         String   metaName = "";
568         String   metaContent = "";
569         // look through all attribs to find the name and content attributes
570         for (int i = attrs.getLength() - 1; i >= 0; i--) {
571             String   name = attrs.item(i).getNodeName();
572             String   value = attrs.item(i).getNodeValue();
573             if (name.equals(ATTRIB_NAME)) {
574                 metaName = value;
575             } else if (name.equals(ATTRIB_CONTENT)) {
576                 metaContent = value;
577             }
578         }
579         // check if we have valid entries for this <META> node, store them
580         // in the properties
581         if (metaName.length() > 0 && metaContent.length() > 0) {
582             properties.put(metaName, CmsStringUtil.substitute(metaContent, "{subst}", "&#"));
583         }
584     }
585 
586     /**
587      * Sets the Property title by analyzing the title node.<p>
588      * 
589      * @param node the title node in html document
590      * @param properties the properties hashtable
591      */
592     private void writeTitleProperty(Node   node, Hashtable   properties) {
593 
594         String   title = "";
595         // the title string is stored in the first child node
596         NodeList   children = node.getChildNodes();
597         if (children != null) {
598             Node   titleNode = children.item(0);
599             if (titleNode != null) {
600                 title = titleNode.getNodeValue();
601             }
602         }
603         // add the title property if we have one
604         if ((title != null) && (title.length() > 0)) {
605 
606             properties.put(CmsPropertyDefinition.PROPERTY_TITLE, CmsStringUtil.substitute(title, "{subst}", "&#"));
607             // the title will be used as navtext if no other navtext is
608             // given
609             if (properties.get(CmsPropertyDefinition.PROPERTY_NAVTEXT) == null) {
610                 properties.put(CmsPropertyDefinition.PROPERTY_NAVTEXT, CmsStringUtil.substitute(title, "{subst}", "&#"));
611             }
612         }
613 
614     }
615 
616 }
617
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags