KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > opencms > workplace > tools > database > CmsHtmlImportConverter


1 /*
2  * File :
3  * Date :
4  * Version:
5  *
6  * This library is part of OpenCms -
7  * the Open Source Content Mananagement System
8  *
9  * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * For further information about Alkacon Software GmbH, please see the
22  * company website: http://www.alkacon.com
23  *
24  * For further information about OpenCms, please see the
25  * project website: http://www.opencms.org
26  *
27  * You should have received a copy of the GNU Lesser General Public
28  * License along with this library; if not, write to the Free Software
29  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30  */

31
32 package org.opencms.workplace.tools.database;
33
34 import org.opencms.file.CmsPropertyDefinition;
35 import org.opencms.i18n.CmsEncoder;
36 import org.opencms.main.CmsLog;
37 import org.opencms.util.CmsStringUtil;
38
39 import java.io.ByteArrayInputStream JavaDoc;
40 import java.io.ByteArrayOutputStream JavaDoc;
41 import java.io.IOException JavaDoc;
42 import java.io.InputStream JavaDoc;
43 import java.io.PrintWriter JavaDoc;
44 import java.io.Reader JavaDoc;
45 import java.io.StringReader JavaDoc;
46 import java.io.StringWriter JavaDoc;
47 import java.io.UnsupportedEncodingException JavaDoc;
48 import java.io.Writer JavaDoc;
49 import java.util.HashSet JavaDoc;
50 import java.util.Hashtable JavaDoc;
51 import java.util.StringTokenizer JavaDoc;
52 import java.util.regex.Matcher JavaDoc;
53 import java.util.regex.Pattern JavaDoc;
54
55 import org.w3c.dom.Document JavaDoc;
56 import org.w3c.dom.NamedNodeMap JavaDoc;
57 import org.w3c.dom.Node JavaDoc;
58 import org.w3c.dom.NodeList JavaDoc;
59 import org.w3c.tidy.Tidy;
60
61 /**
62  * This class implements Html-converting routines based on tidy to modify the
63  * Html code of the imported Html pages.<p>
64  *
65  * @author Michael Emmerich
66  *
67  * @version $Revision: 1.10 $
68  *
69  * @since 6.0.0
70  */

71 public class CmsHtmlImportConverter {
72
73     /** defintition of the alt attribute. */
74     private static final String JavaDoc ATTRIB_ALT = "alt";
75
76     /** defintition of the content attribute. */
77     private static final String JavaDoc ATTRIB_CONTENT = "content";
78
79     /** defintition of the href attribute. */
80     private static final String JavaDoc ATTRIB_HREF = "href";
81
82     /** defintition of the name attribute. */
83     private static final String JavaDoc ATTRIB_NAME = "name";
84
85     /** defintition of the src attribute. */
86     private static final String JavaDoc ATTRIB_SRC = "src";
87
88     /** defintition of the &lt;BODY&gt;&lt;/BODY&gt; node. */
89     private static final String JavaDoc NODE_BODY = "body";
90
91     /** defintition of the &lt;HEAD&gt;&lt;/HEAD&gt; node. */
92     private static final String JavaDoc NODE_HEAD = "head";
93
94     /** defintition of the &lt;A&gt;&lt;/A&gt; node. */
95     private static final String JavaDoc NODE_HREF = "a";
96
97     /** defintition of the &lt;HTML&gt;&lt;/HTML&gt; node. */
98     private static final String JavaDoc NODE_HTML = "html";
99
100     /** defintition of the &lt;IMG&gt;&lt;/IMG&gt; node. */
101     private static final String JavaDoc NODE_IMG = "img";
102
103     /** defintition of the &lt;META&gt;&lt;/META&gt; node. */
104     private static final String JavaDoc NODE_META = "meta";
105
106     /** defintition of the &lt;TITLE&gt;&lt;/TITLE&gt; node. */
107     private static final String JavaDoc NODE_TITLE = "title";
108
109     /**
110      * HashMap stores tag names, after the end-tag, a "\n" is added to the output.<p>
111      */

112     private HashSet JavaDoc m_enterTags = new HashSet JavaDoc();
113
114     /**
115      * the absolute path in the real filesystem of the file to convert.
116      */

117     private String JavaDoc m_filename;
118
119     /**
120      * reference to the HtmlImport object, required to access the link translation.
121      */

122     private CmsHtmlImport m_htmlImport;
123
124     /**
125      * temporary buffer used in transformation method.
126      */

127     private StringBuffer JavaDoc m_tempString;
128
129     /** instance of JTidy. */
130     private Tidy m_tidy = new Tidy();
131
132     /** flag to write the output. */
133     private boolean m_write;
134
135     /**
136      * Default constructor, creates a new HtmlConverter.<p>
137      *
138      * @param htmlImport reference to the htmlimport
139      * @param xmlMode switch for setting the import to HTML or XML mode
140      */

141     public CmsHtmlImportConverter(CmsHtmlImport htmlImport, boolean xmlMode) {
142
143         m_tidy.setTidyMark(false);
144         m_tidy.setShowWarnings(false);
145         m_tidy.setQuiet(true);
146         m_tidy.setForceOutput(true);
147
148         if (xmlMode) {
149             m_tidy.setXmlTags(xmlMode);
150             m_tidy.setXmlSpace(true);
151         }
152
153         initialiseTags();
154         m_htmlImport = htmlImport;
155     }
156
157     /**
158      * Extracts the content of a HTML page.<p>
159      *
160      * This method should be pretty robust and work even if the input HTML does not contains
161      * the specified matchers.<p>
162      *
163      * @param content the content to extract the body from
164      * @param startpoint the point where matching starts
165      * @param endpoint the point where matching ends
166      * @return the extracted body tag content
167      */

168     public static String JavaDoc extractHtml(String JavaDoc content, String JavaDoc startpoint, String JavaDoc endpoint) {
169
170         /** Regex that matches a start body tag. */
171         Pattern JavaDoc startPattern = Pattern.compile(startpoint, Pattern.CASE_INSENSITIVE);
172
173         /** Regex that matches an end body tag. */
174         Pattern JavaDoc endPattern = Pattern.compile(endpoint, Pattern.CASE_INSENSITIVE);
175
176         Matcher JavaDoc startMatcher = startPattern.matcher(content);
177         Matcher JavaDoc endMatcher = endPattern.matcher(content);
178
179         int start = 0;
180         int end = content.length();
181
182         if (startMatcher.find()) {
183             start = startMatcher.end();
184         }
185
186         if (endMatcher.find(start)) {
187             end = endMatcher.start();
188         }
189
190         return content.substring(start, end);
191     }
192
193     /**
194      * Transforms HTML code into user defined output.<p>
195      *
196      * @param input Reader with HTML code
197      * @param output Writer with transformed code
198      * @param startPattern the start pattern definition for content extracting
199      * @param endPattern the end pattern definition for content extracting
200      * @param properties the file properties
201      */

202     public void convertHTML(Reader JavaDoc input, Writer JavaDoc output, String JavaDoc startPattern, String JavaDoc endPattern, Hashtable JavaDoc properties) {
203
204         /* local variables */
205         StringBuffer JavaDoc htmlString = new StringBuffer JavaDoc();
206         Node JavaDoc node;
207         String JavaDoc outString = "";
208
209         try {
210             /* write InputStream input in StringBuffer htmlString */
211             int c;
212             while ((c = input.read()) != -1) {
213                 htmlString.append((char)c);
214             }
215         } catch (IOException JavaDoc e) {
216             if (CmsLog.INIT.isWarnEnabled()) {
217                 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0, e.getLocalizedMessage()));
218             }
219             return;
220         }
221         outString = htmlString.toString();
222         // extract from html if even both patterns are defined
223
if (CmsStringUtil.isNotEmpty(startPattern) && CmsStringUtil.isNotEmpty(endPattern)) {
224             String JavaDoc extractMain = extractHtml(outString, startPattern, endPattern);
225             if (extractMain.length() != outString.length()) {
226                 String JavaDoc extractHead = extractHtml(outString, "<html>", CmsStringUtil.BODY_START_REGEX);
227                 //String extractHead = extractHtml(extractMain, "<html>", CmsStringUtil.C_BODY_START_REGEX);
228
StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(extractHead.length() + extractMain.length() + 255);
229                 buffer.append("<html>");
230                 buffer.append(extractHead);
231                 buffer.append("<body>");
232                 buffer.append(extractMain);
233                 buffer.append("</body></html>");
234                 outString = buffer.toString();
235             }
236         }
237
238         /* convert htmlString in InputStream for parseDOM */
239         InputStream JavaDoc in;
240         try {
241             in = new ByteArrayInputStream JavaDoc(outString.getBytes(CmsEncoder.ENCODING_UTF_8));
242         } catch (UnsupportedEncodingException JavaDoc e) {
243             // this should never happen since UTF-8 is always supported
244
in = new ByteArrayInputStream JavaDoc(outString.getBytes());
245         }
246         m_tidy.setInputEncoding(CmsEncoder.ENCODING_UTF_8);
247         m_tidy.setOutputEncoding(CmsEncoder.ENCODING_UTF_8);
248
249         // hold tidy error information into a new PrintWriter Object
250
PrintWriter JavaDoc errorLog = new PrintWriter JavaDoc(new ByteArrayOutputStream JavaDoc(), true);
251         m_tidy.setErrout(errorLog);
252
253         node = m_tidy.parseDOM(in, null);
254         /* check if html code has errors */
255         if (m_tidy.getParseErrors() != 0) {
256             if (CmsLog.INIT.isWarnEnabled()) {
257                 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0));
258             }
259         }
260         /* second step: create transformed output with printDocument from DOM */
261         this.printDocument(node, properties);
262
263         try {
264             String JavaDoc content = m_tempString.toString();
265             content = CmsStringUtil.substitute(content, "<br></br>", "<br>");
266             content = CmsStringUtil.substitutePerl(content, "</a>(\\w+)", "</a> $1", "g");
267             output.write(content);
268             output.close();
269
270         } catch (IOException JavaDoc e) {
271             if (CmsLog.INIT.isWarnEnabled()) {
272                 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_1, e.getLocalizedMessage()));
273             }
274             return;
275         }
276     }
277
278     /**
279      * Transforms HTML code into user defined output.<p>
280      *
281      * @param filename the absolute path in the real filesystem of the file to convert
282      * @param inString String with HTML code
283      * @param startPattern the start pattern definition for content extracting
284      * @param endPattern the end pattern definition for content extracting
285      * @param properties the file properties
286      * @return String with transformed code
287      */

288     public String JavaDoc convertHTML(
289         String JavaDoc filename,
290         String JavaDoc inString,
291         String JavaDoc startPattern,
292         String JavaDoc endPattern,
293         Hashtable JavaDoc properties) {
294
295         m_tempString = new StringBuffer JavaDoc();
296         m_write = true;
297         m_filename = filename.replace('\\', '/');
298         Reader JavaDoc in = new StringReader JavaDoc(inString);
299         Writer JavaDoc out = new StringWriter JavaDoc();
300         convertHTML(in, out, startPattern, endPattern, properties);
301         return out.toString();
302     }
303
304     /**
305      * Initialises Vector m_enterTags with tag names.<p>
306      */

307     private void initialiseTags() {
308
309         StringTokenizer JavaDoc T = new StringTokenizer JavaDoc(
310             "p,table,tr,td,body,head,script,pre,title,style,h1,h2,h3,h4,h5,h6,ul,ol,li",
311             ",");
312         while (T.hasMoreTokens()) {
313             m_enterTags.add(new String JavaDoc(T.nextToken()));
314         }
315     }
316
317     /**
318      * Private method to parse DOM and create user defined output.<p>
319      *
320      * @param node Node of DOM from HTML code
321      * @param properties the file properties
322      */

323     private void printDocument(Node JavaDoc node, Hashtable JavaDoc properties) {
324
325         // if node is empty do nothing... (Recursion)
326
if (node == null) {
327             return;
328         }
329         // initialise local variables
330
int type = node.getNodeType();
331         String JavaDoc name = node.getNodeName();
332
333         // detect node type
334
switch (type) {
335             case Node.DOCUMENT_NODE:
336
337                 this.printDocument(((Document JavaDoc)node).getDocumentElement(), properties);
338                 break;
339             case Node.ELEMENT_NODE:
340
341                 // check if its the <head> node. Nothing inside the <head> node
342
// must be
343
// part of the output, but we must scan the content of this
344
// node to get all
345
// <meta> tags
346
if (name.equals(NODE_HEAD)) {
347                     m_write = false;
348                 }
349                 // scan element node; if a block has to be removed or replaced,
350
// break and discard child nodes
351
transformStartElement(node, properties);
352
353                 // test if node has children
354
NodeList JavaDoc children = node.getChildNodes();
355                 if (children != null) {
356                     int len = children.getLength();
357                     for (int i = 0; i < len; i++) {
358                         // recursively call printDocument with all child nodes
359
this.printDocument(children.item(i), properties);
360                     }
361                 }
362                 break;
363             case Node.TEXT_NODE:
364
365                 // replace subStrings in text nodes
366
transformTextNode(node);
367                 break;
368             default:
369
370                 break;
371         }
372         // end of recursion, add eventual endtags and suffixes
373
switch (type) {
374             case Node.ELEMENT_NODE:
375                 // analyse endtags and add them to output
376
transformEndElement(node);
377                 if (node.getNodeName().equals(NODE_HEAD)) {
378                     m_write = true;
379                 }
380                 break;
381             case Node.DOCUMENT_NODE:
382                 break;
383             default:
384                 break;
385         }
386     }
387
388     /**
389      * Transform element nodes and create end tags in output.<p>
390      *
391      * @param node actual element node
392      */

393     private void transformEndElement(Node JavaDoc node) {
394
395         // check hat kind of node we have
396
String JavaDoc nodeName = node.getNodeName();
397
398         // the <HTML> and <BODY> node must be skipped
399
if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
400             // do nothing here
401
} else {
402             // only do some output if we are in writing mode
403
if (m_write) {
404                 m_tempString.append("</");
405                 m_tempString.append(nodeName);
406                 m_tempString.append(">");
407
408                 // append a "\n" to output String if possible
409
if (m_enterTags.contains(node.getNodeName())) {
410                     m_tempString.append("\n");
411                 }
412             }
413         }
414     }
415
416     /**
417      * Transforms element nodes and create start tags in output. <p>
418      *
419      * @param node actual element node
420      * @param properties the file properties
421      */

422     private void transformStartElement(Node JavaDoc node, Hashtable JavaDoc properties) {
423
424         // check hat kind of node we have
425
String JavaDoc nodeName = node.getNodeName();
426
427         // the <HTML> and <BODY> node must be skipped
428
if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
429             // the <TITLE> node must be read and its value set as properties to
430
// the imported file
431

432         } else if (nodeName.equals(NODE_TITLE)) {
433
434             writeTitleProperty(node, properties);
435
436         } else if (nodeName.equals(NODE_META)) {
437
438             writeMetaTagProperty(node, properties);
439
440         } else if (nodeName.equals(NODE_HREF)) {
441
442             // only do some output if we are in writing mode
443
if (m_write) {
444                 m_tempString.append("<");
445                 m_tempString.append(nodeName);
446                 NamedNodeMap JavaDoc attrs = node.getAttributes();
447                 // look through all attribs to find the reference
448
for (int i = attrs.getLength() - 1; i >= 0; i--) {
449                     String JavaDoc name = attrs.item(i).getNodeName();
450                     String JavaDoc value = attrs.item(i).getNodeValue();
451
452                     if (name.equals(ATTRIB_HREF)) {
453
454                         // check if this is an external link
455
if (value.indexOf("://") > 0) {
456                             // store it for later creation of an entry in the
457
// link gallery
458
String JavaDoc externalLinkFile = m_htmlImport.storeExternalLink(value);
459                             if (externalLinkFile != null) {
460                                 value = m_htmlImport.getLinkGallery() + externalLinkFile;
461                             }
462                         } else if (!value.startsWith("mailto:") && !value.startsWith("javascript:")) {
463
464                             // save an existing anchor link for later use
465
// if (value.indexOf("#") > 0) {
466
// String anchor = value.substring(value.indexOf("#"), value.length());
467
// }
468
// get the new link into the VFS
469
String JavaDoc internalUri = m_htmlImport.getAbsoluteUri(value, m_filename.substring(
470                                 0,
471                                 m_filename.lastIndexOf("/") + 1));
472
473                             value = m_htmlImport.translateLink(internalUri);
474                         }
475                     }
476
477                     m_tempString.append(" ");
478                     m_tempString.append(name);
479                     m_tempString.append("=\"");
480                     m_tempString.append(value);
481                     m_tempString.append("\"");
482                 }
483                 m_tempString.append(">");
484             }
485
486             // this is a imasge, its reference must be converted
487
} else if (nodeName.equals(NODE_IMG)) {
488
489             // only do some output if we are in writing mode
490
if (m_write) {
491                 m_tempString.append("<");
492                 m_tempString.append(nodeName);
493                 NamedNodeMap JavaDoc attrs = node.getAttributes();
494                 // look through all attribs to find the src and alt attributes
495
String JavaDoc imagename = "";
496                 String JavaDoc altText = "";
497                 for (int i = attrs.getLength() - 1; i >= 0; i--) {
498                     String JavaDoc name = attrs.item(i).getNodeName();
499                     String JavaDoc value = attrs.item(i).getNodeValue();
500                     if (name.equals(ATTRIB_SRC)) {
501                         // we found the src. now check if it refers to an
502
// external image.
503
// if not, we must get the correct location in the VFS
504
if (value.indexOf("://") <= 0) {
505                             imagename = m_htmlImport.getAbsoluteUri(value, m_filename.substring(
506                                 0,
507                                 m_filename.lastIndexOf("/") + 1));
508                             value = m_htmlImport.translateLink(imagename);
509                         }
510                     } else if (name.equals(ATTRIB_ALT)) {
511                         altText = value;
512                     }
513
514                     m_tempString.append(" ");
515                     m_tempString.append(name);
516                     m_tempString.append("=\"");
517                     m_tempString.append(value);
518                     m_tempString.append("\"");
519                 }
520
521                 //store the alt tag of this image for later use
522
m_htmlImport.storeImageInfo(imagename, altText);
523
524                 m_tempString.append(">");
525             }
526         } else {
527
528             // only do some output if we are in writing mode
529
if (m_write) {
530
531                 m_tempString.append("<");
532                 m_tempString.append(nodeName);
533                 NamedNodeMap JavaDoc attrs = node.getAttributes();
534                 for (int i = attrs.getLength() - 1; i >= 0; i--) {
535                     m_tempString.append(" " + attrs.item(i).getNodeName() + "=" + "\"");
536                     /* scan attribute values and replace subStrings */
537                     m_tempString.append(attrs.item(i).getNodeValue() + "\"");
538                 }
539                 m_tempString.append(">");
540             }
541         }
542     }
543
544     /**
545      * Private method to transform text nodes.<p>
546      *
547      * @param node actual text node
548      */

549     private void transformTextNode(Node JavaDoc node) {
550
551         // only do some output if we are in writing mode
552
if (m_write) {
553             String JavaDoc helpString = node.getNodeValue();
554             m_tempString.append(helpString);
555         }
556     }
557
558     /**
559      * Writes meta tags as cms properties by analyzing the meta tags nodes.<p>
560      *
561      * @param node the meta tag node in html document
562      * @param properties the properties hashtable
563      */

564     private void writeMetaTagProperty(Node JavaDoc node, Hashtable JavaDoc properties) {
565
566         NamedNodeMap JavaDoc attrs = node.getAttributes();
567         String JavaDoc metaName = "";
568         String JavaDoc metaContent = "";
569         // look through all attribs to find the name and content attributes
570
for (int i = attrs.getLength() - 1; i >= 0; i--) {
571             String JavaDoc name = attrs.item(i).getNodeName();
572             String JavaDoc value = attrs.item(i).getNodeValue();
573             if (name.equals(ATTRIB_NAME)) {
574                 metaName = value;
575             } else if (name.equals(ATTRIB_CONTENT)) {
576                 metaContent = value;
577             }
578         }
579         // check if we have valid entries for this <META> node, store them
580
// in the properties
581
if (metaName.length() > 0 && metaContent.length() > 0) {
582             properties.put(metaName, CmsStringUtil.substitute(metaContent, "{subst}", "&#"));
583         }
584     }
585
586     /**
587      * Sets the Property title by analyzing the title node.<p>
588      *
589      * @param node the title node in html document
590      * @param properties the properties hashtable
591      */

592     private void writeTitleProperty(Node JavaDoc node, Hashtable JavaDoc properties) {
593
594         String JavaDoc title = "";
595         // the title string is stored in the first child node
596
NodeList JavaDoc children = node.getChildNodes();
597         if (children != null) {
598             Node JavaDoc titleNode = children.item(0);
599             if (titleNode != null) {
600                 title = titleNode.getNodeValue();
601             }
602         }
603         // add the title property if we have one
604
if ((title != null) && (title.length() > 0)) {
605
606             properties.put(CmsPropertyDefinition.PROPERTY_TITLE, CmsStringUtil.substitute(title, "{subst}", "&#"));
607             // the title will be used as navtext if no other navtext is
608
// given
609
if (properties.get(CmsPropertyDefinition.PROPERTY_NAVTEXT) == null) {
610                 properties.put(CmsPropertyDefinition.PROPERTY_NAVTEXT, CmsStringUtil.substitute(title, "{subst}", "&#"));
611             }
612         }
613
614     }
615
616 }
617
Popular Tags