1 31 32 package org.opencms.workplace.tools.database; 33 34 import org.opencms.file.CmsPropertyDefinition; 35 import org.opencms.i18n.CmsEncoder; 36 import org.opencms.main.CmsLog; 37 import org.opencms.util.CmsStringUtil; 38 39 import java.io.ByteArrayInputStream ; 40 import java.io.ByteArrayOutputStream ; 41 import java.io.IOException ; 42 import java.io.InputStream ; 43 import java.io.PrintWriter ; 44 import java.io.Reader ; 45 import java.io.StringReader ; 46 import java.io.StringWriter ; 47 import java.io.UnsupportedEncodingException ; 48 import java.io.Writer ; 49 import java.util.HashSet ; 50 import java.util.Hashtable ; 51 import java.util.StringTokenizer ; 52 import java.util.regex.Matcher ; 53 import java.util.regex.Pattern ; 54 55 import org.w3c.dom.Document ; 56 import org.w3c.dom.NamedNodeMap ; 57 import org.w3c.dom.Node ; 58 import org.w3c.dom.NodeList ; 59 import org.w3c.tidy.Tidy; 60 61 71 public class CmsHtmlImportConverter { 72 73 74 private static final String ATTRIB_ALT = "alt"; 75 76 77 private static final String ATTRIB_CONTENT = "content"; 78 79 80 private static final String ATTRIB_HREF = "href"; 81 82 83 private static final String ATTRIB_NAME = "name"; 84 85 86 private static final String ATTRIB_SRC = "src"; 87 88 89 private static final String NODE_BODY = "body"; 90 91 92 private static final String NODE_HEAD = "head"; 93 94 95 private static final String NODE_HREF = "a"; 96 97 98 private static final String NODE_HTML = "html"; 99 100 101 private static final String NODE_IMG = "img"; 102 103 104 private static final String NODE_META = "meta"; 105 106 107 private static final String NODE_TITLE = "title"; 108 109 112 private HashSet m_enterTags = new HashSet (); 113 114 117 private String m_filename; 118 119 122 private CmsHtmlImport m_htmlImport; 123 124 127 private StringBuffer m_tempString; 128 129 130 private Tidy m_tidy = new Tidy(); 131 132 133 private boolean m_write; 134 135 141 public CmsHtmlImportConverter(CmsHtmlImport htmlImport, boolean xmlMode) { 142 143 m_tidy.setTidyMark(false); 144 m_tidy.setShowWarnings(false); 145 m_tidy.setQuiet(true); 146 m_tidy.setForceOutput(true); 147 148 if (xmlMode) { 149 m_tidy.setXmlTags(xmlMode); 150 m_tidy.setXmlSpace(true); 151 } 152 153 initialiseTags(); 154 m_htmlImport = htmlImport; 155 } 156 157 168 public static String extractHtml(String content, String startpoint, String endpoint) { 169 170 171 Pattern startPattern = Pattern.compile(startpoint, Pattern.CASE_INSENSITIVE); 172 173 174 Pattern endPattern = Pattern.compile(endpoint, Pattern.CASE_INSENSITIVE); 175 176 Matcher startMatcher = startPattern.matcher(content); 177 Matcher endMatcher = endPattern.matcher(content); 178 179 int start = 0; 180 int end = content.length(); 181 182 if (startMatcher.find()) { 183 start = startMatcher.end(); 184 } 185 186 if (endMatcher.find(start)) { 187 end = endMatcher.start(); 188 } 189 190 return content.substring(start, end); 191 } 192 193 202 public void convertHTML(Reader input, Writer output, String startPattern, String endPattern, Hashtable properties) { 203 204 205 StringBuffer htmlString = new StringBuffer (); 206 Node node; 207 String outString = ""; 208 209 try { 210 211 int c; 212 while ((c = input.read()) != -1) { 213 htmlString.append((char)c); 214 } 215 } catch (IOException e) { 216 if (CmsLog.INIT.isWarnEnabled()) { 217 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0, e.getLocalizedMessage())); 218 } 219 return; 220 } 221 outString = htmlString.toString(); 222 if (CmsStringUtil.isNotEmpty(startPattern) && CmsStringUtil.isNotEmpty(endPattern)) { 224 String extractMain = extractHtml(outString, startPattern, endPattern); 225 if (extractMain.length() != outString.length()) { 226 String extractHead = extractHtml(outString, "<html>", CmsStringUtil.BODY_START_REGEX); 227 StringBuffer buffer = new StringBuffer (extractHead.length() + extractMain.length() + 255); 229 buffer.append("<html>"); 230 buffer.append(extractHead); 231 buffer.append("<body>"); 232 buffer.append(extractMain); 233 buffer.append("</body></html>"); 234 outString = buffer.toString(); 235 } 236 } 237 238 239 InputStream in; 240 try { 241 in = new ByteArrayInputStream (outString.getBytes(CmsEncoder.ENCODING_UTF_8)); 242 } catch (UnsupportedEncodingException e) { 243 in = new ByteArrayInputStream (outString.getBytes()); 245 } 246 m_tidy.setInputEncoding(CmsEncoder.ENCODING_UTF_8); 247 m_tidy.setOutputEncoding(CmsEncoder.ENCODING_UTF_8); 248 249 PrintWriter errorLog = new PrintWriter (new ByteArrayOutputStream (), true); 251 m_tidy.setErrout(errorLog); 252 253 node = m_tidy.parseDOM(in, null); 254 255 if (m_tidy.getParseErrors() != 0) { 256 if (CmsLog.INIT.isWarnEnabled()) { 257 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0)); 258 } 259 } 260 261 this.printDocument(node, properties); 262 263 try { 264 String content = m_tempString.toString(); 265 content = CmsStringUtil.substitute(content, "<br></br>", "<br>"); 266 content = CmsStringUtil.substitutePerl(content, "</a>(\\w+)", "</a> $1", "g"); 267 output.write(content); 268 output.close(); 269 270 } catch (IOException e) { 271 if (CmsLog.INIT.isWarnEnabled()) { 272 CmsLog.INIT.warn(Messages.get().getBundle().key(Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_1, e.getLocalizedMessage())); 273 } 274 return; 275 } 276 } 277 278 288 public String convertHTML( 289 String filename, 290 String inString, 291 String startPattern, 292 String endPattern, 293 Hashtable properties) { 294 295 m_tempString = new StringBuffer (); 296 m_write = true; 297 m_filename = filename.replace('\\', '/'); 298 Reader in = new StringReader (inString); 299 Writer out = new StringWriter (); 300 convertHTML(in, out, startPattern, endPattern, properties); 301 return out.toString(); 302 } 303 304 307 private void initialiseTags() { 308 309 StringTokenizer T = new StringTokenizer ( 310 "p,table,tr,td,body,head,script,pre,title,style,h1,h2,h3,h4,h5,h6,ul,ol,li", 311 ","); 312 while (T.hasMoreTokens()) { 313 m_enterTags.add(new String (T.nextToken())); 314 } 315 } 316 317 323 private void printDocument(Node node, Hashtable properties) { 324 325 if (node == null) { 327 return; 328 } 329 int type = node.getNodeType(); 331 String name = node.getNodeName(); 332 333 switch (type) { 335 case Node.DOCUMENT_NODE: 336 337 this.printDocument(((Document )node).getDocumentElement(), properties); 338 break; 339 case Node.ELEMENT_NODE: 340 341 if (name.equals(NODE_HEAD)) { 347 m_write = false; 348 } 349 transformStartElement(node, properties); 352 353 NodeList children = node.getChildNodes(); 355 if (children != null) { 356 int len = children.getLength(); 357 for (int i = 0; i < len; i++) { 358 this.printDocument(children.item(i), properties); 360 } 361 } 362 break; 363 case Node.TEXT_NODE: 364 365 transformTextNode(node); 367 break; 368 default: 369 370 break; 371 } 372 switch (type) { 374 case Node.ELEMENT_NODE: 375 transformEndElement(node); 377 if (node.getNodeName().equals(NODE_HEAD)) { 378 m_write = true; 379 } 380 break; 381 case Node.DOCUMENT_NODE: 382 break; 383 default: 384 break; 385 } 386 } 387 388 393 private void transformEndElement(Node node) { 394 395 String nodeName = node.getNodeName(); 397 398 if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) { 400 } else { 402 if (m_write) { 404 m_tempString.append("</"); 405 m_tempString.append(nodeName); 406 m_tempString.append(">"); 407 408 if (m_enterTags.contains(node.getNodeName())) { 410 m_tempString.append("\n"); 411 } 412 } 413 } 414 } 415 416 422 private void transformStartElement(Node node, Hashtable properties) { 423 424 String nodeName = node.getNodeName(); 426 427 if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) { 429 432 } else if (nodeName.equals(NODE_TITLE)) { 433 434 writeTitleProperty(node, properties); 435 436 } else if (nodeName.equals(NODE_META)) { 437 438 writeMetaTagProperty(node, properties); 439 440 } else if (nodeName.equals(NODE_HREF)) { 441 442 if (m_write) { 444 m_tempString.append("<"); 445 m_tempString.append(nodeName); 446 NamedNodeMap attrs = node.getAttributes(); 447 for (int i = attrs.getLength() - 1; i >= 0; i--) { 449 String name = attrs.item(i).getNodeName(); 450 String value = attrs.item(i).getNodeValue(); 451 452 if (name.equals(ATTRIB_HREF)) { 453 454 if (value.indexOf("://") > 0) { 456 String externalLinkFile = m_htmlImport.storeExternalLink(value); 459 if (externalLinkFile != null) { 460 value = m_htmlImport.getLinkGallery() + externalLinkFile; 461 } 462 } else if (!value.startsWith("mailto:") && !value.startsWith("javascript:")) { 463 464 String internalUri = m_htmlImport.getAbsoluteUri(value, m_filename.substring( 470 0, 471 m_filename.lastIndexOf("/") + 1)); 472 473 value = m_htmlImport.translateLink(internalUri); 474 } 475 } 476 477 m_tempString.append(" "); 478 m_tempString.append(name); 479 m_tempString.append("=\""); 480 m_tempString.append(value); 481 m_tempString.append("\""); 482 } 483 m_tempString.append(">"); 484 } 485 486 } else if (nodeName.equals(NODE_IMG)) { 488 489 if (m_write) { 491 m_tempString.append("<"); 492 m_tempString.append(nodeName); 493 NamedNodeMap attrs = node.getAttributes(); 494 String imagename = ""; 496 String altText = ""; 497 for (int i = attrs.getLength() - 1; i >= 0; i--) { 498 String name = attrs.item(i).getNodeName(); 499 String value = attrs.item(i).getNodeValue(); 500 if (name.equals(ATTRIB_SRC)) { 501 if (value.indexOf("://") <= 0) { 505 imagename = m_htmlImport.getAbsoluteUri(value, m_filename.substring( 506 0, 507 m_filename.lastIndexOf("/") + 1)); 508 value = m_htmlImport.translateLink(imagename); 509 } 510 } else if (name.equals(ATTRIB_ALT)) { 511 altText = value; 512 } 513 514 m_tempString.append(" "); 515 m_tempString.append(name); 516 m_tempString.append("=\""); 517 m_tempString.append(value); 518 m_tempString.append("\""); 519 } 520 521 m_htmlImport.storeImageInfo(imagename, altText); 523 524 m_tempString.append(">"); 525 } 526 } else { 527 528 if (m_write) { 530 531 m_tempString.append("<"); 532 m_tempString.append(nodeName); 533 NamedNodeMap attrs = node.getAttributes(); 534 for (int i = attrs.getLength() - 1; i >= 0; i--) { 535 m_tempString.append(" " + attrs.item(i).getNodeName() + "=" + "\""); 536 537 m_tempString.append(attrs.item(i).getNodeValue() + "\""); 538 } 539 m_tempString.append(">"); 540 } 541 } 542 } 543 544 549 private void transformTextNode(Node node) { 550 551 if (m_write) { 553 String helpString = node.getNodeValue(); 554 m_tempString.append(helpString); 555 } 556 } 557 558 564 private void writeMetaTagProperty(Node node, Hashtable properties) { 565 566 NamedNodeMap attrs = node.getAttributes(); 567 String metaName = ""; 568 String metaContent = ""; 569 for (int i = attrs.getLength() - 1; i >= 0; i--) { 571 String name = attrs.item(i).getNodeName(); 572 String value = attrs.item(i).getNodeValue(); 573 if (name.equals(ATTRIB_NAME)) { 574 metaName = value; 575 } else if (name.equals(ATTRIB_CONTENT)) { 576 metaContent = value; 577 } 578 } 579 if (metaName.length() > 0 && metaContent.length() > 0) { 582 properties.put(metaName, CmsStringUtil.substitute(metaContent, "{subst}", "&#")); 583 } 584 } 585 586 592 private void writeTitleProperty(Node node, Hashtable properties) { 593 594 String title = ""; 595 NodeList children = node.getChildNodes(); 597 if (children != null) { 598 Node titleNode = children.item(0); 599 if (titleNode != null) { 600 title = titleNode.getNodeValue(); 601 } 602 } 603 if ((title != null) && (title.length() > 0)) { 605 606 properties.put(CmsPropertyDefinition.PROPERTY_TITLE, CmsStringUtil.substitute(title, "{subst}", "&#")); 607 if (properties.get(CmsPropertyDefinition.PROPERTY_NAVTEXT) == null) { 610 properties.put(CmsPropertyDefinition.PROPERTY_NAVTEXT, CmsStringUtil.substitute(title, "{subst}", "&#")); 611 } 612 } 613 614 } 615 616 } 617 | Popular Tags |