1 16 package org.outerj.daisy.htmlcleaner; 17 18 import org.xml.sax.InputSource ; 19 import org.w3c.dom.*; 20 21 import javax.xml.parsers.DocumentBuilderFactory ; 22 import javax.xml.parsers.DocumentBuilder ; 23 import java.util.ArrayList ; 24 25 35 public class HtmlCleanerFactory { 36 private boolean handledCleanup = false; 37 private boolean handledSerialization = false; 38 HtmlCleanerTemplate template = new HtmlCleanerTemplate(); 39 40 public HtmlCleanerTemplate buildTemplate(InputSource is) throws Exception { 41 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); 42 dbf.setNamespaceAware(true); 43 DocumentBuilder db = dbf.newDocumentBuilder(); 44 Document document = db.parse(is); 45 document.normalize(); 46 47 Element docEl = document.getDocumentElement(); 48 if (!(docEl.getLocalName().equals("htmlcleaner") && docEl.getNamespaceURI() == null)) { 49 throw new Exception ("Htmlcleaner config file should have root elemnet 'htmlcleaner'."); 50 } 51 52 NodeList nodeList = docEl.getChildNodes(); 53 for (int i = 0; i < nodeList.getLength(); i++) { 54 Node node = nodeList.item(i); 55 56 if (node instanceof Element) { 57 if (node.getNamespaceURI() == null && node.getLocalName().equals("cleanup")) { 58 handleCleanupNode((Element)node); 59 } else if (node.getNamespaceURI() == null && node.getLocalName().equals("serialization")) { 60 handleSerializationNode((Element)node); 61 } else { 62 throw new Exception ("Error in htmlcleaner config: unexpected element: " + node.getNodeName()); 63 } 64 } 65 } 66 template.initialize(); 67 return template; 68 } 69 70 private void handleCleanupNode(Element cleanupEl) throws Exception { 71 if (handledCleanup) 72 throw new Exception ("Error in htmlcleaner config: cleanup element is only allowed once"); 73 handledCleanup = true; 74 75 NodeList cleanupNodes = cleanupEl.getChildNodes(); 76 for (int k = 0; k < cleanupNodes.getLength(); k++) { 77 Node node = cleanupNodes.item(k); 78 if (node instanceof Element) { 79 if (node.getNamespaceURI() == null && node.getLocalName().equals("allowed-span-classes")) { 80 String [] classes = getClassChildren((Element)node); 81 for (int z = 0; z < classes.length; z++) 82 template.addAllowedSpanClass(classes[z]); 83 } else if (node.getNamespaceURI() == null && node.getLocalName().equals("allowed-div-classes")) { 84 String [] classes = getClassChildren((Element)node); 85 for (int z = 0; z < classes.length; z++) 86 template.addAllowedDivClass(classes[z]); 87 } else if (node.getNamespaceURI() == null && node.getLocalName().equals("allowed-para-classes")) { 88 String [] classes = getClassChildren((Element)node); 89 for (int z = 0; z < classes.length; z++) 90 template.addAllowedParaClass(classes[z]); 91 } else if (node.getNamespaceURI() == null && node.getLocalName().equals("allowed-pre-classes")) { 92 String [] classes = getClassChildren((Element)node); 93 for (int z = 0; z < classes.length; z++) 94 template.addAllowedPreClass(classes[z]); 95 } else if (node.getNamespaceURI() == null && node.getLocalName().equals("allowed-elements")) { 96 handleAllowedElementsNode((Element)node); 97 } else if (node.getNamespaceURI() == null && node.getLocalName().equals("img-alternate-src-attr")) { 98 String name = ((Element)node).getAttribute("name"); 99 if (name.equals("")) 100 throw new Exception ("Error in htmlcleaner config: missing name attribute on img-alternate-src-attr"); 101 template.setImgAlternateSrcAttr(name); 102 } else if (node.getNamespaceURI() == null && node.getLocalName().equals("link-alternate-href-attr")) { 103 String name = ((Element)node).getAttribute("name"); 104 if (name.equals("")) 105 throw new Exception ("Error in htmlcleaner config: missing name attribute on link-alternate-href-attr"); 106 template.setLinkAlternateHrefAttr(name); 107 } else { 108 throw new Exception ("Error in htmlcleaner config: unexpected element " + node.getNodeName() + " inside " + cleanupEl.getNodeName()); 109 } 110 } 111 } 112 113 } 114 115 private String [] getClassChildren(Element element) throws Exception { 116 ArrayList classes = new ArrayList (); 117 NodeList nodeList = element.getChildNodes(); 118 for (int i = 0; i < nodeList.getLength(); i++) { 119 Node node = nodeList.item(i); 120 if (node instanceof Element) { 121 if (node.getNamespaceURI() == null && node.getLocalName().equals("class")) { 122 Node text = node.getFirstChild(); 123 if (text instanceof Text) { 124 classes.add(((Text)text).getData()); 125 } else { 126 throw new Exception ("Error in htmlcleaner: element class does not have a text node child"); 127 } 128 } else { 129 throw new Exception ("Error in htmlcleaner config: unexpected element: " + node.getNodeName() + " as child of " + element.getNodeName()); 130 } 131 } 132 } 133 return (String [])classes.toArray(new String [classes.size()]); 134 } 135 136 private void handleAllowedElementsNode(Element element) throws Exception { 137 NodeList children = element.getChildNodes(); 138 for (int i = 0; i < children.getLength(); i++) { 139 Node node = children.item(i); 140 141 if (node instanceof Element) { 142 if (node.getNamespaceURI() == null && node.getLocalName().equals("element")) { 143 String name = ((Element)node).getAttribute("name"); 144 if (name.equals("")) 145 throw new Exception ("Error in htmlcleaner config: missing name attribute on 'element' element"); 146 String [] attributes = getAttributeChildren((Element)node); 147 template.addAllowedElement(name, attributes); 148 } else { 149 throw new Exception ("Error in htmlcleaner config: unexpected element: '" + node.getNodeName() + "' as child of " + element.getNodeName()); 150 } 151 } 152 } 153 } 154 155 private String [] getAttributeChildren(Element element) throws Exception { 156 ArrayList names = new ArrayList (); 157 NodeList children = element.getChildNodes(); 158 for (int i = 0; i < children.getLength(); i++) { 159 Node node = children.item(i); 160 if (node instanceof Element) { 161 if (node.getNamespaceURI() == null && node.getLocalName().equals("attribute")) { 162 String name = ((Element)node).getAttribute("name"); 163 if (name.equals("")) 164 throw new Exception ("Error in htmlcleaner config: missing name attribute on attribute element"); 165 names.add(name); 166 } else { 167 throw new Exception ("Error in htmlcleaner config: unexpected element: '" + node.getNodeName() + "' as child of " + element.getNodeName()); 168 } 169 } 170 } 171 return (String [])names.toArray(new String [names.size()]); 172 } 173 174 private void handleSerializationNode(Element element) throws Exception { 175 if (handledSerialization) 176 throw new Exception ("Error in htmlcleaner config: serialization element is only allowed once"); 177 handledSerialization = true; 178 179 NodeList children = element.getChildNodes(); 180 for (int i = 0; i < children.getLength(); i++) { 181 Node node = children.item(i); 182 if (node instanceof Element) { 183 if (node.getNamespaceURI() == null && node.getLocalName().equals("linewidth")) { 184 String value = ((Element)node).getAttribute("value"); 185 if (value.equals("")) 186 throw new Exception ("Error in htmlcleaner config: missing value attribute on linewidth element."); 187 int intValue = Integer.parseInt(value); 188 template.setMaxLineWidth(intValue); 189 } else if (node.getNamespaceURI() == null && node.getLocalName().equals("elements")) { 190 handleElementsNode((Element)node); 191 } else { 192 throw new Exception ("Error in htmlcleaner config: unexpected element '" + node.getNodeName() + "' as child of " + element.getNodeName()); 193 } 194 } 195 } 196 } 197 198 private void handleElementsNode(Element element) throws Exception { 199 NodeList children = element.getChildNodes(); 200 for (int i = 0; i < children.getLength(); i++) { 201 Node node = children.item(i); 202 if (node instanceof Element) { 203 if (node.getNamespaceURI() == null && node.getLocalName().equals("element")) { 204 Element childEl = (Element)node; 205 String name = childEl.getAttribute("name"); 206 if (name.equals("")) 207 throw new Exception ("Error in htmlcleaner config: missing name attribute on 'element' element."); 208 String beforeOpenAttr = childEl.getAttribute("beforeOpen"); 209 String afterOpenAttr = childEl.getAttribute("afterOpen"); 210 String beforeCloseAttr = childEl.getAttribute("beforeClose"); 211 String afterCloseAttr = childEl.getAttribute("afterClose"); 212 int beforeOpen = 0, afterOpen = 0, beforeClose = 0, afterClose = 0; 213 if (!beforeOpenAttr.equals("")) 214 beforeOpen = Integer.parseInt(beforeOpenAttr); 215 if (!afterOpenAttr.equals("")) 216 afterOpen = Integer.parseInt(afterOpenAttr); 217 if (!beforeCloseAttr.equals("")) 218 beforeClose = Integer.parseInt(beforeCloseAttr); 219 if (!afterCloseAttr.equals("")) 220 afterClose = Integer.parseInt(afterCloseAttr); 221 boolean inline = "true".equals(childEl.getAttribute("inline")); 222 template.addOutputElement(name, beforeOpen, afterOpen, beforeClose, afterClose, inline); 223 } 224 } 225 } 226 } 227 } 228 | Popular Tags |