1 16 package org.outerj.daisy.jspwiki_import; 17 18 import org.apache.commons.httpclient.HttpClient; 19 import org.apache.commons.httpclient.HttpMethod; 20 import org.apache.commons.httpclient.HttpStatus; 21 import org.apache.commons.httpclient.methods.GetMethod; 22 import org.apache.xerces.parsers.DOMParser; 23 import org.cyberneko.html.HTMLConfiguration; 24 import org.xml.sax.*; 25 import org.xml.sax.helpers.AttributesImpl ; 26 import org.jaxen.dom.DOMXPath; 27 import org.w3c.dom.Element ; 28 import org.w3c.dom.NodeList ; 29 import org.w3c.dom.Node ; 30 import org.outerj.daisy.htmlcleaner.HtmlCleanerFactory; 31 import org.outerj.daisy.htmlcleaner.HtmlCleanerTemplate; 32 import org.outerj.daisy.htmlcleaner.HtmlCleaner; 33 import org.outerj.daisy.repository.*; 34 import org.outerj.daisy.repository.clientimpl.RemoteRepositoryManager; 35 36 import javax.xml.transform.dom.DOMSource ; 37 import javax.xml.transform.Transformer ; 38 import javax.xml.transform.stream.StreamResult ; 39 import javax.xml.transform.sax.SAXTransformerFactory ; 40 import javax.xml.transform.sax.TransformerHandler ; 41 import javax.xml.transform.sax.SAXResult ; 42 import javax.xml.parsers.DocumentBuilderFactory ; 43 import javax.xml.parsers.DocumentBuilder ; 44 import javax.xml.parsers.SAXParserFactory ; 45 import javax.xml.parsers.SAXParser ; 46 import java.util.*; 47 import java.io.*; 48 import java.net.URLDecoder ; 49 50 66 public class JspWikiImporter { 67 private String wikiPageURL = "http://wiki.cocoondev.org/Wiki.jsp?page="; 68 private String collectionName = "cocoon"; 69 private String daisyUser = "jspwiki-import"; 70 private String daisyPassword = "topsecret"; 71 private HashSet allPageNames = new HashSet(); 72 private DocumentBuilder documentBuilder; 73 private HtmlCleanerTemplate htmlCleanerTemplate; 74 private SAXTransformerFactory transformerFactory = (SAXTransformerFactory )SAXTransformerFactory.newInstance(); 75 private Repository repository; 76 private HashMap importPages = new HashMap(); 77 private HashMap importedImages = new HashMap(); 78 private HashMap importedAttachments = new HashMap(); 79 private DocumentCollection collection; 80 private static HashSet skipPages = new HashSet(); 81 static { 82 skipPages.add("UndefinedPages"); 83 skipPages.add("UnusedPages"); 84 skipPages.add("IndexPage"); 85 skipPages.add("RecentChanges"); 86 skipPages.add("FullRecentChanges"); 87 } 88 89 public static void main(String [] args) throws Exception { 90 new JspWikiImporter().run(); 91 } 92 93 public void run() throws Exception { 94 System.out.println("Doing preparations..."); 96 documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); 97 File htmlCleanerConfig = new File("../daisywiki/frontend/src/cocoon/webapp/daisy/resources/conf/htmlcleaner.xml"); 98 htmlCleanerTemplate = new HtmlCleanerFactory().buildTemplate(new InputSource(new FileInputStream(htmlCleanerConfig))); 99 100 System.out.println("Connecting to daisy..."); 102 Credentials credentials = new Credentials(daisyUser, daisyPassword); 103 RepositoryManager repositoryManager = new RemoteRepositoryManager("http://localhost:9263", credentials); 104 repository = repositoryManager.getRepository(credentials); 105 collection = repository.getCollectionManager().getCollectionByName(collectionName, false); 106 107 System.out.println("Fetching list of all pages on the wiki..."); 109 loadPageNames(); 110 System.out.println(allPageNames.size() + " pages found on the wiki."); 111 System.out.println(); 112 113 String [] pages = (String [])allPageNames.toArray(new String [allPageNames.size()]); 114 for (int i = 0; i < pages.length; i++) { 115 if (pages[i].startsWith("Wyona")) { 116 System.out.println("Skipping page " + pages[i]); 117 } else if (skipPages.contains(pages[i])) { 118 System.out.println("Skipping page " + pages[i]); 119 } else { 120 System.out.println("Fetching page " + pages[i] + "... (" + i + " of " + pages.length + ")"); 121 byte[] pageData = fetchPage(pages[i]); 122 123 System.out.println("Parsing and cleaning HTML..."); 124 org.w3c.dom.Document pageDocument = parseHtml(pageData); 125 DOMXPath xpath = new DOMXPath("//div[@class='content']"); 126 Element contentDiv = (Element )xpath.selectSingleNode(pageDocument); 127 if (contentDiv == null) 128 throw new Exception ("No content found in page " + pages[i]); 129 String contentData = serialize(contentDivToDoc(contentDiv)); 130 byte[] cleanedContent = clean(contentData); 131 132 System.out.println("Storing page in Daisy..."); 133 Document document = repository.createDocument(pages[i], "SimpleDocument"); 134 document.setPart("SimpleDocumentContent", "text/xml", cleanedContent); 135 document.addToCollection(collection); 136 document.save(); 137 importPages.put(pages[i], new Long (document.getId())); 138 System.out.println("Done\n"); 139 } 140 } 141 142 System.out.println("\n\nWILL NOW START LINK TRANSLATION\n\n"); 143 144 Iterator importPagesIt = importPages.entrySet().iterator(); 145 while (importPagesIt.hasNext()) { 146 Map.Entry entry = (Map.Entry)importPagesIt.next(); 147 String pageName = (String )entry.getKey(); 148 long pageId = ((Long )entry.getValue()).longValue(); 149 150 System.out.println("Translating links for document " + pageName + "..."); 151 Document document = repository.getDocument(pageId, true); 152 byte[] pageData = document.getPart("SimpleDocumentContent").getData(); 153 byte[] newData = clean(translateLinks(pageData)); 154 document.setPart("SimpleDocumentContent", "text/xml", newData); 155 document.save(); 156 System.out.println("Done\n"); 157 } 158 159 } 160 161 private byte[] clean(String htmlData) throws Exception { 162 HtmlCleaner cleaner = htmlCleanerTemplate.newHtmlCleaner(); 163 return cleaner.cleanToByteArray(htmlData); 164 } 165 166 private org.w3c.dom.Document contentDivToDoc(Element contentDiv) { 167 org.w3c.dom.Document doc = documentBuilder.newDocument(); 168 Element htmlEl = doc.createElementNS(null, "html"); 169 doc.appendChild(htmlEl); 170 Element bodyEl = doc.createElementNS(null, "body"); 171 htmlEl.appendChild(bodyEl); 172 NodeList childNodes = contentDiv.getChildNodes(); 173 for (int i = 0; i < childNodes.getLength(); i++) { 174 Node node = childNodes.item(i); 175 boolean append = true; 176 if (node instanceof Element && node.getLocalName().equals("h1")) { 177 Element divEl = (Element )node; 178 if (divEl.getAttribute("class").equals("pagename")) { 179 append = false; 180 } 181 } else if (node instanceof Element && node.getLocalName().equals("div")) { 182 Element divEl = (Element )node; 183 if (divEl.getAttribute("class").equals("bottom")) { 185 return doc; 186 } 187 } 188 if (append) 189 bodyEl.appendChild(doc.importNode(node, true)); 190 } 191 return doc; 192 } 193 194 private String serialize(org.w3c.dom.Document doc) throws Exception { 195 TransformerHandler serializer = transformerFactory.newTransformerHandler(); 196 StringWriter writer = new StringWriter(); 197 serializer.setResult(new StreamResult (writer)); 198 199 Transformer streamer = transformerFactory.newTransformer(); 200 streamer.transform(new DOMSource (doc), new SAXResult (new ExtraCleanup(serializer))); 201 return writer.toString(); 202 } 203 204 private void loadPageNames() throws Exception { 205 byte[] indexPageData = fetchPage("IndexPage"); 206 org.w3c.dom.Document document = parseHtml(indexPageData); 207 DOMXPath xpath = new DOMXPath("//a[@class='wikipage']"); 208 List nodes = xpath.selectNodes(document); 209 Iterator nodesIt = nodes.iterator(); 210 while (nodesIt.hasNext()) { 211 Element element = (Element )nodesIt.next(); 212 String href = element.getAttribute("href"); 213 if (href.startsWith(wikiPageURL)) 214 allPageNames.add(href.substring(wikiPageURL.length())); 215 } 216 } 217 218 private byte[] fetchPage(String pageName) throws Exception { 219 HttpClient client = new HttpClient(); 220 HttpMethod method = new GetMethod(wikiPageURL + pageName); 221 int status = client.executeMethod(method); 222 if (status != HttpStatus.SC_OK) 223 throw new Exception ("Problem retrieving wiki page " + pageName + " : " + method.getStatusCode() + " : " + HttpStatus.getStatusText(method.getStatusCode())); 224 return method.getResponseBody(); 225 } 226 227 private org.w3c.dom.Document parseHtml(byte[] data) throws Exception { 228 DOMParser parser = new DOMParser(new HTMLConfiguration()); 229 parser.setFeature("http://xml.org/sax/features/namespaces", true); 230 parser.setFeature("http://cyberneko.org/html/features/override-namespaces", false); 231 parser.setFeature("http://cyberneko.org/html/features/insert-namespaces", false); 232 parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); 233 parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); 234 235 parser.parse(new InputSource(new ByteArrayInputStream(data))); 236 return parser.getDocument(); 237 } 238 239 private String translateLinks(byte[] data) throws Exception { 240 TransformerHandler serializer = transformerFactory.newTransformerHandler(); 241 StringWriter writer = new StringWriter(); 242 serializer.setResult(new StreamResult (writer)); 243 244 SAXParserFactory parserFactory = SAXParserFactory.newInstance(); 245 parserFactory.setNamespaceAware(true); 246 SAXParser parser = parserFactory.newSAXParser(); 247 parser.getXMLReader().setContentHandler(new LinkTranslator(serializer)); 248 parser.getXMLReader().parse(new InputSource(new ByteArrayInputStream(data))); 249 250 return writer.toString(); 251 } 252 253 class AbstractTransformer implements ContentHandler { 254 protected ContentHandler consumer; 255 256 public AbstractTransformer(ContentHandler consumer) { 257 this.consumer = consumer; 258 } 259 260 public void endDocument() 261 throws SAXException { 262 consumer.endDocument(); 263 } 264 265 public void startDocument () 266 throws SAXException { 267 consumer.startDocument(); 268 } 269 270 public void characters (char ch[], int start, int length) 271 throws SAXException { 272 consumer.characters(ch, start, length); 273 } 274 275 public void ignorableWhitespace (char ch[], int start, int length) 276 throws SAXException { 277 consumer.ignorableWhitespace(ch, start, length); 278 } 279 280 public void endPrefixMapping (String prefix) 281 throws SAXException { 282 consumer.endPrefixMapping(prefix); 283 } 284 285 public void skippedEntity (String name) 286 throws SAXException { 287 consumer.skippedEntity(name); 288 } 289 290 public void setDocumentLocator (Locator locator) { 291 consumer.setDocumentLocator(locator); 292 } 293 294 public void processingInstruction (String target, String data) 295 throws SAXException { 296 consumer.processingInstruction(target, data); 297 } 298 299 public void startPrefixMapping (String prefix, String uri) 300 throws SAXException { 301 consumer.startPrefixMapping(prefix, uri); 302 } 303 304 public void endElement (String namespaceURI, String localName, 305 String qName) 306 throws SAXException { 307 consumer.endElement(namespaceURI, localName, qName); 308 } 309 310 public void startElement (String namespaceURI, String localName, 311 String qName, Attributes atts) 312 throws SAXException { 313 consumer.startElement(namespaceURI, localName, qName, atts); 314 } 315 } 316 317 class LinkTranslator extends AbstractTransformer { 318 319 public LinkTranslator(ContentHandler consumer) { 320 super(consumer); 321 } 322 323 public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { 324 if (uri.equals("") && localName.equals("a")) { 325 int index = attributes.getIndex("href"); 326 String href = (index != -1 ? attributes.getValue(index) : null); 327 if (href != null && href.startsWith(wikiPageURL)) { 328 String linkedPage = href.substring(wikiPageURL.length()); 329 Long linkedPageId = (Long )importPages.get(linkedPage); 330 System.out.println("attempt translation of " + linkedPage + " to " + linkedPageId); 331 if (linkedPageId != null) { 332 AttributesImpl newAttrs = new AttributesImpl (attributes); 333 newAttrs.setAttribute(newAttrs.getIndex("href"), "", "href", "href", "CDATA", "daisy:" + linkedPageId.longValue()); 334 attributes = newAttrs; 335 } 336 } 337 } 338 consumer.startElement(uri, localName, qName, attributes); 339 } 340 } 341 342 class ExtraCleanup extends AbstractTransformer { 343 private boolean dropNextImgEndTag = false; 344 345 public ExtraCleanup(ContentHandler consumer) { 346 super(consumer); 347 } 348 349 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { 350 if (namespaceURI.equals("") && localName.equals("img") && ("http://wiki.cocoondev.org/images/out.png".equals(atts.getValue("src")) || "images/attachment_small.png".equals(atts.getValue("src")))) { 351 dropNextImgEndTag = true; 352 } else if (namespaceURI.equals("") && localName.equals("img")) { 354 String src = atts.getValue("src"); 355 if (src != null) { 356 if (importedImages.containsKey(src)) { 357 AttributesImpl newAttrs = new AttributesImpl (); 358 newAttrs.addAttribute("", "src", "src", "CDATA", "daisy:" + importedImages.get(src)); 359 } else { 360 try { 361 HttpClient client = new HttpClient(); 362 HttpMethod method = new GetMethod(src); 363 int status = client.executeMethod(method); 364 if (status >= 300 && status < 400) { 365 method = new GetMethod(method.getResponseHeader("location").getValue()); 366 status = client.executeMethod(method); 367 } 368 if (status != HttpStatus.SC_OK) 369 throw new Exception ("Problem retrieving image " + src + " : " + method.getStatusCode() + " : " + HttpStatus.getStatusText(method.getStatusCode())); 370 byte[] data = method.getResponseBody(); 371 String name = getImageName(src); 372 Document imageDocument = repository.createDocument(name, "Image"); 373 imageDocument.setPart("ImageData", method.getResponseHeader("Content-Type").getValue(), data); 374 imageDocument.addToCollection(collection); 375 imageDocument.save(); 376 importedImages.put(src, String.valueOf(imageDocument.getId())); 377 AttributesImpl newAttrs = new AttributesImpl (); 378 newAttrs.addAttribute("", "src", "src", "CDATA", "daisy:" + imageDocument.getId()); 379 super.startElement("", "img", "img", newAttrs); 380 System.out.println("Imported image " + src + " as " + name); 381 } catch (Exception e) { 382 throw new SAXException("Error getting image " + src, e); 383 } 384 } 385 } 386 } else if (namespaceURI.equals("") && localName.equals("a") && "attachment".equals(atts.getValue("class"))) { 387 String src = atts.getValue("href"); 388 String decodedSrc = null; 389 try { 390 decodedSrc = URLDecoder.decode(src, "UTF-8"); 391 } catch (UnsupportedEncodingException e) { 392 throw new SAXException(e); 393 } 394 if (importedAttachments.containsKey(src)) { 395 AttributesImpl newAttrs = new AttributesImpl (); 396 newAttrs.addAttribute("", "src", "src", "CDATA", "daisy:" + importedAttachments.get(src)); 397 } else { 398 try { 399 HttpClient client = new HttpClient(); 400 HttpMethod method = new GetMethod(src); 401 int status = client.executeMethod(method); 402 if (status != HttpStatus.SC_OK) 403 throw new Exception ("Problem retrieving attachment " + src + " : " + method.getStatusCode() + " : " + HttpStatus.getStatusText(method.getStatusCode())); 404 byte[] data = method.getResponseBody(); 405 String name = getImageName(decodedSrc); 406 Document attachmentDocument = repository.createDocument(name, "Attachment"); 407 attachmentDocument.setPart("AttachmentData", method.getResponseHeader("Content-Type").getValue(), data); 408 attachmentDocument.addToCollection(collection); 409 attachmentDocument.save(); 410 importedAttachments.put(src, String.valueOf(attachmentDocument.getId())); 411 AttributesImpl newAttrs = new AttributesImpl (); 412 newAttrs.addAttribute("", "href", "href", "CDATA", "daisy:" + attachmentDocument.getId()); 413 super.startElement("", "a", "a", newAttrs); 414 System.out.println("Imported attachment " + src + " as " + name); 415 } catch (Exception e) { 416 throw new SAXException("Error getting attachment " + src, e); 417 } 418 } 419 } else { 420 super.startElement(namespaceURI, localName, qName, atts); 421 } 422 } 423 424 private String getImageName(String src) { 425 String name = src.substring(src.lastIndexOf('/') + 1); 426 int dotpos = name.lastIndexOf('.'); 427 if (dotpos != -1) { 428 name = name.substring(0, dotpos); 429 } 430 return name; 431 } 432 433 public void endElement(String namespaceURI, String localName, String qName) throws SAXException { 434 if (dropNextImgEndTag && namespaceURI.equals("") && localName.equals("img")) { 435 dropNextImgEndTag = false; 437 } else { 439 super.endElement(namespaceURI, localName, qName); 440 } 441 } 442 } 443 } 444 | Popular Tags |