KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > jspwiki_import > JspWikiImporter


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.jspwiki_import;
17
18 import org.apache.commons.httpclient.HttpClient;
19 import org.apache.commons.httpclient.HttpMethod;
20 import org.apache.commons.httpclient.HttpStatus;
21 import org.apache.commons.httpclient.methods.GetMethod;
22 import org.apache.xerces.parsers.DOMParser;
23 import org.cyberneko.html.HTMLConfiguration;
24 import org.xml.sax.*;
25 import org.xml.sax.helpers.AttributesImpl JavaDoc;
26 import org.jaxen.dom.DOMXPath;
27 import org.w3c.dom.Element JavaDoc;
28 import org.w3c.dom.NodeList JavaDoc;
29 import org.w3c.dom.Node JavaDoc;
30 import org.outerj.daisy.htmlcleaner.HtmlCleanerFactory;
31 import org.outerj.daisy.htmlcleaner.HtmlCleanerTemplate;
32 import org.outerj.daisy.htmlcleaner.HtmlCleaner;
33 import org.outerj.daisy.repository.*;
34 import org.outerj.daisy.repository.clientimpl.RemoteRepositoryManager;
35
36 import javax.xml.transform.dom.DOMSource JavaDoc;
37 import javax.xml.transform.Transformer JavaDoc;
38 import javax.xml.transform.stream.StreamResult JavaDoc;
39 import javax.xml.transform.sax.SAXTransformerFactory JavaDoc;
40 import javax.xml.transform.sax.TransformerHandler JavaDoc;
41 import javax.xml.transform.sax.SAXResult JavaDoc;
42 import javax.xml.parsers.DocumentBuilderFactory JavaDoc;
43 import javax.xml.parsers.DocumentBuilder JavaDoc;
44 import javax.xml.parsers.SAXParserFactory JavaDoc;
45 import javax.xml.parsers.SAXParser JavaDoc;
46 import java.util.*;
47 import java.io.*;
48 import java.net.URLDecoder JavaDoc;
49
50 /**
51  * Standalone app to import contents of a JSP Wiki into daisy. Currently
52  * only written with the purpose of importing the Cocoon Wiki content to
53  * have some meaningful, and meaningful-sized testdata.
54  *
55  * <p>The import runs in two passes: first all wiki pages are imported
56  * into daisy, then links are translated from wiki page names to daisy
57  * document ids.
58  *
59  * <p>To run, after maven build, execute target/runimport.sh.
60  *
61  * <p>To make this usable as a generic utility, at least the hardcoded
62  * wiki location and daisy username, collection and url should be specifiable
63  * using command line parameters.
64  *
65  */

66 public class JspWikiImporter {
67     private String JavaDoc wikiPageURL = "http://wiki.cocoondev.org/Wiki.jsp?page=";
68     private String JavaDoc collectionName = "cocoon";
69     private String JavaDoc daisyUser = "jspwiki-import";
70     private String JavaDoc daisyPassword = "topsecret";
71     private HashSet allPageNames = new HashSet();
72     private DocumentBuilder JavaDoc documentBuilder;
73     private HtmlCleanerTemplate htmlCleanerTemplate;
74     private SAXTransformerFactory JavaDoc transformerFactory = (SAXTransformerFactory JavaDoc)SAXTransformerFactory.newInstance();
75     private Repository repository;
76     private HashMap importPages = new HashMap();
77     private HashMap importedImages = new HashMap();
78     private HashMap importedAttachments = new HashMap();
79     private DocumentCollection collection;
80     private static HashSet skipPages = new HashSet();
81     static {
82         skipPages.add("UndefinedPages");
83         skipPages.add("UnusedPages");
84         skipPages.add("IndexPage");
85         skipPages.add("RecentChanges");
86         skipPages.add("FullRecentChanges");
87     }
88
89     public static void main(String JavaDoc[] args) throws Exception JavaDoc {
90         new JspWikiImporter().run();
91     }
92
93     public void run() throws Exception JavaDoc {
94         // initialize some stuff
95
System.out.println("Doing preparations...");
96         documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
97         File htmlCleanerConfig = new File("../daisywiki/frontend/src/cocoon/webapp/daisy/resources/conf/htmlcleaner.xml");
98         htmlCleanerTemplate = new HtmlCleanerFactory().buildTemplate(new InputSource(new FileInputStream(htmlCleanerConfig)));
99
100         // connect to daisy
101
System.out.println("Connecting to daisy...");
102         Credentials credentials = new Credentials(daisyUser, daisyPassword);
103         RepositoryManager repositoryManager = new RemoteRepositoryManager("http://localhost:9263", credentials);
104         repository = repositoryManager.getRepository(credentials);
105         collection = repository.getCollectionManager().getCollectionByName(collectionName, false);
106
107         // load wiki page names
108
System.out.println("Fetching list of all pages on the wiki...");
109         loadPageNames();
110         System.out.println(allPageNames.size() + " pages found on the wiki.");
111         System.out.println();
112
113         String JavaDoc[] pages = (String JavaDoc[])allPageNames.toArray(new String JavaDoc[allPageNames.size()]);
114         for (int i = 0; i < pages.length; i++) {
115             if (pages[i].startsWith("Wyona")) {
116                 System.out.println("Skipping page " + pages[i]);
117             } else if (skipPages.contains(pages[i])) {
118                 System.out.println("Skipping page " + pages[i]);
119             } else {
120                 System.out.println("Fetching page " + pages[i] + "... (" + i + " of " + pages.length + ")");
121                 byte[] pageData = fetchPage(pages[i]);
122
123                 System.out.println("Parsing and cleaning HTML...");
124                 org.w3c.dom.Document JavaDoc pageDocument = parseHtml(pageData);
125                 DOMXPath xpath = new DOMXPath("//div[@class='content']");
126                 Element JavaDoc contentDiv = (Element JavaDoc)xpath.selectSingleNode(pageDocument);
127                 if (contentDiv == null)
128                     throw new Exception JavaDoc("No content found in page " + pages[i]);
129                 String JavaDoc contentData = serialize(contentDivToDoc(contentDiv));
130                 byte[] cleanedContent = clean(contentData);
131
132                 System.out.println("Storing page in Daisy...");
133                 Document document = repository.createDocument(pages[i], "SimpleDocument");
134                 document.setPart("SimpleDocumentContent", "text/xml", cleanedContent);
135                 document.addToCollection(collection);
136                 document.save();
137                 importPages.put(pages[i], new Long JavaDoc(document.getId()));
138                 System.out.println("Done\n");
139             }
140         }
141
142         System.out.println("\n\nWILL NOW START LINK TRANSLATION\n\n");
143
144         Iterator importPagesIt = importPages.entrySet().iterator();
145         while (importPagesIt.hasNext()) {
146             Map.Entry entry = (Map.Entry)importPagesIt.next();
147             String JavaDoc pageName = (String JavaDoc)entry.getKey();
148             long pageId = ((Long JavaDoc)entry.getValue()).longValue();
149
150             System.out.println("Translating links for document " + pageName + "...");
151             Document document = repository.getDocument(pageId, true);
152             byte[] pageData = document.getPart("SimpleDocumentContent").getData();
153             byte[] newData = clean(translateLinks(pageData));
154             document.setPart("SimpleDocumentContent", "text/xml", newData);
155             document.save();
156             System.out.println("Done\n");
157         }
158
159     }
160
161     private byte[] clean(String JavaDoc htmlData) throws Exception JavaDoc {
162         HtmlCleaner cleaner = htmlCleanerTemplate.newHtmlCleaner();
163         return cleaner.cleanToByteArray(htmlData);
164     }
165
166     private org.w3c.dom.Document JavaDoc contentDivToDoc(Element JavaDoc contentDiv) {
167         org.w3c.dom.Document JavaDoc doc = documentBuilder.newDocument();
168         Element JavaDoc htmlEl = doc.createElementNS(null, "html");
169         doc.appendChild(htmlEl);
170         Element JavaDoc bodyEl = doc.createElementNS(null, "body");
171         htmlEl.appendChild(bodyEl);
172         NodeList JavaDoc childNodes = contentDiv.getChildNodes();
173         for (int i = 0; i < childNodes.getLength(); i++) {
174             Node JavaDoc node = childNodes.item(i);
175             boolean append = true;
176             if (node instanceof Element JavaDoc && node.getLocalName().equals("h1")) {
177                 Element JavaDoc divEl = (Element JavaDoc)node;
178                 if (divEl.getAttribute("class").equals("pagename")) {
179                     append = false;
180                 }
181             } else if (node instanceof Element JavaDoc && node.getLocalName().equals("div")) {
182                 Element JavaDoc divEl = (Element JavaDoc)node;
183                 // detect end of content by presence of a div with class bottom.
184
if (divEl.getAttribute("class").equals("bottom")) {
185                     return doc;
186                 }
187             }
188             if (append)
189                 bodyEl.appendChild(doc.importNode(node, true));
190         }
191         return doc;
192     }
193
194     private String JavaDoc serialize(org.w3c.dom.Document JavaDoc doc) throws Exception JavaDoc {
195         TransformerHandler JavaDoc serializer = transformerFactory.newTransformerHandler();
196         StringWriter writer = new StringWriter();
197         serializer.setResult(new StreamResult JavaDoc(writer));
198
199         Transformer JavaDoc streamer = transformerFactory.newTransformer();
200         streamer.transform(new DOMSource JavaDoc(doc), new SAXResult JavaDoc(new ExtraCleanup(serializer)));
201         return writer.toString();
202     }
203
204     private void loadPageNames() throws Exception JavaDoc {
205         byte[] indexPageData = fetchPage("IndexPage");
206         org.w3c.dom.Document JavaDoc document = parseHtml(indexPageData);
207         DOMXPath xpath = new DOMXPath("//a[@class='wikipage']");
208         List JavaDoc nodes = xpath.selectNodes(document);
209         Iterator nodesIt = nodes.iterator();
210         while (nodesIt.hasNext()) {
211             Element JavaDoc element = (Element JavaDoc)nodesIt.next();
212             String JavaDoc href = element.getAttribute("href");
213             if (href.startsWith(wikiPageURL))
214                 allPageNames.add(href.substring(wikiPageURL.length()));
215         }
216     }
217
218     private byte[] fetchPage(String JavaDoc pageName) throws Exception JavaDoc {
219         HttpClient client = new HttpClient();
220         HttpMethod method = new GetMethod(wikiPageURL + pageName);
221         int status = client.executeMethod(method);
222         if (status != HttpStatus.SC_OK)
223             throw new Exception JavaDoc("Problem retrieving wiki page " + pageName + " : " + method.getStatusCode() + " : " + HttpStatus.getStatusText(method.getStatusCode()));
224         return method.getResponseBody();
225     }
226
227     private org.w3c.dom.Document JavaDoc parseHtml(byte[] data) throws Exception JavaDoc {
228         DOMParser parser = new DOMParser(new HTMLConfiguration());
229         parser.setFeature("http://xml.org/sax/features/namespaces", true);
230         parser.setFeature("http://cyberneko.org/html/features/override-namespaces", false);
231         parser.setFeature("http://cyberneko.org/html/features/insert-namespaces", false);
232         parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
233         parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
234
235         parser.parse(new InputSource(new ByteArrayInputStream(data)));
236         return parser.getDocument();
237     }
238
239     private String JavaDoc translateLinks(byte[] data) throws Exception JavaDoc {
240         TransformerHandler JavaDoc serializer = transformerFactory.newTransformerHandler();
241         StringWriter writer = new StringWriter();
242         serializer.setResult(new StreamResult JavaDoc(writer));
243
244         SAXParserFactory JavaDoc parserFactory = SAXParserFactory.newInstance();
245         parserFactory.setNamespaceAware(true);
246         SAXParser JavaDoc parser = parserFactory.newSAXParser();
247         parser.getXMLReader().setContentHandler(new LinkTranslator(serializer));
248         parser.getXMLReader().parse(new InputSource(new ByteArrayInputStream(data)));
249
250         return writer.toString();
251     }
252
253     class AbstractTransformer implements ContentHandler {
254         protected ContentHandler consumer;
255
256         public AbstractTransformer(ContentHandler consumer) {
257             this.consumer = consumer;
258         }
259
260         public void endDocument()
261         throws SAXException {
262             consumer.endDocument();
263         }
264
265         public void startDocument ()
266         throws SAXException {
267             consumer.startDocument();
268         }
269
270         public void characters (char ch[], int start, int length)
271         throws SAXException {
272             consumer.characters(ch, start, length);
273         }
274
275         public void ignorableWhitespace (char ch[], int start, int length)
276         throws SAXException {
277             consumer.ignorableWhitespace(ch, start, length);
278         }
279
280         public void endPrefixMapping (String JavaDoc prefix)
281         throws SAXException {
282             consumer.endPrefixMapping(prefix);
283         }
284
285         public void skippedEntity (String JavaDoc name)
286         throws SAXException {
287             consumer.skippedEntity(name);
288         }
289
290         public void setDocumentLocator (Locator locator) {
291             consumer.setDocumentLocator(locator);
292         }
293
294         public void processingInstruction (String JavaDoc target, String JavaDoc data)
295         throws SAXException {
296             consumer.processingInstruction(target, data);
297         }
298
299         public void startPrefixMapping (String JavaDoc prefix, String JavaDoc uri)
300         throws SAXException {
301             consumer.startPrefixMapping(prefix, uri);
302         }
303
304         public void endElement (String JavaDoc namespaceURI, String JavaDoc localName,
305                     String JavaDoc qName)
306         throws SAXException {
307             consumer.endElement(namespaceURI, localName, qName);
308         }
309
310         public void startElement (String JavaDoc namespaceURI, String JavaDoc localName,
311                       String JavaDoc qName, Attributes atts)
312         throws SAXException {
313             consumer.startElement(namespaceURI, localName, qName, atts);
314         }
315     }
316
317     class LinkTranslator extends AbstractTransformer {
318
319         public LinkTranslator(ContentHandler consumer) {
320             super(consumer);
321         }
322
323         public void startElement(String JavaDoc uri, String JavaDoc localName, String JavaDoc qName, Attributes attributes) throws SAXException {
324             if (uri.equals("") && localName.equals("a")) {
325                 int index = attributes.getIndex("href");
326                 String JavaDoc href = (index != -1 ? attributes.getValue(index) : null);
327                 if (href != null && href.startsWith(wikiPageURL)) {
328                     String JavaDoc linkedPage = href.substring(wikiPageURL.length());
329                     Long JavaDoc linkedPageId = (Long JavaDoc)importPages.get(linkedPage);
330                     System.out.println("attempt translation of " + linkedPage + " to " + linkedPageId);
331                     if (linkedPageId != null) {
332                         AttributesImpl JavaDoc newAttrs = new AttributesImpl JavaDoc(attributes);
333                         newAttrs.setAttribute(newAttrs.getIndex("href"), "", "href", "href", "CDATA", "daisy:" + linkedPageId.longValue());
334                         attributes = newAttrs;
335                     }
336                 }
337             }
338             consumer.startElement(uri, localName, qName, attributes);
339         }
340     }
341
342     class ExtraCleanup extends AbstractTransformer {
343         private boolean dropNextImgEndTag = false;
344
345         public ExtraCleanup(ContentHandler consumer) {
346             super(consumer);
347         }
348
349         public void startElement(String JavaDoc namespaceURI, String JavaDoc localName, String JavaDoc qName, Attributes atts) throws SAXException {
350             if (namespaceURI.equals("") && localName.equals("img") && ("http://wiki.cocoondev.org/images/out.png".equals(atts.getValue("src")) || "images/attachment_small.png".equals(atts.getValue("src")))) {
351                 dropNextImgEndTag = true;
352                 // skip element
353
} else if (namespaceURI.equals("") && localName.equals("img")) {
354                 String JavaDoc src = atts.getValue("src");
355                 if (src != null) {
356                     if (importedImages.containsKey(src)) {
357                         AttributesImpl JavaDoc newAttrs = new AttributesImpl JavaDoc();
358                         newAttrs.addAttribute("", "src", "src", "CDATA", "daisy:" + importedImages.get(src));
359                     } else {
360                         try {
361                             HttpClient client = new HttpClient();
362                             HttpMethod method = new GetMethod(src);
363                             int status = client.executeMethod(method);
364                             if (status >= 300 && status < 400) {
365                                 method = new GetMethod(method.getResponseHeader("location").getValue());
366                                 status = client.executeMethod(method);
367                             }
368                             if (status != HttpStatus.SC_OK)
369                                 throw new Exception JavaDoc("Problem retrieving image " + src + " : " + method.getStatusCode() + " : " + HttpStatus.getStatusText(method.getStatusCode()));
370                             byte[] data = method.getResponseBody();
371                             String JavaDoc name = getImageName(src);
372                             Document imageDocument = repository.createDocument(name, "Image");
373                             imageDocument.setPart("ImageData", method.getResponseHeader("Content-Type").getValue(), data);
374                             imageDocument.addToCollection(collection);
375                             imageDocument.save();
376                             importedImages.put(src, String.valueOf(imageDocument.getId()));
377                             AttributesImpl JavaDoc newAttrs = new AttributesImpl JavaDoc();
378                             newAttrs.addAttribute("", "src", "src", "CDATA", "daisy:" + imageDocument.getId());
379                             super.startElement("", "img", "img", newAttrs);
380                             System.out.println("Imported image " + src + " as " + name);
381                         } catch (Exception JavaDoc e) {
382                             throw new SAXException("Error getting image " + src, e);
383                         }
384                     }
385                 }
386             } else if (namespaceURI.equals("") && localName.equals("a") && "attachment".equals(atts.getValue("class"))) {
387                 String JavaDoc src = atts.getValue("href");
388                 String JavaDoc decodedSrc = null;
389                 try {
390                     decodedSrc = URLDecoder.decode(src, "UTF-8");
391                 } catch (UnsupportedEncodingException e) {
392                     throw new SAXException(e);
393                 }
394                 if (importedAttachments.containsKey(src)) {
395                     AttributesImpl JavaDoc newAttrs = new AttributesImpl JavaDoc();
396                     newAttrs.addAttribute("", "src", "src", "CDATA", "daisy:" + importedAttachments.get(src));
397                 } else {
398                     try {
399                         HttpClient client = new HttpClient();
400                         HttpMethod method = new GetMethod(src);
401                         int status = client.executeMethod(method);
402                         if (status != HttpStatus.SC_OK)
403                             throw new Exception JavaDoc("Problem retrieving attachment " + src + " : " + method.getStatusCode() + " : " + HttpStatus.getStatusText(method.getStatusCode()));
404                         byte[] data = method.getResponseBody();
405                         String JavaDoc name = getImageName(decodedSrc);
406                         Document attachmentDocument = repository.createDocument(name, "Attachment");
407                         attachmentDocument.setPart("AttachmentData", method.getResponseHeader("Content-Type").getValue(), data);
408                         attachmentDocument.addToCollection(collection);
409                         attachmentDocument.save();
410                         importedAttachments.put(src, String.valueOf(attachmentDocument.getId()));
411                         AttributesImpl JavaDoc newAttrs = new AttributesImpl JavaDoc();
412                         newAttrs.addAttribute("", "href", "href", "CDATA", "daisy:" + attachmentDocument.getId());
413                         super.startElement("", "a", "a", newAttrs);
414                         System.out.println("Imported attachment " + src + " as " + name);
415                     } catch (Exception JavaDoc e) {
416                         throw new SAXException("Error getting attachment " + src, e);
417                     }
418                 }
419             } else {
420                 super.startElement(namespaceURI, localName, qName, atts);
421             }
422         }
423
424         private String JavaDoc getImageName(String JavaDoc src) {
425             String JavaDoc name = src.substring(src.lastIndexOf('/') + 1);
426             int dotpos = name.lastIndexOf('.');
427             if (dotpos != -1) {
428                 name = name.substring(0, dotpos);
429             }
430             return name;
431         }
432
433         public void endElement(String JavaDoc namespaceURI, String JavaDoc localName, String JavaDoc qName) throws SAXException {
434             if (dropNextImgEndTag && namespaceURI.equals("") && localName.equals("img")) {
435                 // skip
436
dropNextImgEndTag = false;
437                 // note that this code assumes img elements are never nested.
438
} else {
439                 super.endElement(namespaceURI, localName, qName);
440             }
441         }
442     }
443 }
444
Popular Tags