1 16 package org.outerj.daisy.textextraction.impl; 17 18 import java.io.InputStream ; 19 import java.util.zip.ZipEntry ; 20 import java.util.zip.ZipInputStream ; 21 22 import org.xmlpull.mxp1.MXParser; 23 import org.xmlpull.v1.XmlPullParser; 24 import org.outerj.daisy.xmlutil.XmlReader; 25 26 29 public class OpenOfficeTextExtractor implements MimetypeTextExtractor { 30 private static final String TEXTNAMESPACE="http://openoffice.org/2000/text"; 31 32 public String getText(InputStream is) throws Exception { 33 39 40 ZipInputStream zis = new ZipInputStream (is); 41 42 ZipEntry ze = null; 43 String zipEntryName = null; 44 StringBuffer text = new StringBuffer (); 45 46 while ((ze = zis.getNextEntry()) != null 47 && !(zipEntryName = ze.getName()).equals("content.xml")) { 48 } 49 50 if (zipEntryName != null && zipEntryName.equals("content.xml")) { 51 57 58 67 68 XmlPullParser parser = new MXParser(); 69 parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true); 70 parser.setInput(new XmlReader(zis)); 71 boolean inText = false; 72 73 int eventType = parser.getEventType(); 74 while (eventType != XmlPullParser.END_DOCUMENT) 75 { 76 eventType = parser.next(); 77 if (eventType == XmlPullParser.START_TAG) 78 { 79 if (parser.getName().equals("p") && 80 parser.getNamespace().equals(TEXTNAMESPACE)) { 81 text.append(' '); 82 inText = true; 83 } 84 } else if (eventType == XmlPullParser.END_TAG) { 85 if (parser.getName().equals("p") && 86 parser.getNamespace().equals(TEXTNAMESPACE)) { 87 inText = false; 88 } 89 } else if (eventType == XmlPullParser.TEXT) { 90 if (inText) { 91 String gotText = parser.getText(); 92 text.append(gotText); 93 } 94 } 95 } 96 97 } else { 98 throw new Exception ("Invalid OpenOffice document format (content.xml not found)"); 99 } 100 101 return text.toString(); 102 } 103 } 104 | Popular Tags |