KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > outerj > daisy > textextraction > impl > OpenOfficeTextExtractor


1 /*
2  * Copyright 2004 Outerthought bvba and Schaubroeck nv
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */

16 package org.outerj.daisy.textextraction.impl;
17
18 import java.io.InputStream JavaDoc;
19 import java.util.zip.ZipEntry JavaDoc;
20 import java.util.zip.ZipInputStream JavaDoc;
21
22 import org.xmlpull.mxp1.MXParser;
23 import org.xmlpull.v1.XmlPullParser;
24 import org.outerj.daisy.xmlutil.XmlReader;
25
26 /**
27  * Extracts all text from an OpenOffice document.
28  */

29 public class OpenOfficeTextExtractor implements MimetypeTextExtractor {
30     private static final String JavaDoc TEXTNAMESPACE="http://openoffice.org/2000/text";
31     
32     public String JavaDoc getText(InputStream JavaDoc is) throws Exception JavaDoc {
33         /*
34          * the byte array we receive here is in fact a ZIP containing the
35          * content.xml, styles.xml,meta.xml and META-INF/manifest.xml files. We
36          * are only interested in the content.xml because that's the file
37          * containing the actual content (duh)
38          */

39
40         ZipInputStream JavaDoc zis = new ZipInputStream JavaDoc(is);
41
42         ZipEntry JavaDoc ze = null;
43         String JavaDoc zipEntryName = null;
44         StringBuffer JavaDoc text = new StringBuffer JavaDoc();
45
46         while ((ze = zis.getNextEntry()) != null
47             && !(zipEntryName = ze.getName()).equals("content.xml")) {
48         }
49
50         if (zipEntryName != null && zipEntryName.equals("content.xml")) {
51             /*
52              * we found the correct zip entry. This means the "read pointer" of
53              * the zipinputstream points correctly to the beginning of this zip
54              * entry and we can pass it to the xml parser like this (will
55              * return -1 as soon as the end of the zip entry is reached)
56              */

57             
58             /* We are using this XmlPullParser because it was impossible to work
59              * with a sax parser. The sax parser always wanted to have access to the
60              * openoffice dtd. Even tried to write our own entityresolver to work
61              * around this problem but didnt work out. In order not to pin ourselves
62              * down to a specific sax implementor (where we eg. would be able to
63              * specify that we explicitly don't want any check at all against a dtd)
64              * we choose not to use sax at all and use a very lightweight type of
65              * parsing for this specific goal.
66              */

67             
68             XmlPullParser parser = new MXParser();
69             parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true);
70             parser.setInput(new XmlReader(zis));
71             boolean inText = false;
72
73             int eventType = parser.getEventType();
74             while (eventType != XmlPullParser.END_DOCUMENT)
75             {
76                 eventType = parser.next();
77                 if (eventType == XmlPullParser.START_TAG)
78                 {
79                     if (parser.getName().equals("p") &&
80                             parser.getNamespace().equals(TEXTNAMESPACE)) {
81                         text.append(' ');
82                         inText = true;
83                     }
84                 } else if (eventType == XmlPullParser.END_TAG) {
85                     if (parser.getName().equals("p") &&
86                             parser.getNamespace().equals(TEXTNAMESPACE)) {
87                         inText = false;
88                     }
89                 } else if (eventType == XmlPullParser.TEXT) {
90                     if (inText) {
91                         String JavaDoc gotText = parser.getText();
92                         text.append(gotText);
93                     }
94                 }
95             }
96             
97         } else {
98             throw new Exception JavaDoc("Invalid OpenOffice document format (content.xml not found)");
99         }
100
101         return text.toString();
102     }
103 }
104
Popular Tags