OpenOfficeTextExtractor


1   /*
2    * Copyright 2004 Outerthought bvba and Schaubroeck nv
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.outerj.daisy.textextraction.impl;
17  
18  import java.io.InputStream  ;
19  import java.util.zip.ZipEntry  ;
20  import java.util.zip.ZipInputStream  ;
21  
22  import org.xmlpull.mxp1.MXParser;
23  import org.xmlpull.v1.XmlPullParser;
24  import org.outerj.daisy.xmlutil.XmlReader;
25  
26  /**
27   * Extracts all text from an OpenOffice document.
28   */
29  public class OpenOfficeTextExtractor implements MimetypeTextExtractor {
30      private static final String   TEXTNAMESPACE="http://openoffice.org/2000/text";
31      
32      public String   getText(InputStream   is) throws Exception   {
33          /*
34           * the byte array we receive here is in fact a ZIP containing the
35           * content.xml, styles.xml,meta.xml and META-INF/manifest.xml files. We
36           * are only interested in the content.xml because that's the file
37           * containing the actual content (duh)
38           */
39  
40          ZipInputStream   zis = new ZipInputStream  (is);
41  
42          ZipEntry   ze = null;
43          String   zipEntryName = null;
44          StringBuffer   text = new StringBuffer  ();
45  
46          while ((ze = zis.getNextEntry()) != null
47              && !(zipEntryName = ze.getName()).equals("content.xml")) {
48          }
49  
50          if (zipEntryName != null && zipEntryName.equals("content.xml")) {
51              /*
52               * we found the correct zip entry. This means the "read pointer" of
53               * the zipinputstream points correctly to the beginning of this zip
54               * entry and we can pass it to the xml parser like this (will
55               * return -1 as soon as the end of the zip entry is reached)
56               */            
57              
58              /* We are using this XmlPullParser because it was impossible to work
59               * with a sax parser. The sax parser always wanted to have access to the
60               * openoffice dtd. Even tried to write our own entityresolver to work
61               * around this problem but didnt work out. In order not to pin ourselves
62               * down to a specific sax implementor (where we eg. would be able to 
63               * specify that we explicitly don't want any check at all against a dtd)
64               * we choose not to use sax at all and use a very lightweight type of 
65               * parsing for this specific goal. 
66               */
67              
68              XmlPullParser parser = new MXParser();
69              parser.setFeature(XmlPullParser.FEATURE_PROCESS_NAMESPACES, true);
70              parser.setInput(new XmlReader(zis));
71              boolean inText = false;
72  
73              int eventType = parser.getEventType();
74              while (eventType != XmlPullParser.END_DOCUMENT)
75              {
76                  eventType = parser.next();
77                  if (eventType == XmlPullParser.START_TAG)
78                  {
79                      if (parser.getName().equals("p") &&
80                              parser.getNamespace().equals(TEXTNAMESPACE)) {
81                          text.append(' ');
82                          inText = true;
83                      }
84                  } else if (eventType == XmlPullParser.END_TAG) {
85                      if (parser.getName().equals("p") &&
86                              parser.getNamespace().equals(TEXTNAMESPACE)) {
87                          inText = false;
88                      }
89                  } else if (eventType == XmlPullParser.TEXT) {
90                      if (inText) {
91                          String   gotText = parser.getText();
92                          text.append(gotText);
93                      }
94                  }
95              }
96              
97          } else {
98              throw new Exception  ("Invalid OpenOffice document format (content.xml not found)");
99          }
100 
101         return text.toString();
102     }
103 }
104
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags