OpenDocumentTextInputStream


1   /*
2       OpenDocumentTextInputStream extracts raw text from an OpenDocument
3       text file.
4       Copyright (C) 2005  J. David Eisenberg
5   
6       This library is free software; you can redistribute it and/or
7       modify it under the terms of the GNU Lesser General Public
8       License as published by the Free Software Foundation; either
9       version 2.1 of the License, or (at your option) any later version.
10  
11      This library is distributed in the hope that it will be useful,
12      but WITHOUT ANY WARRANTY; without even the implied warranty of
13      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14      Lesser General Public License for more details.
15  
16      You should have received a copy of the GNU Lesser General Public
17      License along with this library; if not, write to the Free Software
18      Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19      
20      Author: J. David Eisenberg
21      Contact: catcode@catcode.com
22  
23  */
24  package com.catcode.odf;
25  
26  import java.io.InputStream  ;
27  import java.io.IOException  ;
28  
29  import java.util.ArrayList  ;
30  import java.util.Collections  ;
31  import java.util.regex.Matcher  ;
32  import java.util.regex.Pattern  ;
33  import java.io.FilterInputStream  ;
34  
35  /**
36   * OpenDocumentTextInputStream reads the content of an
37   * OASIS Open Document Format text (word processing) file.
38   * <p>
39   * Limitations/restrictions:
40   * </p>
41   * <ul>
42   * <li>The namespaces must all be in the root element.</li>
43   * <li>No data is returned for embedded objects.</li>
44   * <li>Will not properly handle XML comments that contain elements.</li>
45   * </ul>
46   *
47   * <p>
48   * You can set two lists of element names (using the
49   * <code>OpenDocumentElement</code> class). The capture list is the
50   * list of elements whose text you want; the omit list is the
51   * list of elements within which text is never output.  The default
52   * value for the capture list is <code>&lt;text:p&gt;</code> and
53   * <code>&lt;text:h</code>. The default value for the omit list
54   * is <code>&lt;text:tracked-changes&gt;</code>.
55   * </p>
56   *
57   *  @author     J. David Eisenberg
58   *  @version    0.1, 2005-10-16
59  */
60  
61  public class OpenDocumentTextInputStream extends FilterInputStream  
62  {
63      private StringBuffer   tagBuffer; // collect the tag
64      
65      private String   textNamespace;   // the namespace prefix for <text:...>
66      private static final Pattern   elementNamePattern =
67          Pattern.compile("^/?(?:([\\p{L}\\p{N}_.-]+):)?([\\p{L}\\p{N}_.-]+)");
68      
69      /*
70       * If the source file has a Unicode character whose value is
71       * >= 0x80, then we have to split it into several bytes and
72       * parcel them out one at time when read() is called.
73       * The utf8Ouput buffer is the holding area for those bytes.
74       */
75      private int[] utf8Output;
76      private int utf8OutputPosition;
77      private int utf8OutputLength;
78      
79      /*
80       * We are interested only in text within "capture" elements.
81       * We also keep track of how deeply nested we are in 
82       * capture elements.
83       *
84       * The capture list must be kept in sorted order.
85       */
86      private static ArrayList   captureList;
87      private int captureDepth;
88      
89      /*
90       * If we are insite an "omit" element, then we never
91       * output its text, even if we encounter a capture element
92       * inside.
93       *
94       * The omit list must be kept in sorted order.
95       */
96      private static ArrayList   omitList;
97      private int omitDepth;
98  
99      private boolean rootElement;
100     /*
101      * The standard five (and only recognized!) entities
102      * and their corresponding characters
103      */
104     private static final String  [] stdFiveEntities = {
105         "apos", "quot", "lt", "gt", "amp"
106     };
107     static final byte[] stdFiveValues = {
108         '\'', '"', '<', '>', '&'
109     };
110     
111     /**
112      * Constructs an OASIS Open Document Text input stream.
113      *
114      * @param   in              the actual input stream
115      */
116     public OpenDocumentTextInputStream( InputStream   in )
117     {
118         this( in, null, null );
119     }
120 
121     /**
122      * Constructs an OASIS Open Document Text input stream.
123      * This constructor lets you provide a list of "capture" elements
124      * whose content you wish to examine. and "omit" elements whose
125      * content will always be omitted. These lists <em>must</em>
126      * be sorted into Unicode order, since it will be searched with
127      * <code>binarySearch()</code>.
128      * <p>
129      * If you want an empty list for either one of these, pass in
130      * an empty <code>ArrayList</code>. Passing in <code>null</code>
131      * will set you up with the default capture or omit list.
132      * </p>
133      *
134      * @param   in          the actual input stream
135      * @param   capture     an <code>ArrayList</code> of
136      *                      elements whose content will be
137      *                      read by this stream
138      * @param   omit        An <code>ArrayList</code> of element
139      *                      whose content will be ignored by ths stream.
140      */
141     public OpenDocumentTextInputStream( InputStream   in,
142         ArrayList   capture, ArrayList   omit )
143     {
144         super( in );
145         
146         /* initialize variables */
147         utf8Output = new int[4];
148         utf8OutputPosition = 0;
149         utf8OutputLength = 0;
150         rootElement = true;
151         
152         if (capture == null)
153         {
154             captureList = new ArrayList  (4);
155             captureList.add( new ElementPostProcess( "h", '\n') );
156             captureList.add( new ElementPostProcess( "p", '\n' ) );
157             captureList.add( new ElementPostProcess( "tab", '\t' ) );
158             captureList.add( new ElementPostProcess( "s", ' ') );
159         }
160         else
161         {
162             this.captureList = capture;
163         }
164         if (omit == null)
165         {
166             omitList = new ArrayList  (1);
167             omitList.add( new ElementPostProcess( "tracked-changes", '\0' ) );
168         }
169         else
170         {
171             this.omitList = omit;
172         }
173         captureDepth = 0;
174         omitDepth = 0;
175     }
176 
177     /**
178      * Reads the next byte of data from this input stream.
179      * The value byte is returned as an <code>int</code> in the range 0 to 255.
180      * If no byte is available because the end of the stream has been reached,
181      * the value -1 is returned. Only bytes within "relevant" elements (as
182      * listed in the <code>relevantElement</code> list) are returned.
183      * This method blocks until input data is available, the end of the stream
184      * is detected, or an exception is thrown.
185      *
186      *  @return the next byte of data, or <code>-1</code>
187      *          if the end of the stream is reached.
188      *  @throws IOException if an I/O error occurs.
189      * 
190      */
191     public int read( ) throws IOException  
192     {
193         int theByte = 0;
194         int result = 0;
195         while (theByte == 0)
196         {
197             /*
198                 If we still have a UTF-8 sequence in progress, emit it.
199             */
200             if (utf8OutputPosition < utf8OutputLength)
201             {
202                 theByte = utf8Output[utf8OutputPosition++];
203             }
204             else
205             {
206                 theByte = in.read( );
207                 if (theByte == '<')
208                 {
209                     collectTag();
210                     theByte = 0;
211                 }
212                 else if (theByte == '&')
213                 {
214                     collectEntity();
215                     theByte = 0;
216                 }
217                 else if ((omitDepth > 0 || captureDepth == 0) && theByte != -1)
218                 {
219                     theByte = 0;
220                 }
221             }               
222         }
223         return theByte;
224     }
225     
226     /**
227      * Reads some number of bytes from the input stream and stores them into
228      * the buffer array <code>b</code>. The number of bytes actually read is
229      * returned as an integer.
230      */
231     public int read(byte b[]) throws IOException  
232     {
233         return read(b, 0, b.length);
234     }
235 
236     /**
237      * Reads up to <code>len</code> bytes of data from the input stream into
238      * an array of bytes. The number of bytes actually read is
239      * returned as an integer. See <code>InputStream</code> for details.
240      * In fact, this code is copied straight from that file.
241      */
242     public int read(byte b[], int off, int len) throws IOException  
243     {
244         if (b == null)
245         {
246             throw new NullPointerException  ();
247         }
248         else if ((off < 0) || (off > b.length) || (len < 0) ||
249            ((off + len) > b.length) || ((off + len) < 0))
250         {
251             throw new IndexOutOfBoundsException  ();
252         }
253         else if (len == 0)
254         {
255             return 0;
256         }
257 
258         int c = read();
259 
260         if (c == -1)
261         {
262             return -1;
263         }
264         
265         b[off] = (byte) c;
266     
267         int i = 1;
268         try
269         {
270             for (; i < len ; i++)
271             {
272                 c = read();
273                 if (c == -1)
274                 {
275                     break;
276                 }
277                 if (b != null)
278                 {
279                     b[off + i] = (byte) c;
280                 }
281             }
282         }
283         catch (IOException   ee)
284         {
285         }
286         return i;
287     }
288 
289      /**
290      * Skips specified number of bytes in the current ODT file entry.
291      * @param n the number of bytes to skip
292      * @return the actual number of bytes skipped
293      * @exception IOException if an I/O error has occurred
294      * @exception IllegalArgumentException if n < 0
295      */
296     public long skip(long n) throws IOException  
297     {
298         byte[] tmpbuf = new byte[512];
299         long remaining = 0;
300 
301         if (n < 0)
302         {
303             throw new IllegalArgumentException  ("negative skip length");
304         }
305         int max = (int) Math.min(n, Integer.MAX_VALUE);
306         int total = 0;
307         while (total < max)
308         {
309             int len = max - total;
310             if (len > tmpbuf.length)
311             {
312                 len = tmpbuf.length;
313             }
314             len = read(tmpbuf, 0, len);
315             if (len == -1)
316             {
317                 break;
318             }
319             total += len;
320         }
321         return total;
322     }
323 
324     /**
325      * Collect all characters up to and including the ending semicolon
326      * of the entity.
327      *
328      * Accepts entities in form &#nnn; &#xnnn; &alpha;, but checks to see
329      * that alpha entities are only the "big five".
330      * <p>
331      * This method will fill the <code>utf8Output[]</code> array,
332      * set <code>utf8OutputLength</code> appropriately, and
333      * set <code>utf8OutputPosition</code> to zero.
334      * </p>
335      * <p>
336      * If we hit the end of file, put <code>-1</code> in the utf8 buffer;
337      * the main loop in <code>read()</code> will emit it the next time through.
338      * </p>
339      *
340      * @throws  IOException if I/O error occurs while reading bytes.
341      */
342     protected void collectEntity( ) throws IOException  
343     {
344         StringBuffer   strBuf = new StringBuffer  (10);
345         String   entityString;
346         int entityValue;
347         int b;
348         int i;  // ubiquitous counter
349         
350         b = super.read();
351         while (b != ';' && b != -1)
352         {
353             if (b != -1)
354             {
355                 strBuf.append( (char) b );
356                 b = super.read();
357             }
358         }
359         if (b != -1)
360         {
361             if (strBuf.charAt(0) == '#')
362             {
363                 /* numeric entity; leading "x" means hex */
364                 entityString = strBuf.substring(1).toLowerCase();
365                 if (entityString.startsWith("x"))
366                 {
367                     entityString = entityString.substring(1);
368                     entityValue = Integer.parseInt( entityString, 16 );
369                 }
370                 else
371                 {
372                     entityValue = Integer.parseInt( entityString, 10 );
373                 }
374                 createUTF8Output( entityValue );
375             }
376             else
377             {
378                 /* alphabetic entity */
379                 entityString = strBuf.toString();
380                 for (i=0; i < stdFiveEntities.length &&
381                     !entityString.equals( stdFiveEntities[i] ); i++)
382                     // do nothing; all action is in the loop count
383                     ;
384                 if (i == stdFiveEntities.length)
385                 {
386                     throw new IllegalArgumentException  ( "Unknown entity &"
387                         + entityString + ";" );
388                 }
389                 utf8Output[0] = stdFiveValues[i];
390                 utf8OutputLength = 1;
391             }
392         }
393         else
394         {
395             utf8Output[0] = -1;
396             utf8OutputLength = 1;
397         }
398         utf8OutputPosition = 0;
399     }
400     
401     /**
402      * Split a Unicode value into UTF-8 bytes.
403      * Puts bytes into <code>utf8Output[]</code> and sets the
404      * <code>utf8OutputLength</code> appropriately.
405      */
406     protected void createUTF8Output( int value )
407     {
408         /*
409          *   Char. number range  |        UTF-8 octet sequence
410          *      (hexadecimal)    |              (binary)
411          *   --------------------+----------------------------------
412          *   0000 0000-0000 007F | 0xxxxxxx
413          *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
414          *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
415          *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
416          */
417         if (value > 0x10ffff)
418         {
419             throw new IllegalArgumentException  (
420                 value + " outside Unicode range."
421             );
422         }
423         if (value <= 0x7f)
424         {
425             utf8Output[0] = value & 0x7f;
426             utf8OutputLength = 1;
427         }
428         else if (value <= 0x7ff)
429         {
430             utf8Output[0] = 0xc0 | ((value >> 6) & 0x1f);
431             utf8Output[1] = 0x80 | (value & 0x3f);
432             utf8OutputLength = 2;
433         }
434         else if (value <= 0xffff)
435         {
436             utf8Output[0] = 0xe0 | ((value >> 12) & 0xf);
437             utf8Output[1] = 0x80 | ((value >> 6) & 0x3f);
438             utf8Output[2] = 0x80 | (value  & 0x3f);
439             utf8OutputLength = 3;
440         }
441         else
442         {
443             utf8Output[0] = 0xf0 | ((value >> 18) & 0x7);
444             utf8Output[1] = 0x80 | ((value >> 12) & 0x3f);
445             utf8Output[2] = 0x80 | ((value >> 6) & 0x3f);
446             utf8Output[3] = 0x80 | (value & 0x3f);
447             utf8OutputLength = 4;
448         }
449     }
450     
451     /**
452      * Collects information between angle brackets into a string buffer.
453      *
454      * <p>
455      * Reads from file until encountering a &gt; symbol.  If a byte
456      * has a value greater than 127, then call <code>collectUTF8()</code>
457      * to combine it and the following bytes into a Unicode character.
458      * </p>
459      * <p>
460      * If we hit the end of file, put <code>-1</code> in the utf8 buffer;
461      * the main loop in <code>read()</code> will emit it the next time through.
462      * @throws  IOException if I/O error occurs while reading bytes.
463      * </p>
464      */
465     protected void collectTag() throws IOException  
466     {
467         int b = 0;
468         int nUTF8;
469 
470         tagBuffer = new StringBuffer  (50);
471         b = super.read();
472         while (b != '>' && b != -1)
473         {
474             if (b > 127)
475             {
476                 b = collectUTF8( b );
477             }
478             /* replace whitespace characters with blanks */
479             if (b == 0x09 || b == 0x0a || b == 0x0d || b == 0x0085
480                 || b == 0x2028 || b == 0x2029)
481             {
482                 b = 0x20;
483             }
484             tagBuffer.append( (char) b );
485             b = super.read();
486         }
487         if (b != -1)
488         {
489             analyzeTag( tagBuffer.toString() );
490         }
491         else
492         {
493             utf8Output[0] = -1;
494             utf8OutputLength = 1;
495             utf8OutputPosition = 0;
496         }
497     }
498 
499     /**
500      * Create a UTF-8 character from individual bytes.
501      *
502      * @param startByte the starting byte of a UTF-8 sequence.
503      * @return a UTF-8 character.
504      */
505     protected int collectUTF8( int startByte ) throws IOException  
506     {
507         int highBits = (startByte >> 4) & 0x0f;
508         int nUTF8;
509         int[] utf8Buf = new int[4];
510         int oneByte = 0;
511         int result;
512         int i;
513         
514         utf8Buf[0] = startByte;
515         if (highBits == 12 || highBits == 13)
516         {
517             nUTF8 = 1;
518         }
519         else if (highBits == 14)
520         {
521             nUTF8 = 2;
522         }
523         else
524         {
525             nUTF8 = 3;
526         }
527         for (i=0; i < nUTF8 && oneByte != -1; i++)
528         {
529             oneByte = super.read();
530             if (oneByte != -1)
531             {
532                 utf8Buf[i+1] = oneByte;
533             }
534         }
535         if (oneByte != -1)
536         {
537             result = 0;
538             switch (highBits)
539             {
540                 case 12:
541                 case 13:
542                     result = ((utf8Buf[0] & 0x1f) << 6)
543                         | (utf8Buf[1] & 0x3f);
544                     break;
545                 case 14:
546                     result = ((utf8Buf[0] & 0x0f) << 12)
547                         | ((utf8Buf[1] & 0x3f) << 6)
548                         | (utf8Buf[2] & 0x3f);
549                     break;
550                 case 15:
551                     result = ((utf8Buf[0] & 0x07) << 18)
552                         | ((utf8Buf[1] & 0x3f) << 12)
553                         | ((utf8Buf[2] & 0x3f) << 6)
554                         | (utf8Buf[3] & 0x3f);
555                     break;          
556             }
557         }
558         else
559         {
560             result = -1;
561         }
562         return result;
563     }
564     
565     /**
566      * Set flags to accept or reject characters in this tag.
567      *
568      * @param tag the tag to be analyzed
569      */
570     protected void analyzeTag( String   tag )
571     {
572         Matcher   m;
573         String   prefix;
574         String   name;
575         boolean isOpeningTag;
576         boolean isClosingTag;
577         int position;
578 
579         if ( !tag.startsWith("!") && !tag.startsWith( "?" ) )
580         {
581             m = elementNamePattern.matcher( tag );
582             if (m.find())
583             {
584                 prefix = m.group(1);
585                 name = m.group(2);
586                 
587                 /*
588                  * If this is the root element, it will have the text
589                  * namespace in it
590                  */
591                 if (rootElement && !tag.startsWith("?") &&
592                     !tag.startsWith("!"))
593                 {
594                     Pattern   textURI =
595                     Pattern.compile("xmlns:?([\\p{L}\\p{N}_.-]*)\\s*=\\s*" +
596                         "\"urn:oasis:names:tc:opendocument:xmlns:text:1.0\"");
597                     m = textURI.matcher( tag );
598                     if (m.find())
599                     {
600                         textNamespace = m.group(1);
601                     }
602                     else
603                     {
604                         throw new IllegalArgumentException  (
605                             "Cannot find namespace for text"
606                         );
607                     }
608                     rootElement = false;
609                 }
610                 
611                 isOpeningTag = !tag.startsWith("/");
612                 isClosingTag = tag.startsWith("/") || tag.endsWith("/");
613                 if (prefix.equals(textNamespace))
614                 {
615                     position = findTag( omitList, name );
616                     if (position >= 0)
617                     {
618                         if (isOpeningTag)
619                         {
620                             omitDepth++;
621                         }
622                         if (isClosingTag)
623                         {
624                             omitDepth--;
625                         }
626                     }
627 
628                     position = findTag( captureList, name );
629                     if (position >= 0)
630                     {
631                         ElementPostProcess elementInfo =
632                             (ElementPostProcess) captureList.get(position);
633                         if (isOpeningTag)
634                         {
635                             captureDepth++;
636                         }
637                         if (isClosingTag)
638                         {
639                             if ( elementInfo.getPostProcess() != '\0' &&
640                                 omitDepth == 0)
641                             {
642                                 utf8Output[0] = elementInfo.getPostProcess();
643                                 utf8OutputLength = 1;
644                                 utf8OutputPosition = 0;
645                             }
646                             captureDepth--;
647                         }
648                     }
649                 }
650             }
651             else
652             {
653                 throw new IllegalArgumentException  ( "Unknown tag <" +
654                     tag + ">");
655             }
656         }
657     }
658     
659     /**
660      * Locates a tag name within a list of <code>ElementPostProcess</code>.
661      *
662      * @param list an ArrayList of ElementPostProcess objects.
663      * @param name the name to search for.
664      * @return the position in the list, or -1 if not found.
665      */
666     private int findTag( ArrayList   list, String   name )
667     {
668         int result = -1;
669         int i = 0;
670         while (i < list.size() && result == -1)
671         {
672             if (((ElementPostProcess)list.get(i)).getName().equals(name))
673             {
674                 result = i;
675             }
676             else
677             {
678                 i++;
679             }
680         }
681         return result;
682     }
683 }
684 
685
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags