SSTDeserializer


1   
2   /* ====================================================================
3      Copyright 2002-2004   Apache Software Foundation
4   
5      Licensed under the Apache License, Version 2.0 (the "License");
6      you may not use this file except in compliance with the License.
7      You may obtain a copy of the License at
8   
9          http://www.apache.org/licenses/LICENSE-2.0
10  
11     Unless required by applicable law or agreed to in writing, software
12     distributed under the License is distributed on an "AS IS" BASIS,
13     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14     See the License for the specific language governing permissions and
15     limitations under the License.
16  ==================================================================== */
17          
18  
19  package org.apache.poi.hssf.record;
20  
21  import org.apache.poi.util.BinaryTree;
22  import org.apache.poi.util.LittleEndian;
23  import org.apache.poi.util.LittleEndianConsts;
24  
25  /**
26   * Handles the task of deserializing a SST string.  The two main entry points are
27   *
28   * @author Glen Stampoultzis (glens at apache.org)
29   * @author Jason Height (jheight at apache.org)
30   */
31  class SSTDeserializer
32  {
33  
34      private BinaryTree strings;
35      /** this is the number of characters that have been read prior to the continuation */
36      private int continuationReadChars;
37      /** this is the string we were working on before hitting the end of the current record. This string is NOT finished. */
38      private String   unfinishedString;
39      /** this is true if the string uses wide characters */
40      private boolean wideChar;
41      /** this is true if the string is a rich text string */
42      private boolean richText;
43      /** this is true if the string is a far east string or some other wierd string */
44      private boolean extendedText;
45      /** Number of formatting runs in this rich text field */
46      private short runCount;
47      /** Number of characters in current string */
48      private int charCount;
49      private int extensionLength;
50      private int continueSkipBytes = 0;
51  
52  
53      public SSTDeserializer( BinaryTree strings )
54      {
55          this.strings = strings;
56          initVars();
57      }
58  
59      private void initVars()
60      {
61          runCount = 0;
62          continuationReadChars = 0;
63          unfinishedString = "";
64  //        bytesInCurrentSegment = 0;
65  //        stringDataOffset = 0;
66          wideChar = false;
67          richText = false;
68          extendedText = false;
69          continueSkipBytes = 0;
70      }
71  
72      /**
73       * This is the starting point where strings are constructed.  Note that
74       * strings may span across multiple continuations. Read the SST record
75       * carefully before beginning to hack.
76       */
77      public void manufactureStrings( final byte[] data, final int initialOffset)
78      {
79          initVars();
80  
81          int offset = initialOffset;
82          final int dataSize = data.length;
83          while ( offset < dataSize )
84          {
85              int remaining = dataSize - offset;
86  
87              if ( ( remaining > 0 ) && ( remaining < LittleEndianConsts.SHORT_SIZE ) )
88              {
89                  throw new RecordFormatException( "Cannot get length of the last string in SSTRecord" );
90              }
91              if ( remaining == LittleEndianConsts.SHORT_SIZE )
92              {
93                //JMH Dont know about this
94                  setContinuationCharsRead( 0 );//LittleEndian.getUShort( data, offset ) );
95                  unfinishedString = "";
96                  break;
97              }
98              charCount = LittleEndian.getUShort( data, offset );
99              int charsRead = charCount;
100             readStringHeader( data, offset );
101             boolean stringContinuesOverContinuation = remaining < totalStringSize();
102             if ( stringContinuesOverContinuation )
103             {
104                 int remainingBytes = dataSize - offset - stringHeaderOverhead();
105                 //Only read the size of the string or whatever is left before the
106                 //continuation
107                 charsRead = Math.min(charsRead, calculateCharCount( remainingBytes ));
108                 setContinuationCharsRead( charsRead );                
109                 if (charsRead == charCount) {
110                   //Since all of the characters will have been read, but the entire string (including formatting runs etc)
111                   //hasnt, Compute the number of bytes to skip when the continue record starts
112                   continueSkipBytes = offsetForContinuedRecord(0) - (remainingBytes - calculateByteCount(charsRead));
113                 }
114             }
115             processString( data, offset, charsRead );
116             offset += totalStringSize();
117             if ( stringContinuesOverContinuation )
118             {
119                 break;
120             }
121         }
122     }
123 
124 //    private void dump( final byte[] data, int offset, int length )
125 //    {
126 //        try
127 //        {
128 //            System.out.println( "------------------- SST DUMP -------------------------" );
129 //            HexDump.dump( (byte[]) data, offset, System.out, offset, length );
130 //        }
131 //        catch ( IOException e )
132 //        {
133 //        }
134 //        catch ( ArrayIndexOutOfBoundsException e )
135 //        {
136 //        }
137 //        catch ( IllegalArgumentException e )
138 //        {
139 //        }
140 //    }
141 
142     /**
143      * Detemines the option types for the string (ie, compressed or uncompressed unicode, rich text string or
144      * plain string etc) and calculates the length and offset for the string.
145      *
146      */
147     private void readStringHeader( final byte[] data, final int index )
148     {
149 
150         byte optionFlag = data[index + LittleEndianConsts.SHORT_SIZE];
151 
152         wideChar = ( optionFlag & 1 ) == 1;
153         extendedText = ( optionFlag & 4 ) == 4;
154         richText = ( optionFlag & 8 ) == 8;
155         runCount = 0;
156         if ( richText )
157         {
158             runCount = LittleEndian.getShort( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD );
159         }
160         extensionLength = 0;
161         if ( extendedText )
162         {
163             extensionLength = LittleEndian.getInt( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD
164                     + (richText ? LittleEndianConsts.SHORT_SIZE : 0) );
165         }
166 
167     }
168 
169 
170     /**
171      * Reads a string or the first part of a string.
172      *
173      * @param characters the number of characters to write.
174      *
175      * @return the number of bytes written.
176      */
177     private int processString( final byte[] data, final int dataIndex, final int characters )
178     {
179 
180         // length is the length we store it as.  not the length that is read.
181         int length = SSTRecord.STRING_MINIMAL_OVERHEAD + calculateByteCount( characters );
182         byte[] unicodeStringBuffer = new byte[length];
183 
184         int offset = 0;
185 
186         // Set the length in characters
187         LittleEndian.putUShort( unicodeStringBuffer, offset, characters );
188         offset += LittleEndianConsts.SHORT_SIZE;
189         // Set the option flags
190         unicodeStringBuffer[offset] = data[dataIndex + offset];
191         // Copy in the string data
192         int bytesRead = unicodeStringBuffer.length - SSTRecord.STRING_MINIMAL_OVERHEAD;
193         arraycopy( data, dataIndex + stringHeaderOverhead(), unicodeStringBuffer, SSTRecord.STRING_MINIMAL_OVERHEAD, bytesRead );
194         // Create the unicode string
195         UnicodeString string = new UnicodeString( UnicodeString.sid,
196                 (short) unicodeStringBuffer.length,
197                 unicodeStringBuffer );
198         setContinuationCharsRead( calculateCharCount(bytesRead));
199 
200         if ( isStringFinished() )
201         {
202             Integer   integer = new Integer  ( strings.size() );
203             addToStringTable( strings, integer, string );
204         }
205         else
206         {
207             unfinishedString = string.getString();
208         }
209 
210         return bytesRead;
211     }
212 
213     private boolean isStringFinished()
214     {
215         return getContinuationCharsRead() == charCount;
216     }
217 
218     /**
219      * Okay, we are doing some major cheating here. Because we can't handle rich text strings properly
220      * we end up getting duplicate strings.  To get around this I'm doing two things: 1. Converting rich
221      * text to normal text and 2. If there's a duplicate I'm adding a space onto the end.  Sneaky perhaps
222      * but it gets the job done until we can handle this a little better.
223      */
224     static public void addToStringTable( BinaryTree strings, Integer   integer, UnicodeString string )
225     {
226 
227         if ( string.isRichText() )
228             string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~8 ) ) );
229         if ( string.isExtendedText() )
230             string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~4 ) ) );
231 
232         boolean added = false;
233         while ( added == false )
234         {
235             try
236             {
237                 strings.put( integer, string );
238                 added = true;
239             }
240             catch ( Exception   ignore )
241             {
242                 string.setString( string.getString() + " " );
243             }
244         }
245 
246     }
247 
248 
249     private int calculateCharCount( final int byte_count )
250     {
251         return byte_count / ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
252     }
253 
254     /**
255      * Process a Continue record. A Continue record for an SST record
256      * contains the same kind of data that the SST record contains,
257      * with the following exceptions:
258      * <P>
259      * <OL>
260      * <LI>The string counts at the beginning of the SST record are
261      *     not in the Continue record
262      * <LI>The first string in the Continue record might NOT begin
263      *     with a size. If the last string in the previous record is
264      *     continued in this record, the size is determined by that
265      *     last string in the previous record; the first string will
266      *     begin with a flag byte, followed by the remaining bytes (or
267      *     words) of the last string from the previous
268      *     record. Otherwise, the first string in the record will
269      *     begin with a string length
270      * </OL>
271      *
272      * @param record the Continue record's byte data
273      */
274     public void processContinueRecord( final byte[] record )
275     {
276         if ( isStringFinished() )
277         {
278             final int offset = continueSkipBytes;
279             initVars();
280             manufactureStrings( record, offset);
281         }
282         else
283         {
284             // reset the wide bit because that can change across a continuation. the fact that it's
285             // actually rich text doesn't change across continuations even though the rich text
286             // may on longer be set in the "new" option flag.  confusing huh?
287             wideChar = ( record[0] & 1 ) == 1;
288 
289             if ( stringSpansContinuation( record.length - LittleEndianConsts.BYTE_SIZE ) )
290             {
291                 processEntireContinuation( record );
292             }
293             else
294             {
295                 readStringRemainder( record );
296             }
297         }
298 
299     }
300 
301     /**
302      * Reads the remainder string and any subsequent strings from the continuation record.
303      *
304      * @param record  The entire continuation record data.
305      */
306     private void readStringRemainder( final byte[] record )
307     {
308         int stringRemainderSizeInBytes = calculateByteCount( charCount-getContinuationCharsRead() );
309         byte[] unicodeStringData = new byte[SSTRecord.STRING_MINIMAL_OVERHEAD
310                 + stringRemainderSizeInBytes];
311 
312         // write the string length
313         LittleEndian.putShort( unicodeStringData, 0, (short) (charCount-getContinuationCharsRead()) );
314 
315         // write the options flag
316         unicodeStringData[LittleEndianConsts.SHORT_SIZE] = createOptionByte( wideChar, richText, extendedText );
317 
318         // copy the bytes/words making up the string; skipping
319         // past all the overhead of the str_data array
320         arraycopy( record, LittleEndianConsts.BYTE_SIZE, unicodeStringData,
321                 SSTRecord.STRING_MINIMAL_OVERHEAD,
322                 stringRemainderSizeInBytes );
323 
324         // use special constructor to create the final string
325         UnicodeString string = new UnicodeString( UnicodeString.sid,
326                 (short) unicodeStringData.length, unicodeStringData,
327                 unfinishedString );
328         Integer   integer = new Integer  ( strings.size() );
329 
330         addToStringTable( strings, integer, string );
331 
332         int newOffset = offsetForContinuedRecord( stringRemainderSizeInBytes );
333         manufactureStrings( record, newOffset);
334     }
335 
336     /**
337      * Calculates the size of the string in bytes based on the character width
338      */
339     private int stringSizeInBytes()
340     {
341         return calculateByteCount( charCount );
342     }
343 
344     /**
345      * Calculates the size of the string in byes.  This figure includes all the over
346      * heads for the string.
347      */
348     private int totalStringSize()
349     {
350         return stringSizeInBytes()
351                 + stringHeaderOverhead()
352                 + LittleEndianConsts.INT_SIZE * runCount
353                 + extensionLength;
354     }
355 
356     private int stringHeaderOverhead()
357     {
358         return SSTRecord.STRING_MINIMAL_OVERHEAD
359                 + ( richText ? LittleEndianConsts.SHORT_SIZE : 0 )
360                 + ( extendedText ? LittleEndianConsts.INT_SIZE : 0 );
361     }
362 
363     private int offsetForContinuedRecord( int stringRemainderSizeInBytes )
364     {
365         int offset = stringRemainderSizeInBytes + runCount * LittleEndianConsts.INT_SIZE + extensionLength;        
366         if (stringRemainderSizeInBytes != 0)
367           //If a portion of the string remains then the wideChar options byte is repeated,
368           //so need to skip this.
369           offset += + LittleEndianConsts.BYTE_SIZE;
370         return offset;  
371     }
372 
373     private byte createOptionByte( boolean wideChar, boolean richText, boolean farEast )
374     {
375         return (byte) ( ( wideChar ? 1 : 0 ) + ( farEast ? 4 : 0 ) + ( richText ? 8 : 0 ) );
376     }
377 
378     /**
379      * If the continued record is so long is spans into the next continue then
380      * simply suck the remaining string data into the existing <code>unfinishedString</code>.
381      *
382      * @param record    The data from the continuation record.
383      */
384     private void processEntireContinuation( final byte[] record )
385     {
386         // create artificial data to create a UnicodeString
387         int dataLengthInBytes = record.length - LittleEndianConsts.BYTE_SIZE;
388         byte[] unicodeStringData = new byte[record.length + LittleEndianConsts.SHORT_SIZE];
389 
390         int charsRead = calculateCharCount( dataLengthInBytes );
391         LittleEndian.putShort( unicodeStringData, (byte) 0, (short) charsRead );
392         arraycopy( record, 0, unicodeStringData, LittleEndianConsts.SHORT_SIZE, record.length );
393         UnicodeString ucs = new UnicodeString( UnicodeString.sid, (short) unicodeStringData.length, unicodeStringData, unfinishedString);
394 
395         unfinishedString = ucs.getString();
396         setContinuationCharsRead( getContinuationCharsRead() + charsRead );
397         if (getContinuationCharsRead() == charCount) {
398           Integer   integer = new Integer  ( strings.size() );
399           addToStringTable( strings, integer, ucs );
400         }
401     }
402 
403     private boolean stringSpansContinuation( int continuationSizeInBytes )
404     {
405         return calculateByteCount( charCount - getContinuationCharsRead() ) > continuationSizeInBytes;
406     }
407 
408     /**
409      * @return the number of characters we expect in the first
410      *         sub-record in a subsequent continuation record
411      */
412 
413     int getContinuationCharsRead()
414     {
415         return continuationReadChars;
416     }
417 
418     private void setContinuationCharsRead( final int count )
419     {
420         continuationReadChars = count;
421     }
422 
423     private int calculateByteCount( final int character_count )
424     {
425         return character_count * ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
426     }
427 
428 
429     /**
430      * Copies an array from the specified source array, beginning at the
431      * specified position, to the specified position of the destination array.
432      * A subsequence of array components are copied from the source
433      * array referenced by <code>src</code> to the destination array
434      * referenced by <code>dst</code>. The number of components copied is
435      * equal to the <code>length</code> argument. The components at
436      * positions <code>srcOffset</code> through
437      * <code>srcOffset+length-1</code> in the source array are copied into
438      * positions <code>dstOffset</code> through
439      * <code>dstOffset+length-1</code>, respectively, of the destination
440      * array.
441      * <p>
442      * If the <code>src</code> and <code>dst</code> arguments refer to the
443      * same array object, then the copying is performed as if the
444      * components at positions <code>srcOffset</code> through
445      * <code>srcOffset+length-1</code> were first copied to a temporary
446      * array with <code>length</code> components and then the contents of
447      * the temporary array were copied into positions
448      * <code>dstOffset</code> through <code>dstOffset+length-1</code> of the
449      * destination array.
450      * <p>
451      * If <code>dst</code> is <code>null</code>, then a
452      * <code>NullPointerException</code> is thrown.
453      * <p>
454      * If <code>src</code> is <code>null</code>, then a
455      * <code>NullPointerException</code> is thrown and the destination
456      * array is not modified.
457      * <p>
458      * Otherwise, if any of the following is true, an
459      * <code>ArrayStoreException</code> is thrown and the destination is
460      * not modified:
461      * <ul>
462      * <li>The <code>src</code> argument refers to an object that is not an
463      *     array.
464      * <li>The <code>dst</code> argument refers to an object that is not an
465      *     array.
466      * <li>The <code>src</code> argument and <code>dst</code> argument refer to
467      *     arrays whose component types are different primitive types.
468      * <li>The <code>src</code> argument refers to an array with a primitive
469      *     component type and the <code>dst</code> argument refers to an array
470      *     with a reference component type.
471      * <li>The <code>src</code> argument refers to an array with a reference
472      *     component type and the <code>dst</code> argument refers to an array
473      *     with a primitive component type.
474      * </ul>
475      * <p>
476      * Otherwise, if any of the following is true, an
477      * <code>IndexOutOfBoundsException</code> is
478      * thrown and the destination is not modified:
479      * <ul>
480      * <li>The <code>srcOffset</code> argument is negative.
481      * <li>The <code>dstOffset</code> argument is negative.
482      * <li>The <code>length</code> argument is negative.
483      * <li><code>srcOffset+length</code> is greater than
484      *     <code>src.length</code>, the length of the source array.
485      * <li><code>dstOffset+length</code> is greater than
486      *     <code>dst.length</code>, the length of the destination array.
487      * </ul>
488      * <p>
489      * Otherwise, if any actual component of the source array from
490      * position <code>srcOffset</code> through
491      * <code>srcOffset+length-1</code> cannot be converted to the component
492      * type of the destination array by assignment conversion, an
493      * <code>ArrayStoreException</code> is thrown. In this case, let
494      * <b><i>k</i></b> be the smallest nonnegative integer less than
495      * length such that <code>src[srcOffset+</code><i>k</i><code>]</code>
496      * cannot be converted to the component type of the destination
497      * array; when the exception is thrown, source array components from
498      * positions <code>srcOffset</code> through
499      * <code>srcOffset+</code><i>k</i><code>-1</code>
500      * will already have been copied to destination array positions
501      * <code>dstOffset</code> through
502      * <code>dstOffset+</code><i>k</I><code>-1</code> and no other
503      * positions of the destination array will have been modified.
504      * (Because of the restrictions already itemized, this
505      * paragraph effectively applies only to the situation where both
506      * arrays have component types that are reference types.)
507      *
508      * @param      src          the source array.
509      * @param      src_position start position in the source array.
510      * @param      dst          the destination array.
511      * @param      dst_position pos   start position in the destination data.
512      * @param      length       the number of array elements to be copied.
513      * @exception  IndexOutOfBoundsException  if copying would cause
514      *               access of data outside array bounds.
515      * @exception  ArrayStoreException  if an element in the <code>src</code>
516      *               array could not be stored into the <code>dest</code> array
517      *               because of a type mismatch.
518      * @exception  NullPointerException if either <code>src</code> or
519      *               <code>dst</code> is <code>null</code>.
520      */
521     private void arraycopy( byte[] src, int src_position,
522                             byte[] dst, int dst_position,
523                             int length )
524     {
525         System.arraycopy( src, src_position, dst, dst_position, length );
526     }
527 
528     /**
529      * @return the unfinished string
530      */
531     String   getUnfinishedString()
532     {
533         return unfinishedString;
534     }
535 
536     /**
537      * @return true if current string uses wide characters
538      */
539     boolean isWideChar()
540     {
541         return wideChar;
542     }
543 
544 
545 }
546
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags