|                                                                                                              1
 15
 16  package net.nutch.parse.msword;
 17
 18  import net.nutch.parse.msword.chp.*;
 19
 20  import org.apache.poi.util.LittleEndian;
 21  import org.apache.poi.hwpf.model.*;
 22  import org.apache.poi.hwpf.sprm.*;
 23
 24  import java.util.*;
 25  import java.io.*;
 26
 27
 34  class Word6Extractor
 35  {
 36
 37    public Word6Extractor()
 38    {
 39    }
 40
 41
 49    public String
  extractText(byte[] mainStream) throws Exception  50    {
 51      int fcMin = LittleEndian.getInt(mainStream, 0x18);
 52      int fcMax = LittleEndian.getInt(mainStream, 0x1C);
 53
 54      int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
 55      int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
 56
 57          Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
 59        chpTableSize, fcMin);
 60      List textRuns = chpTable.getTextRuns();
 61
 62          WordTextBuffer finalTextBuf = new WordTextBuffer();
 64      Iterator runsIt = textRuns.iterator();
 65      while(runsIt.hasNext())
 66      {
 67        CHPX chpx = (CHPX)runsIt.next();
 68        int runStart = chpx.getStart() + fcMin;
 69        int runEnd = chpx.getEnd() + fcMin;
 70
 71        if (!isDeleted(chpx.getGrpprl()))
 72        {
 73          String
  s = new String  (mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252"); 74          finalTextBuf.append(s);
 75          if (runEnd >= fcMax)
 76          {
 77            break;
 78          }
 79        }
 80      }
 81
 82      return finalTextBuf.toString();
 83    }
 84
 85
 90    private boolean isDeleted(byte[] grpprl)
 91    {
 92      int offset = 0;
 93      boolean deleted = false;
 94      while (offset < grpprl.length)
 95      {
 96        switch (LittleEndian.getUnsignedByte(grpprl, offset++))
 97        {
 98          case 65:
 99            deleted = grpprl[offset++] != 0;
 100           break;
 101         case 66:
 102           offset++;
 103           break;
 104         case 67:
 105           offset++;
 106           break;
 107         case 68:
 108           offset += grpprl[offset];
 109           break;
 110         case 69:
 111           offset += 2;
 112           break;
 113         case 70:
 114           offset += 4;
 115           break;
 116         case 71:
 117           offset++;
 118           break;
 119         case 72:
 120           offset += 2;
 121           break;
 122         case 73:
 123           offset += 3;
 124           break;
 125         case 74:
 126           offset += grpprl[offset];
 127           break;
 128         case 75:
 129           offset++;
 130           break;
 131         case 80:
 132           offset += 2;
 133           break;
 134         case 81:
 135           offset += grpprl[offset];
 136           break;
 137         case 82:
 138           offset += grpprl[offset];
 139           break;
 140         case 83:
 141           break;
 142         case 85:
 143           offset++;
 144           break;
 145         case 86:
 146           offset++;
 147           break;
 148         case 87:
 149           offset++;
 150           break;
 151         case 88:
 152           offset++;
 153           break;
 154         case 89:
 155           offset++;
 156           break;
 157         case 90:
 158           offset++;
 159           break;
 160         case 91:
 161           offset++;
 162           break;
 163         case 92:
 164           offset++;
 165           break;
 166         case 93:
 167           offset += 2;
 168           break;
 169         case 94:
 170           offset++;
 171           break;
 172         case 95:
 173           offset += 3;
 174           break;
 175         case 96:
 176           offset += 2;
 177           break;
 178         case 97:
 179           offset += 2;
 180           break;
 181         case 98:
 182           offset++;
 183           break;
 184         case 99:
 185           offset++;
 186           break;
 187         case 100:
 188           offset++;
 189           break;
 190         case 101:
 191           offset++;
 192           break;
 193         case 102:
 194           offset++;
 195           break;
 196         case 103:
 197           offset += grpprl[offset];
 198           break;
 199         case 104:
 200           offset++;
 201           break;
 202         case 105:
 203           offset += grpprl[offset];
 204           break;
 205         case 106:
 206           offset += grpprl[offset];
 207           break;
 208         case 107:
 209           offset += 2;
 210           break;
 211         case 108:
 212           offset += grpprl[offset];
 213           break;
 214         case 109:
 215           offset += 2;
 216           break;
 217         case 110:
 218           offset += 2;
 219           break;
 220         case 117:
 221           offset++;
 222           break;
 223         case 118:
 224           offset++;
 225           break;
 226
 227       }
 228     }
 229     return deleted;
 230   }
 231 }
 232
                                                                                                                                                                                                             |                                                                       
 
 
 
 
 
                                                                                   Popular Tags                                                                                                                                                                                              |