1 15 16 package net.nutch.parse.msword.chp; 17 18 import java.util.List ; 19 import java.util.ArrayList ; 20 import java.io.OutputStream ; 21 import java.io.IOException ; 22 23 import org.apache.poi.poifs.common.POIFSConstants; 24 import org.apache.poi.util.LittleEndian; 25 import org.apache.poi.hwpf.model.io.*; 26 import org.apache.poi.hwpf.model.*; 27 28 34 public class Word6CHPBinTable 35 { 36 37 ArrayList _textRuns = new ArrayList (); 38 39 47 public Word6CHPBinTable(byte[] documentStream, int offset, 48 int size, int fcMin) 49 { 50 PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); 51 52 int length = binTable.length(); 53 for (int x = 0; x < length; x++) 54 { 55 GenericPropertyNode node = binTable.getProperty(x); 56 57 int pageNum = LittleEndian.getShort((byte[])node.getBytes()); 58 int pageOffset = POIFSConstants.BIG_BLOCK_SIZE * pageNum; 59 60 CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream, 61 pageOffset, fcMin); 62 63 int fkpSize = cfkp.size(); 64 65 for (int y = 0; y < fkpSize; y++) 66 { 67 _textRuns.add(cfkp.getCHPX(y)); 68 } 69 } 70 } 71 72 public List getTextRuns() 73 { 74 return _textRuns; 75 } 76 77 } 78 | Popular Tags |