1 16 package org.outerj.daisy.textextraction.impl; 17 18 import org.apache.poi.util.LittleEndian; 19 import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; 20 import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; 21 import org.apache.poi.poifs.eventfilesystem.POIFSReader; 22 import org.apache.poi.poifs.filesystem.DocumentInputStream; 23 import java.io.*; 24 25 28 public class MSPowerPointTextExtractor implements MimetypeTextExtractor, POIFSReaderListener { 29 30 private ByteArrayOutputStream writer = new ByteArrayOutputStream(); 32 33 public String getText(InputStream is) throws Exception { 34 POIFSReader reader = new POIFSReader(); 35 reader.registerListener(this); 36 reader.read(is); 37 38 return writer.toString(); 39 } 40 41 public void processPOIFSReaderEvent(POIFSReaderEvent event) { 42 try { 43 DocumentInputStream input = event.getStream(); 44 45 byte[] buffer = new byte[input.available()]; 46 input.read(buffer, 0, input.available()); 47 48 for(int i=0; i<buffer.length-20; i++) { 49 long type = LittleEndian.getUShort(buffer,i+2); 50 long size = LittleEndian.getUInt(buffer,i+4); 51 52 if(type==4008) { 53 writer.write(buffer, i + 4 + 1, (int)size +3); 54 i = i + 4 + 1 + (int)size -1; 55 } 56 } 57 } 58 catch (Exception e) { 59 60 } 61 } 62 } 63 | Popular Tags |