1 31 package org.pdfbox.util; 32 33 import java.awt.geom.Rectangle2D ; 34 import java.io.IOException ; 35 import java.io.StringWriter ; 36 import java.util.ArrayList ; 37 import java.util.HashMap ; 38 import java.util.Iterator ; 39 import java.util.List ; 40 import java.util.Map ; 41 import java.util.Vector ; 42 43 import org.pdfbox.cos.COSStream; 44 import org.pdfbox.pdmodel.PDPage; 45 import org.pdfbox.pdmodel.common.PDStream; 46 47 53 public class PDFTextStripperByArea extends PDFTextStripper 54 { 55 private List regions = new ArrayList (); 56 private Map regionArea = new HashMap (); 57 private Map regionCharacterList = new HashMap (); 58 private Map regionText = new HashMap (); 59 60 64 public PDFTextStripperByArea() throws IOException 65 { 66 super(); 67 setPageSeparator( "" ); 68 } 69 70 76 public void addRegion( String regionName, Rectangle2D rect ) 77 { 78 regions.add( regionName ); 79 regionArea.put( regionName, rect ); 80 } 81 82 87 public List getRegions() 88 { 89 return regions; 90 } 91 92 98 public String getTextForRegion( String regionName ) 99 { 100 StringWriter text = (StringWriter )regionText.get( regionName ); 101 return text.toString(); 102 } 103 104 110 public void extractRegions( PDPage page ) throws IOException 111 { 112 Iterator regionIter = regions.iterator(); 113 while( regionIter.hasNext() ) 114 { 115 String regionName = (String )regionIter.next(); 118 Vector regionCharactersByArticle = new Vector (); 119 regionCharactersByArticle.add( new ArrayList () ); 120 regionCharacterList.put( regionName, regionCharactersByArticle ); 121 regionText.put( regionName, new StringWriter () ); 122 } 123 124 PDStream contentStream = page.getContents(); 125 if( contentStream != null ) 126 { 127 COSStream contents = contentStream.getStream(); 128 processPage( page, contents ); 129 } 130 } 131 132 135 protected void showCharacter( TextPosition text ) 136 { 137 Iterator regionIter = regionArea.keySet().iterator(); 138 while( regionIter.hasNext() ) 139 { 140 String region = (String )regionIter.next(); 141 Rectangle2D rect = (Rectangle2D )regionArea.get( region ); 142 if( rect.contains( text.getX(), text.getY() ) ) 143 { 144 charactersByArticle = (Vector )regionCharacterList.get( region ); 145 super.showCharacter( text ); 146 } 147 } 148 } 149 150 155 protected void flushText() throws IOException 156 { 157 Iterator regionIter = regionArea.keySet().iterator(); 158 while( regionIter.hasNext() ) 159 { 160 String region = (String )regionIter.next(); 161 charactersByArticle = (Vector )regionCharacterList.get( region ); 162 output = (StringWriter )regionText.get( region ); 163 super.flushText(); 164 } 165 } 166 } 167 | Popular Tags |