KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > ms > Doc


1 /* Doc
2 *
3 * Created on September 12, 2006
4 *
5 * Copyright (C) 2006 Internet Archive.
6 *
7 * This file is part of the Heritrix web crawler (crawler.archive.org).
8 *
9 * Heritrix is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * any later version.
13 *
14 * Heritrix is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser Public License
20 * along with Heritrix; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */

23 package org.archive.util.ms;
24
25
26 import java.io.File JavaDoc;
27 import java.io.IOException JavaDoc;
28 import java.io.RandomAccessFile JavaDoc;
29 import java.util.List JavaDoc;
30 import java.util.logging.Level JavaDoc;
31 import java.util.logging.Logger JavaDoc;
32
33 import org.archive.io.Endian;
34 import org.archive.io.RandomAccessInputStream;
35 import org.archive.io.SeekInputStream;
36 import org.archive.io.SeekReader;
37
38
39 /**
40  * Reads .doc files.
41  *
42  * @author pjack
43  */

44 public class Doc {
45     
46     
47     final private static Logger JavaDoc LOGGER = Logger.getLogger(Doc.class.getName());
48     
49
50     /**
51      * Static utility library, do not instantiate.
52      */

53     private Doc() {
54     }
55
56
57     /**
58      * Returns the text of the .doc file with the given file name.
59      *
60      * @param docFilename the name of the file whose text to return
61      * @return the text of that file
62      * @throws IOException if an IO error occurs
63      */

64     public static SeekReader getText(String JavaDoc docFilename) throws IOException JavaDoc {
65         return getText(new File JavaDoc(docFilename));
66     }
67
68
69     /**
70      * Returns the text of the given .doc file.
71      *
72      * @param doc the .doc file whose text to return
73      * @return the text of that file
74      * @throws IOException if an IO error occurs
75      */

76     public static SeekReader getText(File JavaDoc doc) throws IOException JavaDoc {
77         RandomAccessFile JavaDoc raf = new RandomAccessFile JavaDoc(doc, "r");
78         RandomAccessInputStream rais = new RandomAccessInputStream(raf);
79         return getText(rais);
80     }
81
82     
83     /**
84      * Returns the text of the given .doc file.
85      *
86      * @param doc the .doc file whose text to return
87      * @return the text of that file
88      * @throws IOException if an IO error occurs
89      */

90     public static SeekReader getText(SeekInputStream doc) throws IOException JavaDoc {
91         BlockFileSystem bfs = new DefaultBlockFileSystem(doc, 16);
92         return getText(bfs, 20);
93     }
94
95     
96     /**
97      * Returns the text for the given .doc file. The given cacheSize refers
98      * to the number of the .doc file's piece table entries to cache. Most
99      * .doc files only have 1 piece table entry; however, a "fast-saved"
100      * .doc file might have several. A cacheSize of 20 should be ample for
101      * most .doc files in the world. Since piece table entries are small --
102      * only 12 bytes each -- caching them prevents many otherwise necessary
103      * file pointer repositionings.
104      *
105      * @param wordDoc the .doc file as a BlockFileSystem
106      * @param cacheSize the number of piece table entries to cache
107      * @return a reader that will return the text in the file
108      * @throws IOException if an IO error occurs
109      */

110     public static SeekReader getText(BlockFileSystem wordDoc, int cacheSize)
111     throws IOException JavaDoc {
112         List JavaDoc<Entry> entries = wordDoc.getRoot().list();
113         Entry main = find(entries, "WordDocument");
114         SeekInputStream mainStream = main.open();
115         
116         mainStream.position(10);
117         int flags = Endian.littleChar(mainStream);
118         boolean complex = (flags & 0x0004) == 0x0004;
119         boolean tableOne = (flags & 0x0200) == 0x0200;
120         String JavaDoc tableName = tableOne ? "1Table" : "0Table";
121         Entry table = find(entries, tableName);
122         if (LOGGER.isLoggable(Level.FINEST)) {
123             LOGGER.finest("Main entry: " + main);
124             LOGGER.finest("Table entry: " + table);
125         }
126         SeekInputStream tableStream = table.open();
127         
128         mainStream.position(24);
129         int fcMin = Endian.littleInt(mainStream);
130         int fcMax = Endian.littleInt(mainStream);
131         
132         mainStream.position(76);
133         int cppText = Endian.littleInt(mainStream);
134         
135         mainStream.position(418);
136         int fcClx = Endian.littleInt(mainStream);
137         int fcSz = Endian.littleInt(mainStream);
138         
139         if (LOGGER.isLoggable(Level.FINE)) {
140             LOGGER.fine("fcMin: " + fcMin);
141             LOGGER.fine("fcMax: " + fcMax);
142             LOGGER.fine("FcClx: " + fcClx);
143             LOGGER.fine("szClx: " + fcSz);
144             LOGGER.fine("complex: " + complex);
145             LOGGER.fine("cppText: " + cppText);
146         }
147         PieceTable pt = new PieceTable(tableStream, fcClx, fcMax - fcMin, cacheSize);
148         return new PieceReader(pt, mainStream);
149     }
150
151
152     private static Entry find(List JavaDoc<Entry> entries, String JavaDoc name) {
153         for (Entry e: entries) {
154             if (e.getName().equals(name)) {
155                 return e;
156             }
157         }
158         return null;
159     }
160
161 }
162
Popular Tags