KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > ms > DocTest


1 /* DocTest
2 *
3 * Created on September 12, 2006
4 *
5 * Copyright (C) 2006 Internet Archive.
6 *
7 * This file is part of the Heritrix web crawler (crawler.archive.org).
8 *
9 * Heritrix is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * any later version.
13 *
14 * Heritrix is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser Public License
20 * along with Heritrix; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */

23 package org.archive.util.ms;
24
25 import java.io.Closeable JavaDoc;
26 import java.io.File JavaDoc;
27 import java.io.FileInputStream JavaDoc;
28 import java.io.FileOutputStream JavaDoc;
29 import java.io.IOException JavaDoc;
30 import java.io.InputStreamReader JavaDoc;
31 import java.io.OutputStreamWriter JavaDoc;
32 import java.io.Reader JavaDoc;
33 import java.io.Writer JavaDoc;
34
35 import org.apache.poi.hdf.extractor.WordDocument;
36
37 import junit.framework.TestCase;
38
39
40 public class DocTest extends TestCase {
41
42     
43     final private static File JavaDoc TEST_DIR = new File JavaDoc("testdata/ms");
44
45     
46     // Rename to testAgainstPOI to actually run the test.
47
public void testAgainstPOI() throws IOException JavaDoc {
48         int errors = 0;
49         long start = System.currentTimeMillis();
50         for (File JavaDoc f: TEST_DIR.listFiles()) try {
51             start = System.currentTimeMillis();
52             if (f.getName().endsWith(".doc")) {
53                 errors += runDoc(f);
54             }
55         } finally {
56             long duration = System.currentTimeMillis() - start;
57             System.out.println("Duration in milliseconds: " + duration);
58         }
59         if (errors > 0) {
60             throw new IOException JavaDoc(errors + " errors, see stdout.");
61         }
62     }
63
64     
65     private int runDoc(File JavaDoc doc) throws IOException JavaDoc {
66         System.out.println("===== Now processing " + doc.getName());
67         String JavaDoc name = doc.getName();
68         int p = name.lastIndexOf('.');
69         String JavaDoc expectedName = name.substring(0, p) + ".txt";
70         File JavaDoc expectedFile = new File JavaDoc(TEST_DIR, expectedName);
71         if (!expectedFile.exists()) {
72             createExpectedOutput(doc, expectedFile);
73         }
74         return runFiles(doc, expectedFile);
75     }
76     
77     
78     private void createExpectedOutput(File JavaDoc doc, File JavaDoc output)
79     throws IOException JavaDoc {
80         FileInputStream JavaDoc finp = new FileInputStream JavaDoc(doc);
81         FileOutputStream JavaDoc fout = new FileOutputStream JavaDoc(output);
82
83         try {
84             WordDocument wd = new WordDocument(finp);
85             Writer JavaDoc writer = new OutputStreamWriter JavaDoc(fout, "UTF-16BE");
86             wd.writeAllText(writer);
87         } finally {
88             close(finp);
89             close(fout);
90         }
91     }
92     
93     
94     private static void close(Closeable JavaDoc c) {
95         try {
96             c.close();
97         } catch (IOException JavaDoc e) {
98             e.printStackTrace();
99         }
100     }
101
102     
103     private int runFiles(File JavaDoc doc, File JavaDoc expected)
104     throws IOException JavaDoc {
105         FileInputStream JavaDoc expectedIn = new FileInputStream JavaDoc(expected);
106         Reader JavaDoc expectedReader = new InputStreamReader JavaDoc(expectedIn, "UTF-16BE");
107         Reader JavaDoc docReader = Doc.getText(doc);
108         try {
109             return runReaders(docReader, expectedReader);
110         } finally {
111             close(docReader);
112             close(expectedReader);
113         }
114     }
115     
116     
117     private int runReaders(Reader JavaDoc doc, Reader JavaDoc expected)
118     throws IOException JavaDoc {
119         int count = 0;
120         int errors = 0;
121         boolean go = true;
122         while (go) {
123             int ch = doc.read();
124             int expectedCh = correctPOI(expected.read());
125             if ((ch < 0) || (expectedCh < 0)) {
126                 go = false;
127                 if ((ch >= 0) || (expectedCh >= 0)) {
128                     errors++;
129                     System.out.println("File lengths differ.");
130                 }
131             }
132             if (ch != expectedCh) {
133                 errors += 1;
134                 report(count, expectedCh, ch);
135             }
136             count++;
137         }
138         return errors;
139     }
140
141     
142     private void report(int count, int expected, int actual) {
143         StringBuilder JavaDoc msg = new StringBuilder JavaDoc("#").append(count);
144         msg.append(": Expected ");
145         msg.append(expected).append(" (").append(toChar(expected));
146         msg.append(") but got ").append(actual).append(" (");
147         msg.append(toChar(actual)).append(").");
148         System.out.println(msg);
149     }
150
151
152     private static String JavaDoc toChar(int ch) {
153         if (ch < 0) {
154             return "EOF";
155         } else {
156             return Character.toString((char)ch);
157         }
158     }
159     
160     /**
161      * Corrects POI's Cp1252 output. There's a bug somewhere in POI that
162      * makes it produce incorrect characters. Not sure where and don't have
163      * time to track it down. But I have visually checked the input
164      * documents to verify that Doc is producing the right character, and
165      * that POI is not.
166      *
167      * @param ch the POI-produced character to check
168      * @return the corrected character
169      */

170     private static int correctPOI(int ch) {
171         switch (ch) {
172             case 8734:
173                 // POI produced the infinity sign when it should have
174
// produced the degrees sign.
175
return 176;
176             case 214:
177                 // POI produced an umat O instead of an ellipses mark.
178
return 8230;
179             case 237:
180                 // POI produced an acute i instead of a fancy single quote
181
return 8217;
182             case 236:
183                 // POI produced a reverse acute i instead of fancy double quote
184
return 8220;
185             case 238:
186                 // POI produced a caret i instead of fancy double quote
187
return 8221;
188             default:
189                 return ch;
190         }
191     }
192
193     
194 }
195
Popular Tags