KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > alfresco > repo > content > transform > TextMiningContentTransformer


1 /*
2  * Copyright (C) 2005 Alfresco, Inc.
3  *
4  * Licensed under the Mozilla Public License version 1.1
5  * with a permitted attribution clause. You may obtain a
6  * copy of the License at
7  *
8  * http://www.alfresco.org/legal/license.txt
9  *
10  * Unless required by applicable law or agreed to in writing,
11  * software distributed under the License is distributed on an
12  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13  * either express or implied. See the License for the specific
14  * language governing permissions and limitations under the
15  * License.
16  */

17 package org.alfresco.repo.content.transform;
18
19 import java.io.IOException JavaDoc;
20 import java.io.InputStream JavaDoc;
21 import java.util.Map JavaDoc;
22
23 import org.alfresco.repo.content.MimetypeMap;
24 import org.alfresco.service.cmr.repository.ContentReader;
25 import org.alfresco.service.cmr.repository.ContentWriter;
26 import org.textmining.text.extraction.WordExtractor;
27
28 /**
29  * Makes use of the {@link http://www.textmining.org/ TextMining} library to
30  * perform conversions from MSWord documents to text.
31  *
32  * @author Derek Hulley
33  */

34 public class TextMiningContentTransformer extends AbstractContentTransformer
35 {
36     private WordExtractor wordExtractor;
37     
38     public TextMiningContentTransformer()
39     {
40         this.wordExtractor = new WordExtractor();
41     }
42     
43     /**
44      * Currently the only transformation performed is that of text extraction from Word documents.
45      */

46     public double getReliability(String JavaDoc sourceMimetype, String JavaDoc targetMimetype)
47     {
48         if (!MimetypeMap.MIMETYPE_WORD.equals(sourceMimetype) ||
49                 !MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
50         {
51             // only support DOC -> Text
52
return 0.0;
53         }
54         else
55         {
56             return 1.0;
57         }
58     }
59
60     public void transformInternal(ContentReader reader, ContentWriter writer, Map JavaDoc<String JavaDoc, Object JavaDoc> options)
61             throws Exception JavaDoc
62     {
63         InputStream JavaDoc is = null;
64         String JavaDoc text = null;
65         try
66         {
67             is = reader.getContentInputStream();
68             text = wordExtractor.extractText(is);
69         }
70         catch (IOException JavaDoc e)
71         {
72             // check if this is an error caused by the fact that the .doc is in fact
73
// one of Word's temp non-documents
74
if (e.getMessage().contains("Unable to read entire header"))
75             {
76                 // just assign an empty string
77
text = "";
78             }
79         }
80         finally
81         {
82             if (is != null)
83             {
84                 is.close();
85             }
86         }
87         // dump the text out. This will close the writer automatically.
88
writer.putContent(text);
89     }
90 }
91
Popular Tags