I_CmsTextExtractor


1   /*
2    * File   : $Source: /usr/local/cvs/opencms/src/org/opencms/search/extractors/I_CmsTextExtractor.java,v $
3    * Date   : $Date: 2005/06/23 11:11:28 $
4    * Version: $Revision: 1.5 $
5    *
6    * This library is part of OpenCms -
7    * the Open Source Content Mananagement System
8    *
9    * Copyright (c) 2005 Alkacon Software GmbH (http://www.alkacon.com)
10   *
11   * This library is free software; you can redistribute it and/or
12   * modify it under the terms of the GNU Lesser General Public
13   * License as published by the Free Software Foundation; either
14   * version 2.1 of the License, or (at your option) any later version.
15   *
16   * This library is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19   * Lesser General Public License for more details.
20   *
21   * For further information about Alkacon Software GmbH, please see the
22   * company website: http://www.alkacon.com
23   *
24   * For further information about OpenCms, please see the
25   * project website: http://www.opencms.org
26   * 
27   * You should have received a copy of the GNU Lesser General Public
28   * License along with this library; if not, write to the Free Software
29   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
30   */
31  
32  package org.opencms.search.extractors;
33  
34  import java.io.InputStream  ;
35  
36  /**
37   * Allows extraction of the indexable "plain" text plus (optional) meta information from a given binary 
38   * input document format.<p>
39   * 
40   * @author Alexander Kandzior 
41   * 
42   * @version $Revision: 1.5 $ 
43   * 
44   * @since 6.0.0 
45   */
46  public interface I_CmsTextExtractor {
47  
48      /**
49       * Extracts the text and meta information from the given binary document.<p> 
50       * 
51       * The encoding of the input stream is either not required (the document type may have 
52       * one common default encoding) or the extractor is able to divine the encoding 
53       * from the provided binary array automatically.<p>
54       * 
55       * Delivers is the same result as calling <code>{@link #extractText(byte[], String)}</code>
56       * when <code>String == null</code>.<p>
57       * 
58       * @param content the binary content of the document to extract the text from
59       * @return the extracted text
60       * 
61       * @throws Exception if the text extration fails
62       */
63      I_CmsExtractionResult extractText(byte[] content) throws Exception  ;
64  
65      /**
66       * Extracts the text and meta information from the given binary document, using the specified content encoding.<p> 
67       * 
68       * The encoding is a hint for the text extractor, if the value given is <code>null</code> then 
69       * the text extractor should try to figure out the encoding itself.<p>
70       * 
71       * @param content the binary content of the document to extract the text from
72       * @param encoding the encoding to use
73       * 
74       * @return the extracted text
75       * 
76       * @throws Exception if the text extration fails
77       */
78      I_CmsExtractionResult extractText(byte[] content, String   encoding) throws Exception  ;
79  
80      /**
81       * Extracts the text and meta information from the document on the input stream.<p> 
82       * 
83       * The encoding of the input stream is either not required (the document type may have 
84       * one common default encoding) or the extractor is able to divine the encoding 
85       * from the provided input stream automatically.<p>
86       * 
87       * Delivers is the same result as calling <code>{@link #extractText(InputStream, String)}</code>
88       * when <code>String == null</code>.<p>
89       * 
90       * @param in the input stream for the document to extract the text from
91       * @return the extracted text and meta information 
92       * 
93       * @throws Exception if the text extration fails
94       */
95      I_CmsExtractionResult extractText(InputStream   in) throws Exception  ;
96  
97      /**
98       * Extracts the text and meta information from the document on the input stream, using the specified content encoding.<p> 
99       * 
100      * The encoding is a hint for the text extractor, if the value given is <code>null</code> then 
101      * the text extractor should try to figure out the encoding itself.<p>
102      * 
103      * @param in the input stream for the document to extract the text from
104      * @param encoding the encoding to use
105      * 
106      * @return the extracted text and meta information 
107      * 
108      * @throws Exception if the text extration fails
109      */
110     I_CmsExtractionResult extractText(InputStream   in, String   encoding) throws Exception  ;
111 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags