KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > jahia > utils > fileparsers > CharsetDetection


1 package org.jahia.utils.fileparsers;
2
3 import java.io.*;
4 import java.net.*;
5
6 import org.mozilla.intl.chardet.*;
7
8 /**
9  * <p>Title: Char set detection , based on jcharset lib</p>
10  * <p>Description: </p>
11  * <p>Copyright: Copyright (c) 2002</p>
12  * <p>Company: </p>
13  *
14  * @author Khue Nguyen
15  * @version 1.0
16  */

17 public class CharsetDetection implements nsICharsetDetectionObserver {
18
19     private static org.apache.log4j.Logger logger =
20             org.apache.log4j.Logger.getLogger (CharsetDetection.class);
21
22     private String JavaDoc charSet = null;
23
24     public CharsetDetection(){
25
26     }
27
28     /**
29      * Returns the charset. You must call charsetDetection(...) first
30      *
31      * @return String
32      */

33     public String JavaDoc getCharset(){
34         return this.charSet;
35     }
36
37     /**
38      * Returns true if only ascii
39      *
40      * @param ins InputStream
41      * @throws IOException
42      * @return int 1 = only ascii, 0 = not ascii, -1 = unknown
43      */

44     public int charsetDetection(InputStream ins) throws IOException {
45         return charsetDetection(nsPSMDetector.ALL , ins);
46     }
47
48     /**
49      * Returns true if only ascii
50      *
51      * @param url URL
52      * @throws IOException
53      * @return int 1 = only ascii, 0 = not ascii, -1 = unknown
54      */

55     public int charsetDetection(URL url) throws IOException {
56
57         if ( url == null ){
58             return -1;
59         }
60         return charsetDetection(nsPSMDetector.ALL , url.openStream());
61     }
62
63     /**
64      * Returns true if only ascii
65      *
66      * @param lang int
67      * @param url URL
68      * @throws IOException
69      * @return int 1 = only ascii, 0 = not ascii, -1 = unknown
70      */

71     public int charsetDetection(int lang, URL url) throws IOException {
72
73         if ( url == null ){
74             return -1;
75         }
76         return charsetDetection(lang, url.openStream());
77     }
78
79     /**
80      * Returns true if only ascii
81      *
82      * @param lang int
83      * @param ins InputStream
84      * @throws IOException
85      * @return int 1 = only ascii, 0 = not ascii, -1 = unknown
86      */

87     public int charsetDetection(int lang, InputStream ins) throws IOException {
88
89         if ( ins == null ){
90             return -1;
91         }
92         nsDetector det = new nsDetector(lang);
93         det.Init(this);
94         BufferedInputStream imp = new BufferedInputStream(ins);
95
96         byte[] buf = new byte[1024];
97         int len;
98         boolean done = false;
99         boolean isAscii = true;
100
101         while ( (len = imp.read(buf, 0, buf.length)) != -1) {
102
103             // Check if the stream is only ascii.
104
if (isAscii)
105                 isAscii = det.isAscii(buf, len);
106
107             // DoIt if non-ascii and not done yet.
108
if (!isAscii && !done)
109                 done = det.DoIt(buf, len, true);
110         }
111         det.DataEnd();
112
113         String JavaDoc[]charSets = det.getProbableCharsets();
114         /*
115         for ( int i=0; i<charSets.length ; i++ ){
116             logger.debug(
117                 "Charset detection notification , PROBABLE CHARSET FOUND = "
118                 + charSets[i]);
119         }*/

120         if ( charSets.length>0 ){
121             this.charSet = charSets[0]; // get the first
122
}
123
124         return (isAscii?1:0);
125     }
126
127     /**
128      * nsICharsetDetectionObserver charset detection implementation
129      *
130      * @param charset String
131      */

132     public void Notify(String JavaDoc charset)
133     {
134         // We can't rely on this, we should prefer probableCharsets use.
135

136         /*
137         this.charSet = charset;
138         logger.debug("Charset detection notification , CHARSET FOUND = "
139                      + charset);
140         */

141     }
142
143 }
144
Popular Tags