KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > au > id > jericho > lib > html > EncodedSource


1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2
// Version 2.2
3
// Copyright (C) 2006 Martin Jericho
4
// http://sourceforge.net/projects/jerichohtml/
5
//
6
// This library is free software; you can redistribute it and/or
7
// modify it under the terms of the GNU Lesser General Public
8
// License as published by the Free Software Foundation; either
9
// version 2.1 of the License, or (at your option) any later version.
10
// http://www.gnu.org/copyleft/lesser.html
11
//
12
// This library is distributed in the hope that it will be useful,
13
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
// Lesser General Public License for more details.
16
//
17
// You should have received a copy of the GNU Lesser General Public
18
// License along with this library; if not, write to the Free Software
19
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20

21 package au.id.jericho.lib.html;
22
23 import java.util.*;
24 import java.io.*;
25 import java.net.*;
26
27 /**
28  * Based on information in:
29  * http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
30  * http://www.w3.org/TR/html401/charset.html#h-5.2
31  */

32 final class EncodedSource {
33     public final Reader Reader;
34     public final String JavaDoc Encoding;
35     public final String JavaDoc EncodingSpecificationInfo;
36     public final HttpURLConnection HttpURLConnection;
37     
38     private static final int PREVIEW_BUFFER_SIZE=2048;
39     private static final int PREVIEW_MAX_BYTES=PREVIEW_BUFFER_SIZE*4; // Cater for each character in the preview buffer requiring an average of 4 bytes, which is twice what would reasonably be expected but ensures the reset() call on the BufferedInputStream doesn't fail.
40

41     private static final String JavaDoc UTF_32="UTF-32"; // not supported in Java, will throw an exception.
42
private static final String JavaDoc UTF_16="UTF-16";
43     private static final String JavaDoc UTF_16BE="UTF-16BE";
44     private static final String JavaDoc UTF_16LE="UTF-16LE";
45     private static final String JavaDoc UTF_8="UTF-8";
46     private static final String JavaDoc EBCDIC="Cp037";
47     private static final String JavaDoc ISO_8859_1="ISO-8859-1";
48     
49     EncodedSource(final InputStream inputStream, final String JavaDoc encoding, final String JavaDoc encodingSpecificationInfo, final HttpURLConnection httpURLConnection) throws UnsupportedEncodingException {
50         if (encoding==null)
51             Reader=new InputStreamReader(inputStream); // Reader will be empty so the encoding is arbitrary.
52
else
53             Reader=new InputStreamReader(inputStream,encoding);
54         Encoding=encoding;
55         EncodingSpecificationInfo=encodingSpecificationInfo;
56         HttpURLConnection=httpURLConnection;
57     }
58
59     public static EncodedSource construct(final URL url) throws IOException {
60         final URLConnection urlConnection=url.openConnection();
61         final HttpURLConnection httpURLConnection=(urlConnection instanceof HttpURLConnection) ? (HttpURLConnection)urlConnection : null;
62         // urlConnection.setRequestProperty("Accept-Charset","UTF-8, ISO-8859-1;q=0"); // used for debugging
63
final InputStream inputStream=urlConnection.getInputStream();
64         final String JavaDoc contentType=urlConnection.getContentType();
65         if (contentType!=null) {
66             final String JavaDoc charset=Source.getCharsetParameterFromHttpHeaderValue(contentType);
67             if (charset!=null) return new EncodedSource(inputStream,charset,"HTTP header Content-Type: "+contentType,httpURLConnection);
68         }
69         return construct(inputStream,httpURLConnection);
70     }
71     
72     public static EncodedSource construct(final InputStream inputStream, final HttpURLConnection httpURLConnection) throws IOException {
73         final BufferedInputStream in=(inputStream instanceof BufferedInputStream) ? (BufferedInputStream)inputStream : new BufferedInputStream(inputStream);
74         in.mark(PREVIEW_MAX_BYTES);
75         final String JavaDoc preliminaryEncoding=getPreliminaryEncoding(in);
76         if (preliminaryEncoding==null) return new EncodedSource(in,null,"empty input stream",httpURLConnection);
77         in.reset();
78         final Source previewSource=getPreviewSource(in,preliminaryEncoding);
79         in.reset();
80         if (previewSource.getEncoding()!=null) return new EncodedSource(in,previewSource.encoding,previewSource.encodingSpecificationInfo,httpURLConnection);
81         // No explicit encoding specified in document
82
// If the document is not XML and is being loaded using HTTP, use the default specified by HTTP which is ISO-8859-1.
83
// For the encoding to be ISO-8859-1, the preliminary encoding must be UTF-8.
84
if (httpURLConnection!=null && preliminaryEncoding==UTF_8 && !previewSource.isXML())
85             return new EncodedSource(in,ISO_8859_1,"HTTP default 8-bit encoding for non-XML document",httpURLConnection);
86         // Just use the preliminary encoding (UTF-8 or UTF-16), which must be the case for an XML document without an XML declaration.
87
return new EncodedSource(in,preliminaryEncoding,"XML default matching first four bytes of input stream",httpURLConnection);
88     }
89
90     private static String JavaDoc getPreliminaryEncoding(BufferedInputStream bufferedInputStream) throws IOException {
91         final int b1=bufferedInputStream.read();
92         if (b1==-1) return null;
93         final int b2=bufferedInputStream.read();
94         final int b3=bufferedInputStream.read();
95         final int b4=bufferedInputStream.read();
96         if ((b1&0xFE)==0xFE && b2==(b1^1)) { // first two bytes are FEFF or FFFE
97
return (b3==0) ? UTF_32 : UTF_16;
98         } else if (b1==0) {
99             if (b2==0 || b4==0) return UTF_32;
100             return UTF_16BE;
101         } else if (b2==0) {
102             return (b3==0) ? UTF_32 : UTF_16LE;
103         } else if (b1==0x4C && b2==0x6F && b3==0xA7 && b4==0x94) return EBCDIC; // This only recognises "<?xm", not sure how straight HMTL documents in EBCDIC can be detected easily.
104
return UTF_8;
105     }
106
107     private static Source getPreviewSource(BufferedInputStream bufferedInputStream, String JavaDoc preliminaryEncoding) throws IOException {
108         final BufferedReader preliminaryReader=new BufferedReader(new InputStreamReader(bufferedInputStream,preliminaryEncoding),PREVIEW_BUFFER_SIZE);
109         StringBuffer JavaDoc sb=new StringBuffer JavaDoc(PREVIEW_BUFFER_SIZE);
110         for (int i=0; i<PREVIEW_BUFFER_SIZE; i++) {
111             final int ch=preliminaryReader.read();
112             if (ch==-1) break;
113             sb.append((char)ch);
114         }
115         return new Source(sb);
116     }
117 }
118
Popular Tags