1 21 package au.id.jericho.lib.html; 22 23 import java.util.*; 24 import java.io.*; 25 import java.net.*; 26 27 32 final class EncodedSource { 33 public final Reader Reader; 34 public final String Encoding; 35 public final String EncodingSpecificationInfo; 36 public final HttpURLConnection HttpURLConnection; 37 38 private static final int PREVIEW_BUFFER_SIZE=2048; 39 private static final int PREVIEW_MAX_BYTES=PREVIEW_BUFFER_SIZE*4; 41 private static final String UTF_32="UTF-32"; private static final String UTF_16="UTF-16"; 43 private static final String UTF_16BE="UTF-16BE"; 44 private static final String UTF_16LE="UTF-16LE"; 45 private static final String UTF_8="UTF-8"; 46 private static final String EBCDIC="Cp037"; 47 private static final String ISO_8859_1="ISO-8859-1"; 48 49 EncodedSource(final InputStream inputStream, final String encoding, final String encodingSpecificationInfo, final HttpURLConnection httpURLConnection) throws UnsupportedEncodingException { 50 if (encoding==null) 51 Reader=new InputStreamReader(inputStream); else 53 Reader=new InputStreamReader(inputStream,encoding); 54 Encoding=encoding; 55 EncodingSpecificationInfo=encodingSpecificationInfo; 56 HttpURLConnection=httpURLConnection; 57 } 58 59 public static EncodedSource construct(final URL url) throws IOException { 60 final URLConnection urlConnection=url.openConnection(); 61 final HttpURLConnection httpURLConnection=(urlConnection instanceof HttpURLConnection) ? (HttpURLConnection)urlConnection : null; 62 final InputStream inputStream=urlConnection.getInputStream(); 64 final String contentType=urlConnection.getContentType(); 65 if (contentType!=null) { 66 final String charset=Source.getCharsetParameterFromHttpHeaderValue(contentType); 67 if (charset!=null) return new EncodedSource(inputStream,charset,"HTTP header Content-Type: "+contentType,httpURLConnection); 68 } 69 return construct(inputStream,httpURLConnection); 70 } 71 72 public static EncodedSource construct(final InputStream inputStream, final HttpURLConnection httpURLConnection) throws IOException { 73 final BufferedInputStream in=(inputStream instanceof BufferedInputStream) ? (BufferedInputStream)inputStream : new BufferedInputStream(inputStream); 74 in.mark(PREVIEW_MAX_BYTES); 75 final String preliminaryEncoding=getPreliminaryEncoding(in); 76 if (preliminaryEncoding==null) return new EncodedSource(in,null,"empty input stream",httpURLConnection); 77 in.reset(); 78 final Source previewSource=getPreviewSource(in,preliminaryEncoding); 79 in.reset(); 80 if (previewSource.getEncoding()!=null) return new EncodedSource(in,previewSource.encoding,previewSource.encodingSpecificationInfo,httpURLConnection); 81 if (httpURLConnection!=null && preliminaryEncoding==UTF_8 && !previewSource.isXML()) 85 return new EncodedSource(in,ISO_8859_1,"HTTP default 8-bit encoding for non-XML document",httpURLConnection); 86 return new EncodedSource(in,preliminaryEncoding,"XML default matching first four bytes of input stream",httpURLConnection); 88 } 89 90 private static String getPreliminaryEncoding(BufferedInputStream bufferedInputStream) throws IOException { 91 final int b1=bufferedInputStream.read(); 92 if (b1==-1) return null; 93 final int b2=bufferedInputStream.read(); 94 final int b3=bufferedInputStream.read(); 95 final int b4=bufferedInputStream.read(); 96 if ((b1&0xFE)==0xFE && b2==(b1^1)) { return (b3==0) ? UTF_32 : UTF_16; 98 } else if (b1==0) { 99 if (b2==0 || b4==0) return UTF_32; 100 return UTF_16BE; 101 } else if (b2==0) { 102 return (b3==0) ? UTF_32 : UTF_16LE; 103 } else if (b1==0x4C && b2==0x6F && b3==0xA7 && b4==0x94) return EBCDIC; return UTF_8; 105 } 106 107 private static Source getPreviewSource(BufferedInputStream bufferedInputStream, String preliminaryEncoding) throws IOException { 108 final BufferedReader preliminaryReader=new BufferedReader(new InputStreamReader(bufferedInputStream,preliminaryEncoding),PREVIEW_BUFFER_SIZE); 109 StringBuffer sb=new StringBuffer (PREVIEW_BUFFER_SIZE); 110 for (int i=0; i<PREVIEW_BUFFER_SIZE; i++) { 111 final int ch=preliminaryReader.read(); 112 if (ch==-1) break; 113 sb.append((char)ch); 114 } 115 return new Source(sb); 116 } 117 } 118 | Popular Tags |