1 package net.sf.saxon.functions; 2 3 import net.sf.saxon.Err; 4 import net.sf.saxon.expr.Expression; 5 import net.sf.saxon.expr.StaticContext; 6 import net.sf.saxon.expr.XPathContext; 7 import net.sf.saxon.om.Item; 8 import net.sf.saxon.om.FastStringBuffer; 9 import net.sf.saxon.om.XMLChar; 10 import net.sf.saxon.trans.DynamicError; 11 import net.sf.saxon.trans.XPathException; 12 import net.sf.saxon.value.StringValue; 13 import net.sf.saxon.value.BooleanValue; 14 15 import java.io.*; 16 import java.net.MalformedURLException ; 17 import java.net.URL ; 18 import java.net.URLConnection ; 19 20 21 public class UnparsedText extends SystemFunction implements XSLTFunction { 22 23 String expressionBaseURI = null; 24 25 public static final int UNPARSED_TEXT = 0; 26 public static final int UNPARSED_TEXT_AVAILABLE = 1; 27 28 public void checkArguments(StaticContext env) throws XPathException { 29 if (expressionBaseURI == null) { 30 super.checkArguments(env); 31 expressionBaseURI = env.getBaseURI(); 32 } 33 } 34 35 36 39 40 public Expression preEvaluate(StaticContext env) { 41 return this; 42 } 46 47 48 52 53 public Item evaluateItem(XPathContext context) throws XPathException { 54 StringValue result; 55 try { 56 StringValue hrefVal = (StringValue)argument[0].evaluateItem(context); 57 if (hrefVal == null) { 58 return null; 59 } 60 String href = hrefVal.getStringValue(); 61 62 String encoding = null; 63 if (getNumberOfArguments() == 2) { 64 encoding = argument[1].evaluateItem(context).getStringValue(); 65 } 66 67 result = new StringValue(readFile(href, expressionBaseURI, encoding)); 68 } catch (XPathException err) { 69 if (operation == UNPARSED_TEXT_AVAILABLE) { 70 return BooleanValue.FALSE; 71 } else { 72 throw err; 73 } 74 } 75 if (operation == UNPARSED_TEXT_AVAILABLE) { 76 return BooleanValue.TRUE; 77 } else { 78 return result; 79 } 80 } 81 82 85 86 private CharSequence readFile(String href, String baseURI, String encoding) 87 throws XPathException { 88 89 91 URL absoluteURL; 92 if (baseURI == null) { try { 94 absoluteURL = new URL (href); 96 } catch (MalformedURLException err) { 97 DynamicError e = new DynamicError("Cannot resolve absolute URI", err); 99 e.setErrorCode("XTDE1170"); 100 throw e; 101 } 102 } else { 103 try { 104 absoluteURL = new URL (new URL (baseURI), href); 105 } catch (MalformedURLException err) { 106 DynamicError e = new DynamicError("Cannot resolve relative URI", err); 107 e.setErrorCode("XTDE1170"); 108 throw e; 109 } 110 } 111 try { 112 InputStream is; 113 if (encoding != null) { 114 is = absoluteURL.openStream(); 115 } else { 116 URLConnection connection = absoluteURL.openConnection(); 117 connection.connect(); 118 is = connection.getInputStream(); 119 120 try { 121 122 if (!is.markSupported()) { 123 is = new BufferedInputStream(is); 124 } 125 126 String contentType; 128 129 if (!"file".equals(connection.getURL().getProtocol())) { 131 132 contentType = connection.getContentType(); 134 135 if (contentType != null) { 136 int pos = contentType.indexOf("charset"); 137 if (pos>=0) { 138 pos = contentType.indexOf('=', pos + 7); 139 if (pos>=0) { 140 contentType = contentType.substring(pos + 1); 141 } 142 if ((pos = contentType.indexOf(';')) > 0) { 143 contentType = contentType.substring(0, pos); 144 } 145 146 if ((pos = contentType.indexOf('(')) > 0) { 148 contentType = contentType.substring(0, pos); 149 } 150 if ((pos = contentType.indexOf('"')) > 0) { 152 contentType = contentType.substring(pos + 1, 153 contentType.indexOf('"', pos + 2)); 154 } 155 encoding = contentType.trim(); 156 } 157 } 158 } 159 160 if (encoding == null) { 161 is.mark(100); 163 byte[] start = new byte[100]; 164 int read = is.read(start, 0, 100); 165 is.reset(); 166 encoding = inferEncoding(start, read); 167 } 168 169 } catch (IOException e) { 170 encoding = "UTF-8"; 171 } 172 173 } 174 Reader reader = new BufferedReader(new InputStreamReader(is, encoding)); 175 FastStringBuffer sb = new FastStringBuffer(2048); 176 char[] buffer = new char[2048]; 177 boolean first = true; 178 int actual; 179 int line = 1; 180 int column = 1; 181 while (true) { 182 actual = reader.read(buffer, 0, 2048); 183 if (actual < 0) { 184 break; 185 } 186 for (int c=0; c<actual;) { 187 int ch32 = buffer[c++]; 188 if (ch32 == '\n') { 189 line++; 190 column = 0; 191 } 192 column++; 193 if (XMLChar.isHighSurrogate(ch32)) { 194 char low = buffer[c++]; 195 ch32 = XMLChar.supplemental((char)ch32, low); 196 } 197 if (!XMLChar.isValid(ch32)) { 198 DynamicError err = new DynamicError( 199 "The unparsed-text file contains a character illegal in XML (line=" + 200 line + " column=" + column + " value=hex " + Integer.toHexString(ch32) + ')'); 201 err.setErrorCode("XTDE1180"); 202 throw err; 203 } 204 } 205 if (first) { 206 first = false; 207 if (buffer[0]=='\ufeff') { 208 sb.append(buffer, 1, actual-1); 210 } else { 211 sb.append(buffer, 0, actual); 212 } 213 } else { 214 sb.append(buffer, 0, actual); 215 } 216 } 217 reader.close(); 218 return sb.condense(); 219 } catch (java.io.UnsupportedEncodingException encErr) { 220 DynamicError e = new DynamicError("Unknown encoding " + Err.wrap(encoding), encErr); 221 e.setErrorCode("XTDE1190"); 222 throw e; 223 } catch (java.io.IOException ioErr) { 224 DynamicError e = new DynamicError("Failed to read input file", ioErr); 225 e.setErrorCode("XTDE1170"); 226 e.setLocator(this); 227 throw e; 228 } 229 } 230 231 private String inferEncoding(byte[] start, int read) { 232 if (read >= 2) { 233 if (ch(start[0]) == 0xFE && ch(start[1]) == 0xFF) { 234 return "UTF-16"; 235 } else if (ch(start[0]) == 0xFF && ch(start[1]) == 0xFE) { 236 return "UTF-16LE"; 237 } 238 } 239 if (read >= 3) { 240 if (ch(start[0]) == 0xEF && ch(start[1]) == 0xBB && ch(start[2]) == 0xBF) { 241 return "UTF-8"; 242 } 243 } 244 if (read >= 4) { 245 if (ch(start[0]) == '<' && ch(start[1]) == '?' && 246 ch(start[2]) == 'x' && ch(start[3]) == 'm' && ch(start[4]) == 'l') { 247 FastStringBuffer sb = new FastStringBuffer(read); 248 for (int b = 0; b < read; b++) { 249 sb.append((char)start[b]); 250 } 251 String p = sb.toString(); 252 int v = p.indexOf("encoding"); 253 if (v >= 0) { 254 v += 8; 255 while (v < p.length() && " \n\r\t=\"'".indexOf(p.charAt(v)) >= 0) { 256 v++; 257 } 258 sb.setLength(0); 259 while (v < p.length() && p.charAt(v) != '"' && p.charAt(v) != '\'') { 260 sb.append(p.charAt(v++)); 261 } 262 return sb.toString(); 263 } 264 } 265 } else if (read > 0 && start[0] == 0 && start[2] == 0 && start[4] == 0 && start[6] == 0) { 266 return "UTF-16"; 267 } else if (read > 1 && start[1] == 0 && start[3] == 0 && start[5] == 0 && start[7] == 0) { 268 return "UTF-16LE"; 269 } 270 return "UTF-8"; 272 } 273 274 private int ch(byte b) { 275 return ((int)b) & 0xff; 276 } 277 278 } 279 280 281 | Popular Tags |