KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlcleaner > Utils


1 /* Copyright (c) 2006-2007, Vladimir Nikic
2     All rights reserved.
3     
4     Redistribution and use of this software in source and binary forms,
5     with or without modification, are permitted provided that the following
6     conditions are met:
7     
8     * Redistributions of source code must retain the above
9       copyright notice, this list of conditions and the
10       following disclaimer.
11     
12     * Redistributions in binary form must reproduce the above
13       copyright notice, this list of conditions and the
14       following disclaimer in the documentation and/or other
15       materials provided with the distribution.
16     
17     * The name of HtmlCleaner may not be used to endorse or promote
18       products derived from this software without specific prior
19       written permission.
20
21     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31     POSSIBILITY OF SUCH DAMAGE.
32     
33     You can contact Vladimir Nikic by sending e-mail to
34     nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
35     subject line.
36 */

37
38 package org.htmlcleaner;
39
40 import java.io.*;
41 import java.net.URL JavaDoc;
42
43 /**
44  * <p>Common utilities.</p>
45  *
46  * Created by: Vladimir Nikic<br/>
47  * Date: November, 2006.
48  */

49 public class Utils {
50
51     /**
52      * Trims specified string from left.
53      * @param s
54      */

55     public static String JavaDoc ltrim(String JavaDoc s) {
56         if (s == null) {
57             return null;
58         }
59
60         int index = 0;
61         int len = s.length();
62
63         while ( index < len && Character.isWhitespace(s.charAt(index)) ) {
64             index++;
65         }
66
67         return (index >= len) ? "" : s.substring(index);
68     }
69
70     /**
71      * Trims specified string from right.
72      * @param s
73      */

74     public static String JavaDoc rtrim(String JavaDoc s) {
75         if (s == null) {
76             return null;
77         }
78
79         int len = s.length();
80         int index = len;
81
82         while ( index > 0 && Character.isWhitespace(s.charAt(index-1)) ) {
83             index--;
84         }
85
86         return (index <= 0) ? "" : s.substring(0, index);
87     }
88     
89     /**
90      * Reads content from the specified URL with specified charset into string
91      * @param url
92      * @param charset
93      * @throws IOException
94      */

95     public static StringBuffer JavaDoc readUrl(URL JavaDoc url, String JavaDoc charset) throws IOException {
96         StringBuffer JavaDoc buffer = new StringBuffer JavaDoc(1024);
97
98         Object JavaDoc content = url.getContent();
99         if (content instanceof InputStream) {
100             InputStreamReader reader = new InputStreamReader((InputStream)content, charset);
101             char[] charArray = new char[1024];
102
103             int charsRead = 0;
104             do {
105                 charsRead = reader.read(charArray);
106                 if (charsRead >= 0) {
107                     buffer.append(charArray, 0, charsRead);
108                 }
109             } while (charsRead > 0);
110         }
111
112         return buffer;
113     }
114
115     public static boolean isHexadecimalDigit(char ch) {
116         return Character.isDigit(ch) ||
117                ch == 'A' || ch == 'a' || ch == 'B' || ch == 'b' || ch == 'C' || ch == 'c' ||
118                ch == 'D' || ch == 'd' || ch == 'E' || ch == 'e' || ch == 'F' || ch == 'f';
119     }
120     
121     /**
122      * Escapes XML string.
123      */

124     public static String JavaDoc escapeXml(String JavaDoc s, boolean advanced, boolean recognizeUnicodeChars, boolean translateSpecialEntities) {
125         if (s != null) {
126             int len = s.length();
127             StringBuffer JavaDoc result = new StringBuffer JavaDoc(len);
128             
129             for (int i = 0; i < len; i++) {
130                 char ch = s.charAt(i);
131                 
132                 if (ch == '&') {
133                     if ( recognizeUnicodeChars && (i < len-1) && (s.charAt(i+1) == '#') ) {
134                         int charIndex = i + 2;
135                         String JavaDoc unicode = "";
136                         while ( charIndex < len &&
137                                 (isHexadecimalDigit(s.charAt(charIndex)) || s.charAt(charIndex) == 'x' || s.charAt(charIndex) == 'X')
138                               ) {
139                             unicode += s.charAt(charIndex);
140                             charIndex++;
141                         }
142                         if (charIndex == len || !"".equals(unicode)) {
143                             try {
144                                 char unicodeChar = unicode.toLowerCase().startsWith("x") ?
145                                                         (char)Integer.parseInt(unicode.substring(1), 16) :
146                                                         (char)Integer.parseInt(unicode);
147                                 if ( "&<>\'\"".indexOf(unicodeChar) < 0 ) {
148                                     int replaceChunkSize = (charIndex < len && s.charAt(charIndex) == ';') ? unicode.length()+1 : unicode.length();
149                                     result.append( String.valueOf(unicodeChar) );
150                                     i += replaceChunkSize + 1;
151                                 } else {
152                                     i = charIndex;
153                                     result.append("&amp;#" + unicode + ";");
154                                 }
155                             } catch (NumberFormatException JavaDoc e) {
156                                 i = charIndex;
157                                 result.append("&amp;#" + unicode + ";");
158                             }
159                         } else {
160                             result.append("&amp;");
161                         }
162                     } else {
163                         if (translateSpecialEntities) {
164                             // get following sequence of most 10 characters
165
String JavaDoc seq = s.substring(i, i+Math.min(10, len-i));
166                             int semiIndex = seq.indexOf(';');
167                             if (semiIndex > 0) {
168                                 String JavaDoc entity = seq.substring(1, semiIndex);
169                                 Integer JavaDoc code = (Integer JavaDoc) SpecialEntities.entities.get(entity);
170                                 if (code != null) {
171                                     int entityLen = entity.length();
172                                     result.append( (char)code.intValue() );
173                                     i += entityLen + 1;
174                                     continue;
175                                 }
176                             }
177                         }
178                         
179                         if (advanced) {
180                             String JavaDoc sub = s.substring(i);
181                             if ( sub.startsWith("&amp;") ) {
182                                 result.append("&amp;");
183                                 i += 4;
184                             } else if ( sub.startsWith("&apos;") ) {
185                                 result.append("&apos;");
186                                 i += 5;
187                             } else if ( sub.startsWith("&gt;") ) {
188                                 result.append("&gt;");
189                                 i += 3;
190                             } else if ( sub.startsWith("&lt;") ) {
191                                 result.append("&lt;");
192                                 i += 3;
193                             } else if ( sub.startsWith("&quot;") ) {
194                                 result.append("&quot;");
195                                 i += 5;
196                             } else {
197                                 result.append("&amp;");
198                             }
199                             
200                             continue;
201                         }
202                         
203                         result.append("&amp;");
204                     }
205                 } else if (ch == '\'') {
206                     result.append("&apos;");
207                 } else if (ch == '>') {
208                     result.append("&gt;");
209                 } else if (ch == '<') {
210                     result.append("&lt;");
211                 } else if (ch == '\"') {
212                     result.append("&quot;");
213                 } else {
214                     result.append(ch);
215                 }
216             }
217             
218             return result.toString();
219         }
220         
221         return null;
222     }
223
224 }
Popular Tags