KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > TextUtils


1 /* Copyright (C) 2003 Internet Archive.
2  *
3  * This file is part of the Heritrix web crawler (crawler.archive.org).
4  *
5  * Heritrix is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser Public License as published by
7  * the Free Software Foundation; either version 2.1 of the License, or
8  * any later version.
9  *
10  * Heritrix is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU Lesser Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser Public License
16  * along with Heritrix; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  */

19 package org.archive.util;
20
21 import java.io.BufferedReader JavaDoc;
22 import java.io.IOException JavaDoc;
23 import java.io.PrintWriter JavaDoc;
24 import java.io.StringReader JavaDoc;
25 import java.io.StringWriter JavaDoc;
26 import java.util.HashMap JavaDoc;
27 import java.util.Map JavaDoc;
28 import java.util.regex.Matcher JavaDoc;
29 import java.util.regex.Pattern JavaDoc;
30
31 import javax.servlet.jsp.JspWriter JavaDoc;
32
33 import org.apache.commons.lang.StringEscapeUtils;
34
35 public class TextUtils {
36     private static final String JavaDoc FIRSTWORD = "^([^\\s]*).*$";
37     
38     /**
39      * Allowable range between & and ;
40      */

41     private static final int MAX_ENTITY_WIDTH = 9;
42     
43     private static final ThreadLocal JavaDoc<Map JavaDoc<String JavaDoc,Matcher JavaDoc>> TL_MATCHER_MAP
44      = new ThreadLocal JavaDoc<Map JavaDoc<String JavaDoc,Matcher JavaDoc>>() {
45         protected Map JavaDoc<String JavaDoc,Matcher JavaDoc> initialValue() {
46             return new HashMap JavaDoc<String JavaDoc,Matcher JavaDoc>(50);
47         }
48     };
49
50     /**
51      * Get a matcher object for a precompiled regex pattern.
52      *
53      * This method tries to reuse Matcher objects for efficiency.
54      * It can hold for recycling one Matcher per pattern per thread.
55      *
56      * Matchers retrieved should be returned for reuse via the
57      * recycleMatcher() method, but no errors will occur if they
58      * are not.
59      *
60      * This method is a hotspot frequently accessed.
61      *
62      * @param pattern the string pattern to use
63      * @param input the character sequence the matcher should be using
64      * @return a matcher object loaded with the submitted character sequence
65      */

66     public static Matcher JavaDoc getMatcher(String JavaDoc pattern, CharSequence JavaDoc input) {
67         if (pattern == null) {
68             throw new IllegalArgumentException JavaDoc("String 'pattern' must not be null");
69         }
70         final Map JavaDoc<String JavaDoc,Matcher JavaDoc> matchers = TL_MATCHER_MAP.get();
71         Matcher JavaDoc m = (Matcher JavaDoc)matchers.get(pattern);
72         if(m == null) {
73             m = Pattern.compile(pattern).matcher(input);
74         } else {
75             matchers.put(pattern,null);
76             m.reset(input);
77         }
78         return m;
79     }
80
81     public static void recycleMatcher(Matcher JavaDoc m) {
82         final Map JavaDoc<String JavaDoc,Matcher JavaDoc> matchers = TL_MATCHER_MAP.get();
83         matchers.put(m.pattern().pattern(),m);
84     }
85     
86     /**
87      * Utility method using a precompiled pattern instead of using the
88      * replaceAll method of the String class. This method will also be reusing
89      * Matcher objects.
90      *
91      * @see java.util.regex.Pattern
92      * @param pattern precompiled Pattern to match against
93      * @param input the character sequence to check
94      * @param replacement the String to substitute every match with
95      * @return the String with all the matches substituted
96      */

97     public static String JavaDoc replaceAll(
98             String JavaDoc pattern, CharSequence JavaDoc input, String JavaDoc replacement) {
99         Matcher JavaDoc m = getMatcher(pattern, input);
100         String JavaDoc res = m.replaceAll(replacement);
101         recycleMatcher(m);
102         return res;
103     }
104
105     /**
106      * Utility method using a precompiled pattern instead of using the
107      * replaceFirst method of the String class. This method will also be reusing
108      * Matcher objects.
109      *
110      * @see java.util.regex.Pattern
111      * @param pattern precompiled Pattern to match against
112      * @param input the character sequence to check
113      * @param replacement the String to substitute the first match with
114      * @return the String with the first match substituted
115      */

116     public static String JavaDoc replaceFirst(
117             String JavaDoc pattern, CharSequence JavaDoc input, String JavaDoc replacement) {
118         Matcher JavaDoc m = getMatcher(pattern, input);
119         String JavaDoc res = m.replaceFirst(replacement);
120         recycleMatcher(m);
121         return res;
122     }
123
124     /**
125      * Utility method using a precompiled pattern instead of using the matches
126      * method of the String class. This method will also be reusing Matcher
127      * objects.
128      *
129      * @see java.util.regex.Pattern
130      * @param pattern precompiled Pattern to match against
131      * @param input the character sequence to check
132      * @return true if character sequence matches
133      */

134     public static boolean matches(String JavaDoc pattern, CharSequence JavaDoc input) {
135         Matcher JavaDoc m = getMatcher(pattern, input);
136         boolean res = m.matches();
137         recycleMatcher(m);
138         return res;
139     }
140
141     /**
142      * Utility method using a precompiled pattern instead of using the split
143      * method of the String class.
144      *
145      * @see java.util.regex.Pattern
146      * @param pattern precompiled Pattern to split by
147      * @param input the character sequence to split
148      * @return array of Strings split by pattern
149      */

150     public static String JavaDoc[] split(String JavaDoc pattern, CharSequence JavaDoc input) {
151         Matcher JavaDoc m = getMatcher(pattern,input);
152         String JavaDoc[] retVal = m.pattern().split(input);
153         recycleMatcher(m);
154         return retVal;
155     }
156     
157     /**
158      * @param s String to find first word in (Words are delimited by
159      * whitespace).
160      * @return First word in the passed string else null if no word found.
161      */

162     public static String JavaDoc getFirstWord(String JavaDoc s) {
163         Matcher JavaDoc m = getMatcher(FIRSTWORD, s);
164         String JavaDoc retVal = (m != null && m.matches())? m.group(1): null;
165         recycleMatcher(m);
166         return retVal;
167     }
168
169     /**
170      * Escapes a string so that it can be passed as an argument to a javscript
171      * in a JSP page. This method takes a string and returns the same string
172      * with any single quote escaped by prepending the character with a
173      * backslash. Linebreaks are also replaced with '\n'. Also,
174      * less-than signs and ampersands are replaced with HTML entities.
175      *
176      * @param s The string to escape
177      * @return The same string escaped.
178      */

179     public static String JavaDoc escapeForHTMLJavascript(String JavaDoc s) {
180         return escapeForHTML(StringEscapeUtils.escapeJavaScript(s));
181     }
182     
183     /**
184      * Escapes a string so that it can be placed inside XML/HTML attribute.
185      * Replaces ampersand, less-than, greater-than, single-quote, and
186      * double-quote with escaped versions.
187      * @param s The string to escape
188      * @return The same string escaped.
189      */

190     public static String JavaDoc escapeForMarkupAttribute(String JavaDoc s) {
191         return StringEscapeUtils.escapeXml(s);
192     }
193     
194     /**
195      * Minimally escapes a string so that it can be placed inside XML/HTML
196      * attribute.
197      * Escapes lt and amp.
198      * @param s The string to escape
199      * @return The same string escaped.
200      */

201     public static String JavaDoc escapeForHTML(String JavaDoc s) {
202         // TODO: do this in a single pass instead of creating 5 junk strings
203
String JavaDoc escaped = s.replaceAll("&","&amp;");
204         return escaped.replaceAll("<","&lt;");
205     }
206
207     /**
208      * Utility method for writing a (potentially large) String to a JspWriter,
209      * escaping it for HTML display, without constructing another large String
210      * of the whole content.
211      * @param s String to write
212      * @param out destination JspWriter
213      * @throws IOException
214      */

215     public static void writeEscapedForHTML(String JavaDoc s, JspWriter JavaDoc out)
216     throws IOException JavaDoc {
217         BufferedReader JavaDoc reader = new BufferedReader JavaDoc(new StringReader JavaDoc(s));
218         String JavaDoc line;
219         while((line=reader.readLine()) != null){
220             out.println(StringEscapeUtils.escapeHtml(line));
221         }
222     }
223     
224     /**
225      * Replaces HTML Entity Encodings.
226      * @param cs The CharSequence to remove html codes from
227      * @return the same CharSequence or an escaped String.
228      */

229     public static CharSequence JavaDoc unescapeHtml(final CharSequence JavaDoc cs) {
230         if (cs == null) {
231             return cs;
232         }
233         
234         // If both of these do not equal zero, then cs has entity code
235
int startEntityCode = -1;
236         int endEntityCode = -1;
237     
238         // Check for encodings, make sure start and end are within certain range
239
for (int i = 0; i < cs.length(); i++) {
240             if (cs.charAt(i) == '&') {
241                 startEntityCode = i;
242             } else if (cs.charAt(i) == ';' && startEntityCode >= 0 &&
243                     i > startEntityCode &&
244                     ((i - startEntityCode) < MAX_ENTITY_WIDTH)) {
245                 endEntityCode = i;
246             }
247         }
248
249         return (startEntityCode != 0 && endEntityCode != 0)?
250             StringEscapeUtils.unescapeHtml(cs.toString()): cs;
251     }
252     
253     /**
254      * @param message Message to put at top of the string returned. May be
255      * null.
256      * @param e Exception to write into a string.
257      * @return Return formatted string made of passed message and stack trace
258      * of passed exception.
259      */

260     public static String JavaDoc exceptionToString(String JavaDoc message, Throwable JavaDoc e) {
261         StringWriter JavaDoc sw = new StringWriter JavaDoc();
262         if (message == null || message.length() == 0) {
263             sw.write(message);
264             sw.write("\n");
265         }
266         e.printStackTrace(new PrintWriter JavaDoc(sw));
267         return sw.toString();
268     }
269 }
Popular Tags