KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > au > id > jericho > lib > html > ParseText


1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2
// Version 2.2
3
// Copyright (C) 2006 Martin Jericho
4
// http://sourceforge.net/projects/jerichohtml/
5
//
6
// This library is free software; you can redistribute it and/or
7
// modify it under the terms of the GNU Lesser General Public
8
// License as published by the Free Software Foundation; either
9
// version 2.1 of the License, or (at your option) any later version.
10
// http://www.gnu.org/copyleft/lesser.html
11
//
12
// This library is distributed in the hope that it will be useful,
13
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
// Lesser General Public License for more details.
16
//
17
// You should have received a copy of the GNU Lesser General Public
18
// License along with this library; if not, write to the Free Software
19
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20

21 package au.id.jericho.lib.html;
22
23 /**
24  * Represents the text from the {@linkplain Source source} document that is to be parsed.
25  * <p>
26  * This class is normally only of interest to users who wish to create <a HREF="TagType.html#Custom">custom tag types</a>.
27  * <p>
28  * The parse text is defined as the entire text of the source document in lower case, with all
29  * {@linkplain Segment#ignoreWhenParsing() ignored} segments replaced by space characters.
30  * <p>
31  * The text is stored in lower case to make case insensitive parsing as efficient as possible.
32  * <p>
33  * This class provides many methods which are also provided by the <code>java.lang.String</code> class,
34  * but adds an extra parameter called <code>breakAtIndex</code> to the various <code>indexOf</code> methods.
35  * This parameter allows a search on only a specified segment of the text, which is not possible using the normal <code>String</code> class.
36  * <p>
37  * <code>ParseText</code> instances are obtained using the {@link Source#getParseText()} method.
38  */

39 public final class ParseText implements CharSequence JavaDoc {
40     private final char[] text;
41
42     /** A value to use as the <code>breakAtIndex</code> argument in certain methods to indicate that the search should continue to the start or end of the parse text. */
43     public static final int NO_BREAK=-1;
44
45     /**
46      * Constructs a new <code>ParseText</code> object based on the specified <code>CharSequence</code>.
47      * @param charSequence the character sequence upon which the parse text is based.
48      */

49     ParseText(final CharSequence JavaDoc charSequence) {
50         text=new char[charSequence.length()];
51         for (int i=0; i<text.length; i++) text[i]=Character.toLowerCase(charSequence.charAt(i));
52     }
53
54     /**
55      * Constructs a new <code>ParseText</code> object based on the specified {@link OutputDocument}.
56      * @param outputDocument the {@link OutputDocument} upon which the parse text is based.
57      */

58     ParseText(final OutputDocument outputDocument) {
59         this(outputDocument.toString());
60     }
61
62     /**
63      * Indicates whether this parse text contains the specified string at the specified position.
64      * <p>
65      * This method is analogous to the <code>java.lang.String.startsWith(String prefix, int toffset)</code> method.
66      *
67      * @param str a string.
68      * @param pos the position (index) in this parse text at which to check for the specified string.
69      * @return <code>true</code> if this parse text contains the specified string at the specified position, otherwise <code>false</code>.
70      */

71     public boolean containsAt(final String JavaDoc str, final int pos) {
72         for (int i=0; i<str.length(); i++)
73             if (str.charAt(i)!=text[pos+i]) return false;
74         return true;
75     }
76
77     /**
78      * Returns the character at the specified index.
79      * @param index the index of the character.
80      * @return the character at the specified index, which is always in lower case.
81      */

82     public char charAt(final int index) {
83         return text[index];
84     }
85
86     /**
87      * Returns the index within this parse text of the first occurrence of the specified character,
88      * starting the search at the position specified by <code>fromIndex</code>.
89      * <p>
90      * If the specified character is not found then -1 is returned.
91      *
92      * @param searchChar a character.
93      * @param fromIndex the index to start the search from.
94      * @return the index within this parse text of the first occurrence of the specified character within the specified range, or -1 if the character is not found.
95      */

96     public int indexOf(final char searchChar, final int fromIndex) {
97         return indexOf(searchChar,fromIndex,NO_BREAK);
98     }
99     
100     /**
101      * Returns the index within this parse text of the first occurrence of the specified character,
102      * starting the search at the position specified by <code>fromIndex</code>,
103      * and breaking the search at the index specified by <code>breakAtIndex</code>.
104      * <p>
105      * The position specified by <code>breakAtIndex</code> is not included in the search.
106      * <p>
107      * If the search is to continue to the end of the text,
108      * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
109      * <p>
110      * If the specified character is not found then -1 is returned.
111      *
112      * @param searchChar a character.
113      * @param fromIndex the index to start the search from.
114      * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the end of the text.
115      * @return the index within this parse text of the first occurrence of the specified character within the specified range, or -1 if the character is not found.
116      */

117     public int indexOf(final char searchChar, final int fromIndex, final int breakAtIndex) {
118         final int actualBreakAtIndex=(breakAtIndex==NO_BREAK || breakAtIndex>text.length ? text.length : breakAtIndex);
119         for (int i=(fromIndex<0 ? 0 : fromIndex); i<actualBreakAtIndex; i++)
120             if (text[i]==searchChar) return i;
121         return -1;
122     }
123
124     /**
125      * Returns the index within this parse text of the last occurrence of the specified character,
126      * searching backwards starting at the position specified by <code>fromIndex</code>.
127      * <p>
128      * If the specified character is not found then -1 is returned.
129      *
130      * @param searchChar a character.
131      * @param fromIndex the index to start the search from.
132      * @return the index within this parse text of the last occurrence of the specified character within the specified range, or -1 if the character is not found.
133      */

134     public int lastIndexOf(final char searchChar, final int fromIndex) {
135         return lastIndexOf(searchChar,fromIndex,NO_BREAK);
136     }
137     
138     /**
139      * Returns the index within this parse text of the last occurrence of the specified character,
140      * searching backwards starting at the position specified by <code>fromIndex</code>,
141      * and breaking the search at the index specified by <code>breakAtIndex</code>.
142      * <p>
143      * The position specified by <code>breakAtIndex</code> is not included in the search.
144      * <p>
145      * If the search is to continue to the start of the text,
146      * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
147      * <p>
148      * If the specified character is not found then -1 is returned.
149      *
150      * @param searchChar a character.
151      * @param fromIndex the index to start the search from.
152      * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the start of the text.
153      * @return the index within this parse text of the last occurrence of the specified character within the specified range, or -1 if the character is not found.
154      */

155     public int lastIndexOf(final char searchChar, final int fromIndex, final int breakAtIndex) {
156         for (int i=(fromIndex>text.length ? text.length : fromIndex); i>breakAtIndex; i--)
157             if (text[i]==searchChar) return i;
158         return -1;
159     }
160
161     /**
162      * Returns the index within this parse text of the first occurrence of the specified string,
163      * starting the search at the position specified by <code>fromIndex</code>.
164      * <p>
165      * If the specified string is not found then -1 is returned.
166      *
167      * @param searchString a string.
168      * @param fromIndex the index to start the search from.
169      * @return the index within this parse text of the first occurrence of the specified string within the specified range, or -1 if the string is not found.
170      */

171     public int indexOf(final String JavaDoc searchString, final int fromIndex) {
172         return (searchString.length()==1)
173             ? indexOf(searchString.charAt(0),fromIndex,NO_BREAK)
174             : indexOf(searchString.toCharArray(),fromIndex,NO_BREAK);
175     }
176
177     /**
178      * Returns the index within this parse text of the first occurrence of the specified character array,
179      * starting the search at the position specified by <code>fromIndex</code>.
180      * <p>
181      * If the specified character array is not found then -1 is returned.
182      *
183      * @param searchCharArray a character array.
184      * @param fromIndex the index to start the search from.
185      * @return the index within this parse text of the first occurrence of the specified character array within the specified range, or -1 if the character array is not found.
186      */

187     public int indexOf(final char[] searchCharArray, final int fromIndex) {
188         return indexOf(searchCharArray,fromIndex,NO_BREAK);
189     }
190     
191     /**
192      * Returns the index within this parse text of the first occurrence of the specified string,
193      * starting the search at the position specified by <code>fromIndex</code>,
194      * and breaking the search at the index specified by <code>breakAtIndex</code>.
195      * <p>
196      * The position specified by <code>breakAtIndex</code> is not included in the search.
197      * <p>
198      * If the search is to continue to the end of the text,
199      * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
200      * <p>
201      * If the specified string is not found then -1 is returned.
202      *
203      * @param searchString a string.
204      * @param fromIndex the index to start the search from.
205      * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the end of the text.
206      * @return the index within this parse text of the first occurrence of the specified string within the specified range, or -1 if the string is not found.
207      */

208     public int indexOf(final String JavaDoc searchString, final int fromIndex, final int breakAtIndex) {
209         return (searchString.length()==1)
210             ? indexOf(searchString.charAt(0),fromIndex,breakAtIndex)
211             : indexOf(searchString.toCharArray(),fromIndex,breakAtIndex);
212     }
213
214     /**
215      * Returns the index within this parse text of the first occurrence of the specified character array,
216      * starting the search at the position specified by <code>fromIndex</code>,
217      * and breaking the search at the index specified by <code>breakAtIndex</code>.
218      * <p>
219      * The position specified by <code>breakAtIndex</code> is not included in the search.
220      * <p>
221      * If the search is to continue to the end of the text,
222      * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
223      * <p>
224      * If the specified character array is not found then -1 is returned.
225      *
226      * @param searchCharArray a character array.
227      * @param fromIndex the index to start the search from.
228      * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the end of the text.
229      * @return the index within this parse text of the first occurrence of the specified character array within the specified range, or -1 if the character array is not found.
230      */

231     public int indexOf(final char[] searchCharArray, final int fromIndex, final int breakAtIndex) {
232         if (searchCharArray.length==0) return fromIndex;
233         final char firstChar=searchCharArray[0];
234         final int lastPossibleBreakAtIndex=text.length-searchCharArray.length+1;
235         final int actualBreakAtIndex=(breakAtIndex==NO_BREAK || breakAtIndex>lastPossibleBreakAtIndex) ? lastPossibleBreakAtIndex : breakAtIndex;
236         outerLoop: for (int i=(fromIndex<0 ? 0 : fromIndex); i<actualBreakAtIndex; i++) {
237             if (text[i]==firstChar) {
238                 for (int j=1; j<searchCharArray.length; j++)
239                     if (searchCharArray[j]!=text[j+i]) continue outerLoop;
240                 return i;
241             }
242         }
243         return -1;
244     }
245
246     /**
247      * Returns the index within this parse text of the last occurrence of the specified string,
248      * searching backwards starting at the position specified by <code>fromIndex</code>.
249      * <p>
250      * If the specified string is not found then -1 is returned.
251      *
252      * @param searchString a string.
253      * @param fromIndex the index to start the search from.
254      * @return the index within this parse text of the last occurrence of the specified string within the specified range, or -1 if the string is not found.
255      */

256     public int lastIndexOf(final String JavaDoc searchString, final int fromIndex) {
257         return (searchString.length()==1)
258             ? lastIndexOf(searchString.charAt(0),fromIndex,NO_BREAK)
259             : lastIndexOf(searchString.toCharArray(),fromIndex,NO_BREAK);
260     }
261
262     /**
263      * Returns the index within this parse text of the last occurrence of the specified character array,
264      * searching backwards starting at the position specified by <code>fromIndex</code>.
265      * <p>
266      * If the specified character array is not found then -1 is returned.
267      *
268      * @param searchCharArray a character array.
269      * @param fromIndex the index to start the search from.
270      * @return the index within this parse text of the last occurrence of the specified character array within the specified range, or -1 if the character array is not found.
271      */

272     public int lastIndexOf(final char[] searchCharArray, final int fromIndex) {
273         return lastIndexOf(searchCharArray,fromIndex,NO_BREAK);
274     }
275
276     /**
277      * Returns the index within this parse text of the last occurrence of the specified string,
278      * searching backwards starting at the position specified by <code>fromIndex</code>,
279      * and breaking the search at the index specified by <code>breakAtIndex</code>.
280      * <p>
281      * The position specified by <code>breakAtIndex</code> is not included in the search.
282      * <p>
283      * If the search is to continue to the start of the text,
284      * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
285      * <p>
286      * If the specified string is not found then -1 is returned.
287      *
288      * @param searchString a string.
289      * @param fromIndex the index to start the search from.
290      * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the start of the text.
291      * @return the index within this parse text of the last occurrence of the specified string within the specified range, or -1 if the string is not found.
292      */

293     public int lastIndexOf(final String JavaDoc searchString, final int fromIndex, final int breakAtIndex) {
294         return (searchString.length()==1)
295             ? lastIndexOf(searchString.charAt(0),fromIndex,breakAtIndex)
296             : lastIndexOf(searchString.toCharArray(),fromIndex,breakAtIndex);
297     }
298     
299     /**
300      * Returns the index within this parse text of the last occurrence of the specified character array,
301      * searching backwards starting at the position specified by <code>fromIndex</code>,
302      * and breaking the search at the index specified by <code>breakAtIndex</code>.
303      * <p>
304      * The position specified by <code>breakAtIndex</code> is not included in the search.
305      * <p>
306      * If the search is to continue to the start of the text,
307      * the value {@link #NO_BREAK ParseText.NO_BREAK} should be specified as the <code>breakAtIndex</code>.
308      * <p>
309      * If the specified character array is not found then -1 is returned.
310      *
311      * @param searchCharArray a character array.
312      * @param fromIndex the index to start the search from.
313      * @param breakAtIndex the index at which to break off the search, or {@link #NO_BREAK} if the search is to continue to the start of the text.
314      * @return the index within this parse text of the last occurrence of the specified character array within the specified range, or -1 if the character array is not found.
315      */

316     public int lastIndexOf(final char[] searchCharArray, int fromIndex, final int breakAtIndex) {
317         if (searchCharArray.length==0) return fromIndex;
318         final int rightIndex=text.length-searchCharArray.length;
319         if (breakAtIndex>rightIndex) return -1;
320         if (fromIndex>rightIndex) fromIndex=rightIndex;
321         final int lastCharIndex=searchCharArray.length-1;
322         final char lastChar=searchCharArray[lastCharIndex];
323         final int actualBreakAtPos=breakAtIndex+lastCharIndex;
324         outerLoop: for (int i=fromIndex+lastCharIndex; i>actualBreakAtPos; i--) {
325             if (text[i]==lastChar) {
326                 final int startIndex=i-lastCharIndex;
327                 for (int j=lastCharIndex-1; j>=0; j--)
328                     if (searchCharArray[j]!=text[j+startIndex]) continue outerLoop;
329                 return startIndex;
330             }
331         }
332         return -1;
333     }
334
335     /**
336      * Returns the length of the parse text.
337      * @return the length of the parse text.
338      */

339     public int length() {
340         return text.length;
341     }
342
343     /**
344      * Returns a new string that is a substring of this parse text.
345      * <p>
346      * The substring begins at the specified <code>beginIndex</code> and extends to the character at index <code>endIndex</code> - 1.
347      * Thus the length of the substring is <code>endIndex-beginIndex</code>.
348      *
349      * @param beginIndex the begin index, inclusive.
350      * @param endIndex the end index, exclusive.
351      * @return a new string that is a substring of this parse text.
352      */

353     public String JavaDoc substring(final int beginIndex, final int endIndex) {
354         return new String JavaDoc(text,beginIndex,endIndex-beginIndex);
355     }
356
357     /**
358      * Returns a new character sequence that is a subsequence of this sequence.
359      * <p>
360      * This is equivalent to {@link #substring(int,int) substring(beginIndex,endIndex)}.
361      *
362      * @param beginIndex the begin index, inclusive.
363      * @param endIndex the end index, exclusive.
364      * @return a new character sequence that is a subsequence of this sequence.
365      */

366     public CharSequence JavaDoc subSequence(final int beginIndex, final int endIndex) {
367         return substring(beginIndex,endIndex);
368     }
369
370     /**
371      * Returns the content of the parse text as a <code>String</code>.
372      * @return the content of the parse text as a <code>String</code>.
373      */

374     public String JavaDoc toString() {
375         return new String JavaDoc(text);
376     }
377 }
378
Popular Tags