StringSearch


1   /*
2    *******************************************************************************
3    * Copyright (C) 1996-2006, International Business Machines Corporation and    *
4    * others. All Rights Reserved.                                                *
5    *******************************************************************************
6    */
7   
8   package com.ibm.icu.text;
9   
10  import java.text.CharacterIterator  ;
11  import java.text.StringCharacterIterator  ;
12  import java.util.Locale  ;
13  
14  import com.ibm.icu.impl.CharacterIteratorWrapper;
15  import com.ibm.icu.impl.NormalizerImpl;
16  import com.ibm.icu.lang.UCharacter;
17  import com.ibm.icu.util.ULocale;
18  
19  /**
20   * <p>
21   * <code>StringSearch</code> is the concrete subclass of 
22   * <code>SearchIterator</code> that provides language-sensitive text searching 
23   * based on the comparison rules defined in a {@link RuleBasedCollator} object.
24   * </p>
25   * <p>
26   * <code>StringSearch</code> uses a version of the fast Boyer-Moore search
27   * algorithm that has been adapted to work with the large character set of
28   * Unicode. Refer to 
29   * <a HREF="http://icu.sourceforge.net/docs/papers/efficient_text_searching_in_java.html">
30   * "Efficient Text Searching in Java"</a>, published in the 
31   * <i>Java Report</i> on February, 1999, for further information on the 
32   * algorithm.
33   * </p>
34   * <p>
35   * Users are also strongly encouraged to read the section on 
36   * <a HREF="http://icu.sourceforge.net/userguide/searchString.html">
37   * String Search</a> and 
38   * <a HREF="http://icu.sourceforge.net/userguide/Collate_Intro.html">
39   * Collation</a> in the user guide before attempting to use this class.
40   * </p>
41   * <p>
42   * String searching gets alittle complicated when accents are encountered at
43   * match boundaries. If a match is found and it has preceding or trailing 
44   * accents not part of the match, the result returned will include the 
45   * preceding accents up to the first base character, if the pattern searched 
46   * for starts an accent. Likewise, 
47   * if the pattern ends with an accent, all trailing accents up to the first
48   * base character will be included in the result.
49   * </p>
50   * <p>
51   * For example, if a match is found in target text "a&#92;u0325&#92;u0300" for 
52   * the pattern
53   * "a&#92;u0325", the result returned by StringSearch will be the index 0 and
54   * length 3 &lt;0, 3&gt;. If a match is found in the target 
55   * "a&#92;u0325&#92;u0300" 
56   * for the pattern "&#92;u0300", then the result will be index 1 and length 2 
57   * <1, 2>.
58   * </p>
59   * <p>
60   * In the case where the decomposition mode is on for the RuleBasedCollator,
61   * all matches that starts or ends with an accent will have its results include 
62   * preceding or following accents respectively. For example, if pattern "a" is
63   * looked for in the target text "&aacute;&#92;u0325", the result will be
64   * index 0 and length 2 &lt;0, 2&gt;.
65   * </p>
66   * <p>
67   * The StringSearch class provides two options to handle accent matching 
68   * described below:
69   * </p>
70   * <p>
71   * Let S' be the sub-string of a text string S between the offsets start and 
72   * end &lt;start, end&gt;.
73   * <br>
74   * A pattern string P matches a text string S at the offsets &lt;start, 
75   * length&gt; 
76   * <br>
77   * if
78   * <pre> 
79   * option 1. P matches some canonical equivalent string of S'. Suppose the 
80   *           RuleBasedCollator used for searching has a collation strength of 
81   *           TERTIARY, all accents are non-ignorable. If the pattern 
82   *           "a&#92;u0300" is searched in the target text 
83   *           "a&#92;u0325&#92;u0300", 
84   *           a match will be found, since the target text is canonically 
85   *           equivalent to "a&#92;u0300&#92;u0325"
86   * option 2. P matches S' and if P starts or ends with a combining mark, 
87   *           there exists no non-ignorable combining mark before or after S' 
88   *           in S respectively. Following the example above, the pattern 
89   *           "a&#92;u0300" will not find a match in "a&#92;u0325&#92;u0300", 
90   *           since
91   *           there exists a non-ignorable accent '&#92;u0325' in the middle of 
92   *           'a' and '&#92;u0300'. Even with a target text of 
93   *           "a&#92;u0300&#92;u0325" a match will not be found because of the 
94   *           non-ignorable trailing accent &#92;u0325.
95   * </pre>
96   * Option 2. will be the default mode for dealing with boundary accents unless
97   * specified via the API setCanonical(boolean).
98   * One restriction is to be noted for option 1. Currently there are no 
99   * composite characters that consists of a character with combining class > 0 
100  * before a character with combining class == 0. However, if such a character 
101  * exists in the future, the StringSearch may not work correctly with option 1
102  * when such characters are encountered.
103  * </p>
104  * <p>
105  * <tt>SearchIterator</tt> provides APIs to specify the starting position 
106  * within the text string to be searched, e.g. <tt>setIndex</tt>,
107  * <tt>preceding</tt> and <tt>following</tt>. Since the starting position will 
108  * be set as it is specified, please take note that there are some dangerous 
109  * positions which the search may render incorrect results:
110  * <ul>
111  * <li> The midst of a substring that requires decomposition.
112  * <li> If the following match is to be found, the position should not be the
113  *      second character which requires to be swapped with the preceding 
114  *      character. Vice versa, if the preceding match is to be found, 
115  *      position to search from should not be the first character which 
116  *      requires to be swapped with the next character. E.g certain Thai and
117  *      Lao characters require swapping.
118  * <li> If a following pattern match is to be found, any position within a 
119  *      contracting sequence except the first will fail. Vice versa if a 
120  *      preceding pattern match is to be found, a invalid starting point 
121  *      would be any character within a contracting sequence except the last.
122  * </ul>
123  * </p>
124  * <p>
125  * Though collator attributes will be taken into consideration while 
126  * performing matches, there are no APIs provided in StringSearch for setting 
127  * and getting the attributes. These attributes can be set by getting the 
128  * collator from <tt>getCollator</tt> and using the APIs in 
129  * <tt>com.ibm.icu.text.Collator</tt>. To update StringSearch to the new 
130  * collator attributes, <tt>reset()</tt> or 
131  * <tt>setCollator(RuleBasedCollator)</tt> has to be called.
132  * </p>
133  * <p>
134  * Consult the 
135  * <a HREF="http://icu.sourceforge.net/userguide/searchString.html">
136  * String Search</a> user guide and the <code>SearchIterator</code> 
137  * documentation for more information and examples of use.
138  * </p>
139  * <p>
140  * This class is not subclassable
141  * </p>
142  * @see SearchIterator
143  * @see RuleBasedCollator
144  * @author Laura Werner, synwee
145  * @stable ICU 2.0
146  */
147 // internal notes: all methods do not guarantee the correct status of the 
148 // characteriterator. the caller has to maintain the original index position
149 // if necessary. methods could change the index position as it deems fit
150 public final class StringSearch extends SearchIterator
151 {
152     
153     // public constructors --------------------------------------------------
154     
155     /**
156      * Initializes the iterator to use the language-specific rules defined in 
157      * the argument collator to search for argument pattern in the argument 
158      * target text. The argument breakiter is used to define logical matches.
159      * See super class documentation for more details on the use of the target 
160      * text and BreakIterator.
161      * @param pattern text to look for.
162      * @param target target text to search for pattern. 
163      * @param collator RuleBasedCollator that defines the language rules
164      * @param breakiter A {@link BreakIterator} that is used to determine the 
165      *                boundaries of a logical match. This argument can be null.
166      * @exception IllegalArgumentException thrown when argument target is null,
167      *            or of length 0
168      * @see BreakIterator
169      * @see RuleBasedCollator
170      * @see SearchIterator
171      * @stable ICU 2.0
172      */
173     public StringSearch(String   pattern, CharacterIterator   target,
174                         RuleBasedCollator collator, BreakIterator breakiter) 
175     {
176         super(target, breakiter);
177         m_textBeginOffset_ = targetText.getBeginIndex();
178         m_textLimitOffset_ = targetText.getEndIndex();
179         m_collator_ = collator;
180         m_colEIter_ = m_collator_.getCollationElementIterator(target);
181         m_utilColEIter_ = collator.getCollationElementIterator("");
182         m_ceMask_ = getMask(m_collator_.getStrength());
183         m_isCanonicalMatch_ = false;
184         m_pattern_ = new Pattern(pattern);
185         m_matchedIndex_ = DONE;
186         
187         initialize();
188     }
189 
190     /**
191      * Initializes the iterator to use the language-specific rules defined in 
192      * the argument collator to search for argument pattern in the argument 
193      * target text. No BreakIterators are set to test for logical matches.
194      * @param pattern text to look for.
195      * @param target target text to search for pattern. 
196      * @param collator RuleBasedCollator that defines the language rules
197      * @exception IllegalArgumentException thrown when argument target is null,
198      *            or of length 0
199      * @see RuleBasedCollator
200      * @see SearchIterator
201      * @stable ICU 2.0
202      */
203     public StringSearch(String   pattern, CharacterIterator   target,
204                         RuleBasedCollator collator) 
205     {
206         this(pattern, target, collator, BreakIterator.getCharacterInstance());
207     }
208 
209     /**
210      * Initializes the iterator to use the language-specific rules and 
211      * break iterator rules defined in the argument locale to search for 
212      * argument pattern in the argument target text. 
213      * See super class documentation for more details on the use of the target 
214      * text and BreakIterator.
215      * @param pattern text to look for.
216      * @param target target text to search for pattern. 
217      * @param locale locale to use for language and break iterator rules
218      * @exception IllegalArgumentException thrown when argument target is null,
219      *            or of length 0. ClassCastException thrown if the collator for 
220      *            the specified locale is not a RuleBasedCollator.
221      * @see BreakIterator
222      * @see RuleBasedCollator
223      * @see SearchIterator
224      * @stable ICU 2.0
225      */
226     public StringSearch(String   pattern, CharacterIterator   target, Locale   locale)
227     {
228         this(pattern, target, ULocale.forLocale(locale));
229     }
230 
231     /**
232      * Initializes the iterator to use the language-specific rules and 
233      * break iterator rules defined in the argument locale to search for 
234      * argument pattern in the argument target text. 
235      * See super class documentation for more details on the use of the target 
236      * text and BreakIterator.
237      * @param pattern text to look for.
238      * @param target target text to search for pattern. 
239      * @param locale ulocale to use for language and break iterator rules
240      * @exception IllegalArgumentException thrown when argument target is null,
241      *            or of length 0. ClassCastException thrown if the collator for 
242      *            the specified locale is not a RuleBasedCollator.
243      * @see BreakIterator
244      * @see RuleBasedCollator
245      * @see SearchIterator
246      * @draft ICU 3.2
247      * @provisional This API might change or be removed in a future release.
248      */
249     public StringSearch(String   pattern, CharacterIterator   target, ULocale locale)
250     {
251         this(pattern, target, (RuleBasedCollator)Collator.getInstance(locale),
252              BreakIterator.getCharacterInstance(locale));
253     }
254 
255     /**
256      * Initializes the iterator to use the language-specific rules and 
257      * break iterator rules defined in the default locale to search for 
258      * argument pattern in the argument target text. 
259      * See super class documentation for more details on the use of the target 
260      * text and BreakIterator.
261      * @param pattern text to look for.
262      * @param target target text to search for pattern. 
263      * @exception IllegalArgumentException thrown when argument target is null,
264      *            or of length 0. ClassCastException thrown if the collator for 
265      *            the default locale is not a RuleBasedCollator.
266      * @see BreakIterator
267      * @see RuleBasedCollator
268      * @see SearchIterator
269      * @stable ICU 2.0
270      */
271     public StringSearch(String   pattern, String   target) 
272     {
273         this(pattern, new StringCharacterIterator(target),
274              (RuleBasedCollator)Collator.getInstance(),
275              BreakIterator.getCharacterInstance());
276     }
277 
278     // public getters -----------------------------------------------------
279     
280     /**
281      * <p>
282      * Gets the RuleBasedCollator used for the language rules.
283      * </p>
284      * <p>
285      * Since StringSearch depends on the returned RuleBasedCollator, any 
286      * changes to the RuleBasedCollator result should follow with a call to 
287      * either StringSearch.reset() or 
288      * StringSearch.setCollator(RuleBasedCollator) to ensure the correct 
289      * search behaviour.
290      * </p>
291      * @return RuleBasedCollator used by this StringSearch
292      * @see RuleBasedCollator
293      * @see #setCollator
294      * @stable ICU 2.0
295      */
296     public RuleBasedCollator getCollator() 
297     {
298         return m_collator_;
299     }
300     
301     /**
302      * Returns the pattern for which StringSearch is searching for.
303      * @return the pattern searched for
304      * @stable ICU 2.0
305      */
306     public String   getPattern() 
307     {
308         return m_pattern_.targetText;
309     }
310     
311     /**
312      * Return the index in the target text where the iterator is currently 
313      * positioned at. 
314      * If the iteration has gone past the end of the target text or past 
315      * the beginning for a backwards search, {@link #DONE} is returned.
316      * @return index in the target text where the iterator is currently 
317      *         positioned at
318      * @stable ICU 2.8
319      */
320     public int getIndex() 
321     {
322         int result = m_colEIter_.getOffset();
323         if (isOutOfBounds(m_textBeginOffset_, m_textLimitOffset_, result)) {
324             return DONE;
325         }
326         return result;
327     }
328     
329     /**
330      * Determines whether canonical matches (option 1, as described in the 
331      * class documentation) is set.
332      * See setCanonical(boolean) for more information.
333      * @see #setCanonical
334      * @return true if canonical matches is set, false otherwise
335      * @stable ICU 2.8
336      */
337     public boolean isCanonical() 
338     {
339         return m_isCanonicalMatch_;
340     }
341     
342     // public setters -----------------------------------------------------
343     
344     /**
345      * <p>
346      * Sets the RuleBasedCollator to be used for language-specific searching.
347      * </p>
348      * <p>
349      * This method causes internal data such as Boyer-Moore shift tables
350      * to be recalculated, but the iterator's position is unchanged.
351      * </p>
352      * @param collator to use for this StringSearch
353      * @exception IllegalArgumentException thrown when collator is null
354      * @see #getCollator
355      * @stable ICU 2.0
356      */
357     public void setCollator(RuleBasedCollator collator) 
358     {
359         if (collator == null) {
360             throw new IllegalArgumentException  ("Collator can not be null");
361         }
362         m_collator_ = collator;
363         m_ceMask_ = getMask(m_collator_.getStrength());
364         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
365         initialize();
366         m_colEIter_.setCollator(m_collator_);
367         m_utilColEIter_.setCollator(m_collator_);
368     }
369     
370     /**
371      * <p>
372      * Set the pattern to search for.  
373      * </p>
374      * <p>
375      * This method causes internal data such as Boyer-Moore shift tables
376      * to be recalculated, but the iterator's position is unchanged.
377      * </p>
378      * @param pattern for searching
379      * @see #getPattern
380      * @exception IllegalArgumentException thrown if pattern is null or of
381      *               length 0
382      * @stable ICU 2.0
383      */
384     public void setPattern(String   pattern) 
385     {
386         if (pattern == null || pattern.length() <= 0) {
387             throw new IllegalArgumentException  (
388                     "Pattern to search for can not be null or of length 0");
389         }
390         m_pattern_.targetText = pattern;
391         initialize();
392     }
393     
394     /**
395       * Set the target text to be searched. Text iteration will hence begin at 
396      * the start of the text string. This method is useful if you want to 
397      * re-use an iterator to search within a different body of text.
398      * @param text new text iterator to look for match, 
399      * @exception IllegalArgumentException thrown when text is null or has
400      *            0 length
401      * @see #getTarget
402      * @stable ICU 2.8
403      */
404     public void setTarget(CharacterIterator   text)
405     {
406         super.setTarget(text);
407         m_textBeginOffset_ = targetText.getBeginIndex();
408         m_textLimitOffset_ = targetText.getEndIndex();
409         m_colEIter_.setText(targetText);
410     }
411     
412     /**
413      * <p>
414      * Sets the position in the target text which the next search will start 
415      * from to the argument. This method clears all previous states.
416      * </p>
417      * <p>
418      * This method takes the argument position and sets the position in the 
419      * target text accordingly, without checking if position is pointing to a 
420      * valid starting point to begin searching.
421      * </p>
422      * <p>
423      * Search positions that may render incorrect results are highlighted in 
424      * the class documentation.
425      * </p>
426      * @param position index to start next search from.
427      * @exception IndexOutOfBoundsException thrown if argument position is out
428      *            of the target text range.
429      * @see #getIndex
430      * @stable ICU 2.8
431      */
432     public void setIndex(int position)
433     {
434         super.setIndex(position);
435         m_matchedIndex_ = DONE;
436         m_colEIter_.setExactOffset(position);
437     }
438     
439     /**
440      * <p>
441      * Set the canonical match mode. See class documentation for details.
442      * The default setting for this property is false.
443      * </p>
444      * @param allowCanonical flag indicator if canonical matches are allowed
445      * @see #isCanonical
446      * @stable ICU 2.8
447      */
448     public void setCanonical(boolean allowCanonical)
449     {
450         m_isCanonicalMatch_ = allowCanonical;
451         if (m_isCanonicalMatch_ == true) {
452             if (m_canonicalPrefixAccents_ == null) {
453                 m_canonicalPrefixAccents_ = new StringBuffer  ();
454             }
455             else {
456                 m_canonicalPrefixAccents_.delete(0, 
457                                             m_canonicalPrefixAccents_.length());
458             }
459             if (m_canonicalSuffixAccents_ == null) {
460                 m_canonicalSuffixAccents_ = new StringBuffer  ();
461             }
462             else {
463                 m_canonicalSuffixAccents_.delete(0, 
464                                             m_canonicalSuffixAccents_.length());
465             }
466         }
467     }
468     
469     // public miscellaneous methods -----------------------------------------
470     
471     /** 
472      * <p>
473      * Resets the search iteration. All properties will be reset to the 
474      * default value.
475      * </p>
476      * <p>
477      * Search will begin at the start of the target text if a forward iteration 
478      * is initiated before a backwards iteration. Otherwise if a 
479      * backwards iteration is initiated before a forwards iteration, the search 
480      * will begin at the end of the target text.
481      * </p>
482      * <p>
483      * Canonical match option will be reset to false, ie an exact match.
484      * </p>
485      * @stable ICU 2.8
486      */
487     public void reset()
488     {
489         // reset is setting the attributes that are already in string search, 
490         // hence all attributes in the collator should be retrieved without any 
491         // problems
492         super.reset();
493         m_isCanonicalMatch_ = false;
494         m_ceMask_ = getMask(m_collator_.getStrength());
495         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
496         initialize();
497         m_colEIter_.setCollator(m_collator_);
498         m_colEIter_.reset();
499         m_utilColEIter_.setCollator(m_collator_);
500     }
501 
502     // protected methods -----------------------------------------------------
503     
504     /**
505      * <p>
506      * Concrete method to provide the mechanism 
507      * for finding the next <b>forwards</b> match in the target text.
508      * See super class documentation for its use.
509      * </p>  
510      * @param start index in the target text at which the forwards search 
511      *        should begin.
512      * @return the starting index of the next forwards match if found, DONE 
513      *         otherwise
514      * @see #handlePrevious(int)
515      * @see #DONE
516      * @stable ICU 2.8
517      */
518     protected int handleNext(int start)
519     {
520         if (m_pattern_.m_CELength_ == 0) {
521             matchLength = 0;
522             if (m_matchedIndex_ == DONE && start == m_textBeginOffset_) {
523                 m_matchedIndex_ = start;
524                 return m_matchedIndex_;
525             }
526             
527             targetText.setIndex(start);
528             char ch = targetText.current();
529             // ch can never be done, it is handled by next()
530             char ch2 = targetText.next();
531             if (ch2 == CharacterIterator.DONE) {
532                 m_matchedIndex_ = DONE;    
533             }
534             else {
535                 m_matchedIndex_ = targetText.getIndex();
536             }
537             if (UTF16.isLeadSurrogate(ch) && UTF16.isTrailSurrogate(ch2)) {
538                 targetText.next();
539                 m_matchedIndex_ = targetText.getIndex();
540             }
541         }
542         else {
543             if (matchLength <= 0) {
544                 // we must have reversed direction after we reached the start
545                 // of the target text
546                 // see SearchIterator next(), it checks the bounds and returns
547                 // if it exceeds the range. It does not allow setting of
548                 // m_matchedIndex
549                 if (start == m_textBeginOffset_) {
550                     m_matchedIndex_ = DONE;
551                 }
552                 else {
553                     // for boundary check purposes. this will ensure that the
554                     // next match will not preceed the current offset
555                     // note search->matchedIndex will always be set to something
556                     // in the code
557                     m_matchedIndex_ = start - 1;
558                 }
559             }
560     
561             // status checked below
562             if (m_isCanonicalMatch_) {
563                 // can't use exact here since extra accents are allowed.
564                 handleNextCanonical(start);
565             }
566             else {
567                 handleNextExact(start);
568             }
569         }
570         if (m_matchedIndex_ == DONE) {
571             targetText.setIndex(m_textLimitOffset_);
572         }
573         else {
574             targetText.setIndex(m_matchedIndex_);
575         }
576         return m_matchedIndex_;
577     }
578     
579     /**
580      * <p>
581      * Concrete method to provide the mechanism 
582      * for finding the next <b>backwards</b> match in the target text.
583      * See super class documentation for its use.
584      * </p>  
585      * @param start index in the target text at which the backwards search 
586      *        should begin.
587      * @return the starting index of the next backwards match if found, DONE 
588      *         otherwise
589      * @see #handleNext(int)
590      * @see #DONE
591      * @stable ICU 2.8
592      */
593     protected int handlePrevious(int start)
594     {
595         if (m_pattern_.m_CELength_ == 0) {
596             matchLength = 0;
597             // start can never be DONE or 0, it is handled in previous
598             targetText.setIndex(start);
599             char ch = targetText.previous();
600             if (ch == CharacterIterator.DONE) {
601                 m_matchedIndex_ = DONE;
602             }
603             else {
604                 m_matchedIndex_ = targetText.getIndex();
605                 if (UTF16.isTrailSurrogate(ch)) {
606                     if (UTF16.isLeadSurrogate(targetText.previous())) {
607                         m_matchedIndex_ = targetText.getIndex();
608                     }
609                 }
610             }            
611         }
612         else {
613             if (matchLength == 0) {
614                 // we must have reversed direction after we reached the end
615                 // of the target text
616                 // see SearchIterator next(), it checks the bounds and returns
617                 // if it exceeds the range. It does not allow setting of
618                 // m_matchedIndex
619                 m_matchedIndex_ = DONE;
620             }
621             if (m_isCanonicalMatch_) {
622                 // can't use exact here since extra accents are allowed.
623                 handlePreviousCanonical(start);
624             }
625             else {
626                 handlePreviousExact(start);
627             }
628         }
629 
630         if (m_matchedIndex_ == DONE) {
631             targetText.setIndex(m_textBeginOffset_);
632         }
633         else {
634             targetText.setIndex(m_matchedIndex_);
635         }
636         return m_matchedIndex_;
637     }
638 
639     // private static inner classes ----------------------------------------
640     
641     private static class Pattern 
642     {
643         // protected methods -----------------------------------------------
644         
645         /**
646          * Pattern string
647          */
648         protected String   targetText;
649         /**
650          * Array containing the collation elements of targetText
651          */
652         protected int m_CE_[];
653         /**
654          * Number of collation elements in m_CE_
655          */
656         protected int m_CELength_; 
657         /**
658          * Flag indicator if targetText starts with an accent
659          */
660         protected boolean m_hasPrefixAccents_;
661         /**
662          * Flag indicator if targetText ends with an accent
663          */
664         protected boolean m_hasSuffixAccents_;
665         /**
666          * Default number of characters to shift for Boyer Moore
667          */
668         protected int m_defaultShiftSize_;
669         /**
670          * Number of characters to shift for Boyer Moore, depending on the
671          * source text to search
672          */
673         protected char m_shift_[];
674         /**
675          * Number of characters to shift backwards for Boyer Moore, depending 
676          * on the source text to search
677          */
678         protected char m_backShift_[];
679         
680         // protected constructors ------------------------------------------
681         
682         /**
683          * Empty constructor 
684          */
685         protected Pattern(String   pattern) 
686         {
687             targetText = pattern;
688             m_CE_ = new int[INITIAL_ARRAY_SIZE_];    
689             m_CELength_ = 0;
690             m_hasPrefixAccents_ = false;
691             m_hasSuffixAccents_ = false;
692             m_defaultShiftSize_ = 1;        
693             m_shift_ = new char[MAX_TABLE_SIZE_];
694             m_backShift_ = new char[MAX_TABLE_SIZE_];
695         }
696     };
697 
698 
699     // private data members ------------------------------------------------
700     
701     /**
702      * target text begin offset. Each targetText has a valid contiguous region 
703      * to iterate and this data member is the offset to the first such
704      * character in the region.
705      */
706     private int m_textBeginOffset_;
707     /**
708      * target text limit offset. Each targetText has a valid contiguous region 
709      * to iterate and this data member is the offset to 1 after the last such
710      * character in the region.
711      */
712     private int m_textLimitOffset_;
713     /**
714      * Upon completion of a search, m_matchIndex_ will store starting offset in
715      * m_text for the match. The Value DONE is the default value. 
716      * If we are not at the start of the text or the end of the text and 
717      * m_matchedIndex_ is DONE it means that we can find any more matches in 
718      * that particular direction
719      */
720     private int m_matchedIndex_; 
721     /**
722      * Current pattern to search for
723      */
724     private Pattern m_pattern_;
725     /**
726      * Collator whose rules are used to perform the search
727      */
728     private RuleBasedCollator m_collator_;
729     /** 
730      * The collation element iterator for the text source.
731      */
732     private CollationElementIterator m_colEIter_;
733     /** 
734      * Utility collation element, used throughout program for temporary 
735      * iteration.
736      */
737     private CollationElementIterator m_utilColEIter_;
738     /**
739      * The mask used on the collation elements to retrieve the valid strength
740      * weight 
741      */
742     private int m_ceMask_;
743     /**
744      * Buffer storing accents during a canonical search
745      */
746     private StringBuffer   m_canonicalPrefixAccents_;
747     /**
748      * Buffer storing accents during a canonical search
749      */
750     private StringBuffer   m_canonicalSuffixAccents_;
751     /**
752      * Flag to indicate if canonical search is to be done.
753      * E.g looking for "a?" in "a??" will yield the match at 0.
754      */
755     private boolean m_isCanonicalMatch_;
756     /**
757      * Size of the shift tables
758      */
759     private static final int MAX_TABLE_SIZE_ = 257; 
760     /**
761      * Initial array size
762      */
763     private static final int INITIAL_ARRAY_SIZE_ = 256;
764     /**
765      * Utility mask
766      */
767     private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
768     /**
769      * Utility mask
770      */
771     private static final int LAST_BYTE_MASK_ = 0xff;
772     /**
773      * Utility buffer for return values and temporary storage
774      */
775     private int m_utilBuffer_[] = new int[2];
776 
777     // private methods -------------------------------------------------------
778 
779     /**
780      * Hash a collation element from its full size (32 bits) down into a
781      * value that can be used as an index into the shift tables.  Right
782      * now we do a modulus by the size of the hash table.
783      * @param ce collation element
784      * @return collapsed version of the collation element
785      */
786     private static final int hash(int ce) 
787     {
788         // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
789         // well with the new collation where most of the latin 1 characters
790         // are of the value xx000xxx. their hashes will most of the time be 0
791         // to be discussed on the hash algo.
792         return CollationElementIterator.primaryOrder(ce) % MAX_TABLE_SIZE_;
793     }
794     
795     /**
796      * Gets the fcd value for a character at the argument index.
797      * This method takes into accounts of the supplementary characters.
798      * Note this method changes the offset in the character iterator.
799      * @param str UTF16 string where character for fcd retrieval resides
800      * @param offset position of the character whose fcd is to be retrieved
801      * @return fcd value
802      */
803     private static final char getFCD(CharacterIterator   str, int offset)
804     {
805         str.setIndex(offset);
806         char ch = str.current();
807         char result = NormalizerImpl.getFCD16(ch);
808         
809         if ((result != 0) && (str.getEndIndex() != offset + 1) && 
810             UTF16.isLeadSurrogate(ch)) {
811             ch = str.next();
812             if (UTF16.isTrailSurrogate(ch)) {
813                 result = NormalizerImpl.getFCD16FromSurrogatePair(result, ch);
814             } else {
815                 result = 0;
816             }
817         }
818         return result;
819     }
820     
821     /**
822      * Gets the fcd value for a character at the argument index.
823      * This method takes into accounts of the supplementary characters.
824      * @param str UTF16 string where character for fcd retrieval resides
825      * @param offset position of the character whose fcd is to be retrieved
826      * @return fcd value
827      */
828     private static final char getFCD(String   str, int offset)
829     {
830         char ch = str.charAt(offset);
831         char result = NormalizerImpl.getFCD16(ch);
832         
833         if ((result != 0) && (str.length() != offset + 1) && 
834             UTF16.isLeadSurrogate(ch)) {
835             ch = str.charAt(offset + 1);
836             if (UTF16.isTrailSurrogate(ch)) {
837                 result = NormalizerImpl.getFCD16FromSurrogatePair(result, ch);
838             } else {
839                 result = 0;
840             }
841         }
842         return result;
843     }
844     
845     /**
846     * Getting the modified collation elements taking into account the collation 
847     * attributes
848     * @param ce 
849     * @return the modified collation element
850     */
851     private final int getCE(int ce)
852     {
853         // note for tertiary we can't use the collator->tertiaryMask, that
854         // is a preprocessed mask that takes into account case options. since
855         // we are only concerned with exact matches, we don't need that.
856         ce &= m_ceMask_;
857         
858         if (m_collator_.isAlternateHandlingShifted()) {
859             // alternate handling here, since only the 16 most significant 
860             // digits is only used, we can safely do a compare without masking
861             // if the ce is a variable, we mask and get only the primary values
862             // no shifting to quartenary is required since all primary values
863             // less than variabletop will need to be masked off anyway.
864             if ((m_collator_.m_variableTopValue_  << 16) > ce) {
865                 if (m_collator_.getStrength() == Collator.QUATERNARY) {
866                     ce = CollationElementIterator.primaryOrder(ce);
867                 }
868                 else { 
869                     ce = CollationElementIterator.IGNORABLE;
870                 }
871             }
872         }
873     
874         return ce;
875     }
876     
877     /**
878      * Appends a int to a int array, increasing the size of the array when 
879      * we are out of space.
880      * @param offset in array to append to
881      * @param value to append
882      * @param array to append to
883      * @return the array appended to, this could be a new and bigger array
884      */
885     private static final int[] append(int offset, int value, int array[])
886     {
887         if (offset >= array.length) {
888             int temp[] = new int[offset + INITIAL_ARRAY_SIZE_];
889             System.arraycopy(array, 0, temp, 0, array.length);
890             array = temp;
891         }
892         array[offset] = value;
893         return array;
894     }
895     
896     /**
897      * Initializing the ce table for a pattern. Stores non-ignorable collation 
898      * keys. Table size will be estimated by the size of the pattern text. 
899      * Table expansion will be perform as we go along. Adding 1 to ensure that 
900      * the table size definitely increases.
901      * Internal method, status assumed to be a success.
902      * @return total number of expansions 
903      */
904     private final int initializePatternCETable()
905     {
906         m_utilColEIter_.setText(m_pattern_.targetText);
907         
908         int offset = 0;
909         int result = 0;
910         int ce = m_utilColEIter_.next();
911     
912         while (ce != CollationElementIterator.NULLORDER) {
913             int newce = getCE(ce);
914             if (newce != CollationElementIterator.IGNORABLE) {
915                 m_pattern_.m_CE_ = append(offset, newce, m_pattern_.m_CE_);
916                 offset ++;            
917             }
918             result += m_utilColEIter_.getMaxExpansion(ce) - 1;
919             ce = m_utilColEIter_.next();
920         }
921     
922         m_pattern_.m_CE_ = append(offset, 0, m_pattern_.m_CE_);
923         m_pattern_.m_CELength_ = offset;
924     
925         return result;
926     }
927     
928     /**
929      * Initializes the pattern struct.
930      * Internal method, status assumed to be success.
931      * @return expansionsize the total expansion size of the pattern
932      */ 
933     private final int initializePattern()
934     {
935         m_pattern_.m_hasPrefixAccents_ = (getFCD(m_pattern_.targetText, 0) 
936                                              >> SECOND_LAST_BYTE_SHIFT_) != 0;
937         m_pattern_.m_hasSuffixAccents_ = (getFCD(m_pattern_.targetText, 
938                                                  m_pattern_.targetText.length() 
939                                                  - 1) 
940                                             & LAST_BYTE_MASK_) != 0;
941         // since intializePattern is an internal method status is a success.
942         return initializePatternCETable();   
943     }
944     
945     /**
946      * Initializing shift tables, with the default values.
947      * If a corresponding default value is 0, the shift table is not set.
948      * @param shift table for forwards shift 
949      * @param backshift table for backwards shift
950      * @param cetable table containing pattern ce
951      * @param cesize size of the pattern ces
952      * @param expansionsize total size of the expansions
953      * @param defaultforward the default forward value
954      * @param defaultbackward the default backward value
955      */
956      private final void setShiftTable(char shift[], 
957                                                     char backshift[], 
958                                                     int cetable[], int cesize, 
959                                                       int expansionsize,
960                                                     char defaultforward,
961                                                       char defaultbackward)
962     {
963         // estimate the value to shift. to do that we estimate the smallest 
964         // number of characters to give the relevant ces, ie approximately
965         // the number of ces minus their expansion, since expansions can come 
966         // from a character.
967         for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {
968             shift[count] = defaultforward;
969         }
970         cesize --; // down to the last index
971         for (int count = 0; count < cesize; count ++) {
972             // number of ces from right of array to the count
973             int temp = defaultforward - count - 1;
974             shift[hash(cetable[count])] = temp > 1 ? ((char)temp) : 1;
975         }
976         shift[hash(cetable[cesize])] = 1;
977         // for ignorables we just shift by one. see test examples.
978         shift[hash(0)] = 1;
979         
980         for (int count = 0; count < MAX_TABLE_SIZE_; count ++) {
981             backshift[count] = defaultbackward;
982         }
983         for (int count = cesize; count > 0; count --) {
984             // the original value count does not seem to work
985             backshift[hash(cetable[count])] = (char)(count > expansionsize ? 
986                                                       count - expansionsize : 1);
987         }
988         backshift[hash(cetable[0])] = 1;
989         backshift[hash(0)] = 1;
990     }
991     
992     /**
993      * <p>Building of the pattern collation element list and the Boyer Moore 
994      * StringSearch table.</p>
995      * <p>The canonical match will only be performed after the default match 
996      * fails.</p>
997      * <p>For both cases we need to remember the size of the composed and 
998      * decomposed versions of the string. Since the Boyer-Moore shift 
999      * calculations shifts by a number of characters in the text and tries to 
1000     * match the pattern from that offset, the shift value can not be too large 
1001     * in case we miss some characters. To choose a right shift size, we 
1002     * estimate the NFC form of the and use its size as a shift guide. The NFC 
1003     * form should be the small possible representation of the pattern. Anyways, 
1004     * we'll err on the smaller shift size. Hence the calculation for 
1005     * minlength. Canonical match will be performed slightly differently. We'll 
1006     * split the pattern into 3 parts, the prefix accents (PA), the middle 
1007     * string bounded by the first and last base character (MS), the ending 
1008     * accents (EA). Matches will be done on MS first, and only when we match 
1009     * MS then some processing will be required for the prefix and end accents 
1010     * in order to determine if they match PA and EA. Hence the default shift 
1011     * values for the canonical match will take the size of either end's accent 
1012     * into consideration. Forwards search will take the end accents into 
1013     * consideration for the default shift values and the backwards search will 
1014     * take the prefix accents into consideration.</p>
1015     * <p>If pattern has no non-ignorable ce, we return a illegal argument 
1016     * error.</p>
1017     */ 
1018    private final void initialize()
1019    {
1020        int expandlength  = initializePattern();   
1021        if (m_pattern_.m_CELength_ > 0) {
1022            char minlength = (char)(m_pattern_.m_CELength_ > expandlength 
1023                                ? m_pattern_.m_CELength_ - expandlength : 1);
1024            m_pattern_.m_defaultShiftSize_ = minlength;
1025            setShiftTable(m_pattern_.m_shift_, m_pattern_.m_backShift_, 
1026                          m_pattern_.m_CE_, m_pattern_.m_CELength_, 
1027                          expandlength, minlength, minlength);
1028        }
1029        else {
1030            m_pattern_.m_defaultShiftSize_ = 0;
1031        }
1032    }
1033    
1034    /**
1035     * Determine whether the search text bounded by the offset start and end is 
1036     * one or more whole units of text as determined by the breakiterator in 
1037     * StringSearch.
1038     * @param start target text start offset
1039     * @param end target text end offset
1040     */
1041    private final boolean isBreakUnit(int start, int end) 
1042    {
1043        if (breakIterator != null) {
1044            int startindex = breakIterator.first();
1045            int endindex   = breakIterator.last();
1046            
1047            // out-of-range indexes are never boundary positions
1048            if (start < startindex || start > endindex || end < startindex 
1049                || end > endindex) {
1050                return false;
1051            }
1052            // otherwise, we can use following() on the position before the 
1053            // specified one and return true of the position we get back is the 
1054            // one the user specified
1055            boolean result = (start == startindex 
1056                              || breakIterator.following(start - 1) == start) 
1057                             && (end == endindex 
1058                                  || breakIterator.following(end - 1) == end);
1059            if (result) {
1060                // iterates the individual ces
1061                m_utilColEIter_.setText(
1062                    new CharacterIteratorWrapper(targetText), start);
1063                for (int count = 0; count < m_pattern_.m_CELength_;
1064                     count ++) {
1065                    int ce = getCE(m_utilColEIter_.next());
1066                    if (ce == CollationElementIterator.IGNORABLE) {
1067                        count --;
1068                        continue;
1069                    }
1070                    if (ce != m_pattern_.m_CE_[count]) {
1071                        return false;
1072                    }
1073                }
1074                int nextce = m_utilColEIter_.next();
1075                while (m_utilColEIter_.getOffset() == end 
1076                       && getCE(nextce) == CollationElementIterator.IGNORABLE) {
1077                    nextce = m_utilColEIter_.next();       
1078                }
1079                if (nextce != CollationElementIterator.NULLORDER 
1080                    && m_utilColEIter_.getOffset() == end) {
1081                    // extra collation elements at the end of the match
1082                    return false;
1083                }
1084            }
1085            return result;
1086        }
1087        return true;
1088    }
1089    
1090    /**
1091     * Getting the next base character offset if current offset is an accent, 
1092     * or the current offset if the current character contains a base character. 
1093     * accents the following base character will be returned
1094     * @param text string
1095     * @param textoffset current offset
1096     * @param textlength length of text string
1097     * @return the next base character or the current offset
1098     *         if the current character is contains a base character.
1099     */
1100    private final int getNextBaseOffset(CharacterIterator   text, 
1101                                                        int textoffset)
1102    {
1103        if (textoffset < text.getEndIndex()) {
1104            while (text.getIndex() < text.getEndIndex()) { 
1105                int result = textoffset;
1106                if ((getFCD(text, textoffset ++) 
1107                            >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1108                     return result;
1109                }
1110            }
1111            return text.getEndIndex();
1112        }
1113        return textoffset;
1114    }
1115    
1116    /**
1117     * Gets the next base character offset depending on the string search 
1118     * pattern data
1119     * @param textoffset one offset away from the last character
1120     *                   to search for.
1121     * @return start index of the next base character or the current offset
1122     *         if the current character is contains a base character.
1123     */
1124    private final int getNextBaseOffset(int textoffset)
1125    {
1126        if (m_pattern_.m_hasSuffixAccents_ 
1127            && textoffset < m_textLimitOffset_) {
1128            targetText.setIndex(textoffset);
1129            targetText.previous();
1130            if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {
1131                return getNextBaseOffset(targetText, textoffset);
1132            }
1133        }
1134        return textoffset;
1135    }
1136    
1137    /**
1138     * Shifting the collation element iterator position forward to prepare for
1139     * a following match. If the last character is a unsafe character, we'll 
1140     * only shift by 1 to capture contractions, normalization etc.
1141     * Internal method, status assumed to be success.
1142     * @param textoffset start text position to do search
1143     * @param ce the text ce which failed the match.
1144     * @param patternceindex index of the ce within the pattern ce buffer which
1145     *        failed the match
1146     * @return final offset
1147     */
1148    private int shiftForward(int textoffset, int ce, int patternceindex)
1149                                    
1150    {
1151        if (ce != CollationElementIterator.NULLORDER) {
1152            int shift = m_pattern_.m_shift_[hash(ce)];
1153            // this is to adjust for characters in the middle of the 
1154            // substring for matching that failed.
1155            int adjust = m_pattern_.m_CELength_ - patternceindex;
1156            if (adjust > 1 && shift >= adjust) {
1157                shift -= adjust - 1;
1158            }
1159            textoffset += shift;
1160        }
1161        else {
1162            textoffset += m_pattern_.m_defaultShiftSize_;
1163        }
1164         
1165        textoffset = getNextBaseOffset(textoffset);
1166        // check for unsafe characters
1167        // * if it is the start or middle of a contraction: to be done after 
1168        //   a initial match is found
1169        // * thai or lao base consonant character: similar to contraction
1170        // * high surrogate character: similar to contraction
1171        // * next character is a accent: shift to the next base character
1172        return textoffset;
1173    }
1174    
1175    /**
1176     * Gets the offset to the next safe point in text.
1177     * ie. not the middle of a contraction, swappable characters or 
1178     * supplementary characters.
1179     * @param textoffset offset in string
1180     * @param end offset in string
1181     * @return offset to the next safe character
1182     */
1183    private final int getNextSafeOffset(int textoffset, int end)
1184    {
1185        int result = textoffset; // first contraction character
1186        targetText.setIndex(result);
1187        while (result != end && 
1188            m_collator_.isUnsafe(targetText.current())) {
1189               result ++;
1190               targetText.setIndex(result);
1191        }
1192        return result; 
1193    }
1194    
1195    /** 
1196     * This checks for accents in the potential match started with a composite 
1197     * character.
1198     * This is really painful... we have to check that composite character do 
1199     * not have any extra accents. We have to normalize the potential match and 
1200     * find the immediate decomposed character before the match.
1201     * The first composite character would have been taken care of by the fcd 
1202     * checks in checkForwardExactMatch.
1203     * This is the slow path after the fcd of the first character and 
1204     * the last character has been checked by checkForwardExactMatch and we 
1205     * determine that the potential match has extra non-ignorable preceding
1206     * ces.
1207     * E.g. looking for ? acute in ? A ring above and acute, 
1208     * checkExtraMatchAccent should fail since there is a middle ring in 
1209     * ? Note here that accents checking are slow and cautioned in the API 
1210     * docs.
1211     * Internal method, status assumed to be a success, caller should check 
1212     * status before calling this method
1213     * @param start index of the potential unfriendly composite character
1214     * @param end index of the potential unfriendly composite character
1215     * @return true if there is non-ignorable accents before at the beginning
1216     *              of the match, false otherwise.
1217     */
1218    private final boolean checkExtraMatchAccents(int start, int end)
1219    {
1220        boolean result = false;
1221        if (m_pattern_.m_hasPrefixAccents_) {
1222            targetText.setIndex(start);
1223            
1224            if (UTF16.isLeadSurrogate(targetText.next())) {
1225                if (!UTF16.isTrailSurrogate(targetText.next())) {
1226                    targetText.previous();
1227                }
1228            }
1229            // we are only concerned with the first composite character
1230            String   str = getString(targetText, start, end);
1231            if (Normalizer.quickCheck(str, Normalizer.NFD,0) 
1232                                                    == Normalizer.NO) {
1233                int safeoffset = getNextSafeOffset(start, end);
1234                if (safeoffset != end) {
1235                    safeoffset ++;
1236                }
1237                String   decomp = Normalizer.decompose(
1238                                str.substring(0, safeoffset - start), false);
1239                m_utilColEIter_.setText(decomp);
1240                int firstce = m_pattern_.m_CE_[0];
1241                boolean ignorable = true;
1242                int ce = CollationElementIterator.IGNORABLE;
1243                int offset = 0;
1244                while (ce != firstce) {
1245                    offset = m_utilColEIter_.getOffset();
1246                    if (ce != firstce 
1247                        && ce != CollationElementIterator.IGNORABLE) {
1248                        ignorable = false;
1249                    }
1250                    ce = m_utilColEIter_.next();
1251                }
1252                m_utilColEIter_.setExactOffset(offset); // back up 1 to the 
1253                m_utilColEIter_.previous();             // right offset
1254                offset = m_utilColEIter_.getOffset();
1255                result = !ignorable && (UCharacter.getCombiningClass(
1256                                            UTF16.charAt(decomp, offset)) != 0);
1257            }
1258        }
1259    
1260        return result;
1261    }
1262    
1263    /**
1264    * Used by exact matches, checks if there are accents before the match. 
1265    * This is really painful... we have to check that composite characters at
1266    * the start of the matches have to not have any extra accents. 
1267    * We check the FCD of the character first, if it starts with an accent and 
1268    * the first pattern ce does not match the first ce of the character, we 
1269    * bail.
1270    * Otherwise we try normalizing the first composite 
1271    * character and find the immediate decomposed character before the match to 
1272    * see if it is an non-ignorable accent.
1273    * Now normalizing the first composite character is enough because we ensure 
1274    * that when the match is passed in here with extra beginning ces, the 
1275    * first or last ce that match has to occur within the first character.
1276    * E.g. looking for ? acute in ? A ring above and acute, 
1277    * checkExtraMatchAccent should fail since there is a middle ring in ?
1278    * Note here that accents checking are slow and cautioned in the API docs.
1279    * @param start offset 
1280    * @param end offset
1281    * @return true if there are accents on either side of the match, 
1282    *         false otherwise
1283    */
1284    private final boolean hasAccentsBeforeMatch(int start, int end) 
1285    {
1286        if (m_pattern_.m_hasPrefixAccents_) {
1287            // we have been iterating forwards previously
1288            boolean ignorable = true;
1289            int firstce = m_pattern_.m_CE_[0];
1290            m_colEIter_.setExactOffset(start);
1291            int ce  = getCE(m_colEIter_.next());
1292            while (ce != firstce) {
1293                if (ce != CollationElementIterator.IGNORABLE) {
1294                    ignorable = false;
1295                }
1296                ce = getCE(m_colEIter_.next());
1297            }
1298            if (!ignorable && m_colEIter_.isInBuffer()) {
1299                // within normalization buffer, discontiguous handled here
1300                return true;
1301            }
1302    
1303            // within text
1304            boolean accent = (getFCD(targetText, start) >> SECOND_LAST_BYTE_SHIFT_)
1305                                                        != 0; 
1306            if (!accent) {
1307                return checkExtraMatchAccents(start, end);
1308            }
1309            if (!ignorable) {
1310                return true;
1311            }
1312            if (start > m_textBeginOffset_) {
1313                targetText.setIndex(start);
1314                targetText.previous();
1315                if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) 
1316                                                                        != 0) {
1317                    m_colEIter_.setExactOffset(start);
1318                    ce = m_colEIter_.previous();
1319                    if (ce != CollationElementIterator.NULLORDER 
1320                        && ce != CollationElementIterator.IGNORABLE) {
1321                        return true;
1322                    }
1323                }
1324            }
1325        }
1326      
1327        return false;
1328    }
1329    
1330    /**
1331     * Used by exact matches, checks if there are accents bounding the match.
1332     * Note this is the initial boundary check. If the potential match
1333     * starts or ends with composite characters, the accents in those
1334     * characters will be determined later.
1335     * Not doing backwards iteration here, since discontiguos contraction for 
1336     * backwards collation element iterator, use up too many characters.
1337     * E.g. looking for ? ring in ? A ring above and acute, 
1338     * should fail since there is a acute at the end of ?
1339     * Note here that accents checking are slow and cautioned in the API docs.
1340     * @param start offset of match
1341     * @param end end offset of the match
1342     * @return true if there are accents on either side of the match, 
1343     *         false otherwise
1344     */
1345    private final boolean hasAccentsAfterMatch(int start, int end) 
1346    {
1347        if (m_pattern_.m_hasSuffixAccents_) {
1348            targetText.setIndex(end);
1349            if (end > m_textBeginOffset_ 
1350                && UTF16.isTrailSurrogate(targetText.previous())) {
1351                if (targetText.getIndex() > m_textBeginOffset_ &&
1352                    !UTF16.isLeadSurrogate(targetText.previous())) {
1353                    targetText.next();
1354                }
1355            }
1356            if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) {
1357                int firstce  = m_pattern_.m_CE_[0];
1358                m_colEIter_.setExactOffset(start);
1359                while (getCE(m_colEIter_.next()) != firstce) {
1360                }
1361                int count = 1;
1362                while (count < m_pattern_.m_CELength_) {
1363                    if (getCE(m_colEIter_.next()) 
1364                        == CollationElementIterator.IGNORABLE) {
1365                        count --;
1366                    }
1367                    count ++;
1368                }
1369                int ce = getCE(m_colEIter_.next());
1370                if (ce != CollationElementIterator.NULLORDER 
1371                            && ce != CollationElementIterator.IGNORABLE) {
1372                    if (m_colEIter_.getOffset() <= end) {
1373                        return true;
1374                    }
1375                    if ((getFCD(targetText, end) >> SECOND_LAST_BYTE_SHIFT_) 
1376                        != 0) {
1377                        return true;
1378                    }
1379                }
1380            }
1381        }
1382        return false;
1383    }
1384    
1385    /**
1386    * Checks if the offset runs out of the text string range
1387    * @param textstart offset of the first character in the range
1388    * @param textlimit limit offset of the text string range
1389    * @param offset to test
1390    * @return true if offset is out of bounds, false otherwise
1391    */
1392    private static final boolean isOutOfBounds(int textstart, int textlimit, 
1393                                                int offset)
1394    {
1395        return offset < textstart || offset > textlimit;
1396    }
1397    
1398    /**
1399     * Checks for identical match
1400     * @param strsrch string search data
1401     * @param start offset of possible match
1402     * @param end offset of possible match
1403     * @return true if identical match is found
1404     */
1405    private final boolean checkIdentical(int start, int end) 
1406    {
1407        if (m_collator_.getStrength() != Collator.IDENTICAL) {
1408            return true;
1409        }
1410    
1411        String   textstr = getString(targetText, start, end - start);
1412        if (Normalizer.quickCheck(textstr, Normalizer.NFD,0) 
1413                                                    == Normalizer.NO) {
1414            textstr = Normalizer.decompose(textstr, false);
1415        }
1416        String   patternstr = m_pattern_.targetText;
1417        if (Normalizer.quickCheck(patternstr, Normalizer.NFD,0) 
1418                                                    == Normalizer.NO) {
1419            patternstr = Normalizer.decompose(patternstr, false);
1420        }
1421        return textstr.equals(patternstr);
1422    }
1423    
1424    /**
1425     * Checks to see if the match is repeated
1426     * @param start new match start index
1427     * @param limit new match limit index
1428     * @return true if the the match is repeated, false otherwise
1429     */
1430    private final boolean checkRepeatedMatch(int start, int limit)
1431    {
1432        if (m_matchedIndex_ == DONE) {
1433            return false;
1434        }
1435        int end = limit - 1; // last character in the match
1436        int lastmatchend = m_matchedIndex_ + matchLength - 1; 
1437        if (!isOverlapping()) {
1438            return (start >= m_matchedIndex_ && start <= lastmatchend) 
1439                    || (end >= m_matchedIndex_ && end <= lastmatchend)
1440                    || (start <= m_matchedIndex_ && end >= lastmatchend);
1441                      
1442        }
1443        return start <= m_matchedIndex_ && end >= lastmatchend;
1444    }
1445    
1446    /**
1447     * Checks match for contraction. 
1448     * If the match ends with a partial contraction we fail.
1449     * If the match starts too far off (because of backwards iteration) we try 
1450     * to chip off the extra characters depending on whether a breakiterator 
1451     * has been used.
1452     * Temporary utility buffer used to return modified start and end.
1453     * @param start offset of potential match, to be modified if necessary
1454     * @param end offset of potential match, to be modified if necessary
1455     * @return true if match passes the contraction test, false otherwise.
1456     */
1457    private final boolean checkNextExactContractionMatch(int start, int end) 
1458    {
1459        // This part checks if either ends of the match contains potential 
1460        // contraction. If so we'll have to iterate through them
1461        char endchar = 0;
1462        if (end < m_textLimitOffset_) {
1463            targetText.setIndex(end);
1464            endchar = targetText.current();
1465        }
1466        char poststartchar = 0;
1467        if (start + 1 < m_textLimitOffset_) {
1468            targetText.setIndex(start + 1);
1469            poststartchar = targetText.current();
1470        }
1471        if (m_collator_.isUnsafe(endchar) 
1472            || m_collator_.isUnsafe(poststartchar)) {
1473            // expansion prefix, what's left to iterate
1474            int bufferedCEOffset = m_colEIter_.m_CEBufferOffset_;
1475            boolean hasBufferedCE = bufferedCEOffset > 0;
1476            m_colEIter_.setExactOffset(start);
1477            int temp = start;
1478            while (bufferedCEOffset > 0) {
1479                // getting rid of the redundant ce, caused by setOffset.
1480                // since backward contraction/expansion may have extra ces if 
1481                // we are in the normalization buffer, hasAccentsBeforeMatch 
1482                // would have taken care of it.
1483                // E.g. the character \u01FA will have an expansion of 3, but 
1484                // if we are only looking for acute and ring \u030A and \u0301, 
1485                // we'll have to skip the first ce in the expansion buffer.
1486                m_colEIter_.next();
1487                if (m_colEIter_.getOffset() != temp) {
1488                    start = temp;
1489                    temp  = m_colEIter_.getOffset();
1490                }
1491                bufferedCEOffset --;
1492            }
1493    
1494            int count = 0;
1495            while (count < m_pattern_.m_CELength_) {
1496                int ce = getCE(m_colEIter_.next());
1497                if (ce == CollationElementIterator.IGNORABLE) {
1498                    continue;
1499                }
1500                if (hasBufferedCE && count == 0 
1501                    && m_colEIter_.getOffset() != temp) {
1502                    start = temp;
1503                    temp   = m_colEIter_.getOffset();
1504                }
1505                if (ce != m_pattern_.m_CE_[count]) {
1506                    end ++;
1507                    end = getNextBaseOffset(end);  
1508                    m_utilBuffer_[0] = start;
1509                    m_utilBuffer_[1] = end;
1510                    return false;
1511                }
1512                count ++;
1513            }
1514        } 
1515        m_utilBuffer_[0] = start;
1516        m_utilBuffer_[1] = end;
1517        return true;
1518    }
1519    
1520    
1521    /**
1522     * Checks and sets the match information if found.
1523     * Checks 
1524     * <ul>
1525     * <li> the potential match does not repeat the previous match
1526     * <li> boundaries are correct
1527     * <li> exact matches has no extra accents
1528     * <li> identical matchesb
1529     * <li> potential match does not end in the middle of a contraction
1530     * </ul>
1531     * Otherwise the offset will be shifted to the next character.
1532     * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
1533     * more fitting result value.
1534     * Uses the temporary utility buffer for storing the modified textoffset.
1535     * @param textoffset offset in the collation element text.
1536     * @return true if the match is valid, false otherwise
1537     */
1538    private final boolean checkNextExactMatch(int textoffset)
1539    {
1540        int start = m_colEIter_.getOffset();        
1541        if (!checkNextExactContractionMatch(start, textoffset)) {
1542            // returns the modified textoffset
1543            m_utilBuffer_[0] = m_utilBuffer_[1];
1544            return false;
1545        }
1546    
1547        start = m_utilBuffer_[0];
1548        textoffset = m_utilBuffer_[1];
1549        // this totally matches, however we need to check if it is repeating
1550        if (!isBreakUnit(start, textoffset) 
1551            || checkRepeatedMatch(start, textoffset) 
1552            || hasAccentsBeforeMatch(start, textoffset) 
1553            || !checkIdentical(start, textoffset) 
1554            || hasAccentsAfterMatch(start, textoffset)) {
1555            textoffset ++;
1556            textoffset = getNextBaseOffset(textoffset);  
1557            m_utilBuffer_[0] = textoffset;
1558            return false;
1559        }
1560            
1561        // totally match, we will get rid of the ending ignorables.
1562        m_matchedIndex_  = start;
1563        matchLength = textoffset - start;
1564        return true;
1565    }
1566    
1567    /**
1568    * Getting the previous base character offset, or the current offset if the 
1569    * current character is a base character
1570    * @param text the source text to work on
1571    * @param textoffset one offset after the current character
1572    * @return the offset of the next character after the base character or the 
1573    *             first composed character with accents
1574    */
1575    private final int getPreviousBaseOffset(CharacterIterator   text, 
1576                                            int textoffset)
1577    {
1578        if (textoffset > m_textBeginOffset_) {
1579            while (true) {
1580                int result = textoffset;
1581                text.setIndex(result);
1582                if (UTF16.isTrailSurrogate(text.previous())) {
1583                    if (text.getIndex() != text.getBeginIndex() &&
1584                        !UTF16.isLeadSurrogate(text.previous())) {
1585                        text.next();
1586                    }
1587                }
1588                textoffset = text.getIndex();
1589                char fcd = getFCD(text, textoffset);
1590                if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1591                    if ((fcd & LAST_BYTE_MASK_) != 0) {
1592                        return textoffset;
1593                    }
1594                    return result;
1595                }
1596                if (textoffset == m_textBeginOffset_) {
1597                    return m_textBeginOffset_;
1598                }
1599            }
1600        }
1601        return textoffset;
1602    }
1603    
1604    /**
1605    * Getting the indexes of the accents that are not blocked in the argument
1606    * accent array
1607    * @param accents accents in nfd.
1608    * @param accentsindex array to store the indexes of accents in accents that 
1609    *         are not blocked
1610    * @return the length of populated accentsindex
1611    */
1612    private int getUnblockedAccentIndex(StringBuffer   accents, 
1613                                        int accentsindex[])
1614    {
1615        int index = 0;
1616        int length = accents.length();
1617        int cclass = 0;
1618        int result = 0;
1619        while (index < length) {
1620            int codepoint = UTF16.charAt(accents, index);
1621            int tempclass = UCharacter.getCombiningClass(codepoint);
1622            if (tempclass != cclass) {
1623                cclass = tempclass;
1624                accentsindex[result] = index;
1625                result ++;
1626            }
1627            if (UCharacter.isSupplementary(codepoint)) {
1628                index += 2;
1629            }
1630            else {
1631                index ++;
1632            }
1633        }
1634        accentsindex[result] = length;
1635        return result;
1636    }
1637
1638    /**
1639     * Appends 3 StringBuffer/CharacterIterator together into a destination 
1640     * string buffer.
1641     * @param source1 string buffer
1642     * @param source2 character iterator
1643     * @param start2 start of the character iterator to merge
1644     * @param end2 end of the character iterator to merge
1645     * @param source3 string buffer
1646     * @return appended string buffer
1647     */
1648    private static final StringBuffer   merge(StringBuffer   source1, 
1649                                             CharacterIterator   source2,
1650                                             int start2, int end2,
1651                                             StringBuffer   source3) 
1652    {
1653        StringBuffer   result = new StringBuffer  ();    
1654        if (source1 != null && source1.length() != 0) {
1655            // jdk 1.3.1 does not have append(StringBuffer) yet
1656            if(com.ibm.icu.impl.ICUDebug.isJDK14OrHigher){
1657                result.append(source1);
1658            }else{
1659                result.append(source1.toString());
1660            }
1661        }
1662        source2.setIndex(start2);
1663        while (source2.getIndex() < end2) {
1664            result.append(source2.current());
1665            source2.next();
1666        }
1667        if (source3 != null && source3.length() != 0) {
1668            // jdk 1.3.1 does not have append(StringBuffer) yet
1669            if(com.ibm.icu.impl.ICUDebug.isJDK14OrHigher){
1670                result.append(source3);
1671            }else{
1672                result.append(source3.toString());
1673            }
1674        }
1675        return result;
1676    }
1677    
1678    /**
1679    * Running through a collation element iterator to see if the contents 
1680    * matches pattern in string search data
1681    * @param coleiter collation element iterator to test
1682    * @return true if a match if found, false otherwise
1683    */
1684    private final boolean checkCollationMatch(CollationElementIterator coleiter)
1685    {
1686        int patternceindex = m_pattern_.m_CELength_;
1687        int offset = 0;
1688        while (patternceindex > 0) {
1689            int ce = getCE(coleiter.next());
1690            if (ce == CollationElementIterator.IGNORABLE) {
1691                continue;
1692            }
1693            if (ce != m_pattern_.m_CE_[offset]) {
1694                return false;
1695            }
1696            offset ++;
1697            patternceindex --;
1698        }
1699        return true;
1700    }
1701    
1702    /**
1703     * Rearranges the front accents to try matching.
1704     * Prefix accents in the text will be grouped according to their combining 
1705     * class and the groups will be mixed and matched to try find the perfect 
1706     * match with the pattern.
1707     * So for instance looking for "?" in "???"
1708     * step 1: split "??" into 6 other type of potential accent 
1709     *            substrings "?", "?", "?", "??", 
1710     *            "??", "??".
1711     * step 2: check if any of the generated substrings matches the pattern.
1712     * Internal method, status is assumed to be success, caller has to check 
1713     * status before calling this method.
1714     * @param start first offset of the accents to start searching
1715     * @param end start of the last accent set
1716     * @return DONE if a match is not found, otherwise return the starting
1717     *         offset of the match. Note this start includes all preceding 
1718     *            accents.
1719     */
1720    private int doNextCanonicalPrefixMatch(int start, int end)
1721    {
1722        if ((getFCD(targetText, start) & LAST_BYTE_MASK_) == 0) {
1723            // die... failed at a base character
1724            return DONE;
1725        }
1726    
1727        start = targetText.getIndex(); // index changed by fcd
1728        int offset = getNextBaseOffset(targetText, start);
1729        start = getPreviousBaseOffset(start);
1730    
1731        StringBuffer   accents = new StringBuffer  ();
1732        String   accentstr = getString(targetText, start, offset - start);
1733        // normalizing the offensive string
1734        if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0) 
1735                                                    == Normalizer.NO) {
1736            accentstr = Normalizer.decompose(accentstr, false);
1737        }
1738        accents.append(accentstr);
1739            
1740        int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];      
1741        int accentsize = getUnblockedAccentIndex(accents, accentsindex);
1742        int count = (2 << (accentsize - 1)) - 1;  
1743        while (count > 0) {
1744            // copy the base characters
1745            m_canonicalPrefixAccents_.delete(0, 
1746                                        m_canonicalPrefixAccents_.length());
1747            int k = 0;
1748            for (; k < accentsindex[0]; k ++) {
1749                m_canonicalPrefixAccents_.append(accents.charAt(k));
1750            }
1751            // forming all possible canonical rearrangement by dropping
1752            // sets of accents
1753            for (int i = 0; i <= accentsize - 1; i ++) {
1754                int mask = 1 << (accentsize - i - 1);
1755                if ((count & mask) != 0) {
1756                    for (int j = accentsindex[i]; j < accentsindex[i + 1]; 
1757                                                                        j ++) {
1758                        m_canonicalPrefixAccents_.append(accents.charAt(j));
1759                    }
1760                }
1761            }
1762            StringBuffer   match = merge(m_canonicalPrefixAccents_,
1763                                       targetText, offset, end,
1764                                       m_canonicalSuffixAccents_);
1765                
1766            // if status is a failure, ucol_setText does nothing.
1767            // run the collator iterator through this match
1768            m_utilColEIter_.setText(match.toString());
1769            if (checkCollationMatch(m_utilColEIter_)) {
1770                 return start;
1771            }
1772            count --;
1773        }
1774        return DONE;
1775    }
1776
1777    /**
1778    * Gets the offset to the safe point in text before textoffset.
1779    * ie. not the middle of a contraction, swappable characters or 
1780    * supplementary characters.
1781    * @param start offset in string
1782    * @param textoffset offset in string
1783    * @return offset to the previous safe character
1784    */
1785    private final int getPreviousSafeOffset(int start, int textoffset)
1786    {
1787        int result = textoffset; // first contraction character
1788        targetText.setIndex(textoffset);
1789        while (result >= start && m_collator_.isUnsafe(targetText.previous())) {
1790            result = targetText.getIndex();
1791        }
1792        if (result != start) {
1793            // the first contraction character is consider unsafe here
1794            result = targetText.getIndex(); // originally result --;
1795        }
1796        return result; 
1797    }
1798
1799    /**
1800     * Take the rearranged end accents and tries matching. If match failed at
1801     * a seperate preceding set of accents (seperated from the rearranged on by
1802     * at least a base character) then we rearrange the preceding accents and 
1803     * tries matching again.
1804     * We allow skipping of the ends of the accent set if the ces do not match. 
1805     * However if the failure is found before the accent set, it fails.
1806     * Internal method, status assumed to be success, caller has to check 
1807     * status before calling this method.
1808     * @param textoffset of the start of the rearranged accent
1809     * @return DONE if a match is not found, otherwise return the starting
1810     *         offset of the match. Note this start includes all preceding 
1811     *         accents.
1812     */
1813    private int doNextCanonicalSuffixMatch(int textoffset)
1814    {
1815        int safelength = 0;
1816        StringBuffer   safetext;
1817        int safeoffset = m_textBeginOffset_; 
1818        
1819        if (textoffset != m_textBeginOffset_ 
1820            && m_canonicalSuffixAccents_.length() > 0
1821            && m_collator_.isUnsafe(m_canonicalSuffixAccents_.charAt(0))) {
1822            safeoffset     = getPreviousSafeOffset(m_textBeginOffset_, 
1823                                                    textoffset);
1824            safelength     = textoffset - safeoffset;
1825            safetext       = merge(null, targetText, safeoffset, textoffset, 
1826                                   m_canonicalSuffixAccents_);
1827        }
1828        else {
1829            safetext = m_canonicalSuffixAccents_;
1830        }
1831    
1832        // if status is a failure, ucol_setText does nothing
1833        CollationElementIterator coleiter = m_utilColEIter_;
1834        coleiter.setText(safetext.toString());
1835        // status checked in loop below
1836    
1837        int ceindex = m_pattern_.m_CELength_ - 1;
1838        boolean isSafe = true; // indication flag for position in safe zone
1839        
1840        while (ceindex >= 0) {
1841            int textce = coleiter.previous();
1842            if (textce == CollationElementIterator.NULLORDER) {
1843                // check if we have passed the safe buffer
1844                if (coleiter == m_colEIter_) {
1845                    return DONE;
1846                }
1847                coleiter = m_colEIter_;
1848                if (safetext != m_canonicalSuffixAccents_) {
1849                    safetext.delete(0, safetext.length());
1850                }
1851                coleiter.setExactOffset(safeoffset);
1852                // status checked at the start of the loop
1853                isSafe = false;
1854                continue;
1855            }
1856            textce = getCE(textce);
1857            if (textce != CollationElementIterator.IGNORABLE 
1858                && textce != m_pattern_.m_CE_[ceindex]) {
1859                // do the beginning stuff
1860                int failedoffset = coleiter.getOffset();
1861                if (isSafe && failedoffset >= safelength) {
1862                    // alas... no hope. failed at rearranged accent set
1863                    return DONE;
1864                }
1865                else {
1866                    if (isSafe) {
1867                        failedoffset += safeoffset;
1868                    }
1869                    
1870                    // try rearranging the front accents
1871                    int result = doNextCanonicalPrefixMatch(failedoffset, 
1872                                                            textoffset);
1873                    if (result != DONE) {
1874                        // if status is a failure, ucol_setOffset does nothing
1875                        m_colEIter_.setExactOffset(result);
1876                    }
1877                    return result;
1878                }
1879            }
1880            if (textce == m_pattern_.m_CE_[ceindex]) {
1881                ceindex --;
1882            }
1883        }
1884        // set offset here
1885        if (isSafe) {
1886            int result = coleiter.getOffset();
1887            // sets the text iterator with the correct expansion and offset
1888            int leftoverces = coleiter.m_CEBufferOffset_;
1889            if (result >= safelength) { 
1890                result = textoffset;
1891            }
1892            else {
1893                result += safeoffset;
1894            }
1895            m_colEIter_.setExactOffset(result);
1896            m_colEIter_.m_CEBufferOffset_ = leftoverces;
1897            return result;
1898        }
1899        
1900        return coleiter.getOffset();              
1901    }
1902    
1903    /**
1904     * Trying out the substring and sees if it can be a canonical match.
1905     * This will try normalizing the end accents and arranging them into 
1906     * canonical equivalents and check their corresponding ces with the pattern 
1907     * ce.
1908     * Suffix accents in the text will be grouped according to their combining 
1909     * class and the groups will be mixed and matched to try find the perfect 
1910     * match with the pattern.
1911     * So for instance looking for "?" in "???"
1912     * step 1: split "??" into 6 other type of potential accent 
1913     *         substrings
1914     *         "?", "?", "?", "??", "??", 
1915     *         "??".
1916     * step 2: check if any of the generated substrings matches the pattern.
1917     * @param textoffset end offset in the collation element text that ends with 
1918     *                   the accents to be rearranged
1919     * @return true if the match is valid, false otherwise
1920     */
1921    private boolean doNextCanonicalMatch(int textoffset)
1922    {
1923        int offset = m_colEIter_.getOffset();
1924        targetText.setIndex(textoffset);
1925        if (UTF16.isTrailSurrogate(targetText.previous()) 
1926            && targetText.getIndex() > m_textBeginOffset_) { 
1927            if (!UTF16.isLeadSurrogate(targetText.previous())) {
1928                targetText.next();
1929            }
1930        }
1931        if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
1932            if (m_pattern_.m_hasPrefixAccents_) {
1933                offset = doNextCanonicalPrefixMatch(offset, textoffset);
1934                if (offset != DONE) {
1935                    m_colEIter_.setExactOffset(offset);
1936                    return true;
1937                }
1938            }
1939            return false;
1940        }
1941    
1942        if (!m_pattern_.m_hasSuffixAccents_) {
1943            return false;
1944        }
1945    
1946        StringBuffer   accents = new StringBuffer  ();
1947        // offset to the last base character in substring to search
1948        int baseoffset = getPreviousBaseOffset(targetText, textoffset);
1949        // normalizing the offensive string
1950        String   accentstr = getString(targetText, baseoffset, 
1951                                     textoffset - baseoffset);
1952        if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0) 
1953                                                    == Normalizer.NO) {
1954            accentstr = Normalizer.decompose(accentstr, false);
1955        }
1956        accents.append(accentstr);
1957        // status checked in loop below
1958            
1959        int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
1960        int size = getUnblockedAccentIndex(accents, accentsindex);
1961    
1962        // 2 power n - 1 plus the full set of accents
1963        int  count = (2 << (size - 1)) - 1;  
1964        while (count > 0) {
1965            m_canonicalSuffixAccents_.delete(0, 
1966                                           m_canonicalSuffixAccents_.length());
1967            // copy the base characters
1968            for (int k = 0; k < accentsindex[0]; k ++) {
1969                m_canonicalSuffixAccents_.append(accents.charAt(k));
1970            }
1971            // forming all possible canonical rearrangement by dropping
1972            // sets of accents
1973            for (int i = 0; i <= size - 1; i ++) {
1974                int mask = 1 << (size - i - 1);
1975                if ((count & mask) != 0) {
1976                    for (int j = accentsindex[i]; j < accentsindex[i + 1]; 
1977                        j ++) {
1978                        m_canonicalSuffixAccents_.append(accents.charAt(j));
1979                    }
1980                }
1981            }
1982            offset = doNextCanonicalSuffixMatch(baseoffset);
1983            if (offset != DONE) {
1984                return true; // match found
1985            }
1986            count --;
1987        }
1988        return false;
1989    }
1990    
1991    /**
1992     * Gets the previous base character offset depending on the string search 
1993     * pattern data
1994     * @param strsrch string search data
1995     * @param textoffset current offset, current character
1996     * @return the offset of the next character after this base character or 
1997     *             itself if it is a composed character with accents
1998     */
1999    private final int getPreviousBaseOffset(int textoffset)
2000    {
2001        if (m_pattern_.m_hasPrefixAccents_ && textoffset > m_textBeginOffset_) {
2002            int offset = textoffset;
2003            if ((getFCD(targetText, offset) >> SECOND_LAST_BYTE_SHIFT_) != 0) {
2004                return getPreviousBaseOffset(targetText, textoffset);
2005            }
2006        }
2007        return textoffset;
2008    }
2009    
2010    /**
2011     * Checks match for contraction. 
2012     * If the match ends with a partial contraction we fail.
2013     * If the match starts too far off (because of backwards iteration) we try 
2014     * to chip off the extra characters.
2015     * Uses the temporary util buffer for return values of the modified start
2016     * and end.
2017     * @param start offset of potential match, to be modified if necessary
2018     * @param end offset of potential match, to be modified if necessary
2019     * @return true if match passes the contraction test, false otherwise. 
2020     */
2021    private boolean checkNextCanonicalContractionMatch(int start, int end) 
2022    {
2023        // This part checks if either ends of the match contains potential 
2024        // contraction. If so we'll have to iterate through them
2025        char schar = 0;
2026        char echar = 0;
2027        if (end < m_textLimitOffset_) {
2028            targetText.setIndex(end);
2029            echar = targetText.current();
2030        }
2031        if (start < m_textLimitOffset_) {
2032            targetText.setIndex(start + 1);
2033            schar = targetText.current();
2034        }
2035        if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
2036            int expansion  = m_colEIter_.m_CEBufferOffset_;
2037            boolean hasExpansion = expansion > 0;
2038            m_colEIter_.setExactOffset(start);
2039            int temp = start;
2040            while (expansion > 0) {
2041                // getting rid of the redundant ce, caused by setOffset.
2042                // since backward contraction/expansion may have extra ces if 
2043                // we are in the normalization buffer, hasAccentsBeforeMatch 
2044                // would have taken care of it.
2045                // E.g. the character \u01FA will have an expansion of 3, but 
2046                // if we are only looking for acute and ring \u030A and \u0301, 
2047                // we'll have to skip the first ce in the expansion buffer.
2048                m_colEIter_.next();
2049                if (m_colEIter_.getOffset() != temp) {
2050                    start = temp;
2051                    temp  = m_colEIter_.getOffset();
2052                }
2053                expansion --;
2054            }
2055    
2056            int count = 0;
2057            while (count < m_pattern_.m_CELength_) {
2058                int ce = getCE(m_colEIter_.next());
2059                // status checked below, note that if status is a failure
2060                // ucol_next returns UCOL_NULLORDER
2061                if (ce == CollationElementIterator.IGNORABLE) {
2062                    continue;
2063                }
2064                if (hasExpansion && count == 0 
2065                    && m_colEIter_.getOffset() != temp) {
2066                    start = temp;
2067                    temp = m_colEIter_.getOffset();
2068                }
2069    
2070                if (count == 0 && ce != m_pattern_.m_CE_[0]) {
2071                    // accents may have extra starting ces, this occurs when a 
2072                    // pure accent pattern is matched without rearrangement
2073                    // text \u0325\u0300 and looking for \u0300
2074                    int expected = m_pattern_.m_CE_[0]; 
2075                    if ((getFCD(targetText, start) & LAST_BYTE_MASK_) != 0) {
2076                        ce = getCE(m_colEIter_.next());
2077                        while (ce != expected 
2078                               && ce != CollationElementIterator.NULLORDER 
2079                               && m_colEIter_.getOffset() <= end) {
2080                            ce = getCE(m_colEIter_.next());
2081                        }
2082                    }
2083                }
2084                if (ce != m_pattern_.m_CE_[count]) {
2085                    end ++;
2086                    end = getNextBaseOffset(end);  
2087                    m_utilBuffer_[0] = start;
2088                    m_utilBuffer_[1] = end;
2089                    return false;
2090                }
2091                count ++;
2092            }
2093        } 
2094        m_utilBuffer_[0] = start;
2095        m_utilBuffer_[1] = end;
2096        return true;
2097    }
2098
2099    /**
2100     * Checks and sets the match information if found.
2101     * Checks 
2102     * <ul>
2103     * <li> the potential match does not repeat the previous match
2104     * <li> boundaries are correct
2105     * <li> potential match does not end in the middle of a contraction
2106     * <li> identical matches
2107     * </ul>
2108     * Otherwise the offset will be shifted to the next character.
2109     * The result m_matchIndex_ and m_matchLength_ will be set to the truncated
2110     * more fitting result value.
2111     * Uses the temporary utility buffer for storing the modified textoffset.
2112     * @param textoffset offset in the collation element text.
2113     * @return true if the match is valid, false otherwise
2114     */
2115    private boolean checkNextCanonicalMatch(int textoffset)
2116    {
2117        // to ensure that the start and ends are not composite characters
2118        // if we have a canonical accent match
2119        if ((m_pattern_.m_hasSuffixAccents_ 
2120                && m_canonicalSuffixAccents_.length() != 0) || 
2121            (m_pattern_.m_hasPrefixAccents_ 
2122                && m_canonicalPrefixAccents_.length() != 0)) {
2123            m_matchedIndex_ = getPreviousBaseOffset(m_colEIter_.getOffset());
2124            matchLength = textoffset - m_matchedIndex_;
2125            return true;
2126        }
2127    
2128        int start = m_colEIter_.getOffset();
2129        if (!checkNextCanonicalContractionMatch(start, textoffset)) {
2130            // return the modified textoffset
2131            m_utilBuffer_[0] = m_utilBuffer_[1]; 
2132            return false;
2133        }
2134        start = m_utilBuffer_[0];
2135        textoffset = m_utilBuffer_[1];
2136        start = getPreviousBaseOffset(start);
2137        // this totally matches, however we need to check if it is repeating
2138        if (checkRepeatedMatch(start, textoffset) 
2139            || !isBreakUnit(start, textoffset) 
2140            || !checkIdentical(start, textoffset)) {
2141            textoffset ++;
2142            textoffset = getNextBaseOffset(targetText, textoffset);
2143            m_utilBuffer_[0] = textoffset;
2144            return false;
2145        }
2146        
2147        m_matchedIndex_  = start;
2148        matchLength = textoffset - start;
2149        return true;
2150    }
2151    
2152    /**
2153     * Shifting the collation element iterator position forward to prepare for
2154     * a preceding match. If the first character is a unsafe character, we'll 
2155     * only shift by 1 to capture contractions, normalization etc.
2156     * @param textoffset start text position to do search
2157     * @param ce the text ce which failed the match.
2158     * @param patternceindex index of the ce within the pattern ce buffer which
2159     *        failed the match
2160     * @return final offset
2161     */
2162    private int reverseShift(int textoffset, int ce, int patternceindex)
2163    {         
2164        if (isOverlapping()) {
2165            if (textoffset != m_textLimitOffset_) {
2166                textoffset --;
2167            }
2168            else {
2169                textoffset -= m_pattern_.m_defaultShiftSize_;
2170            }
2171        }
2172        else {
2173            if (ce != CollationElementIterator.NULLORDER) {
2174                int shift = m_pattern_.m_backShift_[hash(ce)];
2175                
2176                // this is to adjust for characters in the middle of the substring 
2177                // for matching that failed.
2178                int adjust = patternceindex;
2179                if (adjust > 1 && shift > adjust) {
2180                    shift -= adjust - 1;
2181                }
2182                textoffset -= shift;
2183            }
2184            else {
2185                textoffset -= m_pattern_.m_defaultShiftSize_;
2186            }
2187        }    
2188        
2189        textoffset = getPreviousBaseOffset(textoffset);
2190        return textoffset;
2191    }
2192
2193    /**
2194     * Checks match for contraction. 
2195     * If the match starts with a partial contraction we fail.
2196     * Uses the temporary utility buffer to return the modified start and end.
2197     * @param start offset of potential match, to be modified if necessary
2198     * @param end offset of potential match, to be modified if necessary
2199     * @return true if match passes the contraction test, false otherwise.
2200     */
2201    private boolean checkPreviousExactContractionMatch(int start, int end) 
2202    {
2203        // This part checks if either ends of the match contains potential 
2204        // contraction. If so we'll have to iterate through them
2205        char echar = 0;
2206        if (end < m_textLimitOffset_) {
2207            targetText.setIndex(end);
2208            echar = targetText.current();
2209        }
2210        char schar = 0;
2211        if (start + 1 < m_textLimitOffset_) {
2212            targetText.setIndex(start + 1);
2213            schar = targetText.current();
2214        }
2215        if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
2216            // expansion suffix, what's left to iterate
2217            int expansion = m_colEIter_.m_CEBufferSize_ 
2218                                            - m_colEIter_.m_CEBufferOffset_;
2219            boolean hasExpansion = expansion > 0;
2220            m_colEIter_.setExactOffset(end);
2221            int temp = end;
2222            while (expansion > 0) {
2223                // getting rid of the redundant ce
2224                // since forward contraction/expansion may have extra ces
2225                // if we are in the normalization buffer, hasAccentsBeforeMatch
2226                // would have taken care of it.
2227                // E.g. the character \u01FA will have an expansion of 3, but if
2228                // we are only looking for A ring A\u030A, we'll have to skip the 
2229                // last ce in the expansion buffer
2230                m_colEIter_.previous();
2231                if (m_colEIter_.getOffset() != temp) {
2232                    end = temp;
2233                    temp = m_colEIter_.getOffset();
2234                }
2235                expansion --;
2236            }
2237    
2238            int count = m_pattern_.m_CELength_;
2239            while (count > 0) {
2240                int ce = getCE(m_colEIter_.previous());
2241                // status checked below, note that if status is a failure
2242                // ucol_previous returns UCOL_NULLORDER
2243                if (ce == CollationElementIterator.IGNORABLE) {
2244                    continue;
2245                }
2246                if (hasExpansion && count == 0 
2247                    && m_colEIter_.getOffset() != temp) {
2248                    end = temp;
2249                    temp = m_colEIter_.getOffset();
2250                }
2251                if (ce != m_pattern_.m_CE_[count - 1]) {
2252                    start --;
2253                    start = getPreviousBaseOffset(targetText, start);
2254                    m_utilBuffer_[0] = start;
2255                    m_utilBuffer_[1] = end;
2256                    return false;
2257                }
2258                count --;
2259            }
2260        } 
2261        m_utilBuffer_[0] = start;
2262        m_utilBuffer_[1] = end;
2263        return true;
2264    }
2265    
2266    /**
2267     * Checks and sets the match information if found.
2268     * Checks 
2269     * <ul>
2270     * <li> the current match does not repeat the last match
2271     * <li> boundaries are correct
2272     * <li> exact matches has no extra accents
2273     * <li> identical matches
2274     * </ul>
2275     * Otherwise the offset will be shifted to the preceding character.
2276     * Uses the temporary utility buffer to store the modified textoffset.
2277     * @param textoffset offset in the collation element text. the returned value
2278     *        will be the truncated start offset of the match or the new start 
2279     *        search offset.
2280     * @return true if the match is valid, false otherwise
2281     */
2282    private final boolean checkPreviousExactMatch(int textoffset)
2283    {
2284        // to ensure that the start and ends are not composite characters
2285        int end = m_colEIter_.getOffset();        
2286        if (!checkPreviousExactContractionMatch(textoffset, end)) {
2287            return false;
2288        }
2289        textoffset = m_utilBuffer_[0];
2290        end = m_utilBuffer_[1];
2291            
2292        // this totally matches, however we need to check if it is repeating
2293        // the old match
2294        if (checkRepeatedMatch(textoffset, end) 
2295            || !isBreakUnit(textoffset, end) 
2296            || hasAccentsBeforeMatch(textoffset, end) 
2297            || !checkIdentical(textoffset, end) 
2298            || hasAccentsAfterMatch(textoffset, end)) {
2299            textoffset --;
2300            textoffset = getPreviousBaseOffset(targetText, textoffset);
2301            m_utilBuffer_[0] = textoffset;
2302            return false;
2303        }
2304        m_matchedIndex_ = textoffset;
2305        matchLength = end - textoffset;
2306        return true;
2307    }
2308
2309    /**
2310     * Rearranges the end accents to try matching.
2311     * Suffix accents in the text will be grouped according to their combining 
2312     * class and the groups will be mixed and matched to try find the perfect 
2313     * match with the pattern.
2314     * So for instance looking for "?" in "???"
2315     * step 1: split "??" into 6 other type of potential accent 
2316     *             substrings
2317     *         "?", "?", "?", "??", "??", 
2318     *         "??".
2319     * step 2: check if any of the generated substrings matches the pattern.
2320     * @param start offset of the first base character
2321     * @param end start of the last accent set
2322     * @return DONE if a match is not found, otherwise return the ending
2323     *         offset of the match. Note this start includes all following 
2324     *         accents.
2325     */
2326    private int doPreviousCanonicalSuffixMatch(int start, int end)
2327    {
2328        targetText.setIndex(end);
2329        if (UTF16.isTrailSurrogate(targetText.previous()) 
2330            && targetText.getIndex() > m_textBeginOffset_) {
2331            if (!UTF16.isLeadSurrogate(targetText.previous())) {
2332                targetText.next();
2333            } 
2334        }
2335        if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) {
2336            // die... failed at a base character
2337            return DONE;
2338        }
2339        end = getNextBaseOffset(targetText, end);
2340    
2341        StringBuffer   accents = new StringBuffer  ();
2342        int offset = getPreviousBaseOffset(targetText, end);
2343        // normalizing the offensive string
2344        String   accentstr = getString(targetText, offset, end - offset);
2345        if (Normalizer.quickCheck(accentstr, Normalizer.NFD,0) 
2346                                                    == Normalizer.NO) {
2347            accentstr = Normalizer.decompose(accentstr, false);
2348        }
2349        accents.append(accentstr);    
2350            
2351        int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];      
2352        int accentsize = getUnblockedAccentIndex(accents, accentsindex);
2353        int count = (2 << (accentsize - 1)) - 1;  
2354        while (count > 0) {
2355            m_canonicalSuffixAccents_.delete(0, 
2356                                           m_canonicalSuffixAccents_.length());
2357            // copy the base characters
2358            for (int k = 0; k < accentsindex[0]; k ++) {
2359                 m_canonicalSuffixAccents_.append(accents.charAt(k));
2360            }
2361            // forming all possible canonical rearrangement by dropping
2362            // sets of accents
2363            for (int i = 0; i <= accentsize - 1; i ++) {
2364                int mask = 1 << (accentsize - i - 1);
2365                if ((count & mask) != 0) {
2366                    for (int j = accentsindex[i]; j < accentsindex[i + 1]; 
2367                                                                        j ++) {
2368                        m_canonicalSuffixAccents_.append(accents.charAt(j));
2369                    }
2370                }
2371            }
2372            StringBuffer   match = merge(m_canonicalPrefixAccents_, targetText,
2373                                        start, offset, 
2374                                        m_canonicalSuffixAccents_);
2375            // run the collator iterator through this match
2376            // if status is a failure ucol_setText does nothing
2377            m_utilColEIter_.setText(match.toString());
2378            if (checkCollationMatch(m_utilColEIter_)) {
2379                return end;
2380            }
2381            count --;
2382        }
2383        return DONE;
2384    }
2385    
2386    /**
2387     * Take the rearranged start accents and tries matching. If match failed at
2388     * a seperate following set of accents (seperated from the rearranged on by
2389     * at least a base character) then we rearrange the preceding accents and 
2390     * tries matching again.
2391     * We allow skipping of the ends of the accent set if the ces do not match. 
2392     * However if the failure is found before the accent set, it fails.
2393     * Internal method, status assumed to be success, caller has to check 
2394     * status before calling this method.
2395     * @param textoffset of the ends of the rearranged accent
2396     * @return DONE if a match is not found, otherwise return the ending offset 
2397     *             of the match. Note this start includes all following accents.
2398     */
2399    private int doPreviousCanonicalPrefixMatch(int textoffset)
2400    {
2401       // int safelength = 0;
2402        StringBuffer   safetext;
2403        int safeoffset = textoffset;
2404    
2405        if (textoffset > m_textBeginOffset_
2406            && m_collator_.isUnsafe(m_canonicalPrefixAccents_.charAt(
2407                                    m_canonicalPrefixAccents_.length() - 1))) {
2408            safeoffset = getNextSafeOffset(textoffset, m_textLimitOffset_);
2409            //safelength = safeoffset - textoffset;
2410            safetext = merge(m_canonicalPrefixAccents_, targetText, textoffset, 
2411                             safeoffset, null);
2412        }
2413        else {
2414            safetext = m_canonicalPrefixAccents_;
2415        }
2416    
2417        // if status is a failure, ucol_setText does nothing
2418        CollationElementIterator coleiter = m_utilColEIter_;
2419        coleiter.setText(safetext.toString());
2420        // status checked in loop below
2421        
2422        int ceindex = 0;
2423        boolean isSafe = true; // safe zone indication flag for position
2424        int prefixlength = m_canonicalPrefixAccents_.length();
2425        
2426        while (ceindex < m_pattern_.m_CELength_) {
2427            int textce = coleiter.next();
2428            if (textce == CollationElementIterator.NULLORDER) {
2429                // check if we have passed the safe buffer
2430                if (coleiter == m_colEIter_) {
2431                    return DONE;
2432                }
2433                if (safetext != m_canonicalPrefixAccents_) {
2434                    safetext.delete(0, safetext.length());
2435                }
2436                coleiter = m_colEIter_;
2437                coleiter.setExactOffset(safeoffset);
2438                // status checked at the start of the loop
2439                isSafe = false;
2440                continue;
2441            }
2442            textce = getCE(textce);
2443            if (textce != CollationElementIterator.IGNORABLE 
2444                && textce != m_pattern_.m_CE_[ceindex]) {
2445                // do the beginning stuff
2446                int failedoffset = coleiter.getOffset();
2447                if (isSafe && failedoffset <= prefixlength) {
2448                    // alas... no hope. failed at rearranged accent set
2449                    return DONE;
2450                }
2451                else {
2452                    if (isSafe) {
2453                        failedoffset = safeoffset - failedoffset;
2454                        if (safetext != m_canonicalPrefixAccents_) {
2455                            safetext.delete(0, safetext.length());
2456                        }
2457                    }
2458                    
2459                    // try rearranging the end accents
2460                    int result = doPreviousCanonicalSuffixMatch(textoffset, 
2461                                                                failedoffset);
2462                    if (result != DONE) {
2463                        // if status is a failure, ucol_setOffset does nothing
2464                        m_colEIter_.setExactOffset(result);
2465                    }
2466                    return result;
2467                }
2468            }
2469            if (textce == m_pattern_.m_CE_[ceindex]) {
2470                ceindex ++;
2471            }
2472        }
2473        // set offset here
2474        if (isSafe) {
2475            int result = coleiter.getOffset();
2476            // sets the text iterator here with the correct expansion and offset
2477            int leftoverces = coleiter.m_CEBufferSize_ 
2478                                                - coleiter.m_CEBufferOffset_;
2479            if (result <= prefixlength) { 
2480                result = textoffset;
2481            }
2482            else {
2483                result = textoffset + (safeoffset - result);
2484            }
2485            m_colEIter_.setExactOffset(result);
2486            m_colEIter_.m_CEBufferOffset_ = m_colEIter_.m_CEBufferSize_ 
2487                                                                - leftoverces;
2488            return result;
2489        }
2490        
2491        return coleiter.getOffset();              
2492    }
2493    
2494    /**
2495     * Trying out the substring and sees if it can be a canonical match.
2496     * This will try normalizing the starting accents and arranging them into 
2497     * canonical equivalents and check their corresponding ces with the pattern 
2498     * ce.
2499     * Prefix accents in the text will be grouped according to their combining 
2500     * class and the groups will be mixed and matched to try find the perfect 
2501     * match with the pattern.
2502     * So for instance looking for "?" in "???"
2503     * step 1: split "??" into 6 other type of potential accent 
2504     *            substrings
2505     *         "?", "?", "?", "??", "??", 
2506     *         "??".
2507     * step 2: check if any of the generated substrings matches the pattern.
2508     * @param textoffset start offset in the collation element text that starts 
2509     *                   with the accents to be rearranged
2510     * @return true if the match is valid, false otherwise
2511     */
2512    private boolean doPreviousCanonicalMatch(int textoffset)
2513    {
2514        int offset = m_colEIter_.getOffset();
2515        if ((getFCD(targetText, textoffset) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2516            if (m_pattern_.m_hasSuffixAccents_) {
2517                offset = doPreviousCanonicalSuffixMatch(textoffset, offset);
2518                if (offset != DONE) {
2519                    m_colEIter_.setExactOffset(offset);
2520                    return true;
2521                }
2522            }
2523            return false;
2524        }
2525    
2526        if (!m_pattern_.m_hasPrefixAccents_) {
2527            return false;
2528        }
2529    
2530        StringBuffer   accents = new StringBuffer  ();
2531        // offset to the last base character in substring to search
2532        int baseoffset = getNextBaseOffset(targetText, textoffset);
2533        // normalizing the offensive string
2534        String   textstr = getString(targetText, textoffset, 
2535                                                    baseoffset - textoffset);
2536        if (Normalizer.quickCheck(textstr, Normalizer.NFD,0) 
2537                                                    == Normalizer.NO) {
2538            textstr = Normalizer.decompose(textstr, false);
2539        }
2540        accents.append(textstr);
2541        // status checked in loop
2542            
2543        int accentsindex[] = new int[INITIAL_ARRAY_SIZE_];
2544        int size = getUnblockedAccentIndex(accents, accentsindex);
2545    
2546        // 2 power n - 1 plus the full set of accents
2547        int count = (2 << (size - 1)) - 1;  
2548        while (count > 0) {
2549            m_canonicalPrefixAccents_.delete(0, 
2550                                        m_canonicalPrefixAccents_.length());
2551            // copy the base characters
2552            for (int k = 0; k < accentsindex[0]; k ++) {
2553                m_canonicalPrefixAccents_.append(accents.charAt(k));
2554            }
2555            // forming all possible canonical rearrangement by dropping
2556            // sets of accents
2557            for (int i = 0; i <= size - 1; i ++) {
2558                int mask = 1 << (size - i - 1);
2559                if ((count & mask) != 0) {
2560                    for (int j = accentsindex[i]; j < accentsindex[i + 1]; 
2561                         j ++) {
2562                        m_canonicalPrefixAccents_.append(accents.charAt(j));
2563                    }
2564                }
2565            }
2566            offset = doPreviousCanonicalPrefixMatch(baseoffset);
2567            if (offset != DONE) {
2568                return true; // match found
2569            }
2570            count --;
2571        }
2572        return false;
2573    }
2574    
2575    /**
2576     * Checks match for contraction. 
2577     * If the match starts with a partial contraction we fail.
2578     * Uses the temporary utility buffer to return the modified start and end.
2579     * @param start offset of potential match, to be modified if necessary
2580     * @param end offset of potential match, to be modified if necessary
2581     * @return true if match passes the contraction test, false otherwise.
2582     */
2583    private boolean checkPreviousCanonicalContractionMatch(int start, int end) 
2584    {
2585        int temp = end;
2586        // This part checks if either ends of the match contains potential 
2587        // contraction. If so we'll have to iterate through them
2588        char echar = 0;
2589        char schar = 0;
2590        if (end < m_textLimitOffset_) {
2591            targetText.setIndex(end);
2592            echar = targetText.current();
2593        }
2594        if (start + 1 < m_textLimitOffset_) {
2595            targetText.setIndex(start + 1);
2596            schar = targetText.current();
2597        }
2598        if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) {
2599            int expansion = m_colEIter_.m_CEBufferSize_ 
2600                                            - m_colEIter_.m_CEBufferOffset_;
2601            boolean hasExpansion = expansion > 0;
2602            m_colEIter_.setExactOffset(end);
2603            while (expansion > 0) {
2604                // getting rid of the redundant ce
2605                // since forward contraction/expansion may have extra ces
2606                // if we are in the normalization buffer, hasAccentsBeforeMatch
2607                // would have taken care of it.
2608                // E.g. the character \u01FA will have an expansion of 3, but 
2609                // if we are only looking for A ring A\u030A, we'll have to 
2610                // skip the last ce in the expansion buffer
2611                m_colEIter_.previous();
2612                if (m_colEIter_.getOffset() != temp) {
2613                    end = temp;
2614                    temp = m_colEIter_.getOffset();
2615                }
2616                expansion --;
2617            }
2618    
2619            int count = m_pattern_.m_CELength_;
2620            while (count > 0) {
2621                int ce = getCE(m_colEIter_.previous());
2622                // status checked below, note that if status is a failure
2623                // previous() returns NULLORDER
2624                if (ce == CollationElementIterator.IGNORABLE) {
2625                    continue;
2626                }
2627                if (hasExpansion && count == 0 
2628                    && m_colEIter_.getOffset() != temp) {
2629                    end = temp;
2630                    temp = m_colEIter_.getOffset();
2631                }
2632                if (count == m_pattern_.m_CELength_ 
2633                    && ce != m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]) {
2634                    // accents may have extra starting ces, this occurs when a 
2635                    // pure accent pattern is matched without rearrangement
2636                    int expected = m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1];
2637                    targetText.setIndex(end);
2638                    if (UTF16.isTrailSurrogate(targetText.previous())) {
2639                        if (targetText.getIndex() > m_textBeginOffset_ &&
2640                            !UTF16.isLeadSurrogate(targetText.previous())) {
2641                            targetText.next();
2642                        }
2643                    }
2644                    end = targetText.getIndex();
2645                    if ((getFCD(targetText, end) & LAST_BYTE_MASK_) != 0) {
2646                        ce = getCE(m_colEIter_.previous());
2647                        while (ce != expected 
2648                                && ce != CollationElementIterator.NULLORDER 
2649                                && m_colEIter_.getOffset() <= start) {
2650                            ce = getCE(m_colEIter_.previous());
2651                        }
2652                    }
2653                }
2654                if (ce != m_pattern_.m_CE_[count - 1]) {
2655                    start --;
2656                    start = getPreviousBaseOffset(start);
2657                    m_utilBuffer_[0] = start;
2658                    m_utilBuffer_[1] = end;
2659                    return false;
2660                }
2661                count --;
2662            }
2663        } 
2664        m_utilBuffer_[0] = start;
2665        m_utilBuffer_[1] = end;
2666        return true;
2667    }
2668    
2669    /**
2670     * Checks and sets the match information if found.
2671     * Checks 
2672     * <ul>
2673     * <li> the potential match does not repeat the previous match
2674     * <li> boundaries are correct
2675     * <li> potential match does not end in the middle of a contraction
2676     * <li> identical matches
2677     * </ul>
2678     * Otherwise the offset will be shifted to the next character.
2679     * Uses the temporary utility buffer for storing the modified textoffset.
2680     * @param textoffset offset in the collation element text. the returned 
2681     *             value will be the truncated start offset of the match or the 
2682     *             new start search offset.
2683     * @return true if the match is valid, false otherwise
2684     */
2685    private boolean checkPreviousCanonicalMatch(int textoffset)
2686    {
2687        // to ensure that the start and ends are not composite characters
2688        // if we have a canonical accent match
2689        if (m_pattern_.m_hasSuffixAccents_ 
2690            && m_canonicalSuffixAccents_.length() != 0 
2691            || m_pattern_.m_hasPrefixAccents_ 
2692            && m_canonicalPrefixAccents_.length() != 0) {
2693            m_matchedIndex_ = textoffset;
2694            matchLength = getNextBaseOffset(m_colEIter_.getOffset()) 
2695                                                                - textoffset;
2696            return true;
2697        }
2698    
2699        int end = m_colEIter_.getOffset();
2700        if (!checkPreviousCanonicalContractionMatch(textoffset, end)) {
2701            // storing the modified textoffset
2702            return false;
2703        }
2704        textoffset = m_utilBuffer_[0];
2705        end = m_utilBuffer_[1];
2706        end = getNextBaseOffset(end);
2707        // this totally matches, however we need to check if it is repeating
2708        if (checkRepeatedMatch(textoffset, end) 
2709            || !isBreakUnit(textoffset, end) 
2710            || !checkIdentical(textoffset, end)) {
2711            textoffset --;
2712            textoffset = getPreviousBaseOffset(textoffset);
2713            m_utilBuffer_[0] = textoffset;
2714            return false;
2715        }
2716        
2717        m_matchedIndex_ = textoffset;
2718        matchLength = end - textoffset;
2719        return true;
2720    }
2721    
2722    /**
2723     * Method that does the next exact match
2724     * @param start the offset to start shifting from and performing the 
2725     *        next exact match
2726     */
2727    private void handleNextExact(int start)
2728    {
2729        int textoffset = shiftForward(start, 
2730                                         CollationElementIterator.NULLORDER,
2731                                         m_pattern_.m_CELength_);
2732        int targetce = CollationElementIterator.IGNORABLE;
2733        while (textoffset <= m_textLimitOffset_) {
2734            m_colEIter_.setExactOffset(textoffset);
2735            int patternceindex = m_pattern_.m_CELength_ - 1;
2736            boolean found = false;
2737            int lastce = CollationElementIterator.NULLORDER;
2738            
2739            while (true) {
2740                // finding the last pattern ce match, imagine composite 
2741                // characters. for example: search for pattern A in text \u00C0
2742                // we'll have to skip \u0300 the grave first before we get to A
2743                targetce = m_colEIter_.previous();
2744                if (targetce == CollationElementIterator.NULLORDER) {
2745                    found = false;
2746                    break;
2747                }
2748                targetce = getCE(targetce);
2749                if (targetce == CollationElementIterator.IGNORABLE && 
2750                    m_colEIter_.isInBuffer()) { 
2751                    // this is for the text \u0315\u0300 that requires 
2752                    // normalization and pattern \u0300, where \u0315 is ignorable
2753                    continue;
2754                }
2755                if (lastce == CollationElementIterator.NULLORDER 
2756                    || lastce == CollationElementIterator.IGNORABLE) {
2757                    lastce = targetce;
2758                }
2759                if (targetce == m_pattern_.m_CE_[patternceindex]) {
2760                    // the first ce can be a contraction
2761                    found = true;
2762                    break;
2763                }
2764                if (m_colEIter_.m_CEBufferOffset_ <= 0) {
2765                    found = false;
2766                    break;
2767                }
2768            }
2769    
2770            while (found && patternceindex > 0) {
2771                targetce = m_colEIter_.previous();
2772                if (targetce == CollationElementIterator.NULLORDER) {
2773                    found = false;
2774                    break;
2775                }
2776                targetce = getCE(targetce);
2777                if (targetce == CollationElementIterator.IGNORABLE) {
2778                    continue;
2779                }
2780    
2781                patternceindex --;
2782                found = found && targetce == m_pattern_.m_CE_[patternceindex]; 
2783            }
2784    
2785            if (!found) {
2786                textoffset = shiftForward(textoffset, lastce, patternceindex);
2787                // status checked at loop.
2788                patternceindex = m_pattern_.m_CELength_;
2789                continue;
2790            }
2791            
2792            if (checkNextExactMatch(textoffset)) {
2793                // status checked in ucol_setOffset
2794                return;
2795            }
2796            textoffset = m_utilBuffer_[0];
2797        }
2798        setMatchNotFound();
2799    }
2800
2801    /**
2802     * Method that does the next canonical match
2803     * @param start the offset to start shifting from and performing the 
2804     *        next canonical match
2805     */
2806    private void handleNextCanonical(int start)
2807    {
2808        boolean hasPatternAccents = 
2809           m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;
2810              
2811        // shifting it check for setting offset
2812        // if setOffset is called previously or there was no previous match, we
2813        // leave the offset as it is.
2814        int textoffset = shiftForward(start, CollationElementIterator.NULLORDER, 
2815                                        m_pattern_.m_CELength_);
2816        m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());
2817        m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());
2818        int targetce = CollationElementIterator.IGNORABLE;
2819        
2820        while (textoffset <= m_textLimitOffset_)
2821        {
2822            m_colEIter_.setExactOffset(textoffset);
2823            int patternceindex = m_pattern_.m_CELength_ - 1;
2824            boolean found = false;
2825            int lastce = CollationElementIterator.NULLORDER;
2826            
2827            while (true) {
2828                // finding the last pattern ce match, imagine composite characters
2829                // for example: search for pattern A in text \u00C0
2830                // we'll have to skip \u0300 the grave first before we get to A
2831                targetce = m_colEIter_.previous();
2832                if (targetce == CollationElementIterator.NULLORDER) {
2833                    found = false;
2834                    break;
2835                }
2836                targetce = getCE(targetce);
2837                if (lastce == CollationElementIterator.NULLORDER 
2838                            || lastce == CollationElementIterator.IGNORABLE) {
2839                    lastce = targetce;
2840                }
2841                if (targetce == m_pattern_.m_CE_[patternceindex]) {
2842                    // the first ce can be a contraction
2843                    found = true;
2844                    break;
2845                }
2846                if (m_colEIter_.m_CEBufferOffset_ <= 0) {
2847                    found = false;
2848                    break;
2849                }
2850            }
2851            
2852            while (found && patternceindex > 0) {
2853                targetce    = m_colEIter_.previous();
2854                if (targetce == CollationElementIterator.NULLORDER) {
2855                    found = false;
2856                    break;
2857                }
2858                targetce    = getCE(targetce);
2859                if (targetce == CollationElementIterator.IGNORABLE) {
2860                    continue;
2861                }
2862    
2863                patternceindex --;
2864                found = found && targetce == m_pattern_.m_CE_[patternceindex]; 
2865            }
2866    
2867            // initializing the rearranged accent array
2868            if (hasPatternAccents && !found) {
2869                found = doNextCanonicalMatch(textoffset);
2870            }
2871    
2872            if (!found) {
2873                textoffset = shiftForward(textoffset, lastce, patternceindex);
2874                // status checked at loop
2875                patternceindex = m_pattern_.m_CELength_;
2876                continue;
2877            }
2878            
2879            if (checkNextCanonicalMatch(textoffset)) {
2880                return;
2881            }
2882            textoffset = m_utilBuffer_[0];
2883        }
2884        setMatchNotFound();
2885    }
2886    
2887    /**
2888     * Method that does the previous exact match
2889     * @param start the offset to start shifting from and performing the 
2890     *        previous exact match
2891     */
2892    private void handlePreviousExact(int start)
2893    {
2894        int textoffset = reverseShift(start, CollationElementIterator.NULLORDER, 
2895                                      m_pattern_.m_CELength_);
2896        while (textoffset >= m_textBeginOffset_)
2897        {
2898            m_colEIter_.setExactOffset(textoffset);
2899            int patternceindex = 1;
2900            int targetce = CollationElementIterator.IGNORABLE;
2901            boolean found = false;
2902            int firstce = CollationElementIterator.NULLORDER;
2903            
2904            while (true) {
2905                // finding the first pattern ce match, imagine composite 
2906                // characters. for example: search for pattern \u0300 in text 
2907                // \u00C0, we'll have to skip A first before we get to 
2908                // \u0300 the grave accent
2909                targetce = m_colEIter_.next();
2910                if (targetce == CollationElementIterator.NULLORDER) {
2911                    found = false;
2912                    break;
2913                }
2914                targetce = getCE(targetce);
2915                if (firstce == CollationElementIterator.NULLORDER 
2916                    || firstce == CollationElementIterator.IGNORABLE) {
2917                    firstce = targetce;
2918                }
2919                if (targetce == CollationElementIterator.IGNORABLE) {
2920                    continue;
2921                }         
2922                if (targetce == m_pattern_.m_CE_[0]) {
2923                    found = true;
2924                    break;
2925                }
2926                if (m_colEIter_.m_CEBufferOffset_ == -1 
2927                    || m_colEIter_.m_CEBufferOffset_ 
2928                                            == m_colEIter_.m_CEBufferSize_) {
2929                    // checking for accents in composite character
2930                    found = false;
2931                    break;
2932                }
2933            }
2934    
2935            targetce = firstce;
2936            
2937            while (found && patternceindex < m_pattern_.m_CELength_) {
2938                targetce = m_colEIter_.next();
2939                if (targetce == CollationElementIterator.NULLORDER) {
2940                    found = false;
2941                    break;
2942                }
2943                targetce = getCE(targetce);
2944                if (targetce == CollationElementIterator.IGNORABLE) {
2945                    continue;
2946                }
2947    
2948                found = found && targetce == m_pattern_.m_CE_[patternceindex]; 
2949                patternceindex ++;
2950            }
2951    
2952            if (!found) {
2953                textoffset = reverseShift(textoffset, targetce, patternceindex);
2954                patternceindex = 0;
2955                continue;
2956            }
2957            
2958            if (checkPreviousExactMatch(textoffset)) {
2959                return;
2960            }
2961            textoffset = m_utilBuffer_[0];
2962        }
2963        setMatchNotFound();
2964    }
2965    
2966    /**
2967     * Method that does the previous canonical match
2968     * @param start the offset to start shifting from and performing the 
2969     *        previous canonical match
2970     */
2971    private void handlePreviousCanonical(int start)
2972    {
2973        boolean hasPatternAccents = 
2974           m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_;
2975              
2976        // shifting it check for setting offset
2977        // if setOffset is called previously or there was no previous match, we
2978        // leave the offset as it is.
2979        int textoffset = reverseShift(start, CollationElementIterator.NULLORDER, 
2980                                          m_pattern_.m_CELength_);
2981        m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length());
2982        m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length());
2983        
2984        while (textoffset >= m_textBeginOffset_)
2985        {
2986            m_colEIter_.setExactOffset(textoffset);
2987            int patternceindex = 1;
2988            int targetce = CollationElementIterator.IGNORABLE;
2989            boolean found = false;
2990            int firstce = CollationElementIterator.NULLORDER;
2991            
2992            while (true) {
2993                // finding the first pattern ce match, imagine composite 
2994                // characters. for example: search for pattern \u0300 in text 
2995                // \u00C0, we'll have to skip A first before we get to 
2996                // \u0300 the grave accent
2997                targetce = m_colEIter_.next();
2998                if (targetce == CollationElementIterator.NULLORDER) {
2999                    found = false;
3000                    break;
3001                }
3002                targetce = getCE(targetce);
3003                if (firstce == CollationElementIterator.NULLORDER 
3004                    || firstce == CollationElementIterator.IGNORABLE) {
3005                    firstce = targetce;
3006                }
3007                
3008                if (targetce == m_pattern_.m_CE_[0]) {
3009                    // the first ce can be a contraction
3010                    found = true;
3011                    break;
3012                }
3013                if (m_colEIter_.m_CEBufferOffset_ == -1 
3014                    || m_colEIter_.m_CEBufferOffset_ 
3015                                            == m_colEIter_.m_CEBufferSize_) {
3016                    // checking for accents in composite character
3017                    found = false;
3018                    break;
3019                }
3020            }
3021    
3022            targetce = firstce;
3023            
3024            while (found && patternceindex < m_pattern_.m_CELength_) {
3025                targetce = m_colEIter_.next();
3026                if (targetce == CollationElementIterator.NULLORDER) {
3027                    found = false;
3028                    break;
3029                }
3030                targetce = getCE(targetce);
3031                if (targetce == CollationElementIterator.IGNORABLE) {
3032                    continue;
3033                }
3034    
3035                found = found && targetce == m_pattern_.m_CE_[patternceindex]; 
3036                patternceindex ++;
3037            }
3038    
3039            // initializing the rearranged accent array
3040            if (hasPatternAccents && !found) {
3041                found = doPreviousCanonicalMatch(textoffset);
3042            }
3043    
3044            if (!found) {
3045                textoffset = reverseShift(textoffset, targetce, patternceindex);
3046                patternceindex = 0;
3047                continue;
3048            }
3049    
3050            if (checkPreviousCanonicalMatch(textoffset)) {
3051                return;
3052            }
3053            textoffset = m_utilBuffer_[0];
3054        }
3055        setMatchNotFound();
3056    }
3057    
3058    /**
3059     * Gets a substring out of a CharacterIterator
3060     * @param text CharacterIterator
3061     * @param start start offset
3062     * @param length of substring
3063     * @return substring from text starting at start and length length
3064     */
3065    private static final String   getString(CharacterIterator   text, int start,
3066                                            int length)
3067    {
3068        StringBuffer   result = new StringBuffer  (length);
3069        int offset = text.getIndex();
3070        text.setIndex(start);
3071        for (int i = 0; i < length; i ++) {
3072            result.append(text.current());
3073            text.next();
3074        }
3075        text.setIndex(offset);
3076        return result.toString();
3077    }
3078    
3079    /**
3080     * Getting the mask for collation strength
3081     * @param strength collation strength
3082      * @return collation element mask
3083     */
3084    private static final int getMask(int strength) 
3085    {
3086        switch (strength) 
3087        {
3088            case Collator.PRIMARY:
3089                return RuleBasedCollator.CE_PRIMARY_MASK_;
3090            case Collator.SECONDARY:
3091                return RuleBasedCollator.CE_SECONDARY_MASK_ 
3092                       | RuleBasedCollator.CE_PRIMARY_MASK_;
3093            default:
3094                return RuleBasedCollator.CE_TERTIARY_MASK_ 
3095                       | RuleBasedCollator.CE_SECONDARY_MASK_ 
3096                       | RuleBasedCollator.CE_PRIMARY_MASK_;
3097        }
3098    }
3099    
3100    /**
3101     * Sets match not found 
3102     */
3103    private void setMatchNotFound() 
3104    {
3105        // this method resets the match result regardless of the error status.
3106        m_matchedIndex_ = DONE;
3107        setMatchLength(0);
3108    }
3109}
3110
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags