ContainerScorer


1   /*
2    * Copyright (C) 2005 Alfresco, Inc.
3    *
4    * Licensed under the Mozilla Public License version 1.1 
5    * with a permitted attribution clause. You may obtain a
6    * copy of the License at
7    *
8    *   http://www.alfresco.org/legal/license.txt
9    *
10   * Unless required by applicable law or agreed to in writing,
11   * software distributed under the License is distributed on an
12   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13   * either express or implied. See the License for the specific
14   * language governing permissions and limitations under the
15   * License.
16   */
17  package org.alfresco.repo.search.impl.lucene.query;
18  
19  import java.io.IOException  ;
20  
21  import org.apache.lucene.index.TermPositions;
22  import org.apache.lucene.search.Explanation;
23  import org.apache.lucene.search.Scorer;
24  import org.apache.lucene.search.Similarity;
25  import org.apache.lucene.search.Weight;
26  
27  /**
28   * The scorer for structured field queries.
29   * 
30   * A document either matches or it does not, there for the frequency is reported
31   * as 0.0f or 1.0.
32   * 
33   * 
34   * 
35   * @author andyh
36   */
37  public class ContainerScorer extends Scorer
38  {
39      // Unused
40      Weight weight;
41  
42      // Positions of documents with multiple structure elements
43      // e.g have mutiple paths, multiple categories or multiples entries in the
44      // same category
45      TermPositions root;
46  
47      // The Field positions that describe the structure we are trying to match
48      StructuredFieldPosition[] positions;
49  
50      // Unused at the moment
51      byte[] norms;
52  
53      // The minium document found so far
54      int min = 0;
55  
56      // The max document found so far
57      int max = 0;
58  
59      // The next root doc
60      // -1 and it has gone off the end
61      int rootDoc = 0;
62  
63      // Are there potentially more documents
64      boolean more = true;
65  
66      // The frequency of the terms in the doc (0.0f or 1.0f)
67      float freq = 0.0f;
68  
69      // A term position to find all container entries (there is no better way of finding the set of rquired containers)
70      private TermPositions containers;
71  
72      /**
73       * The arguments here follow the same pattern as used by the PhraseQuery.
74       * (It has the same unused arguments)
75       * 
76       * @param weight -
77       *            curently unsued
78       * @param tps -
79       *            the term positions for the terms we are trying to find
80       * @param root -
81       *            the term positions for documents with multiple entries - this
82       *            may be null, or contain no matches - it specifies those things
83       *            that appear under multiple categories etc.
84       * @param positions -
85       *            the structured field positions - where terms should appear
86       * @param similarity -
87       *            used in the abstract scorer implementation
88       * @param norms -
89       *            unused
90       */
91      public ContainerScorer(Weight weight, TermPositions root, StructuredFieldPosition[] positions, TermPositions containers, Similarity similarity, byte[] norms)
92      {
93          super(similarity);
94          this.weight = weight;
95          this.positions = positions;
96          this.norms = norms;
97          this.root = root;
98          this.containers = containers;
99      }
100 
101     /*
102      * (non-Javadoc)
103      * 
104      * @see org.apache.lucene.search.Scorer#next()
105      */
106     public boolean next() throws IOException  
107     {
108         // If there is no filtering
109         if (allContainers())
110         {
111             // containers and roots must be in sync or the index is broken
112             while (more)
113             {
114                 if (containers.next() && root.next())
115                 {
116                     if (check(0, root.nextPosition()))
117                     {
118                         return true;
119                     }
120                 }
121                 else
122                 {
123                     more = false;
124                     return false;
125                 }
126             }
127         }
128 
129         if (!more)
130         {
131             // One of the search terms has no more docuements
132             return false;
133         }
134 
135         if (max == 0)
136         {
137             // We need to initialise
138             // Just do a next on all terms and check if the first doc matches
139             doNextOnAll();
140             if (found())
141             {
142                 return true;
143             }
144             // drop through to the normal find sequence
145         }
146 
147         return findNext();
148     }
149 
150     /**
151      * Are we looking for all containers?
152      * If there are no positions we must have a better filter
153      *  
154      * @return
155      */
156     private boolean allContainers()
157     {
158         if (positions.length == 0)
159         {
160             return true;
161         }
162         for (StructuredFieldPosition sfp : positions)
163         {
164             if (sfp.getCachingTermPositions() != null)
165             {
166                 return false;
167             }
168         }
169         return true;
170     }
171 
172     /**
173      * @return
174      * @throws IOException
175      */
176     private boolean findNext() throws IOException  
177     {
178         // Move to the next document
179 
180         while (more)
181         {
182             move(); // may set more to false
183             if (found())
184             {
185                 return true;
186             }
187         }
188 
189         // If we get here we must have no more documents
190         return false;
191     }
192 
193     /**
194      * Check if we have found a match
195      * 
196      * @return
197      * @throws IOException
198      */
199 
200     private boolean found() throws IOException  
201     {
202         // No predicate test if there are no positions
203         if (positions.length == 0)
204         {
205             return true;
206         }
207 
208         // no more documents - no match
209         if (!more)
210         {
211             return false;
212         }
213 
214         // min and max must point to the same document
215         if (min != max)
216         {
217             return false;
218         }
219 
220         if (rootDoc != max)
221         {
222             return false;
223         }
224 
225         // We have duplicate entries - suport should be improved but it is not used at the moment
226         // This shuld work akin to the leaf scorer 
227         // It would compact the index
228         // The match must be in a known term range
229         int count = root.freq();
230         int start = 0;
231         int end = -1;
232         for (int i = 0; i < count; i++)
233         {
234             if (i == 0)
235             {
236                 // First starts at zero
237                 start = 0;
238                 end = root.nextPosition() ;
239             }
240             else
241             {
242                 start = end + 1;
243                 end = root.nextPosition() ;
244             }
245 
246             if (check(start, end))
247             {
248                 return true;
249             }
250         }
251 
252         // We had checks to do and they all failed.
253         return false;
254     }
255 
256     /*
257      * We have all documents at the same state. Now we check the positions of
258      * the terms.
259      */
260 
261     private boolean check(int start, int end) throws IOException  
262     {
263         int offset = checkTail(start, end, 0, 0);
264         // Last match may fail
265         if (offset == -1)
266         {
267             return false;
268         }
269         else
270         {
271             // Check non // ending patterns end at the end of the available pattern
272             if (positions[positions.length - 1].isTerminal())
273             {
274                 return ((offset+1) == end);
275             }
276             else
277             {
278                 return true;
279             }
280         }
281     }
282 
283     /**
284      * For // type pattern matches we need to test patterns of variable greedyness.
285      *
286      * 
287      * @param start
288      * @param end
289      * @param currentPosition
290      * @param currentOffset
291      * @return
292      * @throws IOException
293      */
294     private int checkTail(int start, int end, int currentPosition, int currentOffset) throws IOException  
295     {
296         int offset = currentOffset;
297         for (int i = currentPosition, l = positions.length; i < l; i++)
298         {
299             offset = positions[i].matches(start, end, offset);
300             if (offset == -1)
301             {
302                 return -1;
303             }
304             if (positions[i].isDescendant())
305             {
306                 for (int j = offset; j < end; j++)
307                 {
308                     int newOffset = checkTail(start, end, i + 1, j);
309                     if (newOffset != -1)
310                     {
311                         return newOffset;
312                     }
313                 }
314                 return -1;
315             }
316         }
317         return offset;
318     }
319 
320     /*
321      * Move to the next position to consider for a match test
322      */
323 
324     private void move() throws IOException  
325     {
326         if (min == max)
327         {
328             // If we were at a match just do next on all terms
329             // They all must move on
330             doNextOnAll();
331         }
332         else
333         {
334             // We are in a range - try and skip to the max position on all terms
335             // Only some need to move on - some may move past the current max and set a new target
336             skipToMax();
337         }
338     }
339 
340     /*
341      * Go through all the term positions and try and move to next document. Any
342      * failure measn we have no more.
343      * 
344      * This can be used at initialisation and when moving away from an existing
345      * match.
346      * 
347      * This will set min, max, more and rootDoc
348      * 
349      */
350     private void doNextOnAll() throws IOException  
351     {
352         // Do the terms
353         int current;
354         boolean first = true;
355         for (int i = 0, l = positions.length; i < l; i++)
356         {
357             if (positions[i].getCachingTermPositions() != null)
358             {
359                 if (positions[i].getCachingTermPositions().next())
360 
361                 {
362                     current = positions[i].getCachingTermPositions().doc();
363                     adjustMinMax(current, first);
364                     first = false;
365                 }
366                 else
367                 {
368                     more = false;
369                     return;
370                 }
371             }
372         }
373 
374         // Do the root term - it must always exists as the path could well have mutiple entries
375         // If an entry in the index does not have a root terminal it is broken
376         if (root.next())
377         {
378             rootDoc = root.doc();
379         }
380         else
381         {
382             more = false;
383             return;
384         }
385         if (root.doc() < max)
386         {
387             if (root.skipTo(max))
388             {
389                 rootDoc = root.doc();
390             }
391             else
392             {
393                 more = false;
394                 return;
395             }
396         }
397     }
398 
399     /*
400      * Try and skip all those term positions at documents less than the current
401      * max up to value. This is quite likely to fail and leave us with (min !=
402      * max) but that is OK, we try again.
403      * 
404      * It is possible that max increases as we process terms, this is OK. We
405      * just failed to skip to a given value of max and start doing the next.
406      */
407     private void skipToMax() throws IOException  
408     {
409         // Do the terms
410         int current;
411         for (int i = 0, l = positions.length; i < l; i++)
412         {
413             if (i == 0)
414             {
415                 min = max;
416             }
417             if (positions[i].getCachingTermPositions() != null)
418             {
419                 if (positions[i].getCachingTermPositions().doc() < max)
420                 {
421                     if (positions[i].getCachingTermPositions().skipTo(max))
422                     {
423                         current = positions[i].getCachingTermPositions().doc();
424                         adjustMinMax(current, false);
425                     }
426                     else
427                     {
428                         more = false;
429                         return;
430                     }
431                 }
432             }
433         }
434 
435         // Do the root
436         if (root.doc() < max)
437         {
438             if (root.skipTo(max))
439             {
440                 rootDoc = root.doc();
441             }
442             else
443             {
444                 more = false;
445                 return;
446             }
447         }
448     }
449 
450     /*
451      * Adjust the min and max values Convenience boolean to set or adjust the
452      * minimum.
453      */
454     private void adjustMinMax(int doc, boolean setMin)
455     {
456 
457         if (max < doc)
458         {
459             max = doc;
460         }
461 
462         if (setMin)
463         {
464             min = doc;
465         }
466         else if (min > doc)
467         {
468             min = doc;
469         }
470     }
471 
472     /*
473      * (non-Javadoc)
474      * 
475      * @see org.apache.lucene.search.Scorer#doc()
476      */
477     public int doc()
478     {
479         if (allContainers())
480         {
481             return containers.doc();
482         }
483         return max;
484     }
485 
486     /*
487      * (non-Javadoc)
488      * 
489      * @see org.apache.lucene.search.Scorer#score()
490      */
491     public float score() throws IOException  
492     {
493         return 1.0f;
494     }
495 
496     /*
497      * (non-Javadoc)
498      * 
499      * @see org.apache.lucene.search.Scorer#skipTo(int)
500      */
501     public boolean skipTo(int target) throws IOException  
502     {
503         if (allContainers())
504         {
505             containers.skipTo(target);
506             root.skipTo(containers.doc()); // must match
507             if (check(0, root.nextPosition()))
508             {
509                 return true;
510             }
511             while (more)
512             {
513                 if (containers.next() && root.next())
514                 {
515                     if (check(0, root.nextPosition()))
516                     {
517                         return true;
518                     }
519                 }
520                 else
521                 {
522                     more = false;
523                     return false;
524                 }
525             }
526         }
527 
528         max = target;
529         return findNext();
530     }
531 
532     /*
533      * (non-Javadoc)
534      * 
535      * @see org.apache.lucene.search.Scorer#explain(int)
536      */
537     public Explanation explain(int doc) throws IOException  
538     {
539         // TODO: Work out what a proper explanation would be here? 
540         Explanation tfExplanation = new Explanation();
541 
542         while (next() && doc() < doc)
543         {
544         }
545 
546         float phraseFreq = (doc() == doc) ? freq : 0.0f;
547         tfExplanation.setValue(getSimilarity().tf(phraseFreq));
548         tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
549 
550         return tfExplanation;
551     }
552 
553 }
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags