DFAContentModel


1   /*
2    * The Apache Software License, Version 1.1
3    *
4    *
5    * Copyright (c) 1999-2002 The Apache Software Foundation.  All rights 
6    * reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution,
21   *    if any, must include the following acknowledgment:  
22   *       "This product includes software developed by the
23   *        Apache Software Foundation (http://www.apache.org/)."
24   *    Alternately, this acknowledgment may appear in the software itself,
25   *    if and wherever such third-party acknowledgments normally appear.
26   *
27   * 4. The names "Xerces" and "Apache Software Foundation" must
28   *    not be used to endorse or promote products derived from this
29   *    software without prior written permission. For written 
30   *    permission, please contact apache@apache.org.
31   *
32   * 5. Products derived from this software may not be called "Apache",
33   *    nor may "Apache" appear in their name, without prior written
34   *    permission of the Apache Software Foundation.
35   *
36   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47   * SUCH DAMAGE.
48   * ====================================================================
49   *
50   * This software consists of voluntary contributions made by many
51   * individuals on behalf of the Apache Software Foundation and was
52   * originally based on software copyright (c) 1999, International
53   * Business Machines, Inc., http://www.apache.org.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  package com.sun.org.apache.xerces.internal.impl.dtd.models;
59  
60  import com.sun.org.apache.xerces.internal.xni.QName;
61  import com.sun.org.apache.xerces.internal.impl.dtd.XMLContentSpec;
62  
63  /**
64   * @version $Id: DFAContentModel.java,v 1.4 2002/05/29 17:59:37 neilg Exp $
65   *
66   * DFAContentModel is the derivative of ContentModel that does
67   * all of the non-trivial element content validation. This class does 
68   * the conversion from the regular expression to the DFA that 
69   * it then uses in its validation algorithm.
70   * <p>
71   * <b>Note:</b> Upstream work insures that this class will never see
72   * a content model with PCDATA in it. Any model with PCDATA is 'mixed' 
73   * and is handled via the MixedContentModel class since mixed models 
74   * are very constrained in form and easily handled via a special case. 
75   * This also makes implementation of this class much easier.
76   *
77   */
78  public class DFAContentModel
79      implements ContentModelValidator {
80  
81      //
82      // Constants
83      //
84      // special strings
85  
86      /** Epsilon string. */
87      private static String   fEpsilonString = "<<CMNODE_EPSILON>>";
88  
89      /** End-of-content string. */
90      private static String   fEOCString = "<<CMNODE_EOC>>";
91  
92      /** initializing static members **/
93      static {
94          fEpsilonString = fEpsilonString.intern();
95          fEOCString = fEOCString.intern();
96      }
97  
98      // debugging
99  
100     /** Set to true to debug content model validation. */
101     private static final boolean DEBUG_VALIDATE_CONTENT = false;
102 
103     //
104     // Data
105     //
106 
107     /* this is the EquivClassComparator object */
108     //private EquivClassComparator comparator = null;
109 
110     /**
111      * This is the map of unique input symbol elements to indices into
112      * each state's per-input symbol transition table entry. This is part
113      * of the built DFA information that must be kept around to do the
114      * actual validation.
115      */
116     private QName fElemMap[] = null;
117 
118     /**
119      * This is a map of whether the element map contains information 
120      * related to ANY models.
121      */
122     private int fElemMapType[] = null;
123 
124     /** The element map size. */
125     private int fElemMapSize = 0;
126 
127     /** Boolean to distinguish Schema Mixed Content */
128     private boolean fMixed;
129 
130     /**
131      * The NFA position of the special EOC (end of content) node. This
132      * is saved away since it's used during the DFA build.
133      */
134     private int fEOCPos = 0;
135 
136 
137     /**
138      * This is an array of booleans, one per state (there are
139      * fTransTableSize states in the DFA) that indicates whether that
140      * state is a final state.
141      */
142     private boolean fFinalStateFlags[] = null;
143 
144     /**
145      * The list of follow positions for each NFA position (i.e. for each
146      * non-epsilon leaf node.) This is only used during the building of
147      * the DFA, and is let go afterwards.
148      */
149     private CMStateSet fFollowList[] = null;
150 
151     /**
152      * This is the head node of our intermediate representation. It is
153      * only non-null during the building of the DFA (just so that it
154      * does not have to be passed all around.) Once the DFA is built,
155      * this is no longer required so its nulled out.
156      */
157     private CMNode fHeadNode = null;
158 
159     /**
160      * The count of leaf nodes. This is an important number that set some
161      * limits on the sizes of data structures in the DFA process.
162      */
163     private int fLeafCount = 0;
164 
165     /**
166      * An array of non-epsilon leaf nodes, which is used during the DFA
167      * build operation, then dropped.
168      */
169     private CMLeaf fLeafList[] = null;
170 
171     /** Array mapping ANY types to the leaf list. */
172     private int fLeafListType[] = null;
173 
174     //private ContentLeafNameTypeVector fLeafNameTypeVector = null;
175 
176     /**
177      * The string pool of our parser session. This is set during construction
178      * and kept around.
179      */
180     //private StringPool fStringPool = null;
181 
182     /**
183      * This is the transition table that is the main by product of all
184      * of the effort here. It is an array of arrays of ints. The first
185      * dimension is the number of states we end up with in the DFA. The
186      * second dimensions is the number of unique elements in the content
187      * model (fElemMapSize). Each entry in the second dimension indicates
188      * the new state given that input for the first dimension's start
189      * state.
190      * <p>
191      * The fElemMap array handles mapping from element indexes to
192      * positions in the second dimension of the transition table.
193      */
194     private int fTransTable[][] = null;
195 
196     /**
197      * The number of valid entries in the transition table, and in the other
198      * related tables such as fFinalStateFlags.
199      */
200     private int fTransTableSize = 0;
201 
202     /**
203      * Flag that indicates that even though we have a "complicated"
204      * content model, it is valid to have no content. In other words,
205      * all parts of the content model are optional. For example:
206      * <pre>
207      *      &lt;!ELEMENT AllOptional (Optional*,NotRequired?)&gt;
208      * </pre>
209      */
210     private boolean fEmptyContentIsValid = false;
211 
212     // temp variables
213 
214     /** Temporary qualified name. */
215     private QName fQName = new QName();
216 
217     //
218     // Constructors
219     //
220 
221 
222     //
223     // Constructors
224     //
225 
226     /**
227      * Constructs a DFA content model.
228      *
229      * @param syntaxTree    The syntax tree of the content model.
230      * @param leafCount     The number of leaves.
231      * @param dtd           if it is created for a DTDGrammar.
232      *
233      */
234 
235     public DFAContentModel(CMNode syntaxTree, int leafCount, boolean mixed) {
236         // Store away our index and pools in members
237         //fStringPool = stringPool;
238         fLeafCount = leafCount;
239 
240 
241         // this is for Schema Mixed Content
242         fMixed = mixed;
243 
244         //
245         //  Ok, so lets grind through the building of the DFA. This method
246         //  handles the high level logic of the algorithm, but it uses a
247         //  number of helper classes to do its thing.
248         //
249         //  In order to avoid having hundreds of references to the error and
250         //  string handlers around, this guy and all of his helper classes
251         //  just throw a simple exception and we then pass it along.
252         //
253         buildDFA(syntaxTree);
254     }
255 
256     //
257     // ContentModelValidator methods
258     //
259 
260     /**
261      * Check that the specified content is valid according to this
262      * content model. This method can also be called to do 'what if' 
263      * testing of content models just to see if they would be valid.
264      * <p>
265      * A value of -1 in the children array indicates a PCDATA node. All other 
266      * indexes will be positive and represent child elements. The count can be
267      * zero, since some elements have the EMPTY content model and that must be 
268      * confirmed.
269      *
270      * @param children The children of this element.  Each integer is an index within
271      *                 the <code>StringPool</code> of the child element name.  An index
272      *                 of -1 is used to indicate an occurrence of non-whitespace character
273      *                 data.
274      * @param offset Offset into the array where the children starts.
275      * @param length The number of entries in the <code>children</code> array.
276      *
277      * @return The value -1 if fully valid, else the 0 based index of the child
278      *         that first failed. If the value returned is equal to the number
279      *         of children, then the specified children are valid but additional
280      *         content is required to reach a valid ending state.
281      *
282      */
283     public int validate(QName[] children, int offset, int length) {
284 
285         if (DEBUG_VALIDATE_CONTENT) 
286             System.out.println("DFAContentModel#validateContent");
287 
288         //
289         // A DFA content model must *always* have at least 1 child
290         // so a failure is given if no children present.
291         // 
292         // Defect 782: This is an incorrect statement because a DFA
293         // content model is also used for constructions such as:
294         //
295         //     (Optional*,NotRequired?)
296         //
297         // where a perfectly valid content would be NO CHILDREN.
298         // Therefore, if there are no children, we must check to
299         // see if the CMNODE_EOC marker is a valid start state! -Ac
300         //
301         if (length == 0) {
302             if (DEBUG_VALIDATE_CONTENT) {
303                 System.out.println("!!! no children");
304                 System.out.println("elemMap="+fElemMap);
305                 for (int i = 0; i < fElemMap.length; i++) {
306                     String   uri = fElemMap[i].uri;
307                     String   localpart = fElemMap[i].localpart;
308                     
309                     System.out.println("fElemMap["+i+"]="+uri+","+
310                                        localpart+" ("+
311                                        uri+", "+
312                                        localpart+
313                                        ')');
314                                        
315                 }
316                 System.out.println("EOCIndex="+fEOCString);
317             }
318 
319             return fEmptyContentIsValid ? -1 : 0;
320 
321         } // if child count == 0
322 
323         //
324         //  Lets loop through the children in the array and move our way
325         //  through the states. Note that we use the fElemMap array to map
326         //  an element index to a state index.
327         //
328         int curState = 0;
329         for (int childIndex = 0; childIndex < length; childIndex++)
330         {
331             // Get the current element index out
332             final QName curElem = children[offset + childIndex];
333             // ignore mixed text
334             if (fMixed && curElem.localpart == null) {
335                 continue;
336             }
337 
338             // Look up this child in our element map
339             int elemIndex = 0;
340             for (; elemIndex < fElemMapSize; elemIndex++)
341             {
342                 int type = fElemMapType[elemIndex] & 0x0f ;
343                 if (type == XMLContentSpec.CONTENTSPECNODE_LEAF) {
344                     //System.out.println("fElemMap["+elemIndex+"]: "+fElemMap[elemIndex]);
345                     if (fElemMap[elemIndex].rawname == curElem.rawname) {
346                         break;
347                     }
348                 }
349                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY) {
350                     String   uri = fElemMap[elemIndex].uri;
351                     if (uri == null || uri == curElem.uri) {
352                         break;
353                     }
354                 }
355                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL) {
356                     if (curElem.uri == null) {
357                         break;
358                     }
359                 }
360                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
361                     if (fElemMap[elemIndex].uri != curElem.uri) {
362                         break;
363                     }
364                 }
365             }
366 
367             // If we didn't find it, then obviously not valid
368             if (elemIndex == fElemMapSize) {
369                 if (DEBUG_VALIDATE_CONTENT) {
370                     System.out.println("!!! didn't find it");
371 
372                     System.out.println("curElem : " +curElem );
373                     for (int i=0; i<fElemMapSize; i++) {
374                         System.out.println("fElemMap["+i+"] = " +fElemMap[i] );
375                         System.out.println("fElemMapType["+i+"] = " +fElemMapType[i] );
376                     }
377                 }
378 
379                 return childIndex;
380             }
381 
382             //
383             //  Look up the next state for this input symbol when in the
384             //  current state.
385             //
386             curState = fTransTable[curState][elemIndex];
387 
388             // If its not a legal transition, then invalid
389             if (curState == -1) {
390                 if (DEBUG_VALIDATE_CONTENT) 
391                     System.out.println("!!! not a legal transition");
392                 return childIndex;
393             }
394         }
395 
396         //
397         //  We transitioned all the way through the input list. However, that
398         //  does not mean that we ended in a final state. So check whether
399         //  our ending state is a final state.
400         //
401         if (DEBUG_VALIDATE_CONTENT) 
402             System.out.println("curState="+curState+", childCount="+length);
403         if (!fFinalStateFlags[curState])
404             return length;
405 
406         // success!
407         return -1;
408     } // validate
409 
410 
411     //
412     // Private methods
413     //
414 
415     /** 
416      * Builds the internal DFA transition table from the given syntax tree.
417      *
418      * @param syntaxTree The syntax tree.
419      *
420      * @exception CMException Thrown if DFA cannot be built.
421      */
422     private void buildDFA(CMNode syntaxTree) 
423     {
424         //
425         //  The first step we need to take is to rewrite the content model
426         //  using our CMNode objects, and in the process get rid of any
427         //  repetition short cuts, converting them into '*' style repetitions
428         //  or getting rid of repetitions altogether.
429         //
430         //  The conversions done are:
431         //
432         //  x+ -> (x|x*)
433         //  x? -> (x|epsilon)
434         //
435         //  This is a relatively complex scenario. What is happening is that
436         //  we create a top level binary node of which the special EOC value
437         //  is set as the right side node. The the left side is set to the
438         //  rewritten syntax tree. The source is the original content model
439         //  info from the decl pool. The rewrite is done by buildSyntaxTree()
440         //  which recurses the decl pool's content of the element and builds
441         //  a new tree in the process.
442         //
443         //  Note that, during this operation, we set each non-epsilon leaf
444         //  node's DFA state position and count the number of such leafs, which
445         //  is left in the fLeafCount member.
446         //
447         //  The nodeTmp object is passed in just as a temp node to use during
448         //  the recursion. Otherwise, we'd have to create a new node on every
449         //  level of recursion, which would be piggy in Java (as is everything
450         //  for that matter.)
451         //
452 
453     /* MODIFIED (Jan, 2001) 
454      *
455      * Use following rules.
456      *   nullable(x+) := nullable(x), first(x+) := first(x),  last(x+) := last(x)
457      *   nullable(x?) := true, first(x?) := first(x),  last(x?) := last(x)
458      *
459      * The same computation of follow as x* is applied to x+
460      *
461      * The modification drastically reduces computation time of
462      * "(a, (b, a+, (c, (b, a+)+, a+, (d,  (c, (b, a+)+, a+)+, (b, a+)+, a+)+)+)+)+"
463      */
464 
465         fQName.setValues(null, fEOCString, fEOCString, null);
466         CMLeaf nodeEOC = new CMLeaf(fQName);
467         fHeadNode = new CMBinOp
468         (
469             XMLContentSpec.CONTENTSPECNODE_SEQ
470             , syntaxTree
471             , nodeEOC
472         );
473 
474         //
475         //  And handle specially the EOC node, which also must be numbered
476         //  and counted as a non-epsilon leaf node. It could not be handled
477         //  in the above tree build because it was created before all that
478         //  started. We save the EOC position since its used during the DFA
479         //  building loop.
480         //
481         fEOCPos = fLeafCount;
482         nodeEOC.setPosition(fLeafCount++);
483 
484         //
485         //  Ok, so now we have to iterate the new tree and do a little more
486         //  work now that we know the leaf count. One thing we need to do is
487         //  to calculate the first and last position sets of each node. This
488         //  is cached away in each of the nodes.
489         //
490         //  Along the way we also set the leaf count in each node as the
491         //  maximum state count. They must know this in order to create their
492         //  first/last pos sets.
493         //
494         //  We also need to build an array of references to the non-epsilon
495         //  leaf nodes. Since we iterate it in the same way as before, this
496         //  will put them in the array according to their position values.
497         //
498         fLeafList = new CMLeaf[fLeafCount];
499         fLeafListType = new int[fLeafCount];
500         postTreeBuildInit(fHeadNode, 0);
501 
502         //
503         //  And, moving onward... We now need to build the follow position
504         //  sets for all the nodes. So we allocate an array of state sets,
505         //  one for each leaf node (i.e. each DFA position.)
506         //
507         fFollowList = new CMStateSet[fLeafCount];
508         for (int index = 0; index < fLeafCount; index++)
509             fFollowList[index] = new CMStateSet(fLeafCount);
510         calcFollowList(fHeadNode);
511         //
512         //  And finally the big push... Now we build the DFA using all the
513         //  states and the tree we've built up. First we set up the various
514         //  data structures we are going to use while we do this.
515         //
516         //  First of all we need an array of unique element names in our
517         //  content model. For each transition table entry, we need a set of
518         //  contiguous indices to represent the transitions for a particular
519         //  input element. So we need to a zero based range of indexes that
520         //  map to element types. This element map provides that mapping.
521         //
522         fElemMap = new QName[fLeafCount];
523         fElemMapType = new int[fLeafCount];
524         fElemMapSize = 0;
525         for (int outIndex = 0; outIndex < fLeafCount; outIndex++)
526         {
527             fElemMap[outIndex] = new QName();
528 
529             /****
530             if ( (fLeafListType[outIndex] & 0x0f) != 0 ) {
531                 if (fLeafNameTypeVector == null) {
532                     fLeafNameTypeVector = new ContentLeafNameTypeVector();
533                 }
534             }
535             /****/
536 
537             // Get the current leaf's element index
538             final QName element = fLeafList[outIndex].getElement();
539 
540             // See if the current leaf node's element index is in the list
541             int inIndex = 0;
542             for (; inIndex < fElemMapSize; inIndex++)
543             {
544                 if (fElemMap[inIndex].rawname == element.rawname) {
545                     break;
546                 }
547             }
548 
549             // If it was not in the list, then add it, if not the EOC node
550             if (inIndex == fElemMapSize) {
551                 fElemMap[fElemMapSize].setValues(element);
552                 fElemMapType[fElemMapSize] = fLeafListType[outIndex];
553                 fElemMapSize++;
554             }
555         }
556         // set up the fLeafNameTypeVector object if there is one.
557         /*****
558         if (fLeafNameTypeVector != null) {
559             fLeafNameTypeVector.setValues(fElemMap, fElemMapType, fElemMapSize);
560         }
561 
562     /*** 
563     * Optimization(Jan, 2001); We sort fLeafList according to 
564     * elemIndex which is *uniquely* associated to each leaf.  
565     * We are *assuming* that each element appears in at least one leaf.
566     **/
567 
568     int[] fLeafSorter = new int[fLeafCount + fElemMapSize];
569     int fSortCount = 0;
570 
571     for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) {
572         for (int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) {
573             final QName leaf = fLeafList[leafIndex].getElement();
574             final QName element = fElemMap[elemIndex];
575             if (leaf.rawname == element.rawname) {
576                 fLeafSorter[fSortCount++] = leafIndex;
577             }
578         }
579         fLeafSorter[fSortCount++] = -1;
580     }
581 
582     /* Optimization(Jan, 2001) */
583 
584         //
585         //  Next lets create some arrays, some that that hold transient
586         //  information during the DFA build and some that are permament.
587         //  These are kind of sticky since we cannot know how big they will
588         //  get, but we don't want to use any Java collections because of
589         //  performance.
590         //
591         //  Basically they will probably be about fLeafCount*2 on average,
592         //  but can be as large as 2^(fLeafCount*2), worst case. So we start
593         //  with fLeafCount*4 as a middle ground. This will be very unlikely
594         //  to ever have to expand, though it if does, the overhead will be
595         //  somewhat ugly.
596         //
597         int curArraySize = fLeafCount * 4;
598         CMStateSet[] statesToDo = new CMStateSet[curArraySize];
599         fFinalStateFlags = new boolean[curArraySize];
600         fTransTable = new int[curArraySize][];
601 
602         //
603         //  Ok we start with the initial set as the first pos set of the
604         //  head node (which is the seq node that holds the content model
605         //  and the EOC node.)
606         //
607         CMStateSet setT = fHeadNode.firstPos();
608 
609         //
610         //  Init our two state flags. Basically the unmarked state counter
611         //  is always chasing the current state counter. When it catches up,
612         //  that means we made a pass through that did not add any new states
613         //  to the lists, at which time we are done. We could have used a
614         //  expanding array of flags which we used to mark off states as we
615         //  complete them, but this is easier though less readable maybe.
616         //
617         int unmarkedState = 0;
618         int curState = 0;
619 
620         //
621         //  Init the first transition table entry, and put the initial state
622         //  into the states to do list, then bump the current state.
623         //
624         fTransTable[curState] = makeDefStateList();
625         statesToDo[curState] = setT;
626         curState++;
627 
628         /* Optimization(Jan, 2001); This is faster for
629          * a large content model such as, "(t001+|t002+|.... |t500+)".
630          */
631 
632     java.util.Hashtable   stateTable = new java.util.Hashtable  ();
633 
634         /* Optimization(Jan, 2001) */
635 
636         //
637         //  Ok, almost done with the algorithm... We now enter the
638         //  loop where we go until the states done counter catches up with
639         //  the states to do counter.
640         //
641         while (unmarkedState < curState)
642         {
643             //
644             //  Get the first unmarked state out of the list of states to do.
645             //  And get the associated transition table entry.
646             //
647             setT = statesToDo[unmarkedState];
648             int[] transEntry = fTransTable[unmarkedState];
649 
650             // Mark this one final if it contains the EOC state
651             fFinalStateFlags[unmarkedState] = setT.getBit(fEOCPos);
652 
653             // Bump up the unmarked state count, marking this state done
654             unmarkedState++;
655 
656             // Loop through each possible input symbol in the element map
657             CMStateSet newSet = null;
658         /* Optimization(Jan, 2001) */
659             int sorterIndex = 0;
660         /* Optimization(Jan, 2001) */
661             for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
662             {
663                 //
664                 //  Build up a set of states which is the union of all of
665                 //  the follow sets of DFA positions that are in the current
666                 //  state. If we gave away the new set last time through then
667                 //  create a new one. Otherwise, zero out the existing one.
668                 //
669                 if (newSet == null)
670                     newSet = new CMStateSet(fLeafCount);
671                 else
672                     newSet.zeroBits();
673 
674         /* Optimization(Jan, 2001) */
675                 int leafIndex = fLeafSorter[sorterIndex++];
676 
677                 while (leafIndex != -1) {
678             // If this leaf index (DFA position) is in the current set...
679                     if (setT.getBit(leafIndex))
680                     {
681                         //
682                         //  If this leaf is the current input symbol, then we
683                         //  want to add its follow list to the set of states to
684                         //  transition to from the current state.
685                         //
686                                 newSet.union(fFollowList[leafIndex]);
687                             }
688 
689                    leafIndex = fLeafSorter[sorterIndex++];
690     }
691         /* Optimization(Jan, 2001) */
692 
693                 //
694                 //  If this new set is not empty, then see if its in the list
695                 //  of states to do. If not, then add it.
696                 //
697                 if (!newSet.isEmpty())
698                 {
699                     //
700                     //  Search the 'states to do' list to see if this new
701                     //  state set is already in there.
702                     //
703 
704         /* Optimization(Jan, 2001) */
705         Integer   stateObj = (Integer  )stateTable.get(newSet);
706         int stateIndex = (stateObj == null ? curState : stateObj.intValue());
707         /* Optimization(Jan, 2001) */
708 
709                     // If we did not find it, then add it
710                     if (stateIndex == curState)
711                     {
712                         //
713                         //  Put this new state into the states to do and init
714                         //  a new entry at the same index in the transition
715                         //  table.
716                         //
717                         statesToDo[curState] = newSet;
718                         fTransTable[curState] = makeDefStateList();
719 
720         /* Optimization(Jan, 2001) */
721                         stateTable.put(newSet, new Integer  (curState));
722         /* Optimization(Jan, 2001) */
723 
724                         // We now have a new state to do so bump the count
725                         curState++;
726 
727                         //
728                         //  Null out the new set to indicate we adopted it.
729                         //  This will cause the creation of a new set on the
730                         //  next time around the loop.
731                         //
732                         newSet = null;
733                     }
734 
735                     //
736                     //  Now set this state in the transition table's entry
737                     //  for this element (using its index), with the DFA
738                     //  state we will move to from the current state when we
739                     //  see this input element.
740                     //
741                     transEntry[elemIndex] = stateIndex;
742 
743                     // Expand the arrays if we're full
744                     if (curState == curArraySize)
745                     {
746                         //
747                         //  Yikes, we overflowed the initial array size, so
748                         //  we've got to expand all of these arrays. So adjust
749                         //  up the size by 50% and allocate new arrays.
750                         //
751                         final int newSize = (int)(curArraySize * 1.5);
752                         CMStateSet[] newToDo = new CMStateSet[newSize];
753                         boolean[] newFinalFlags = new boolean[newSize];
754                         int[][] newTransTable = new int[newSize][];
755 
756                         // Copy over all of the existing content
757                         for (int expIndex = 0; expIndex < curArraySize; expIndex++)
758                         {
759                             newToDo[expIndex] = statesToDo[expIndex];
760                             newFinalFlags[expIndex] = fFinalStateFlags[expIndex];
761                             newTransTable[expIndex] = fTransTable[expIndex];
762                         }
763 
764                         // Store the new array size
765                         curArraySize = newSize;
766                         statesToDo = newToDo;
767                         fFinalStateFlags = newFinalFlags;
768                         fTransTable = newTransTable;
769                     }
770                 }
771             }
772         }
773 
774         // Check to see if we can set the fEmptyContentIsValid flag.
775         fEmptyContentIsValid = ((CMBinOp)fHeadNode).getLeft().isNullable();
776 
777         //
778         //  And now we can say bye bye to the temp representation since we've
779         //  built the DFA.
780         //
781         if (DEBUG_VALIDATE_CONTENT) 
782             dumpTree(fHeadNode, 0);
783         fHeadNode = null;
784         fLeafList = null;
785         fFollowList = null;
786 
787     }
788 
789     /**
790      * Calculates the follow list of the current node.
791      *
792      * @param nodeCur The curent node.
793      *
794      * @exception CMException Thrown if follow list cannot be calculated.
795      */
796     private void calcFollowList(CMNode nodeCur) 
797     {
798         // Recurse as required
799         if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
800         {
801             // Recurse only
802             calcFollowList(((CMBinOp)nodeCur).getLeft());
803             calcFollowList(((CMBinOp)nodeCur).getRight());
804         }
805          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ)
806         {
807             // Recurse first
808             calcFollowList(((CMBinOp)nodeCur).getLeft());
809             calcFollowList(((CMBinOp)nodeCur).getRight());
810 
811             //
812             //  Now handle our level. We use our left child's last pos
813             //  set and our right child's first pos set, so go ahead and
814             //  get them ahead of time.
815             //
816             final CMStateSet last  = ((CMBinOp)nodeCur).getLeft().lastPos();
817             final CMStateSet first = ((CMBinOp)nodeCur).getRight().firstPos();
818 
819             //
820             //  Now, for every position which is in our left child's last set
821             //  add all of the states in our right child's first set to the
822             //  follow set for that position.
823             //
824             for (int index = 0; index < fLeafCount; index++)
825             {
826                 if (last.getBit(index))
827                     fFollowList[index].union(first);
828             }
829         }
830          /***
831          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
832         {
833             // Recurse first
834             calcFollowList(((CMUniOp)nodeCur).getChild());
835 
836             //
837             //  Now handle our level. We use our own first and last position
838             //  sets, so get them up front.
839             //
840             final CMStateSet first = nodeCur.firstPos();
841             final CMStateSet last  = nodeCur.lastPos();
842 
843             //
844             //  For every position which is in our last position set, add all
845             //  of our first position states to the follow set for that
846             //  position.
847             //
848             for (int index = 0; index < fLeafCount; index++)
849             {
850                 if (last.getBit(index))
851                     fFollowList[index].union(first);
852             }
853         }
854          else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
855               ||  (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE))
856         {
857             throw new RuntimeException("ImplementationMessages.VAL_NIICM");
858         }
859         /***/
860          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
861         || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
862         {
863             // Recurse first
864             calcFollowList(((CMUniOp)nodeCur).getChild());
865 
866             //
867             //  Now handle our level. We use our own first and last position
868             //  sets, so get them up front.
869             //
870             final CMStateSet first = nodeCur.firstPos();
871             final CMStateSet last  = nodeCur.lastPos();
872 
873             //
874             //  For every position which is in our last position set, add all
875             //  of our first position states to the follow set for that
876             //  position.
877             //
878             for (int index = 0; index < fLeafCount; index++)
879             {
880                 if (last.getBit(index))
881                     fFollowList[index].union(first);
882             }
883         }
884        
885         else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE) {
886             // Recurse only
887             calcFollowList(((CMUniOp)nodeCur).getChild());
888         }
889          /***/
890     }
891 
892     /**
893      * Dumps the tree of the current node to standard output.
894      *
895      * @param nodeCur The current node.
896      * @param level   The maximum levels to output.
897      *
898      * @exception CMException Thrown on error.
899      */
900     private void dumpTree(CMNode nodeCur, int level) 
901     {
902         for (int index = 0; index < level; index++)
903             System.out.print("   ");
904 
905         int type = nodeCur.type();
906         if ((type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
907         ||  (type == XMLContentSpec.CONTENTSPECNODE_SEQ))
908         {
909             if (type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
910                 System.out.print("Choice Node ");
911             else
912                 System.out.print("Seq Node ");
913 
914             if (nodeCur.isNullable())
915                 System.out.print("Nullable ");
916 
917             System.out.print("firstPos=");
918             System.out.print(nodeCur.firstPos().toString());
919             System.out.print(" lastPos=");
920             System.out.println(nodeCur.lastPos().toString());
921 
922             dumpTree(((CMBinOp)nodeCur).getLeft(), level+1);
923             dumpTree(((CMBinOp)nodeCur).getRight(), level+1);
924         }
925          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
926         {
927             System.out.print("Rep Node ");
928 
929             if (nodeCur.isNullable())
930                 System.out.print("Nullable ");
931 
932             System.out.print("firstPos=");
933             System.out.print(nodeCur.firstPos().toString());
934             System.out.print(" lastPos=");
935             System.out.println(nodeCur.lastPos().toString());
936 
937             dumpTree(((CMUniOp)nodeCur).getChild(), level+1);
938         }
939          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
940         {
941             System.out.print
942             (
943                 "Leaf: (pos="
944                 + ((CMLeaf)nodeCur).getPosition()
945                 + "), "
946                 + ((CMLeaf)nodeCur).getElement()
947                 + "(elemIndex="
948                 + ((CMLeaf)nodeCur).getElement()
949                 + ") "
950             );
951 
952             if (nodeCur.isNullable())
953                 System.out.print(" Nullable ");
954 
955             System.out.print("firstPos=");
956             System.out.print(nodeCur.firstPos().toString());
957             System.out.print(" lastPos=");
958             System.out.println(nodeCur.lastPos().toString());
959         }
960          else
961         {
962             throw new RuntimeException  ("ImplementationMessages.VAL_NIICM");
963         }
964     }
965 
966 
967     /**
968      * -1 is used to represent bad transitions in the transition table
969      * entry for each state. So each entry is initialized to an all -1
970      * array. This method creates a new entry and initializes it.
971      */
972     private int[] makeDefStateList()
973     {
974         int[] retArray = new int[fElemMapSize];
975         for (int index = 0; index < fElemMapSize; index++)
976             retArray[index] = -1;
977         return retArray;
978     }
979 
980     /** Post tree build initialization. */
981     private int postTreeBuildInit(CMNode nodeCur, int curIndex) 
982     {
983         // Set the maximum states on this node
984         nodeCur.setMaxStates(fLeafCount);
985 
986         // Recurse as required
987         if ((nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY ||
988             (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL ||
989             (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
990             // REVISIT: Don't waste these structures.
991             QName qname = new QName(null, null, null, ((CMAny)nodeCur).getURI());
992             fLeafList[curIndex] = new CMLeaf(qname, ((CMAny)nodeCur).getPosition());
993             fLeafListType[curIndex] = nodeCur.type();
994             curIndex++;
995         }
996         else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
997         ||  (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ))
998         {
999             curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getLeft(), curIndex);
1000            curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getRight(), curIndex);
1001        }
1002         else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
1003         || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE
1004         || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE)
1005        {
1006            curIndex = postTreeBuildInit(((CMUniOp)nodeCur).getChild(), curIndex);
1007        }
1008         else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
1009        {
1010            //
1011            //  Put this node in the leaf list at the current index if its
1012            //  a non-epsilon leaf.
1013            //
1014             final QName node = ((CMLeaf)nodeCur).getElement();
1015            if (node.localpart != fEpsilonString) {
1016                fLeafList[curIndex] = (CMLeaf)nodeCur;
1017                fLeafListType[curIndex] = XMLContentSpec.CONTENTSPECNODE_LEAF;
1018                curIndex++;
1019            }
1020        }
1021         else
1022        {
1023            throw new RuntimeException  ("ImplementationMessages.VAL_NIICM: type="+nodeCur.type());
1024        }
1025        return curIndex;
1026    }
1027
1028} // class DFAContentModel
1029
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags