FastPageParser


1   /*
2    * Title:        FastPageParser
3    * Description:
4    *
5    * This software is published under the terms of the OpenSymphony Software
6    * License version 1.1, of which a copy has been included with this
7    * distribution in the LICENSE.txt file.
8    */
9   
10  package com.opensymphony.module.sitemesh.parser;
11  
12  import com.opensymphony.module.sitemesh.Page;
13  import com.opensymphony.module.sitemesh.PageParser;
14  import com.opensymphony.module.sitemesh.html.util.CharArray;
15  import com.opensymphony.module.sitemesh.util.CharArrayReader;
16  
17  import java.io.IOException  ;
18  import java.io.Reader  ;
19  import java.util.Collections  ;
20  import java.util.HashMap  ;
21  import java.util.Map  ;
22  
23  /**
24   * Very fast PageParser implementation for parsing HTML.
25   *
26   * <p>Produces FastPage.</p>
27   *
28   * @author <a HREF="mailto:salaman@qoretech.com">Victor Salaman</a>
29   * @version $Revision: 1.12 $
30   */
31  public final class FastPageParser implements PageParser
32  {
33     private static final int TOKEN_NONE = -0;
34     private static final int TOKEN_EOF = -1;
35     private static final int TOKEN_TEXT = -2;
36     private static final int TOKEN_TAG = -3;
37     private static final int TOKEN_COMMENT = -4;
38     private static final int TOKEN_CDATA = -5;
39     private static final int TOKEN_SCRIPT = -6;
40     private static final int TOKEN_DOCTYPE = -7;
41     private static final int TOKEN_EMPTYTAG = -8;
42  
43     private static final int STATE_EOF = -1;
44     private static final int STATE_TEXT = -2;
45     private static final int STATE_TAG = -3;
46     private static final int STATE_COMMENT = -4;
47     private static final int STATE_TAG_QUOTE = -5;
48     private static final int STATE_CDATA = -6;
49     private static final int STATE_SCRIPT = -7;
50     private static final int STATE_DOCTYPE = -8;
51  
52     private static final int TAG_STATE_NONE = 0;
53     private static final int TAG_STATE_HTML = -1;
54     private static final int TAG_STATE_HEAD = -2;
55     private static final int TAG_STATE_TITLE = -3;
56     private static final int TAG_STATE_BODY = -4;
57     private static final int TAG_STATE_XML = -6;
58     private static final int TAG_STATE_XMP = -7;
59  
60     // These hashcodes are hardcoded because swtich statements can only
61     // switch on compile-time constants.
62     // In theory it is possible for there to be a hashcode collision with
63     // other HTML tags, however in practice it is *very* unlikely because
64     // tags are generally only a few characters long and hence are likely
65     // to produce unique values.
66  
67     private static final int SLASH_XML_HASH = 1518984; // "/xml".hashCode();
68     private static final int XML_HASH = 118807; // "xml".hashCode();
69     private static final int SLASH_XMP_HASH = 1518988; // "/xmp".hashCode();
70     private static final int XMP_HASH = 118811; // "xmp".hashCode();
71     private static final int HTML_HASH = 3213227; // "html".hashCode();
72     private static final int SLASH_HTML_HASH = 46618714; // "/html".hashCode();
73     private static final int HEAD_HASH = 3198432; // "head".hashCode();
74     private static final int TITLE_HASH = 110371416; // "title".hashCode();
75     private static final int SLASH_TITLE_HASH = 1455941513; // "/title".hashCode();
76     private static final int PARAMETER_HASH = 1954460585; // "parameter".hashCode();
77     private static final int META_HASH = 3347973; // "meta".hashCode();
78     private static final int SLASH_HEAD_HASH = 46603919; // "/head".hashCode();
79     private static final int FRAMESET_HASH = -1644953643; // "frameset".hashCode();
80     private static final int FRAME_HASH = 97692013; // "frame".hashCode();
81     private static final int BODY_HASH = 3029410; // "body".hashCode();
82     private static final int SLASH_BODY_HASH = 46434897; // "/body".hashCode();
83     private static final int CONTENT_HASH = 951530617; // "content".hashCode();
84  
85     public Page parse(char[] data) throws IOException  
86     {
87        FastPage page = internalParse(new CharArrayReader(data));
88        page.setVerbatimPage(data);
89        return page;
90     }
91  
92     public Page parse(Reader reader)
93     {
94        return internalParse(reader);
95     }
96  
97     private FastPage internalParse(Reader reader)
98     {
99        CharArray _buffer    = new CharArray(4096);
100       CharArray _body      = new CharArray(4096);
101       CharArray _head      = new CharArray(512);
102       CharArray _title     = new CharArray(128);
103       Map   _htmlProperties     = null;
104       Map   _metaProperties     = new HashMap  (6);
105       Map   _sitemeshProperties = new HashMap  (6);
106       Map   _bodyProperties     = null;
107 
108       CharArray _currentTaggedContent = new CharArray(1024);
109       String   _contentTagId = null;
110       boolean tagged = false;
111 
112       boolean _frameSet = false;
113 
114       int _state = STATE_TEXT;
115       int _tokenType = TOKEN_NONE;
116       int _pushBack = 0;
117       int _comment = 0;
118       int _quote = 0;
119       boolean hide = false;
120 
121       int state = TAG_STATE_NONE;
122       int laststate = TAG_STATE_NONE;
123       boolean doneTitle = false;
124 
125       // This tag object gets reused each iteration.
126       Tag tagObject = new Tag();
127 
128       while (_tokenType != TOKEN_EOF)
129       {
130          if(tagged)
131          {
132             if(_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG)
133             {
134                if(_buffer==null || _buffer.length()==0)
135                {
136                   _tokenType=TOKEN_NONE;
137                   continue;
138                }
139 
140                if (parseTag(tagObject, _buffer) == null) continue;
141 
142                if (_buffer.compareLowerSubstr("/content"))   // Note that the '/' survives the | 32 operation
143                {
144                   tagged = false;
145                   if(_contentTagId != null)
146                   {
147                      state = TAG_STATE_NONE;
148                      _sitemeshProperties.put(_contentTagId, _currentTaggedContent.toString());
149                      _currentTaggedContent.setLength(0);
150                      _contentTagId = null;
151                   }
152                }
153                else
154                {
155                   _currentTaggedContent.append('<').append(_buffer).append('>');
156                }
157             }
158             else
159             {
160                if(_buffer.length() > 0) _currentTaggedContent.append(_buffer);
161             }
162          }
163          else
164          {
165             if(_tokenType == TOKEN_TAG || _tokenType == TOKEN_EMPTYTAG)
166             {
167                if(_buffer==null || _buffer.length()==0)
168                {
169                   _tokenType=TOKEN_NONE;
170                   continue;
171                }
172 
173                if(parseTag(tagObject, _buffer) == null) {
174                   _tokenType=TOKEN_TEXT;
175                   continue;
176                }
177 
178                int tagHash = _buffer.substrHashCode();
179 
180                if(state == TAG_STATE_XML || state == TAG_STATE_XMP)
181                {
182                   writeTag(state, laststate, hide, _head, _buffer, _body);
183                   if( (state == TAG_STATE_XML && tagHash == SLASH_XML_HASH)
184                     ||(state == TAG_STATE_XMP && tagHash == SLASH_XMP_HASH) )
185                   {
186                      state = laststate;
187                   }
188                }
189                else
190                {
191                   boolean doDefault = false;
192                   switch (tagHash) {
193                      case HTML_HASH:
194                         if (!_buffer.compareLowerSubstr("html")) { // skip any accidental hash collisions
195                            doDefault = true;
196                            break;
197                         }
198                         state = TAG_STATE_HTML;
199                         _htmlProperties = parseProperties(tagObject, _buffer).properties;
200                         break;
201                      case HEAD_HASH:
202                         if (!_buffer.compareLowerSubstr("head")) { // skip any accidental hash collisions
203                            doDefault = true;
204                               break;
205                         }
206                         state = TAG_STATE_HEAD;
207                         break;
208                      case XML_HASH:
209                         if (!_buffer.compareLowerSubstr("xml")) { // skip any accidental hash collisions
210                            doDefault = true;
211                            break;
212                         }
213                         laststate = state;
214                         writeTag(state, laststate, hide, _head, _buffer, _body);
215                         state = TAG_STATE_XML;
216                         break;
217                      case XMP_HASH:
218                         if (!_buffer.compareLowerSubstr("xmp")) { // skip any accidental hash collisions
219                            doDefault = true;
220                            break;
221                         }
222                         laststate = state;
223                         writeTag(state, laststate, hide, _head, _buffer, _body);
224                         state = TAG_STATE_XMP;
225                         break;
226                      case TITLE_HASH:
227                         if (!_buffer.compareLowerSubstr("title")) { // skip any accidental hash collisions
228                            doDefault = true;
229                            break;
230                         }
231                         if (doneTitle)
232                         {
233                            hide = true;
234                         }
235                         else
236                         {
237                            laststate = state;
238                            state = TAG_STATE_TITLE;
239                         }
240                         break;
241                      case SLASH_TITLE_HASH:
242                         if (!_buffer.compareLowerSubstr("/title")) { // skip any accidental hash collisions
243                            doDefault = true;
244                            break;
245                         }
246                         if (doneTitle)
247                         {
248                            hide = false;
249                         }
250                         else
251                         {
252                            doneTitle = true;
253                            state = laststate;
254                         }
255                         break;
256                      case PARAMETER_HASH:
257                         if (!_buffer.compareLowerSubstr("parameter")) { // skip any accidental hash collisions
258                            doDefault = true;
259                            break;
260                         }
261                         parseProperties(tagObject, _buffer);
262                         String   name = (String  ) tagObject.properties.get("name");
263                         String   value = (String  ) tagObject.properties.get("value");
264 
265                         if (name != null && value != null)
266                         {
267                            _sitemeshProperties.put(name, value);
268                         }
269                         break;
270                      case META_HASH:
271                         if (!_buffer.compareLowerSubstr("meta")) { // skip any accidental hash collisions
272                            doDefault = true;
273                            break;
274                         }
275                         CharArray metaDestination = state == TAG_STATE_HEAD ? _head : _body;
276                         metaDestination.append('<');
277                         metaDestination.append(_buffer);
278                         metaDestination.append('>');
279                         parseProperties(tagObject, _buffer);
280                         name = (String  ) tagObject.properties.get("name");
281                         value = (String  ) tagObject.properties.get("content");
282 
283                         if (name == null)
284                         {
285                            String   httpEquiv = (String  ) tagObject.properties.get("http-equiv");
286 
287                            if (httpEquiv != null)
288                            {
289                               name = "http-equiv." + httpEquiv;
290                            }
291                         }
292 
293                         if (name != null && value != null)
294                         {
295                            _metaProperties.put(name, value);
296                         }
297                         break;
298                      case SLASH_HEAD_HASH:
299                         if (!_buffer.compareLowerSubstr("/head")) { // skip any accidental hash collisions
300                            doDefault = true;
301                            break;
302                         }
303                         state = TAG_STATE_HTML;
304                         break;
305                      case FRAME_HASH:
306                         if (!_buffer.compareLowerSubstr("frame")) { // skip any accidental hash collisions
307                            doDefault = true;
308                            break;
309                         }
310                         _frameSet = true;
311                         break;
312                      case FRAMESET_HASH:
313                         if (!_buffer.compareLowerSubstr("frameset")) { // skip any accidental hash collisions
314                            doDefault = true;
315                            break;
316                         }
317                         _frameSet = true;
318                         break;
319                      case BODY_HASH:
320                         if (!_buffer.compareLowerSubstr("body")) { // skip any accidental hash collisions
321                            doDefault = true;
322                            break;
323                         }
324                         if (_tokenType == TOKEN_EMPTYTAG)
325                         {
326                            state = TAG_STATE_BODY;
327                         }
328                         _bodyProperties = parseProperties(tagObject, _buffer).properties;
329                         break;
330                      case CONTENT_HASH:
331                         if (!_buffer.compareLowerSubstr("content")) { // skip any accidental hash collisions
332                            doDefault = true;
333                            break;
334                         }
335                         state = TAG_STATE_NONE;
336                         Map   props = parseProperties(tagObject, _buffer).properties;
337                         if (props != null)
338                         {
339                            tagged = true;
340                            _contentTagId = (String  ) props.get("tag");
341                         }
342                         break;
343                      case SLASH_XMP_HASH:
344                         if (!_buffer.compareLowerSubstr("/xmp")) { // skip any accidental hash collisions
345                            doDefault = true;
346                            break;
347                         }
348                         hide = false;
349                         break;
350                      case SLASH_BODY_HASH:
351                         if (!_buffer.compareLowerSubstr("/body")) { // skip any accidental hash collisions
352                            doDefault = true;
353                            break;
354                         }
355                         state = TAG_STATE_NONE;
356                         hide = true;
357                         break;
358                      case SLASH_HTML_HASH:
359                         if (!_buffer.compareLowerSubstr("/html")) { // skip any accidental hash collisions
360                            doDefault = true;
361                            break;
362                         }
363                         state = TAG_STATE_NONE;
364                         hide = true;
365                         break;
366                      default:
367                         doDefault = true;
368                   }
369                   if (doDefault)
370                      writeTag(state, laststate, hide, _head, _buffer, _body);
371                }
372             }
373             else if (!hide)
374             {
375                if (_tokenType == TOKEN_TEXT)
376                {
377                   if (state == TAG_STATE_TITLE)
378                   {
379                      _title.append(_buffer);
380                   }
381                   else if (shouldWriteToHead(state, laststate))
382                   {
383                      _head.append(_buffer);
384                   }
385                   else
386                   {
387                      _body.append(_buffer);
388                   }
389                }
390                else if (_tokenType == TOKEN_COMMENT)
391                {
392                   final CharArray commentDestination = shouldWriteToHead(state, laststate) ? _head : _body;
393                   commentDestination.append("<!--");
394                   commentDestination.append(_buffer);
395                   commentDestination.append("-->");
396                }
397                else if (_tokenType == TOKEN_CDATA)
398                {
399                   final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body;
400                   commentDestination.append("<![CDATA[");
401                   commentDestination.append(_buffer);
402                   commentDestination.append("]]>");
403                }
404                else if (_tokenType == TOKEN_SCRIPT)
405                {
406                   final CharArray commentDestination = state == TAG_STATE_HEAD ? _head : _body;
407                   commentDestination.append('<');
408                   commentDestination.append(_buffer);
409                }
410             }
411          }
412          _buffer.setLength(0);
413 
414          start:
415          while (true)
416          {
417             int c;
418 
419             if(_pushBack != 0)
420             {
421                c = _pushBack;
422                _pushBack = 0;
423             }
424             else
425             {
426                try
427                {
428                   c = reader.read();
429                }
430                catch(IOException   e)
431                {
432                   _tokenType = TOKEN_EOF;
433                   break start;
434                }
435             }
436 
437             if(c < 0)
438             {
439                int tmpstate = _state;
440                _state = STATE_EOF;
441 
442                if(_buffer.length() > 0 && tmpstate == STATE_TEXT)
443                {
444                   _tokenType = TOKEN_TEXT;
445                   break start;
446                }
447                else
448                {
449                   _tokenType = TOKEN_EOF;
450                   break start;
451                }
452             }
453 
454             switch(_state)
455             {
456                case STATE_TAG:
457                {
458                   int buflen = _buffer.length();
459 
460                   if(c == '>')
461                   {
462                      if (_buffer.length() > 1 && _buffer.charAt(_buffer.length() - 1) == '/')
463                      {
464                         _tokenType = TOKEN_EMPTYTAG;
465                      }
466                      else
467                      {
468                         _tokenType = TOKEN_TAG;
469                      }
470                      _state = STATE_TEXT;
471                      break start;
472                   }
473                   else if(c == '/')
474                   {
475                      _buffer.append('/');
476                   }
477                   else if(c == '<' && buflen == 0)
478                   {
479                      _buffer.append("<<");
480                      _state = STATE_TEXT;
481                   }
482                   else if(c == '-' && buflen == 2 && _buffer.charAt(1) == '-' && _buffer.charAt(0) == '!')
483                   {
484                      _buffer.setLength(0);
485                      _state = STATE_COMMENT;
486                   }
487                   else if(c == '[' && buflen == 7 && _buffer.charAt(0) == '!' && _buffer.charAt(1) == '[' &&  _buffer.compareLower("cdata", 2))
488                   {
489                      _buffer.setLength(0);
490                      _state = STATE_CDATA;
491                   }
492                   else if((c == 'e' || c == 'E') && buflen == 7 && _buffer.charAt(0) == '!' && _buffer.compareLower("doctyp", 1))
493                   {
494                      _buffer.append((char)c);
495                      _state = STATE_DOCTYPE;
496                   }
497                   else if((c == 'T' || c == 't') && buflen == 5 && _buffer.compareLower("scrip", 0))
498                   {
499                      _buffer.append((char)c);
500                      _state = STATE_SCRIPT;
501                   }
502 
503                   else if(c == '"' || c == '\'')
504                   {
505                      _quote = c;
506                      _buffer.append(( char ) c);
507                      _state = STATE_TAG_QUOTE;
508                   }
509                   else
510                   {
511                      _buffer.append(( char ) c);
512                   }
513                }
514                break;
515 
516                case STATE_TEXT:
517                {
518                   if(c == '<')
519                   {
520                      _state = STATE_TAG;
521                      if(_buffer.length() > 0)
522                      {
523                         _tokenType = TOKEN_TEXT;
524                         break start;
525                      }
526                   }
527                   else
528                   {
529                      _buffer.append(( char ) c);
530                   }
531                }
532                break;
533 
534                case STATE_TAG_QUOTE:
535                {
536                   if(c == '>')
537                   {
538                      _pushBack = c;
539                      _state = STATE_TAG;
540                   }
541                   else
542                   {
543                      _buffer.append(( char ) c);
544                      if(c == _quote)
545                      {
546                         _state = STATE_TAG;
547                      }
548                   }
549                }
550                break;
551 
552                case STATE_COMMENT:
553                {
554                   if(c == '>' && _comment >= 2)
555                   {
556                      _buffer.setLength(_buffer.length() - 2);
557                      _comment = 0;
558                      _state = STATE_TEXT;
559                      _tokenType = TOKEN_COMMENT;
560                      break start;
561                   }
562                   else if(c == '-')
563                   {
564                      _comment++;
565                   }
566                   else
567                   {
568                      _comment = 0;
569                   }
570 
571                   _buffer.append(( char ) c);
572                }
573                break;
574 
575                case STATE_CDATA:
576                {
577                   if(c == '>' && _comment >= 2)
578                   {
579                      _buffer.setLength(_buffer.length() - 2);
580                      _comment = 0;
581                      _state = STATE_TEXT;
582                      _tokenType = TOKEN_CDATA;
583                      break start;
584                   }
585                   else if(c == ']')
586                   {
587                      _comment++;
588                   }
589                   else
590                   {
591                      _comment = 0;
592                   }
593 
594                   _buffer.append(( char ) c);
595                }
596                break;
597 
598                case STATE_SCRIPT:
599                {
600                   _buffer.append((char) c);
601                   if (c == '<')
602                   {
603                      _comment = 0;
604                   }
605                   else if ((c == '/' && _comment == 0)
606                      ||((c == 's' || c == 'S' ) && _comment == 1)
607                      ||((c == 'c' || c == 'C' ) && _comment == 2)
608                      ||((c == 'r' || c == 'R' ) && _comment == 3)
609                      ||((c == 'i' || c == 'I' ) && _comment == 4)
610                      ||((c == 'p' || c == 'P' ) && _comment == 5)
611                      ||((c == 't' || c == 'T' ) && _comment == 6)
612                   )
613                   {
614                      _comment++;
615                   }
616                   else if(c == '>' && _comment >= 7)
617                   {
618                      _comment = 0;
619                      _state = STATE_TEXT;
620                      _tokenType = TOKEN_SCRIPT;
621                      break start;
622                   }
623                }
624                break;
625 
626                case STATE_DOCTYPE:
627                {
628                   _buffer.append((char) c);
629                   if (c == '>')
630                   {
631                      _state = STATE_TEXT;
632                      _tokenType = TOKEN_DOCTYPE;
633                      break start;
634                   }
635                   else {
636                     _comment = 0;
637                   }
638                }
639                break;
640             }
641          }
642       }
643 
644       // Help the GC
645       _currentTaggedContent = null;
646       _buffer = null;
647 
648       return new FastPage(_sitemeshProperties,
649                           _htmlProperties,
650                           _metaProperties,
651                           _bodyProperties,
652                           _title.toString().trim(),
653                           _head.toString().trim(),
654                           _body.toString().trim(),
655                           _frameSet);
656    }
657 
658    private static void writeTag(int state, int laststate, boolean hide, CharArray _head, CharArray _buffer, CharArray _body) {
659       if (!hide)
660       {
661          if (shouldWriteToHead(state, laststate))
662          {
663             _head.append('<').append(_buffer).append('>');
664          }
665          else
666          {
667             _body.append('<').append(_buffer).append('>');
668          }
669       }
670    }
671 
672    private static boolean shouldWriteToHead(int state, int laststate)
673    {
674       return state == TAG_STATE_HEAD
675              ||(laststate == TAG_STATE_HEAD && (state == TAG_STATE_XML || state == TAG_STATE_XMP));
676    }
677 
678    /**
679     * Populates a {@link Tag} object using data from the supplied {@link CharArray}.
680     *
681     * The supplied tag parameter is reset and reused - this avoids excess object
682     * creation which hwlps performance.
683     *
684     * @return the same tag instance that was passed in, except it will be populated
685     * with a new <tt>name</tt> value (and the corresponding <tt>nameEndIdx</tt> value).
686     * However if the tag contained nathing but whitespace, this method will return
687     * <tt>null</tt>.
688     */
689    private Tag parseTag(Tag tag, CharArray buf)
690    {
691       int len = buf.length();
692       int idx = 0;
693       int begin;
694 
695       // Skip over any leading whitespace in the tag
696       while (idx < len && Character.isWhitespace(buf.charAt(idx))) idx++;
697 
698       if(idx == len) return null;
699 
700       // Find out where the non-whitespace characters end. This will give us the tag name.
701       begin = idx;
702       while (idx < len && !Character.isWhitespace(buf.charAt(idx))) idx++;
703 
704       // Mark the tag name as a substring within the buffer. This allows us to perform
705       // a substring comparison against it at a later date
706       buf.setSubstr(begin, buf.charAt(idx - 1) == '/' ? idx - 1 : idx);
707 
708       // Remember where the name finishes so we can pull out the properties later if need be
709       tag.nameEndIdx = idx;
710 
711       return tag;
712    }
713 
714    /**
715     * This is called when we need to extract the properties for the tag from the tag's HTML.
716     * We only call this when necessary since it has quite a lot of overhead.
717     *
718     * @param tag the tag that is currently being processed. This should be the
719     * tag that was returned as a result of a call to {@link #parseTag(FastPageParser.Tag, CharArray)}
720     * (ie, it has the <tt>name</tt> and <tt>nameEndIdx</tt> fields set correctly for the
721     * tag in question. The <tt>properties</tt> field can be in an undefined state - it
722     * will get replaced regardless).
723     * @param buffer a <tt>CharArray</tt> containing the entire tag that is being parsed.
724     * @return the same tag instance that was passed in, only it will now be populated
725     * with any properties that were specified in the tag's HTML.
726     */
727    private static Tag parseProperties(Tag tag, CharArray buffer)
728    {
729       int len = buffer.length();
730       int idx = tag.nameEndIdx;
731 
732       // Start with an empty hashmap. A new HashMap is lazy-created if we happen to find any properties
733       tag.properties = Collections.EMPTY_MAP;
734       int begin;
735       while (idx < len)
736       {
737          // Skip forward to the next non-whitespace character
738          while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++;
739 
740          if(idx == len) continue;
741 
742          begin = idx;
743          if(buffer.charAt(idx) == '"')
744          {
745             idx++;
746             while (idx < len && buffer.charAt(idx) != '"') idx++;
747             if(idx == len) continue;
748             idx++;
749          }
750          else if(buffer.charAt(idx) == '\'')
751          {
752             idx++;
753             while (idx < len && buffer.charAt(idx) != '\'') idx++;
754             if(idx == len) continue;
755             idx++;
756          }
757          else
758          {
759             while (idx < len && !Character.isWhitespace(buffer.charAt(idx)) && buffer.charAt(idx) != '=') idx++;
760          }
761 
762          // Mark the substring. This is the attribute name
763          buffer.setSubstr(begin, idx);
764 
765          if(idx < len && Character.isWhitespace(buffer.charAt(idx)))
766          {
767             while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++;
768          }
769 
770          if(idx == len || buffer.charAt(idx) != '=') continue;
771 
772          idx++;
773 
774          if(idx == len) continue;
775 
776          while(idx < len && (buffer.charAt(idx) == '\n' || buffer.charAt(idx) == '\r')) idx++;
777 
778          if(buffer.charAt(idx) == ' ')
779          {
780             while (idx < len && Character.isWhitespace(buffer.charAt(idx))) idx++;
781             if(idx == len || (buffer.charAt(idx) != '"' && buffer.charAt(idx) != '"')) continue;
782          }
783 
784          begin = idx;
785          int end;
786          if(buffer.charAt(idx) == '"')
787          {
788             idx++;
789             begin = idx;
790             while (idx < len && buffer.charAt(idx) != '"') idx++;
791             if(idx == len) continue;
792             end = idx;
793             idx++;
794          }
795          else if(buffer.charAt(idx) == '\'')
796          {
797             idx++;
798             begin = idx;
799             while (idx < len && buffer.charAt(idx) != '\'') idx++;
800             if(idx == len) continue;
801             end = idx;
802             idx++;
803          }
804          else
805          {
806             while (idx < len && !Character.isWhitespace(buffer.charAt(idx))) idx++;
807             end = idx;
808          }
809          // Extract the name and value as String objects and add them to the property map
810          String   name = buffer.getLowerSubstr();
811          String   value = buffer.substring(begin, end);
812 
813          tag.addProperty(name, value);
814       }
815       return tag;
816    }
817 
818    private class Tag
819    {
820       // The index where the name string ends. This is used as the starting
821       // offet if we need to continue processing to find the tag's properties
822       public int nameEndIdx = 0;
823 
824       // This holds a map of the various properties for a particular tag.
825       // This map is only populated when required - normally it will remain empty
826       public Map   properties = Collections.EMPTY_MAP;
827 
828       /**
829        * Adds a name/value property pair to this tag. Each property that is
830        * added represents a property that was parsed from the tag's HTML.
831        */
832       public void addProperty(String   name, String   value)
833       {
834         if(properties==Collections.EMPTY_MAP)
835         {
836           properties = new HashMap  (8);
837         }
838         properties.put(name, value);
839       }
840    }
841 }
842
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags