WebLoader


1   /*
2    * WebLoader.java
3    *
4    * Copyright (C) 1998-2002 Peter Graves
5    * $Id: WebLoader.java,v 1.1.1.1 2002/09/24 16:09:09 piso Exp $
6    *
7    * This program is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU General Public License
9    * as published by the Free Software Foundation; either version 2
10   * of the License, or (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
20   */
21  
22  package org.armedbear.j;
23  
24  import java.io.BufferedReader  ;
25  import java.io.IOException  ;
26  import java.io.InputStream  ;
27  import java.io.InputStreamReader  ;
28  import java.io.PushbackReader  ;
29  import java.io.Reader  ;
30  import java.util.ArrayList  ;
31  import java.util.Hashtable  ;
32  import java.util.List  ;
33  import java.util.Stack  ;
34  
35  public final class WebLoader implements WebConstants
36  {
37      private PushbackReader   reader;
38      private final FastStringBuffer textBuffer = new FastStringBuffer();
39      private final Stack   indentStack = new Stack  ();
40      private final Stack   tableStack = new Stack  ();
41      private Table currentTable;
42      private int sourceOffset;
43      private int offset;
44      private final int maxChars = 80;
45      private LineSegmentList segments;
46      private LineSequence lines;
47      private final Hashtable   refs = new Hashtable  ();
48      private int indentLevel;
49      private File file;
50  
51      public WebLoader(File file)
52      {
53          this.file = file;
54          if (file.getEncoding() == null)
55              file.setEncoding("iso-8859-1");
56          Debug.assertTrue(file.isLocal());
57      }
58  
59      public WebLoader(Reader   reader)
60      {
61          this.reader = new PushbackReader  (new BufferedReader  (reader));
62      }
63  
64      public final Hashtable   getRefs()
65      {
66          return refs;
67      }
68  
69      public LineSequence load()
70      {
71          try {
72              loadInternal();
73          }
74          catch (EncodingChangeException e) {
75              Log.debug("encoding change!");
76              Log.debug("new encoding = |" + e.getNewEncoding() + "|");
77              file.setEncoding(e.getNewEncoding());
78              reader = null;
79              try {
80                  loadInternal();
81              }
82              catch (EncodingChangeException ex) {
83                  Log.error(ex);
84              }
85          }
86          // Handle zero length files.
87          if (lines.getFirstLine() == null)
88              lines.appendLine(new WebLine(sourceOffset));
89          return lines;
90      }
91  
92      private void loadInternal() throws EncodingChangeException
93      {
94          if (reader == null) {
95              Debug.assertTrue(file != null);
96              String   encoding = file.getEncoding();
97              if (encoding == null)
98                  encoding = Editor.preferences().getStringProperty(Property.DEFAULT_ENCODING);
99              try {
100                 InputStream   inputStream = file.getInputStream();
101                 reader = new PushbackReader  (new BufferedReader  (new InputStreamReader  (inputStream, encoding)));
102             }
103             catch (IOException   e) {
104                 Log.error(e);
105                 return;
106             }
107         }
108         lines = new LineSequence();
109         sourceOffset = 0;
110         try {
111             int c;
112             while ((c = reader.read()) >= 0) {
113                 // Line separator always counts as 1 char, so count '\n' but
114                 // not '\r'.
115                 if (c != '\r')
116                     ++sourceOffset;
117                 switch (c) {
118                     case '<':
119                         processMarkup();
120                         break;
121                     case '&':
122                         processEntity();
123                         break;
124                     default:
125                         doChar((char)c);
126                         break;
127                 }
128             }
129             flushLine();
130         }
131         catch (IOException   e) {
132             Log.error(e);
133         }
134     }
135 
136     private boolean bold;
137     private boolean strong;
138     private boolean italic;
139     private boolean emphasis;
140     private boolean heading;
141     private boolean h1;
142     private boolean center;
143     private boolean preformatted;
144     private boolean whitespace;
145     private Link link;
146 
147     private final boolean centered()
148     {
149         return center || h1;
150     }
151 
152     private void processMarkup() throws EncodingChangeException
153     {
154         final String   tag = gatherTag();
155         if (tag.length() < 3) {
156             doText(tag);
157             return;
158         }
159         char c = tag.charAt(1);
160         if (c == '/') {
161             if (!Character.isLetter(tag.charAt(2))) {
162                 doText(tag);
163                 return;
164             }
165         } else {
166             if (c == '!') {
167                 // We only care about comments.
168                 if (tag.equals("<!--"))
169                     skipComment();
170                 return;
171             }
172             if (c == '?') {
173                 // Ignore XML declaration, processing instructions.
174                 return;
175             }
176             if (!Character.isLetter(c)) {
177                 doText(tag);
178                 return;
179             }
180         }
181         final String   tagName = Utilities.getTagName(tag).toLowerCase().intern();
182 
183         // Unsupported tags.
184         if (tagName == "applet") {
185             skipTag("/applet");
186             return;
187         }
188         if (tagName == "form") {
189             flushLine();
190             textBuffer.append("[Form]");
191             flushSegment(null, FORMAT_DISABLED);
192             flushLine();
193             return;
194         }
195         if (tagName == "/form") {
196             flushLine();
197             textBuffer.append("[End Form]");
198             flushSegment(null, FORMAT_DISABLED);
199             newLine();
200             return;
201         }
202         if (tagName == "input") {
203             List   attributes = getAttributes(tag);
204             String   type = getAttribute(attributes, "type");
205             if (type != null) {
206                 if (type.equalsIgnoreCase("submit")) {
207                     flushSegment();
208                     String   value = getAttribute(attributes, "value");
209                     if (value == null)
210                         value = "Submit"; // Default label.
211                     textBuffer.append('[');
212                     textBuffer.append(value);
213                     textBuffer.append(']');
214                     flushSegment(null, FORMAT_DISABLED);
215                 } else if (type.equalsIgnoreCase("image")) {
216                     flushSegment();
217                     textBuffer.append("[Image]");
218                     flushSegment(null, FORMAT_DISABLED);
219                 }
220             }
221             return;
222         }
223         if (tagName == "object") {
224             skipTag("/object");
225             return;
226         }
227         if (tagName == "xml") {
228             skipTag("/xml");
229             return;
230         }
231         if (tagName == "script") {
232             skipScript();
233             return;
234         }
235 
236         if (tagName == "title") {
237             processTitle();
238             return;
239         }
240         if (tagName == "b") {
241             flushSegment();
242             if (bold) {
243                 // Two <b>'s in a row. This one is probably a typo for </b>.
244                 bold = false;
245             } else {
246                 bold = true;
247             }
248             return;
249         }
250         if (tagName == "/b") {
251             flushSegment();
252             bold = false;
253             return;
254         }
255         if (tagName == "strong") {
256             flushSegment();
257             strong = true;
258             return;
259         }
260         if (tagName == "/strong") {
261             flushSegment();
262             strong = false;
263             return;
264         }
265         if (tagName == "i") {
266             flushSegment();
267             italic = true;
268             return;
269         }
270         if (tagName == "/i") {
271             flushSegment();
272             italic = false;
273             return;
274         }
275         if (tagName == "em") {
276             flushSegment();
277             emphasis = true;
278             return;
279         }
280         if (tagName == "/em") {
281             flushSegment();
282             emphasis = false;
283             return;
284         }
285         if (tagName == "q" || tagName == "/q") {
286             // Indent if we're at the beginning of the line.
287             maybeIndent();
288             textBuffer.append('"');
289             return;
290         }
291         if (tagName == "a") {
292             if (link != null)
293                 // The last <a> tag was never terminated. This is probably a typo for </a>.
294                 processEndAnchor();
295             else
296                 processAnchor(tag);
297             return;
298         }
299         if (tagName == "/a") {
300             processEndAnchor();
301             return;
302         }
303         if (tagName == "h1") {
304             newLine();
305             heading = true;
306             h1 = true;
307             return;
308         }
309         if (tagName == "/h1") {
310             newLine();
311             heading = false;
312             h1 = false;
313             return;
314         }
315         if (tagName == "h2" ||
316             tagName == "h3" ||
317             tagName == "h4" ||
318             tagName == "h5" ||
319             tagName == "h6") {
320             newLine();
321             heading = true;
322             return;
323         }
324         if (tagName == "/h2" ||
325             tagName == "/h3" ||
326             tagName == "/h4" ||
327             tagName == "/h5" ||
328             tagName == "/h6") {
329             newLine();
330             heading = false;
331             return;
332         }
333         if (tagName == "br") {
334             // Forced line break. If there's no text to flush, append a blank
335             // line.
336             if (!flushLine()) {
337                 lines.appendLine(new WebLine(sourceOffset));
338                 ++offset;
339             }
340             return;
341         }
342         if (tagName == "div") {
343             flushLine();
344             return;
345         }
346         if (tagName == "/div") {
347             flushLine();
348             return;
349         }
350         if (tagName == "p") {
351             newLine();
352             return;
353         }
354         if (tagName == "pre") {
355             flushLine();
356             preformatted = true;
357             return;
358         }
359         if (tagName == "/pre") {
360             newLine();
361             preformatted = false;
362             return;
363         }
364         if (tagName == "blockquote") {
365             newLine();
366             indentStack.push("blockquote");
367             ++indentLevel;
368             return;
369         }
370         if (tagName == "/blockquote") {
371             newLine();
372             if (!indentStack.empty()) {
373                 String   s = (String  ) indentStack.pop();
374                 --indentLevel;
375                 if (!s.equals("blockquote"))
376                     Log.error("**** /blockquote: stack imbalance");
377             }
378             return;
379         }
380         // Definition list.
381         if (tagName == "dl") {
382             newLine();
383             indentStack.push("dl");
384             return;
385         }
386         // Never omitted.
387         if (tagName == "/dl") {
388             newLine();
389             // Handle unbalanced <dt> and/or <dd> tags.
390             while (!indentStack.empty()) {
391                 String   s = (String  ) indentStack.peek();
392                 if (s.equals("dd")) {
393                     indentStack.pop();
394                     --indentLevel;
395                 } else if (s.equals("dl")) {
396                     indentStack.pop();
397                     break;
398                 } else {
399                     // Shouldn't happen.
400                     break;
401                 }
402             }
403             return;
404         }
405         // Definition.
406         if (tagName == "dd") {
407             flushLine();
408             if (!indentStack.empty()) {
409                 String   s = (String  ) indentStack.peek();
410                 if (s.equals("dl"))
411                     ;
412                 else if (s.equals("dd")) {
413                     // Keep same indentation.
414                     return;
415                 } else
416                     Log.error("**** dd: top of stack is " + s);
417             } else
418                 Log.error("**** dd: indentStack unexpectedly empty");
419             indentStack.push("dd");
420             ++indentLevel;
421             return;
422         }
423         // Term to be defined.
424         if (tagName == "dt") {
425             flushLine();
426             if (!indentStack.empty()) {
427                 String   s = (String  ) indentStack.peek();
428                 if (s.equals("dd")) {
429                     indentStack.pop(); // <dt> terminating <dd> (javadoc)
430                     --indentLevel;
431                 } else if (s.equals("dl"))
432                     ;
433                 else
434                     Log.error("**** dt: top of stack is " + s);
435             } else
436                 Log.error("**** dt: indentStack unexpectedly empty");
437             return;
438         }
439         if (tagName == "img") {
440             processImg(tag);
441             return;
442         }
443         if (tagName == "center") {
444             flushLine();
445             center = true;
446             return;
447         }
448         if (tagName == "/center") {
449             flushLine();
450             center = false;
451             return;
452         }
453         if (tagName == "hr") {
454             flushLine();
455             link = null;
456             for (int i = 0; i < maxChars(); i++)
457                 textBuffer.append('-');
458             flushLine();
459             return;
460         }
461         if (tagName == "ul") {
462             newLine();
463             indentStack.push("ul");
464             ++indentLevel;
465         }
466         // Never omitted.
467         if (tagName == "/ul") {
468             newLine();
469             if (!indentStack.empty()) {
470                 indentStack.pop();
471                 --indentLevel;
472             }
473         }
474         // End tag is usually omitted.
475         if (tagName == "li") {
476             flushLine();
477             if (indentStack.size() > 0) {
478                 textBuffer.append(Utilities.spaces(getIndent()));
479             } else {
480                 textBuffer.append(Utilities.spaces(4));
481             }
482             if (textBuffer.length() >= 2)
483                 textBuffer.setCharAt(textBuffer.length() - 2, '\u2022');
484             flushSegment(null, 0);
485             return;
486         }
487         if (tagName == "style") {
488             skipTag("/style");
489             return;
490         }
491         if (tagName == "table") {
492             newLine();
493             tableStack.push(currentTable);
494             currentTable = new Table();
495             return;
496         }
497         if (tagName == "/table") {
498             flushLine();
499             if (!tableStack.empty())
500                 currentTable = (Table) tableStack.pop();
501             else
502                 Log.error("**** /table: table stack imbalance source offset = " + sourceOffset);
503             return;
504         }
505         // </tr> tag may be omittted.
506         if (tagName == "tr") {
507             flushLine();
508             if (currentTable != null)
509                 currentTable.nextRow();
510             else
511                 Log.error("**** tr: currentTable is null source offset = " + sourceOffset);
512             return;
513         }
514         // </td> tag may be omitted.
515         if (tagName == "td" || tagName == "th") {
516             flushSegment();
517             if (currentTable != null) {
518                 currentTable.nextColumn();
519                 int currentOffset = getCurrentOffset();
520                 // Leave at least one space between columns (but no space
521                 // before the first column).
522                 int numSpaces = 1;
523                 if (currentTable.getColumnIndex() == 0 || currentOffset == 0)
524                     numSpaces = 0;
525                 int minimumOffset = currentTable.getMinimumOffset();
526                 if (minimumOffset > 0) {
527                     if (currentOffset < minimumOffset)
528                         numSpaces = minimumOffset - currentOffset;
529                 }
530                 textBuffer.append(Utilities.spaces(numSpaces));
531                 flushSegment(null, FORMAT_WHITESPACE);
532                 String   s = getAttribute(tag, "width");
533                 if (s != null) {
534                     if (s.endsWith("%")) {
535                         s = s.substring(0, s.length()-1).trim();
536                         if (s.length() > 0) {
537                             try {
538                                 int percent = Integer.parseInt(s);
539                                 int width = maxChars() * percent / 100;
540                                 currentTable.setColumnWidth(width);
541                             }
542                             catch (NumberFormatException   e) {
543                                 Log.error(e);
544                             }
545                         }
546                     } else
547                         ; // Ignore widths specified in pixels.
548                 }
549             } else
550                 Log.error("**** td: currentTable is null");
551             return;
552         }
553         if (tagName == "meta") {
554             // Ignore change of encoding if we're not loading a file. This can
555             // happen when load() is called from MessageBuffer.setText() to
556             // process an HTML message.
557             if (file == null)
558                 return;
559             String   encoding = file.getEncoding();
560             // Ignore the specified encoding if we have already determined the
561             // encoding from the byte order mark.
562             if (encoding != null) {
563                 if (encoding.equals("UnicodeBig") || encoding.equals("UnicodeLittle"))
564                     return;
565             }
566             List   attributes = getAttributes(tag);
567             String   httpEquiv = getAttribute(attributes, "http-equiv");
568             if (httpEquiv != null) {
569                 if (httpEquiv.toLowerCase().equals("content-type")) {
570                     String   contentType = getAttribute(attributes, "content");
571                     if (contentType != null) {
572                         String   charset =
573                             Utilities.getCharsetFromContentType(contentType);
574                         Log.debug("charset = |" + charset + "|");
575                         if (charset != null && charset.length() > 0) {
576                             String   newEncoding =
577                                 Utilities.getEncodingFromCharset(charset);
578                             Log.debug("new encoding = " + newEncoding);
579                             if (!newEncoding.equalsIgnoreCase(encoding))
580                                 throw new EncodingChangeException(newEncoding);
581                             Log.debug("no encoding change");
582                         }
583                     }
584                 }
585             }
586             return;
587         }
588     }
589 
590     private void processTitle()
591     {
592         FastStringBuffer sb = new FastStringBuffer();
593         try {
594             int c;
595             while ((c = reader.read()) >= 0) {
596                 if (c != '\r')
597                     ++sourceOffset;
598                 if (c == '<') {
599                     String   tag = gatherTag();
600                     if (!isTag(tag, "/title"))
601                         Log.error("processTitle unexpected tag " + tag);
602                     break;
603                 } else if (c == '&') {
604                     String   entity = gatherEntity();
605                     sb.append(substituteEntity(entity));
606                 } else
607                     sb.append((char)c);
608             }
609         }
610         catch (IOException   e) {
611             Log.error(e);
612         }
613         String   title = sb.toString().trim();
614         if (lines.getFirstLine() == null) {
615             if (textBuffer.length() == 0) {
616                 if (title.length() < maxChars())
617                     textBuffer.append(Utilities.spaces(maxChars() - title.length()));
618                 textBuffer.append(title);
619                 flushLine();
620             }
621         }
622     }
623 
624     private void processAnchor(String   tag)
625     {
626         flushSegment();
627         List   attributes = getAttributes(tag);
628         if (attributes != null) {
629             for (int i = 0; i < attributes.size(); i++) {
630                 StringPair pair = (StringPair) attributes.get(i);
631                 if (pair.first.equals("href"))
632                     link = new Link(pair.second.trim());
633                 else if (pair.first.equals("name"))
634                     addRef(pair.second, offset);
635             }
636         }
637     }
638 
639     private void processEndAnchor()
640     {
641         boolean appendSpace = false;
642         while (textBuffer.toString().endsWith(" ")) {
643             appendSpace = true;
644             textBuffer.setLength(textBuffer.length() - 1);
645         }
646         flushSegment();
647         link = null;
648         if (appendSpace) {
649             textBuffer.append(' ');
650             flushSegment();
651         }
652     }
653 
654     private void processImg(String   tag)
655     {
656         flushSegment();
657         List   attributes = getAttributes(tag);
658         String   alt = getAttribute(attributes, "alt");
659         String   src = getAttribute(attributes, "src");
660         String   width = getAttribute(attributes, "width");
661         String   height = getAttribute(attributes, "height");
662         int w = 0;
663         int h = 0;
664         if (width != null) {
665             try {
666                 w = Integer.parseInt(width);
667             }
668             catch (NumberFormatException   e) {}
669         }
670         if (height != null) {
671             try {
672                 h = Integer.parseInt(height);
673             }
674             catch (NumberFormatException   e) {}
675         }
676         // Create image link if appropriate.
677         ImageLink imageLink = null;
678         if (src != null && src.length() > 0) {
679             String   lower = src.toLowerCase();
680             if (lower.endsWith(".jpg") || lower.endsWith(".gif") || lower.endsWith(".png")) {
681                 // Only provide image link if image is big enough.
682                 if (w >= 100 && h >= 100)
683                     imageLink = new ImageLink(src);
684             }
685         }
686         if (imageLink != null) {
687             FastStringBuffer sb = new FastStringBuffer("[IMAGE");
688             if (width != null && height != null) {
689                 sb.append(' ');
690                 sb.append(width);
691                 sb.append('x');
692                 sb.append(height);
693             }
694             sb.append(']');
695             if (alt != null && (alt = alt.trim()).length() > 0) {
696                 sb.append(' ');
697                 sb.append(alt);
698             }
699             imageLink.setText(sb.toString());
700             textBuffer.append(imageLink.getText());
701             flushSegment(imageLink, FORMAT_LINK);
702         }
703         // Add a space if the last character on the line so far is not
704         // already a space.
705         if (segments == null || segments.size() == 0) {
706             // We don't need to add a space at the beginning of the line.
707             return;
708         }
709         FastStringBuffer sb = new FastStringBuffer();
710         for (int i = 0; i < segments.size(); i++) {
711             HtmlLineSegment segment = (HtmlLineSegment) segments.getSegment(i);
712             sb.append(segment.getText());
713         }
714         if (sb.length() == 0 || sb.charAt(sb.length()-1) == ' ')
715             return;
716         // The last character is not a space, so we need to add one.
717         textBuffer.append(' ');
718         flushSegment(null, FORMAT_WHITESPACE);
719     }
720 
721     private final void addRef(String   ref, int offset)
722     {
723         refs.put(ref, new Integer  (offset));
724     }
725 
726     private static final String   getAttribute(String   tag, String   attributeName)
727     {
728         return getAttribute(getAttributes(tag), attributeName);
729     }
730 
731     private static String   getAttribute(List   attributes, String   attributeName)
732     {
733         if (attributes != null) {
734             for (int i = attributes.size()-1; i >= 0; i--) {
735                 StringPair pair = (StringPair) attributes.get(i);
736                 if (pair.first.equals(attributeName))
737                     return pair.second;
738             }
739         }
740         return null;
741     }
742 
743     private static List   getAttributes(String   tag)
744     {
745         final int NEUTRAL         = 0;
746         final int ATTRIBUTE_NAME  = 1;
747         final int SPACE_BEFORE_EQ = 2;
748         final int SPACE_AFTER_EQ  = 3;
749         final int ATTRIBUTE_VALUE = 4;
750 
751         int state = NEUTRAL;
752         FastStringBuffer sb = new FastStringBuffer();
753         String   name = null;
754         String   value = null;
755         ArrayList   attributes = null;
756         char delim = 0;
757 
758         final int limit = tag.length();
759         int i;
760         // Skip past tag name.
761         for (i = 0; i < limit; i++) {
762             char c = tag.charAt(i);
763             if (c == '>')
764                 return null;
765             if (Character.isWhitespace(c)) {
766                 ++i;
767                 break;
768             }
769         }
770 
771         for (; i < limit; i++) {
772             char c = tag.charAt(i);
773             switch (state) {
774                 case NEUTRAL:
775                     if (Character.isWhitespace(c))
776                         ;
777                     else {
778                         sb.setLength(0);
779                         sb.append(c);
780                         state = ATTRIBUTE_NAME;
781                     }
782                     break;
783                 case ATTRIBUTE_NAME:
784                     if (c == '=') {
785                         name = sb.toString().toLowerCase();
786                         sb.setLength(0);
787                         state = SPACE_AFTER_EQ;
788                     } else if (Character.isWhitespace(c)) {
789                         name = sb.toString().toLowerCase();
790                         sb.setLength(0);
791                         state = SPACE_BEFORE_EQ;
792                     } else
793                         sb.append(c);
794                     break;
795                 case SPACE_BEFORE_EQ:
796                     if (Character.isWhitespace(c))
797                         ;
798                     else if (c == '=')
799                         state = SPACE_AFTER_EQ;
800                     else {
801                         // An attribute with no value.
802                         sb.setLength(0);
803                         state = NEUTRAL;
804                         if (attributes == null)
805                             attributes = new ArrayList  ();
806                         attributes.add(new StringPair(name, ""));
807                         name = value = null;
808                     }
809                     break;
810                 case SPACE_AFTER_EQ:
811                     if (Character.isWhitespace(c))
812                         ;
813                     else if ( c == '"' || c == '\'') {
814                         delim = c;
815                         sb.setLength(0);
816                         state = ATTRIBUTE_VALUE;
817                     } else {
818                         delim = 0;
819                         sb.setLength(0);
820                         sb.append(c);
821                         state = ATTRIBUTE_VALUE;
822                     }
823                     break;
824                 case ATTRIBUTE_VALUE:
825                     if (delim != 0) {
826                         if (c == delim) {
827                             value = sb.toString();
828                             sb.setLength(0);
829                             state = NEUTRAL;
830                             if (attributes == null)
831                                 attributes = new ArrayList  ();
832                             attributes.add(new StringPair(name, value));
833                             name = value = null;
834                         } else if (c == '&') {
835                             FastStringBuffer sbEntity = new FastStringBuffer();
836                             sbEntity.append('&');
837                             for (++i; i < limit; i++) {
838                                 c = tag.charAt(i);
839                                 if (c == delim) {
840                                     // Not really an entity.
841                                     sb.append(sbEntity.toString());
842                                     // Let outer loop handle the delimiter.
843                                     --i;
844                                     break;
845                                 }
846                                 sbEntity.append(c);
847                                 if (c == ';') {
848                                     sb.append(substituteEntity(sbEntity.toString()));
849                                     break;
850                                 }
851                             }
852                         } else
853                             sb.append(c);
854                     } else {
855                         // Attribute value is not enclosed in quotes.
856                         if (c == '>' || Character.isWhitespace(c)) {
857                             value = sb.toString();
858                             sb.setLength(0);
859                             state = NEUTRAL;
860                             if (attributes == null)
861                                 attributes = new ArrayList  ();
862                             attributes.add(new StringPair(name, value));
863                             name = value = null;
864                         } else if (c == '&') {
865                             FastStringBuffer sbEntity = new FastStringBuffer();
866                             sbEntity.append('&');
867                             for (++i; i < limit; i++) {
868                                 c = tag.charAt(i);
869                                 if (c == ' ' || c == '>') {
870                                     // Reached end of attribute. Back up one char.
871                                     --i;
872                                     // We've already got the whole entity (if it is one).
873                                     break;
874                                 }
875                                 sbEntity.append(c);
876                                 if (c == ';')
877                                     break;
878                             }
879                             sb.append(substituteEntity(sbEntity.toString()));
880                         } else
881                             sb.append(c);
882                     }
883                     break;
884             }
885         }
886 
887         return attributes;
888     }
889 
890     // tagName can be e.g. "table" or "/table".
891     private static boolean isTag(String   s, String   tagName)
892     {
893         Debug.assertTrue(tagName.indexOf('<') < 0);
894         Debug.assertTrue(tagName.indexOf('>') < 0);
895         Debug.assertTrue(tagName.indexOf(' ') < 0);
896 
897         // Shortest possible tag is "<a>".
898         if (s == null || s.length() < 3)
899             return false;
900         if (s.charAt(0) != '<')
901             return false;
902         int length = tagName.length();
903         if (s.length() < length + 2)
904             return false;
905         if (!s.regionMatches(true, 1, tagName, 0, length))
906             return false;
907         // Char after tag name must be whitespace or '>'.
908         char c = s.charAt(length + 1);
909         return c == '>' || Character.isWhitespace(c);
910     }
911 
912     private String   gatherTag()
913     {
914         final int TAG_NAME        = 0;
915         final int NEUTRAL         = 1;
916         final int ATTRIBUTE_NAME  = 2;
917         final int SPACE_BEFORE_EQ = 3;
918         final int SPACE_AFTER_EQ  = 4;
919         final int ATTRIBUTE_VALUE = 5;
920         final int MARKED_SECTION  = 6;
921         final int BANG            = 7;
922         final int INVALID         = 8;
923 
924         FastStringBuffer sb = new FastStringBuffer(256);
925         sb.append('<');
926         int length = 1;
927         int state = TAG_NAME;
928         char delim = 0;
929 
930         int ch;
931 
932         try {
933             while ((ch = reader.read()) >= 0) {
934                 char c = (char) ch;
935                 if (c == '<') {
936                     // We only expect to see a '<' inside a quoted attribute value.
937                     // An actual example from msnbc.com: <a HREF="<!--none-->">
938                     if (state != ATTRIBUTE_VALUE || delim == 0) {
939                         Log.error("unexpected '<' sourceOffset = " + sourceOffset);
940                         reader.unread(c);
941                         return sb.toString();
942                     }
943                 }
944                 if (c != '\r')
945                     ++sourceOffset;
946                 // Ignore whitespace after initial "<" or "</".
947                 if (c <= ' ') {
948                     if (length == 1)
949                         continue;
950                     if (length == 2 && sb.charAt(1) == '/')
951                         continue;
952                 }
953                 sb.append(c);
954                 ++length;
955                 switch (state) {
956                     case TAG_NAME:
957                         if (c == '>') {
958                             // End of tag, no attributes.
959                             return sb.toString();
960                         } else if (Character.isWhitespace(c)) {
961                             // Reached end of tag name.
962                             state = NEUTRAL;
963                         } else if (length == 2 && c == '!') {
964                             state = BANG;
965                         } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == ':') {
966                             ; // OK at any time
967                         } else if (length == 2 && (c == '/' || c == '!')) {
968                             ; // OK as second char only
969                         } else if (length > 2 && ((c >= '0' && c <= '9') || c == '-' || c == '.')) {
970                             ; // OK second char or later
971                         } else {
972                             // Not really a tag.
973                             Log.error("invalid tag sourceOffset = " + sourceOffset);
974                             state = INVALID;
975                         }
976                         break;
977                     case BANG:
978                         if (c == '>') {
979                             return sb.toString();
980                         } else if (length == 4 && sb.toString().equals("<!--")) {
981                             // Start of comment.
982                             return sb.toString();
983                         } else if (length == 3 && sb.toString().equals("<![")) {
984                             state = MARKED_SECTION;
985                         }
986                         break;
987                     case NEUTRAL:
988                         if (c == '>')
989                             return sb.toString();
990                         else if (!Character.isWhitespace(c))
991                             state = ATTRIBUTE_NAME;
992                         break;
993                     case ATTRIBUTE_NAME:
994                         if (c == '>')
995                             return sb.toString();
996                         else if (c == '=')
997                             state = SPACE_AFTER_EQ;
998                         else if (Character.isWhitespace(c))
999                             state = SPACE_BEFORE_EQ;
1000                        break;
1001                    case SPACE_BEFORE_EQ:
1002                        if (c == '>')
1003                            return sb.toString();
1004                        else if (Character.isWhitespace(c))
1005                            ;
1006                        else if (c == '=')
1007                            state = SPACE_AFTER_EQ;
1008                        else {
1009                            // An attribute with no value.
1010                            state = NEUTRAL;
1011                        }
1012                        break;
1013                    case SPACE_AFTER_EQ:
1014                        if (c == '>')
1015                            return sb.toString();
1016                        else if (Character.isWhitespace(c))
1017                            ;
1018                        else if ( c == '"' || c == '\'') {
1019                            delim = c;
1020                            state = ATTRIBUTE_VALUE;
1021                        } else {
1022                            delim = 0;
1023                            state = ATTRIBUTE_VALUE;
1024                        }
1025                        break;
1026                    case ATTRIBUTE_VALUE:
1027                        if (delim != 0) {
1028                            if (c == delim)
1029                                state = NEUTRAL;
1030                        } else {
1031                            // Attribute value is not enclosed in quotes.
1032                            if (c == '>')
1033                                return sb.toString();
1034                            else if (Character.isWhitespace(c))
1035                                state = NEUTRAL;
1036                        }
1037                        break;
1038                    case MARKED_SECTION:
1039                        if (c == '>') {
1040                            if (sb.toString().endsWith("]>"))
1041                                return sb.toString();
1042                        }
1043                        break;
1044                    case INVALID:
1045                        if (c == '>') {
1046                            Log.error("invalid tag |" + sb.toString() +
1047                                "| sourceOffset = " + sourceOffset);
1048                            return sb.toString();
1049                        }
1050                        break;
1051                }
1052            }
1053        }
1054        catch (IOException   e) {
1055            Log.error(e);
1056        }
1057
1058        return sb.toString();
1059    }
1060
1061    private void processEntity()
1062    {
1063        String   entity = gatherEntity();
1064        doText(substituteEntity(entity));
1065    }
1066
1067    private String   gatherEntity()
1068    {
1069        FastStringBuffer sb = new FastStringBuffer('&');
1070        try {
1071            int c;
1072            while ((c = reader.read()) >= 0) {
1073                if (c == '<' || c == '&') {
1074                    reader.unread(c);
1075                    break;
1076                }
1077                if (c != '\r')
1078                    ++sourceOffset;
1079                sb.append((char) c);
1080                if (c == ';')
1081                    break;
1082                if (c == ' ')
1083                    break;
1084            }
1085        }
1086        catch (IOException   e) {
1087            Log.error(e);
1088        }
1089        return sb.toString();
1090    }
1091
1092    private static String   substituteEntity(String   entity)
1093    {
1094        final int length = entity.length();
1095        if (length < 2)
1096            return entity;
1097        if (entity.equals("& "))
1098            return entity; // Not really an entity.
1099        if (entity.charAt(1) == '#') {
1100            // Remove leading "&#" and trailing ';' if present.
1101            String   s;
1102            if (entity.charAt(length - 1) == ';')
1103                s = entity.substring(2, length - 1);
1104            else
1105                s = entity.substring(2);
1106
1107            int n = -1;
1108            try {
1109                n = Integer.parseInt(s);
1110            }
1111            catch (NumberFormatException   e) {}
1112
1113            if (n >= 0) {
1114                switch (n) {
1115                    case 145: // Left single quote.
1116                    case 146: // Right single quote.
1117                        return "'";
1118                    case 147: // Left double quote.
1119                    case 148: // Right double quote.
1120                        return "\"";
1121                    case 149: // Bullet.
1122                        return String.valueOf((char)8226);
1123                    case 150: // En dash.
1124                        return "-";
1125                    case 151: // Em dash.
1126                        return "--";
1127                    case 153:
1128                        return "(TM)";
1129                    case 174:
1130                        return "(R)";
1131                    default:
1132                        return String.valueOf((char)n);
1133                }
1134            }
1135        }
1136
1137        // Remove leading '&' and trailing ';' if present.
1138        String   s;
1139        if (entity.charAt(length - 1) == ';')
1140            s = entity.substring(1, length-1).intern();
1141        else
1142            s = entity.substring(1).intern();
1143
1144        if (s == "quot")
1145            return "\"";
1146        else if (s == "trade") // 153
1147            return "(TM)";
1148        else if (s == "nbsp")
1149            return String.valueOf((char)160);
1150        else if (s == "copy")
1151            return String.valueOf((char)169);
1152        else if (s == "laquo")
1153            return String.valueOf((char)171);
1154        else if (s == "reg") // 174
1155            return "(R)";
1156        else if (s == "acute")
1157            return String.valueOf((char)180);
1158        else if (s == "auml")
1159            return String.valueOf((char)228);
1160        else if (s == "middot")
1161            return String.valueOf((char)183);
1162        else if (s == "raquo")
1163            return String.valueOf((char)187);
1164        else if (s == "eacute")
1165            return String.valueOf((char)233);
1166        else if (s == "iuml")
1167            return String.valueOf((char)239);
1168        else if (s == "bull")
1169            return String.valueOf((char)8226);
1170        else if (s == "AElig")
1171            return "AE";
1172        else if (s == "amp")
1173            return "&";
1174        else if (s == "lt")
1175            return "<";
1176        else if (s == "gt")
1177            return ">";
1178        else
1179            return entity;
1180    }
1181
1182    private void skipComment()
1183    {
1184        FastStringBuffer sb = new FastStringBuffer();
1185        try {
1186            int c;
1187            while ((c = reader.read()) >= 0) {
1188                if (c != '\r')
1189                    ++sourceOffset;
1190                sb.append((char) c);
1191                if (c == '>' && sb.toString().endsWith("-->"))
1192                    return;
1193            }
1194        }
1195        catch (IOException   e){
1196            Log.error(e);
1197        }
1198    }
1199
1200    private void skipTag(String   tagName)
1201    {
1202        try {
1203            int c;
1204            while ((c = reader.read()) >= 0) {
1205                if (c != '\r')
1206                    ++sourceOffset;
1207                if (c == '<') {
1208                    String   tag = gatherTag();
1209                    if (isTag(tag, tagName))
1210                        return;
1211                }
1212            }
1213        }
1214        catch (IOException   e) {
1215            Log.error(e);
1216        }
1217    }
1218
1219    private void skipScript()
1220    {
1221        try {
1222            int c;
1223            while ((c = reader.read()) >= 0) {
1224                if (c != '\r')
1225                    ++sourceOffset;
1226                if (c == '<') {
1227                    if (readEndScriptTag())
1228                        return;
1229                }
1230            }
1231        }
1232        catch (IOException   e) {
1233            Log.error(e);
1234        }
1235    }
1236
1237    private boolean readEndScriptTag()
1238    {
1239        final String   s = "</script>";
1240        final int length = s.length();
1241        FastStringBuffer sb = new FastStringBuffer('<');
1242        try {
1243            int c;
1244            while ((c = reader.read()) >= 0) {
1245                if (c != '\r')
1246                    ++sourceOffset;
1247                sb.append(Character.toLowerCase((char)c));
1248                if (sb.length() < length) {
1249                    if (!s.startsWith(sb.toString()))
1250                        return false;
1251                } else
1252                    return s.equals(sb.toString());
1253            }
1254        }
1255        catch (IOException   e) {
1256            Log.error(e);
1257        }
1258        return false;
1259    }
1260
1261    private void doText(String   s)
1262    {
1263        final int length = s.length();
1264        for (int i = 0; i < length; i++)
1265            doChar(s.charAt(i));
1266    }
1267
1268    private void doChar(char c)
1269    {
1270        if (preformatted) {
1271            switch (c) {
1272                case '\t':
1273                    final int spaces = 8 - getCurrentOffset() % 8;
1274                    for (int i = spaces-1; i >= 0; i--)
1275                        textBuffer.append(' ');
1276                    break;
1277                case '\r':
1278                    break;
1279                case '\n':
1280                    flushSegment();
1281                    if (segments != null) {
1282                        lines.appendLine(new WebLine(segments, sourceOffset));
1283                        segments = null;
1284                    } else
1285                        lines.appendLine(new WebLine(sourceOffset));
1286                    ++offset; // Line separator always counts as 1.
1287                    break;
1288                default:
1289                    textBuffer.append(c);
1290                    break;
1291            }
1292            return;
1293        }
1294
1295        switch (c) {
1296            case 133: // Ellipsis.
1297                textBuffer.append("...");
1298                break;
1299            case 145: // Left single quote.
1300            case 146: // Right single quote.
1301                textBuffer.append('\'');
1302                break;
1303            case 147: // Left double quote.
1304            case 148: // Right double quote.
1305                textBuffer.append('"');
1306                break;
1307            case 149: // Bullet.
1308                textBuffer.append((char)8226);
1309                break;
1310            case 150:
1311                // En dash.
1312                textBuffer.append('-');
1313                break;
1314            case 151:
1315                // Em dash.
1316                textBuffer.append("--");
1317                break;
1318            case 153:
1319                textBuffer.append("(TM)");
1320                break;
1321            case '\n':
1322            case '\t':
1323            case ' ':
1324                // Append a space unless the preceding character was a space
1325                // or non-breaking space.
1326                if (textBuffer.length() > 0) {
1327                    char preceding = textBuffer.charAt(textBuffer.length() - 1);
1328                    if (preceding != ' ' && preceding != 160)
1329                        textBuffer.append(' ');
1330                } else if (segments != null && segments.size() > 0) {
1331                    // Check the last character in the previous segment.
1332                    HtmlLineSegment seg = (HtmlLineSegment) segments.getLastSegment();
1333                    String   s = seg.getText();
1334                    if (s.length() == 0)
1335                        textBuffer.append(' ');
1336                    else {
1337                        char preceding = s.charAt(s.length() - 1);
1338                        if (preceding != ' ' && preceding != 160)
1339                            textBuffer.append(' ');
1340                    }
1341                }
1342                break;
1343            case '\r':
1344                break;
1345            default:
1346                // A non-whitespace character.
1347                // Indent if we're at the beginning of the line.
1348                maybeIndent();
1349                textBuffer.append(c);
1350                break;
1351        }
1352
1353        if (Character.isWhitespace(c))
1354            maybeWrap();
1355    }
1356
1357    private void maybeIndent()
1358    {
1359        if (indentLevel > 0) {
1360            if (segments == null && textBuffer.length() == 0) {
1361                textBuffer.append(Utilities.spaces(getIndent()));
1362                flushSegment(null, FORMAT_WHITESPACE);
1363            }
1364        }
1365    }
1366
1367    private final int getIndent()
1368    {
1369        return indentLevel * 4;
1370    }
1371
1372    private int getCurrentOffset()
1373    {
1374        int currentOffset = 0;
1375        if (segments != null) {
1376            for (int i = segments.size()-1; i >= 0; i--)
1377                currentOffset += segments.getSegment(i).length();
1378        }
1379        currentOffset += textBuffer.length();
1380        return currentOffset;
1381    }
1382
1383    private final void flushSegment()
1384    {
1385        flushSegment(true);
1386    }
1387
1388    private void flushSegment(boolean wrap)
1389    {
1390        if (textBuffer.length() > 0) {
1391            if (wrap)
1392                maybeWrap();
1393            int format = 0;
1394            if (link != null)
1395                format |= FORMAT_LINK;
1396            if (bold || strong || heading)
1397                format |= FORMAT_BOLD;
1398            if (italic || emphasis)
1399                format |= FORMAT_ITALIC;
1400            if (whitespace)
1401                format |= FORMAT_WHITESPACE;
1402            if (segments == null)
1403                segments = new LineSegmentList();
1404            segments.addSegment(new HtmlLineSegment(textBuffer.toString(), format, link));
1405            offset += textBuffer.length();
1406            textBuffer.setLength(0);
1407        }
1408    }
1409
1410    private void flushSegment(Link link, int format)
1411    {
1412        if (textBuffer.length() > 0) {
1413            if (segments == null)
1414                segments = new LineSegmentList();
1415            segments.addSegment(new HtmlLineSegment(textBuffer.toString(), format, link));
1416            offset += textBuffer.length();
1417            textBuffer.setLength(0);
1418        }
1419    }
1420
1421    private void maybeWrap()
1422    {
1423        if (preformatted)
1424            return;
1425        int currentOffset = getCurrentOffset();
1426        if (currentOffset > maxChars()) {
1427            int length = textBuffer.length();
1428
1429            // Cumulative length of preceding segments.
1430            int preceding = currentOffset - length;
1431
1432            final String   text = textBuffer.toString();
1433            int index = text.lastIndexOf(' ');
1434            while (index >= 0 && preceding + index > maxChars())
1435                index = text.lastIndexOf(' ', index - 1);
1436
1437            if (index >= 0) {
1438                // Found a suitable break.
1439                String   remainder = text.substring(index + 1);
1440                textBuffer.setLength(index); // Trims trailing space.
1441                flushSegment(false); // No wrap!
1442                if (segments != null) {
1443                    lines.appendLine(new WebLine(segments, sourceOffset));
1444                    ++offset; // Line separator always counts as 1.
1445                    segments = null;
1446                }
1447                maybeIndent();
1448                textBuffer.append(remainder);
1449            } else {
1450                // No suitable break in text buffer.
1451                textBuffer.setLength(0);
1452                if (segments != null) {
1453                    final int last = segments.size() - 1;
1454                    if (last >= 0) {
1455                        final HtmlLineSegment lastSegment = (HtmlLineSegment) segments.getSegment(last);
1456                        final String   segmentText = lastSegment.getText();
1457                        index = segmentText.lastIndexOf(' ');
1458                        if (index >= 0) {
1459                            // Found a break.
1460                            final String   head = segmentText.substring(0, index);
1461                            final String   tail = segmentText.substring(index + 1);
1462
1463                            // We're removing a trailing space. Adjust offset
1464                            // accordingly.
1465                            --offset;
1466
1467                            final int format = lastSegment.getFormat();
1468                            final Link link = lastSegment.getLink();
1469
1470                            segments.setSegment(last, new HtmlLineSegment(head, format, link));
1471                            lines.appendLine(new WebLine(segments, sourceOffset));
1472
1473                            // Line separator always counts as 1.
1474                            ++offset;
1475
1476                            segments = null;
1477                            if (tail.length() > 0) {
1478                                maybeIndent();
1479                                if (segments == null)
1480                                    segments = new LineSegmentList();
1481                                segments.addSegment(new HtmlLineSegment(tail, format, link));
1482                            }
1483                        } else {
1484                            // No break. Move last segment to current line.
1485                            segments.removeSegment(lastSegment);
1486                            lines.appendLine(new WebLine(segments, sourceOffset));
1487
1488                            // Line separator always counts as 1.
1489                            ++offset;
1490
1491                            segments = null;
1492                            maybeIndent();
1493                            if (segments == null)
1494                                segments = new LineSegmentList();
1495                            segments.addSegment(lastSegment);
1496                        }
1497                    }
1498                }
1499
1500                maybeIndent();
1501                textBuffer.append(text);
1502                flushSegment(false); // No wrap!
1503            }
1504        }
1505    }
1506
1507    // Returns true if it does anything.
1508    private boolean flushLine()
1509    {
1510        flushSegment();
1511        if (centered() && currentTable == null && segments != null) {
1512            int length = getCurrentOffset();
1513            if (maxChars() > length) {
1514                int numSpaces = (maxChars() - length) / 2;
1515                if (numSpaces > 0) {
1516                    segments.addSegment(0, new HtmlLineSegment(Utilities.spaces(numSpaces),
1517                                                               FORMAT_WHITESPACE, null));
1518                    offset += numSpaces;
1519                }
1520            }
1521        }
1522        if (segments != null) {
1523            lines.appendLine(new WebLine(segments, sourceOffset));
1524            ++offset; // Line separator always counts as 1.
1525            segments = null;
1526            return true;
1527        } else
1528            return false;
1529    }
1530
1531    private void newLine()
1532    {
1533        flushLine();
1534        Line lastLine = lines.getLastLine();
1535        if (lastLine != null && lastLine.length() > 0 && !lastLine.isBlank()) {
1536            lines.appendLine(new WebLine(sourceOffset));
1537            ++offset;
1538        }
1539    }
1540
1541    private final int maxChars()
1542    {
1543//         if (maxChars == 0) {
1544//             // We have to be careful here because this might get called before
1545//             // the display is initialized if we're opening a file on the
1546//             // command line.
1547//             Display display = Editor.currentEditor().getDisplay();
1548//             int displayWidth = display.getWidth();
1549//             if (displayWidth > 0) {
1550//                 int charWidth = display.getCharWidth();
1551//                 if (charWidth > 0)
1552//                     maxChars = display.getWidth() / charWidth - 2;
1553//             }
1554//             if (maxChars <= 0)
1555//                 maxChars = 80;
1556//         }
1557        Debug.assertTrue(maxChars == 80);
1558        return maxChars;
1559    }
1560
1561    private static class EncodingChangeException extends Exception  
1562    {
1563        private String   newEncoding;
1564
1565        EncodingChangeException(String   newEncoding)
1566        {
1567            this.newEncoding = newEncoding;
1568        }
1569
1570        String   getNewEncoding()
1571        {
1572            return newEncoding;
1573        }
1574    }
1575}
1576
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags