KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > de > java2html > javasource > JavaSourceParser


1 package de.java2html.javasource;
2
3 import java.io.BufferedReader JavaDoc;
4 import java.io.File JavaDoc;
5 import java.io.FileReader JavaDoc;
6 import java.io.IOException JavaDoc;
7 import java.io.InputStream JavaDoc;
8 import java.io.InputStreamReader JavaDoc;
9 import java.io.Reader JavaDoc;
10 import java.io.StringReader JavaDoc;
11 import java.net.URL JavaDoc;
12 import java.util.Hashtable JavaDoc;
13 import java.util.StringTokenizer JavaDoc;
14
15 import de.java2html.options.JavaSourceConversionOptions;
16 import de.java2html.util.IoUtilities;
17
18 /**
19  * Parses raw text to a {@link de.java2html.javasource.JavaSource} object. The
20  * parser can not only handle grammatically correct Java source files but also
21  * code snippets.
22  *
23  * <p>
24  * (Parsing is done in multiple steps starting with raw text where every
25  * character is classified as UNDEFINED and trying to find out more about it
26  * step by step. There are some state machines used for parsing. They are hand
27  * coded and quite complicated. The parser seems to be very stable, as I have
28  * not been reported a single bug now for about two years.)
29  *
30  * <p>
31  * For questions, suggestions, bug-reports, enhancement-requests etc. I may be
32  * contacted at: <a HREF="mailto:markus@jave.de">markus@jave.de</a>
33  *
34  * The Java2html home page is located at: <a HREF="http://www.java2html.de">
35  * http://www.java2html.de</a>
36  *
37  * @author <a HREF="mailto:markus@jave.de">Markus Gebhard</a>
38  *
39  * <code>Copyright (C) Markus Gebhard 2000-2003
40  *
41  * This program is free software; you can redistribute it and/or
42  * * modify it under the terms of the GNU General Public License
43  * * as published by the Free Software Foundation; either version 2
44  * * of the License, or (at your option) any later version.
45  *
46  * This program is distributed in the hope that it will be useful,
47  * * but WITHOUT ANY WARRANTY; without even the implied warranty of
48  * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
49  * * GNU General Public License for more details.
50  *
51  * You should have received a copy of the GNU General Public License
52  * * along with this program; if not, write to the Free Software
53  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.</code>
54  */

55 public class JavaSourceParser {
56   /** The source code being converted */
57   private JavaSource source;
58
59   /** For faster access to source.getCode() */
60   private String JavaDoc sourceCode;
61
62   /** For faster access to source.getClassification() */
63   private JavaSourceType[] sourceTypes;
64
65   private JavaSourceConversionOptions options;
66
67   /** Delimiters for numeric values. */
68   private final static String JavaDoc NUM_DELIMITERS = " \t\n\r()[]{};:+-/\\*!?#%&|<>=^,";
69
70   /** Delimiters for finding data types and keywords. */
71   private final static String JavaDoc DELIMITERS = " \t\n\r()[]{};:.+-/\\*!?#%&|<>=^";
72
73   /** Characters automatically classified as being empty (type==BACKGROUND) */
74   private final static String JavaDoc EMPTY_STR = " \t\n\r\f";
75
76   private final static String JavaDoc[] PRIMITIVE_DATATYPES = {
77       "boolean",
78       "byte",
79       "char",
80       "double",
81       "float",
82       "int",
83       "long",
84       "short",
85       "void" };
86
87   /*
88    * As defined by Java Language Specification SE §3,
89    */

90   private final static String JavaDoc[] JAVA_KEYWORDS = {
91       "assert",
92       "abstract",
93       "default",
94       "if",
95       "private",
96       "this",
97       "do",
98       "implements",
99       "protected",
100       "throw",
101       "break",
102       "import",
103       "public",
104       "throws",
105       "else",
106       "instanceof",
107       "return",
108       "transient",
109       "case",
110       "extends",
111       "try",
112       "catch",
113       "final",
114       "interface",
115       "static",
116       "finally",
117       "strictfp",
118       "volatile",
119       "class",
120       "native",
121       "super",
122       "while",
123       "const",
124       "for",
125       "new",
126       "strictfp",
127       "switch",
128       "continue",
129       "goto",
130       "package",
131       "synchronized",
132       "threadsafe",
133       "null",
134       "true",
135       "false",
136       //Enum keyword from JDK1.5 (TypeSafe Enums)
137
"enum",
138       "@interface" };
139
140   private final static String JavaDoc[] JAVADOC_KEYWORDS = {
141       "@author",
142       "@beaninfo",
143       "@docRoot",
144       "@deprecated",
145       "@exception",
146       "@link",
147       "@param",
148       "@return",
149       "@see",
150       "@serial",
151       "@serialData",
152       "@serialField",
153       "@since",
154       "@throws",
155       "@version",
156       //new in JDK1.4
157
"@linkplain",
158       "@inheritDoc",
159       "@value",
160       //from iDoclet
161
"@pre",
162       "@post",
163       "@inv",
164       //from disy
165
"@published" };
166
167   /** Hashtables for fast access to JavaDoc keywords (tags) */
168   private static Hashtable JavaDoc tableJavaDocKeywords;
169
170   /** Hashtables for fast access to keywords */
171   private static Hashtable JavaDoc tableJavaKeywords;
172
173   /* States for the first state machine */
174   private final static short PARSESTATE_FINISHED = -1;
175   private final static short COD = 0; //CODE
176
private final static short CAC = 1; //CODE AWAIT COMMENT
177
private final static short CL = 2; //COMMENT LINE
178
private final static short CBJ1 = 3; //COMMENT BLOCK or COMMENT JAVADOC 1
179
private final static short CBJ2 = 4; //COMMENT BLOCK or COMMENT JAVADOC 1
180
private final static short CB = 5; //COMMENT BLOCK
181
private final static short CBA = 6; //COMMENT BLOCK AWAIT END
182
private final static short CJ = 7; //COMMENT JAVADOC
183
private final static short CJA = 8; //COMMENT JAVADOC AWAIT END
184
private final static short QU = 9; //QUOTE
185
private final static short QUA = 10; //QUOTE AWAIT \"
186
private final static short CH1 = 11; //
187
private final static short CH2 = 12; //
188
private final static short CH3 = 13; //
189
private final static short CH4 = 14; //
190
private final static short CH5 = 15; //
191
private final static short CH6 = 16; //
192

193   /* Additional states for the second state machine */
194   private final static short PARSESTATE_START = 0;
195   private final static short PARSESTATE_NEUTRAL = 1;
196   private final static short PARSESTATE_DA = 2;
197   private final static short PARSESTATE_NA = 3;
198   private final static short PARSESTATE_EXP = 4;
199   private final static short PARSESTATE_HEX = 5;
200   private final static short PARSESTATE_HIA = 6;
201
202   /** Counter for this and that (parseThree()?) */
203   private int counter;
204
205   /** EOT=End of text */
206   private final static char EOT = (char) -1;
207
208   /* State informations for state machine one */
209   private short parseState;
210   private int parseSourcePos;
211   private int parseTypePos;
212
213   public JavaSourceParser() {
214     this(JavaSourceConversionOptions.getDefault());
215   }
216
217   public JavaSourceParser(JavaSourceConversionOptions options) {
218     buildTables();
219     this.options = options;
220   }
221
222   /**
223    * Baut aus den statischen String-Arrays die Hashtabellen auf, mit denen die
224    * Keywords im Quelltext gesucht werden.
225    */

226   private synchronized void buildTables() {
227     if (tableJavaDocKeywords != null && tableJavaKeywords != null) {
228       return;
229     }
230
231     tableJavaDocKeywords = new Hashtable JavaDoc((int) (JAVADOC_KEYWORDS.length * 1.5));
232     for (int i = 0; i < JAVADOC_KEYWORDS.length; ++i) {
233       tableJavaDocKeywords.put(JAVADOC_KEYWORDS[i], JAVADOC_KEYWORDS[i]);
234     }
235
236     tableJavaKeywords = new Hashtable JavaDoc((int) (JAVA_KEYWORDS.length * 1.5));
237     for (int i = 0; i < JAVA_KEYWORDS.length; ++i) {
238       tableJavaKeywords.put(JAVA_KEYWORDS[i], JAVA_KEYWORDS[i]);
239     }
240   }
241
242   private final static boolean isEmpty(char ch) {
243     return (EMPTY_STR.indexOf(ch) != -1);
244   }
245
246   private boolean isNumberDelimiter(char ch) {
247     return (NUM_DELIMITERS.indexOf(ch) != -1);
248   }
249
250   private final static int indexOf(char ch, String JavaDoc s, int start, int end) {
251     if (end < start)
252       return -1;
253
254     for (int i = start; i <= end; ++i) {
255       if (s.charAt(i) == ch)
256         return i;
257     }
258
259     return -1;
260   }
261
262   //public void parse(){
263
// sourceCode=source.getCode();
264
// sourceTypes=new JavaSourceType[sourceCode.length()];
265
// parseOne();
266
// parseTwo();
267
// parseThree();
268
// source.setClassification(sourceTypes);
269
//}
270

271   public JavaSource parse(File JavaDoc file) throws IOException JavaDoc {
272     source = parse(new FileReader JavaDoc(file));
273     source.setFileName(file.getName());
274     return source;
275   }
276
277   public JavaSource parse(String JavaDoc rawText) {
278     if (rawText == null) {
279       throw new NullPointerException JavaDoc();
280     }
281     try {
282       return parse(new StringReader JavaDoc(rawText));
283     }
284     catch (IOException JavaDoc e) {
285       System.err.println("Unexpected exception while parsing raw text: " + e);
286       return new JavaSource("");
287     }
288   }
289
290   public JavaSource parse(URL JavaDoc url) throws IOException JavaDoc {
291     InputStream JavaDoc inputStream = null;
292     try {
293       inputStream = url.openStream();
294       return parse(inputStream);
295     }
296     finally {
297       IoUtilities.close(inputStream);
298     }
299   }
300
301   public JavaSource parse(InputStream JavaDoc stream) throws IOException JavaDoc {
302     return parse(new InputStreamReader JavaDoc(stream));
303   }
304
305   public JavaSource parse(Reader JavaDoc reader) throws IOException JavaDoc {
306     if (reader == null) {
307       throw new IllegalArgumentException JavaDoc("reader may not be null");
308     }
309     try {
310       sourceCode = readPlainSource(reader);
311     }
312     finally {
313       IoUtilities.close(reader);
314     }
315     replaceTabs();
316
317     sourceTypes = new JavaSourceType[sourceCode.length()];
318     source = new JavaSource(sourceCode);
319     source.setClassification(sourceTypes);
320
321     parseOne();
322     parseTwo();
323     parseThree();
324     parseFour();
325
326     doStatistics();
327
328     return source;
329   }
330
331   private void parseFour() {
332     boolean isInsideAnnotation = false;
333     for (int i = 0; i < sourceTypes.length; ++i) {
334       if (!isInsideAnnotation && sourceTypes[i] == JavaSourceType.CODE && sourceCode.charAt(i) == '@') {
335         isInsideAnnotation = true;
336         sourceTypes[i] = JavaSourceType.ANNOTATION;
337       }
338       else if (isInsideAnnotation
339           && sourceTypes[i] == JavaSourceType.CODE
340           && (Character.isJavaIdentifierPart(sourceCode.charAt(i)) || sourceCode.charAt(i) == '.')) {
341         sourceTypes[i] = JavaSourceType.ANNOTATION;
342       }
343       else {
344         isInsideAnnotation = false;
345       }
346     }
347   }
348
349   /**
350    * Gathers statistical information from the source code. After parsing this
351    * is quite easy and maybe it is useful for others. lineCount is needed for
352    * the html converter.
353    */

354   private void doStatistics() {
355     int index = 0;
356     source.getStatistic().clear();
357     source.getStatistic().setCharacterCount(sourceCode.length());
358     int linesContainingAnything = 0;
359
360     if (sourceCode.length() == 0) {
361       source.getStatistic().setLineCount(0);
362     }
363     else {
364       StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(sourceCode, "\n\r", true);
365       while (st.hasMoreTokens()) {
366         String JavaDoc line = st.nextToken();
367
368         if (line.charAt(0) == '\r') {
369           ++index;
370         }
371         else if (line.charAt(0) == '\n') {
372           ++index;
373           source.getStatistic().setLineCount(source.getStatistic().getLineCount() + 1);
374         }
375         else {
376           ++linesContainingAnything;
377           statistics(line, index);
378           index += line.length();
379         }
380       }
381       source.getStatistic().setLineCount(source.getStatistic().getLineCount() + 1);
382     }
383
384     //some empty lines without any were not counted
385
source.getStatistic().setEmptyLineCount(source.getStatistic().getLineCount() - linesContainingAnything);
386   }
387
388   private void statistics(String JavaDoc line, int start) {
389     if (line.length() > source.getStatistic().getMaxLineLength()) {
390       source.getStatistic().setMaxLineLength(line.length());
391     }
392
393     int end = start + line.length();
394
395     boolean containsCode = false;
396     boolean containsComment = false;
397
398     for (int i = start; i < end; ++i) {
399       if (sourceTypes[i] == JavaSourceType.CODE
400           || sourceTypes[i] == JavaSourceType.KEYWORD
401           || sourceTypes[i] == JavaSourceType.CODE_TYPE
402           || sourceTypes[i] == JavaSourceType.CHAR_CONSTANT
403           || sourceTypes[i] == JavaSourceType.NUM_CONSTANT) {
404         containsCode = true;
405         if (containsComment)
406           break;
407       }
408       else if (sourceTypes[i] == JavaSourceType.COMMENT_BLOCK
409           || sourceTypes[i] == JavaSourceType.COMMENT_LINE
410           || sourceTypes[i] == JavaSourceType.JAVADOC
411           || sourceTypes[i] == JavaSourceType.JAVADOC_KEYWORD) {
412         containsComment = true;
413         if (containsCode)
414           break;
415       }
416     }
417
418     if (containsCode)
419       source.getStatistic().setCodeLineCount(source.getStatistic().getCodeLineCount() + 1);
420     if (containsComment)
421       source.getStatistic().setCommentLineCount(source.getStatistic().getCommentLineCount() + 1);
422     if (!containsCode && !containsComment)
423       source.getStatistic().setEmptyLineCount(source.getStatistic().getEmptyLineCount() + 1);
424   }
425
426   private String JavaDoc readPlainSource(Reader JavaDoc reader) throws IOException JavaDoc {
427     return readPlainSource(new BufferedReader JavaDoc(reader));
428   }
429
430   private String JavaDoc readPlainSource(BufferedReader JavaDoc reader) throws IOException JavaDoc {
431
432     StringBuffer JavaDoc sb = new StringBuffer JavaDoc();
433     String JavaDoc line;
434     while ((line = reader.readLine()) != null) {
435       sb.append(line);
436       sb.append("\r\n");
437     }
438     if (sb.length() > 0) {
439       sb.setLength(sb.length() - 2);
440     }
441     return sb.toString();
442     // while (true){
443
// char[] buffer = new char[256];
444
// int length = reader.read(buffer, 0 , 256);
445
// if (length<=0){
446
// break;
447
// }
448
// sb.append(buffer, 0, length);
449
// }
450
// //Newlines are converted to "\r\n" for compatibility with eclipse
451
// styledtext!!!
452
// return NewLineNormalizer.normalize(sb.toString(), "\r\n");
453
}
454
455   /**
456    * Preprocessing: Replaces all tabs (\t) by 'tabs' space characters.
457    */

458   private void replaceTabs() {
459     char[] t = new char[options.getTabSize()];
460     for (int i = 0; i < options.getTabSize(); ++i) {
461       t[i] = ' ';
462     }
463
464     StringBuffer JavaDoc sb = new StringBuffer JavaDoc((int) (sourceCode.length() * 1.3));
465     for (int i = 0; i < sourceCode.length(); ++i) {
466       char ch = sourceCode.charAt(i);
467       if (ch == '\t') {
468         sb.append(t);
469       }
470       else {
471         sb.append(ch);
472       }
473     }
474
475     sourceCode = sb.toString();
476   }
477
478   /**
479    * First step of parsing. All characters are classified 'UNDEFINED' and we
480    * try to divide this into: CODE, CHAR_CONSTANT, COMMENT_LINE, COMMENT_BLOCK,
481    * COMMENT_JAVADOC, BACKGROUND and QUOTE This is done by a quite complicate
482    * state machine.
483    */

484   private void parseOne() {
485     parseState = COD;
486     parseSourcePos = 0;
487     parseTypePos = 0;
488
489     while (parseState != PARSESTATE_FINISHED) {
490       parseOneDo();
491     }
492   }
493
494   /**
495    * State-machine for classifying the code to: CODE, CHAR_CONSTANT,
496    * COMMENT_LINE, COMMENT_BLOCK, COMMENT_JAVADOC, BACKGROUND and QUOTE
497    *
498    * Note: It works - don't ask me how! If you want to know more about it all
499    * you can do is taking a sheet of paper (or more) and a pencil and try to
500    * draw the state machine :-)
501    */

502   private void parseOneDo() {
503     char ch = EOT;
504     if (sourceCode.length() > parseSourcePos) {
505       ch = sourceCode.charAt(parseSourcePos++);
506     }
507
508     switch (parseState) {
509       case COD:
510         if (ch == EOT) {
511           parseState = PARSESTATE_FINISHED;
512           return;
513         }
514         if (ch == '/') {
515           parseState = CAC;
516           return;
517         }
518         if (ch == '"') {
519           sourceTypes[parseTypePos++] = JavaSourceType.STRING;
520           parseState = QU;
521           return;
522         }
523         if (ch == '\'') {
524           parseState = CH1;
525           return;
526         }
527         if (isEmpty(ch)) {
528           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
529           return;
530         }
531
532         sourceTypes[parseTypePos++] = JavaSourceType.CODE;
533         return;
534       case CAC:
535         if (ch == EOT) {
536           parseState = PARSESTATE_FINISHED;
537           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
538           return;
539         }
540         if (ch == '/') {
541           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_LINE;
542           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_LINE;
543           parseState = CL;
544           return;
545         }
546         if (ch == '*') {
547           parseState = CBJ1;
548           return;
549         }
550         if (isEmpty(ch)) {
551           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
552           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
553           parseState = COD;
554           return;
555         }
556
557         sourceTypes[parseTypePos++] = JavaSourceType.CODE;
558         sourceTypes[parseTypePos++] = JavaSourceType.CODE;
559         parseState = COD;
560         return;
561       case CL:
562         if (ch == EOT) {
563           parseState = PARSESTATE_FINISHED;
564           return;
565         }
566         if (ch == '\n' || ch == '\r') {
567           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
568           //ggf. durch COMMENT_LINE ersetzen
569
parseState = COD;
570           return;
571         }
572         if (isEmpty(ch)) {
573           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
574           return;
575         }
576         sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_LINE;
577         return;
578       case CB:
579         if (ch == EOT) {
580           parseState = PARSESTATE_FINISHED;
581           return;
582         }
583         if (ch == '*') {
584           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
585           parseState = CBA;
586           return;
587         }
588         if (isEmpty(ch)) {
589           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
590           return;
591         }
592         sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
593         return;
594       case CBA:
595         if (ch == EOT) {
596           parseState = PARSESTATE_FINISHED;
597           return;
598         }
599         if (ch == '/') {
600           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
601           parseState = COD;
602           return;
603         }
604         if (ch == '*') {
605           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
606           parseState = CBA;
607           return;
608         }
609         if (isEmpty(ch)) {
610           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
611           parseState = CB;
612           return;
613         }
614         sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
615         parseState = CB;
616         return;
617       case CJ:
618         if (ch == EOT) {
619           parseState = PARSESTATE_FINISHED;
620           return;
621         }
622         if (ch == '*') {
623           sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
624           parseState = CJA;
625           return;
626         }
627         if (isEmpty(ch)) {
628           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
629           return;
630         }
631         sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
632         return;
633       case CJA:
634         if (ch == EOT) {
635           parseState = PARSESTATE_FINISHED;
636           return;
637         }
638         if (ch == '/') {
639           sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
640           parseState = COD;
641           return;
642         }
643         if (ch == '*') {
644           sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
645           parseState = CJA;
646           return;
647         }
648         if (isEmpty(ch)) {
649           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
650           parseState = CJ;
651           return;
652         }
653         sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
654         parseState = CJ;
655         return;
656       case QU:
657         if (ch == EOT) {
658           parseState = PARSESTATE_FINISHED;
659           return;
660         }
661         if (ch == '"') {
662           sourceTypes[parseTypePos++] = JavaSourceType.STRING;
663           parseState = COD;
664           return;
665         }
666         if (ch == '\\') {
667           parseState = QUA;
668           return;
669         }
670         if (isEmpty(ch)) {
671           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
672           return;
673         }
674
675         sourceTypes[parseTypePos++] = JavaSourceType.STRING;
676         return;
677       case QUA:
678         if (ch == EOT) {
679           sourceTypes[parseTypePos++] = JavaSourceType.STRING;
680           parseState = PARSESTATE_FINISHED;
681           return;
682         }
683         if (ch == '\\') {
684           sourceTypes[parseTypePos++] = JavaSourceType.STRING;
685           sourceTypes[parseTypePos++] = JavaSourceType.STRING;
686           parseState = QU; //This one has been changed from QUA to QU in 2.0
687
return;
688         }
689         if (isEmpty(ch)) {
690           sourceTypes[parseTypePos++] = JavaSourceType.STRING;
691           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
692           parseState = QU;
693           return;
694         }
695         sourceTypes[parseTypePos++] = JavaSourceType.STRING;
696         sourceTypes[parseTypePos++] = JavaSourceType.STRING;
697         parseState = QU;
698         return;
699       case CBJ1:
700         if (ch == EOT) {
701           parseState = PARSESTATE_FINISHED;
702           sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
703           sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
704           return;
705         }
706         if (ch == '*') {
707           parseState = CBJ2;
708           return;
709         }
710         if (isEmpty(ch)) {
711           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
712           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
713           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
714           parseState = CB;
715           return;
716         }
717         sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
718         sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
719         sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
720         parseState = CB;
721         return;
722       case CBJ2:
723         if (ch == EOT) {
724           parseState = PARSESTATE_FINISHED;
725           sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
726           sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
727           sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
728           return;
729         }
730         if (ch == '/') {
731           parseState = COD;
732           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
733           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
734           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
735           sourceTypes[parseTypePos++] = JavaSourceType.COMMENT_BLOCK;
736           return;
737         }
738         if (isEmpty(ch)) {
739           sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
740           sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
741           sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
742           sourceTypes[parseTypePos++] = JavaSourceType.BACKGROUND;
743           parseState = CJ;
744           return;
745         }
746         sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
747         sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
748         sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
749         sourceTypes[parseTypePos++] = JavaSourceType.JAVADOC;
750         parseState = CJ;
751         return;
752       case CH1:
753         if (ch == EOT) {
754           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
755           parseState = PARSESTATE_FINISHED;
756           return;
757         }
758         if (ch == '\\') {
759           parseState = CH3;
760           return;
761         }
762         parseState = CH2;
763         return;
764       case CH2:
765         if (ch == EOT) {
766           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
767           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
768           parseState = PARSESTATE_FINISHED;
769           return;
770         }
771         if (ch == '\'') {
772           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
773           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
774           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
775           parseState = COD;
776           return;
777         }
778         sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
779         sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
780         sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
781         parseState = COD;
782         return;
783       case CH3:
784         if (ch == EOT) {
785           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
786           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
787           parseState = PARSESTATE_FINISHED;
788           return;
789         }
790         if (ch == 'u') {
791           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
792           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
793           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
794           parseState = CH5;
795           return;
796         }
797         if (ch >= '1' && ch <= '9') {
798           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
799           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
800           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
801           parseState = CH6;
802           return;
803         }
804         parseState = CH4;
805         return;
806       case CH4:
807         if (ch == EOT) {
808           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
809           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
810           sourceTypes[parseTypePos++] = JavaSourceType.CODE;
811           parseState = PARSESTATE_FINISHED;
812           return;
813         }
814         if (ch == '\'') {
815           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
816           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
817           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
818           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
819           parseState = COD;
820           return;
821         }
822         sourceTypes[parseTypePos++] = JavaSourceType.CODE;
823         sourceTypes[parseTypePos++] = JavaSourceType.CODE;
824         sourceTypes[parseTypePos++] = JavaSourceType.CODE;
825         sourceTypes[parseTypePos++] = JavaSourceType.CODE;
826         parseState = COD;
827         return;
828       case CH6:
829         if (ch == EOT) {
830           parseState = PARSESTATE_FINISHED;
831           return;
832         }
833         if (ch == '\'') {
834           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
835           parseState = COD;
836           return;
837         }
838         if (ch >= '0' && ch <= '9') {
839           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
840           return;
841         }
842         sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
843         parseState = COD;
844         return;
845       case CH5:
846         if (ch == EOT) {
847           parseState = PARSESTATE_FINISHED;
848           return;
849         }
850         if (ch == '\'') {
851           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
852           parseState = COD;
853           return;
854         }
855         if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')) {
856           sourceTypes[parseTypePos++] = JavaSourceType.CHAR_CONSTANT;
857           return;
858         }
859         sourceTypes[parseTypePos++] = JavaSourceType.UNDEFINED;
860         parseState = COD;
861         return;
862     }
863   }
864
865   /**
866    * Second step for parsing. The categories from the first step are further
867    * divided: COMMENT_JAVADOC to COMMENT_JAVADOC and COMMENT_KEYWORD CODE to
868    * CODE, CODE_TYPE and CODE_KEYWORD
869    */

870   private void parseTwo() {
871     for (int index = 0; index < sourceTypes.length; ++index) {
872       if (sourceTypes[index] == JavaSourceType.CODE) {
873         if (isParenthesis(sourceCode.charAt(index))) {
874           mark(index, JavaSourceType.PARENTHESIS);
875         }
876       }
877     }
878
879     int start = 0;
880     int end = 0;
881
882     while (end < sourceTypes.length - 1) {
883       while (end < sourceTypes.length - 1 && sourceTypes[end + 1] == sourceTypes[start])
884         ++end;
885
886       parseTwo(start, end);
887
888       start = end + 1;
889       end = start;
890     }
891   }
892
893   private boolean isParenthesis(char ch) {
894     return ch == '{' || ch == '}' || ch == '[' || ch == ']' || ch == '(' || ch == ')';
895   }
896
897   private void parseTwo(int start, int end) {
898     if (sourceTypes[start] == JavaSourceType.JAVADOC) {
899       parseTwoCommentBlock(start, end);
900       return;
901     }
902     else if (sourceTypes[start] == JavaSourceType.CODE) {
903       parseTwoCode(start, end);
904       return;
905     }
906
907     //Keine weitere Unterteilung möglich
908
return;
909   }
910
911   /**
912    * Looks for primitive datatyes and keywords in the given region.
913    */

914   private void parseTwoCode(int start, int end) {
915     String JavaDoc code = sourceCode.substring(start, end + 1);
916
917     int index = start;
918     StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(code, DELIMITERS, true);
919     while (st.hasMoreTokens()) {
920       String JavaDoc s = st.nextToken();
921       // if (s.length()==1){
922
// char ch=s.charAt(0);
923
// if (ch=='{' || ch=='}' ||
924
// ch=='[' || ch==']' ||
925
// ch=='(' || ch==')'){
926
// mark(index, JavaSourceType.PARENTHESIS);
927
// }
928
// ++index;
929
// }else{
930
//Keyword?
931
if (tableJavaKeywords.containsKey(s)) {
932         mark(index, index + s.length(), JavaSourceType.KEYWORD);
933         if (s.equals("package")) {
934           int i1 = sourceCode.indexOf(';', index + 1);
935           if (i1 != -1) {
936             source.getStatistic().setPackageName(sourceCode.substring(index + s.length(), i1).trim());
937           }
938         }
939       }
940       else {
941         //Datatype?
942
for (int i = 0; i < PRIMITIVE_DATATYPES.length; ++i) {
943           if (s.equals(PRIMITIVE_DATATYPES[i])) {
944             mark(index, index + s.length(), JavaSourceType.CODE_TYPE);
945             break;
946           }
947         }
948       }
949       index += s.length();
950       // }
951
}
952   }
953
954   /**
955    * Tries to find JavaDoc comment keywords and html tags @l
956    */

957   private void parseTwoCommentBlock(int start, int end) {
958     int i1 = indexOf('@', sourceCode, start, end);
959
960     while (i1 != -1 && i1 + 1 < end) {
961       int i2 = i1 + 1;
962
963       char ch = sourceCode.charAt(i2 + 1);
964       while (i2 < end && ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'))) {
965         ch = sourceCode.charAt(++i2 + 1);
966       }
967
968       String JavaDoc s = sourceCode.substring(i1, i2 + 1);
969       //s is likely to be a valid JavaDoc-Tag
970

971       // if ((s.equals("@link") || s.equals("@linkplain"))
972
// && sourceCode.charAt(i1 - 1) == '{'
973
// && start > 0) {
974
// mark(i1 - 1, i1 + 5, JavaSourceType.JAVADOC_LINKS);
975
// }
976
// else
977
if (tableJavaDocKeywords.containsKey(s)) {
978         mark(i1, i2 + 1, JavaSourceType.JAVADOC_KEYWORD);
979       }
980
981       i1 = indexOf('@', sourceCode, i2, end);
982     }
983
984     //find html tags
985
i1 = indexOf('<', sourceCode, start, end);
986     while (i1 != -1 && i1 + 1 < end) {
987       int i2 = sourceCode.indexOf('>', i1 + 1);
988
989       // char ch=sourceCode.charAt(i2+1);
990
// while(i2<end && ch!='>'){
991
// ch=sourceCode.charAt(++i2+1);
992
// }
993
if (i2 == -1) {
994         i1 = -1;
995         break;
996       }
997       if (hasTypeOrEmpty(sourceTypes, i1, i2 + 1, JavaSourceType.JAVADOC)) {
998         mark(i1, i2 + 1, JavaSourceType.JAVADOC_HTML_TAG);
999       }
1000      i1 = indexOf('<', sourceCode, i2, end);
1001    }
1002  }
1003
1004  private static boolean hasTypeOrEmpty(
1005      JavaSourceType[] sourceTypes,
1006      int startIndex,
1007      int endIndex,
1008      JavaSourceType javaSourceType) {
1009
1010    for (int i = startIndex; i <= endIndex; ++i) {
1011      if (!sourceTypes[i].equals(javaSourceType) && !sourceTypes[i].equals(JavaSourceType.BACKGROUND)) {
1012        return false;
1013      }
1014    }
1015    return true;
1016  }
1017
1018  /**
1019   * Third step for parsing: Finding number constants. CODE is further divided
1020   * to CODE and NUM_CONSTANT
1021   */

1022  private void parseThree() {
1023    int start = 0;
1024    int end = 0;
1025
1026    while (end < sourceTypes.length - 1) {
1027      while (end < sourceTypes.length - 1 && sourceTypes[end + 1] == sourceTypes[start]) {
1028        ++end;
1029      }
1030
1031      if (sourceTypes[start] == JavaSourceType.CODE) {
1032        parseThree(start, end);
1033      }
1034
1035      start = end + 1;
1036      end = start;
1037    }
1038
1039    expandJavaDocLinks();
1040  }
1041
1042  private void expandJavaDocLinks() {
1043    expandEmbracedJavaDocTag("@link", JavaSourceType.JAVADOC_LINKS);
1044    expandEmbracedJavaDocTag("@linkplain", JavaSourceType.JAVADOC_LINKS);
1045  }
1046
1047  private void expandEmbracedJavaDocTag(String JavaDoc tag, JavaSourceType type) {
1048    String JavaDoc pattern = "{" + tag;
1049
1050    for (int index = 0; index < sourceTypes.length; ++index) {
1051      int start = sourceCode.indexOf(pattern, index);
1052      if (start == -1) {
1053        break;
1054      }
1055
1056      char ch = sourceCode.charAt(start + pattern.length());
1057      if (Character.isLetterOrDigit(ch)) {
1058        break;
1059      }
1060
1061      if (!checkRegion(start + 1, start + 1 + tag.length() - 1, new IJavaSourceTypeChecker() {
1062        public boolean isValid(JavaSourceType type) {
1063          return type.equals(JavaSourceType.JAVADOC_KEYWORD);
1064        }
1065      })) {
1066        break;
1067      }
1068
1069      int end = sourceCode.indexOf('}', start + pattern.length());
1070      if (end == -1) {
1071        break;
1072      }
1073
1074      //Check region, can only be JavaDoc and Background
1075
if (checkRegion(start + 1 + tag.length(), end, new IJavaSourceTypeChecker() {
1076        public boolean isValid(JavaSourceType type) {
1077          return type.equals(JavaSourceType.BACKGROUND) || type.equals(JavaSourceType.JAVADOC);
1078        }
1079      })) {
1080        markWithoutBackground(start, end, type);
1081      }
1082      index = end;
1083    }
1084
1085  }
1086
1087  private boolean checkRegion(int start, int end, IJavaSourceTypeChecker checker) {
1088    for (int i = start; i <= end; ++i) {
1089      if (!checker.isValid(sourceTypes[i])) {
1090        return false;
1091      }
1092    }
1093    return true;
1094  }
1095
1096  private void markWithoutBackground(int start, int end, JavaSourceType type) {
1097    for (int i = start; i <= end; ++i) {
1098      if (!sourceTypes[i].equals(JavaSourceType.BACKGROUND)) {
1099        sourceTypes[i] = type;
1100      }
1101    }
1102  }
1103
1104  /**
1105   * Looks for number constants (NUM_CONSTANT) in the selected region.
1106   */

1107  private void parseThree(int start, int end) {
1108    parseState = PARSESTATE_START;
1109    parseSourcePos = start;
1110    parseTypePos = start - 1;
1111    counter = 0;
1112
1113    while (parseState != PARSESTATE_FINISHED) {
1114      parseThreeDo(end);
1115    }
1116  }
1117
1118  /**
1119   * State-machine for NUM_CONSTANTs
1120   */

1121  private void parseThreeDo(int end) {
1122    char ch = EOT;
1123
1124    if (parseSourcePos <= end)
1125      ch = sourceCode.charAt(parseSourcePos);
1126
1127    ++parseSourcePos;
1128    ++parseTypePos;
1129
1130    switch (parseState) {
1131      case PARSESTATE_START:
1132        if (ch == EOT) {
1133          parseState = PARSESTATE_FINISHED;
1134          return;
1135        }
1136        if (ch == '.') {
1137          ++counter;
1138          parseState = PARSESTATE_DA;
1139          return;
1140        }
1141        if (ch == '0') {
1142          ++counter;
1143          parseState = PARSESTATE_HIA;
1144          return;
1145        }
1146        if (ch >= '1' && ch <= '9') {
1147          ++counter;
1148          parseState = PARSESTATE_NA;
1149          return;
1150        }
1151        if (isNumberDelimiter(ch)) {
1152          //stay in this parse state
1153
return;
1154        }
1155        parseState = PARSESTATE_NEUTRAL;
1156        return;
1157      case PARSESTATE_NEUTRAL:
1158        if (ch == EOT) {
1159          parseState = PARSESTATE_FINISHED;
1160          return;
1161        }
1162        if (isNumberDelimiter(ch)) {
1163          parseState = PARSESTATE_START;
1164          return;
1165        }
1166        return;
1167      case PARSESTATE_DA:
1168        if (ch == EOT) {
1169          parseState = PARSESTATE_FINISHED;
1170          return;
1171        }
1172        if (ch >= '0' && ch <= '9') {
1173          ++counter;
1174          parseState = PARSESTATE_NA;
1175          return;
1176        }
1177        if (isNumberDelimiter(ch)) {
1178          parseState = PARSESTATE_START;
1179          counter = 0;
1180          return;
1181        }
1182        parseState = PARSESTATE_NEUTRAL;
1183        counter = 0;
1184        return;
1185      case PARSESTATE_NA:
1186        if (ch == EOT) {
1187          parseState = PARSESTATE_FINISHED;
1188          mark(parseTypePos - counter, parseTypePos, JavaSourceType.NUM_CONSTANT);
1189          return;
1190        }
1191        if (ch == '.' || (ch >= '0' && ch <= '9')) {
1192          ++counter;
1193          return;
1194        }
1195        if (ch == 'e') {
1196          parseState = PARSESTATE_EXP;
1197          ++counter;
1198          return;
1199        }
1200        if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D' || ch == 'l' || ch == 'L') {
1201          ++counter;
1202          mark(parseTypePos - counter + 1, parseTypePos + 1, JavaSourceType.NUM_CONSTANT);
1203          parseState = PARSESTATE_NEUTRAL;
1204          counter = 0;
1205          return;
1206        }
1207        if (isNumberDelimiter(ch)) {
1208          parseState = PARSESTATE_START;
1209          mark(parseTypePos - counter, parseTypePos, JavaSourceType.NUM_CONSTANT);
1210          counter = 0;
1211          return;
1212        }
1213        mark(parseTypePos - counter, parseTypePos, JavaSourceType.NUM_CONSTANT);
1214        parseState = PARSESTATE_NEUTRAL;
1215        counter = 0;
1216        return;
1217      case PARSESTATE_HIA:
1218        if (ch == EOT) {
1219          parseState = PARSESTATE_FINISHED;
1220          mark(parseTypePos - counter, parseTypePos, JavaSourceType.NUM_CONSTANT);
1221          return;
1222        }
1223        if (ch == 'x' || ch == 'X') {
1224          parseState = PARSESTATE_HEX;
1225          ++counter;
1226          return;
1227        }
1228        if (ch == '.' || (ch >= '0' && ch <= '9')) {
1229          ++counter;
1230          parseState = PARSESTATE_NA;
1231          return;
1232        }
1233        if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D' || ch == 'l' || ch == 'L') {
1234          ++counter;
1235          mark(parseTypePos - counter + 1, parseTypePos + 1, JavaSourceType.NUM_CONSTANT);
1236          parseState = PARSESTATE_NEUTRAL;
1237          counter = 0;
1238          return;
1239        }
1240        if (isNumberDelimiter(ch)) {
1241          parseState = PARSESTATE_START;
1242          mark(parseTypePos - counter, parseTypePos, JavaSourceType.NUM_CONSTANT);
1243          counter = 0;
1244          return;
1245        }
1246        mark(parseTypePos - counter, parseTypePos, JavaSourceType.NUM_CONSTANT);
1247        parseState = PARSESTATE_NEUTRAL;
1248        counter = 0;
1249        return;
1250      case PARSESTATE_HEX:
1251        if (ch == EOT) {
1252          parseState = PARSESTATE_FINISHED;
1253          mark(parseTypePos - counter, parseTypePos, JavaSourceType.NUM_CONSTANT);
1254          return;
1255        }
1256        if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')) {
1257          ++counter;
1258          parseState = PARSESTATE_HEX;
1259          return;
1260        }
1261        if (ch == 'l' || ch == 'L') {
1262          ++counter;
1263          mark(parseTypePos - counter + 1, parseTypePos + 1, JavaSourceType.NUM_CONSTANT);
1264          parseState = PARSESTATE_NEUTRAL;
1265          counter = 0;
1266          return;
1267        }
1268        if (isNumberDelimiter(ch)) {
1269          parseState = PARSESTATE_START;
1270          mark(parseTypePos - counter, parseTypePos, JavaSourceType.NUM_CONSTANT);
1271          counter = 0;
1272          return;
1273        }
1274        mark(parseTypePos - counter, parseTypePos, JavaSourceType.NUM_CONSTANT);
1275        parseState = PARSESTATE_NEUTRAL;
1276        counter = 0;
1277        return;
1278      case PARSESTATE_EXP:
1279        if (ch == EOT) {
1280          parseState = PARSESTATE_FINISHED;
1281          mark(parseTypePos - counter, parseTypePos - 1, JavaSourceType.NUM_CONSTANT);
1282          return;
1283        }
1284        if ((ch >= '0' && ch <= '9') || ch == '+' || ch == '-') {
1285          ++counter;
1286          parseState = PARSESTATE_NA;
1287          return;
1288        }
1289        if (isNumberDelimiter(ch)) {
1290          parseState = PARSESTATE_START;
1291          mark(parseTypePos - counter, parseTypePos - 1, JavaSourceType.NUM_CONSTANT);
1292          counter = 0;
1293          return;
1294        }
1295        mark(parseTypePos - counter, parseTypePos - 1, JavaSourceType.NUM_CONSTANT);
1296        parseState = PARSESTATE_NEUTRAL;
1297        counter = 0;
1298        return;
1299    }
1300  }
1301
1302  /**
1303   * Marks the specified region int the source code to the given type.
1304   */

1305  private void mark(int start, int endPlusOne, JavaSourceType type) {
1306    for (int i = start; i < endPlusOne; ++i) {
1307      sourceTypes[i] = type;
1308    }
1309  }
1310
1311  /**
1312   * Marks the character at the specified index to the given type.
1313   */

1314  private void mark(int index, JavaSourceType type) {
1315    sourceTypes[index] = type;
1316  }
1317
1318  //public static void main(String args[]){
1319
// JavaSource j=new JavaSource(new java.io.File("JavaSourceParser.java"));
1320
//
1321
// JavaSourceParser parser=new JavaSourceParser(j);
1322
//
1323
// parser.sourceCode=parser.source.getCode();
1324
// parser.sourceTypes=new byte[parser.sourceCode.length()];
1325
//
1326
// long time0=System.currentTimeMillis();
1327
// parser.parseOne();
1328
// long time1=System.currentTimeMillis();
1329
// parser.parseTwo();
1330
// long time2=System.currentTimeMillis();
1331
// parser.parseThree();
1332
// long time3=System.currentTimeMillis();
1333
//
1334
// System.out.println("Parse1: "+(time1-time0)+"ms");
1335
// System.out.println(" 2: "+(time2-time1)+"ms");
1336
// System.out.println(" 3: "+(time3-time2)+"ms");
1337
//}
1338

1339}
Popular Tags