KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > com > puppycrawl > tools > checkstyle > checks > duplicates > StrictDuplicateCodeCheck


1 ////////////////////////////////////////////////////////////////////////////////
2
// checkstyle: Checks Java source code for adherence to a set of rules.
3
// Copyright (C) 2001-2005 Oliver Burn
4
//
5
// This library is free software; you can redistribute it and/or
6
// modify it under the terms of the GNU Lesser General Public
7
// License as published by the Free Software Foundation; either
8
// version 2.1 of the License, or (at your option) any later version.
9
//
10
// This library is distributed in the hope that it will be useful,
11
// but WITHOUT ANY WARRANTY; without even the implied warranty of
12
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
// Lesser General Public License for more details.
14
//
15
// You should have received a copy of the GNU Lesser General Public
16
// License along with this library; if not, write to the Free Software
17
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
////////////////////////////////////////////////////////////////////////////////
19
package com.puppycrawl.tools.checkstyle.checks.duplicates;
20
21 import java.io.File JavaDoc;
22 import java.io.IOException JavaDoc;
23 import java.util.Arrays JavaDoc;
24
25 import com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck;
26 import com.puppycrawl.tools.checkstyle.api.Utils;
27 import com.puppycrawl.tools.checkstyle.api.MessageDispatcher;
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30
31 /**
32  * Performs a line-by-line comparison of all code lines and reports
33  * duplicate code if a sequence of lines differs only in
34  * indentation. All import statements in Java code are ignored, any
35  * other line - including javadoc, whitespace lines between methods,
36  * etc. - is considered (which is why the check is called
37  * <em>strict</em>).
38  *
39  * @author Lars K&uuml;hne
40  */

41 public final class StrictDuplicateCodeCheck extends AbstractFileSetCheck
42 {
43     /**
44      * Converts each of the original source lines
45      * to a checksum that is checked against to find duplicates.
46      */

47     private interface ChecksumGenerator
48     {
49         /**
50          * Convert each of the original source lines
51          * to a checksum that is checked against to find duplicates
52          * Typically this involves stripping whitespace.
53          * @param aOriginalLines the original lines as they appear in the source
54          * @return the converted line or null if the line should be ignored
55          */

56         long[] convertLines(String JavaDoc[] aOriginalLines);
57     }
58
59
60     /**
61      * Calculates checksums for text files.
62      * Removes leading and trainling whitespace.
63      */

64     private class TextfileChecksumGenerator implements ChecksumGenerator
65     {
66         /** {@inheritDoc} */
67         public long[] convertLines(String JavaDoc[] aOriginalLines)
68         {
69             final long[] checkSums = new long[aOriginalLines.length];
70             for (int i = 0; i < aOriginalLines.length; i++) {
71                 final String JavaDoc line = aOriginalLines[i].trim();
72                 checkSums[i] = calcChecksum(line);
73             }
74             return checkSums;
75         }
76
77         /**
78          * Computes a checksum for a a single line of text.
79          * @param aLine the aLine
80          * @return checksum
81          */

82         protected long calcChecksum(String JavaDoc aLine)
83         {
84             // important that it's larger than the length of most lines
85
// see http://www.utm.edu/research/primes/lists/small/1000.txt
86
final int bigPrime = 317;
87
88             // TODO: Not sure that this algorithm makes it
89
// sufficiently improbable to get false alarms
90
long result = 0;
91             for (int i = 0; i < aLine.length(); i++) {
92                 final long c = aLine.charAt(i);
93                 result += bigPrime * i + c;
94             }
95             return result;
96         }
97     }
98
99     /**
100      * A TextfileChecksumGenerator that also ignores imports.
101      */

102     private class JavaChecksumGenerator extends TextfileChecksumGenerator
103     {
104         // TODO: return IGNORE for lines in the header comment?
105
// That would require some simple parsing...
106

107         // we could also parse the java code using the TreeWalker
108
// and then ignore everything before the CLASS_DEF...
109

110
111         /**
112          * computes a checksum for a aLine. to avoid false alarms it is
113          * important that different lines result in different checksums.
114          * @param aLine the aLine
115          * @return checksum
116          */

117         protected long calcChecksum(String JavaDoc aLine)
118         {
119             if (aLine.startsWith("import ")) {
120                 return IGNORE;
121             }
122             return super.calcChecksum(aLine);
123         }
124     }
125
126     /** a jakarta commons log */
127     private static final Log LOG =
128             LogFactory.getLog(StrictDuplicateCodeCheck.class);
129
130     /** the checksum value to use for lines that should be ignored */
131     private static final long IGNORE = Long.MIN_VALUE;
132
133     /** default value for mMin */
134     private static final int DEFAULT_MIN_DUPLICATE_LINES = 12;
135
136     /** number of lines that have to be idential for reporting duplicates */
137     private int mMin = DEFAULT_MIN_DUPLICATE_LINES;
138
139     /** the basedir to strip off in filenames */
140     private String JavaDoc mBasedir;
141
142     /** the checksums of all files that are currently checked */
143     private long[][] mLineChecksums;
144
145     /** helper to speed up searching algorithm */
146     private long[][] mSortedRelevantChecksums;
147
148     /** files that are currently checked */
149     private File JavaDoc[] mFiles;
150
151     // fields required only for statistics
152

153     /** total number of duplicates found */
154     private int mDuplicates;
155
156     /** lines of code that have been checked */
157     private int mLoc;
158
159     /** number of chache misses */
160     private long mCacheMisses;
161
162     /** number of cache hits */
163     private long mCacheHits;
164
165     /** Creates a new instance of this class. */
166     public StrictDuplicateCodeCheck()
167     {
168     }
169
170     /**
171      * Sets the minimum number of lines that must be equivalent
172      * before the check complains.
173      *
174      * @param aMin the number of lines that must be equal before
175      * triggering a 'duplicate code' message.
176      */

177     public void setMin(int aMin)
178     {
179         mMin = aMin;
180     }
181
182     /** @param aBasedir the base directory to strip off in filenames */
183     public void setBasedir(String JavaDoc aBasedir)
184     {
185         mBasedir = aBasedir;
186     }
187
188     /**
189      * {@inheritDoc}
190      */

191     public synchronized void process(File JavaDoc[] aFiles)
192     {
193         final long start = System.currentTimeMillis();
194         mLoc = 0;
195         mDuplicates = 0;
196         mFiles = filter(aFiles);
197         mLineChecksums = new long[mFiles.length][];
198         mSortedRelevantChecksums = new long[mFiles.length][];
199
200         if (LOG.isDebugEnabled()) {
201             LOG.debug("Reading input files");
202         }
203
204         for (int i = 0; i < mFiles.length; i++) {
205             try {
206                 final File JavaDoc file = mFiles[i];
207                 final String JavaDoc[] lines =
208                     Utils.getLines(file.getPath(), getCharset());
209                 final ChecksumGenerator transformer =
210                     findChecksumGenerator(file);
211                 mLineChecksums[i] = transformer.convertLines(lines);
212             }
213             catch (final IOException JavaDoc ex) {
214                 LOG.error("Cannot access files to check, giving up: "
215                         + ex.getMessage(), ex);
216                 // TODO: better to throw an exception here?
217
// it would be best to throw IOException from process(),
218
// but interface definition doesn't allow that...
219
mLineChecksums = new long[0][0];
220             }
221         }
222         fillSortedRelevantChecksums();
223
224         final long endReading = System.currentTimeMillis();
225         findDuplicates();
226         final long endSearching = System.currentTimeMillis();
227
228         dumpStats(start, endReading, endSearching);
229
230         mLineChecksums = null;
231         mSortedRelevantChecksums = null;
232     }
233
234     /**
235      * Finds the Checksum generator for a given file.
236      *
237      * @param aFile the file to check for duplicates
238      * @return a generator to use for aFile
239      */

240     private ChecksumGenerator findChecksumGenerator(File JavaDoc aFile)
241     {
242         if (aFile.getName().endsWith(".java")) {
243             return new JavaChecksumGenerator();
244         }
245         // TODO: Not sure what to do with binary files such as gifs
246
return new TextfileChecksumGenerator();
247     }
248
249     /**
250      * Dump out statistics data on stderr.
251      * @param aStart start time
252      * @param aEndReading end time of reading phsical files
253      * @param aEndSearching end time duplicate analysis
254      */

255     private void dumpStats(long aStart, long aEndReading, long aEndSearching)
256     {
257         if (LOG.isDebugEnabled()) {
258             final long cacheLookups = mCacheHits + mCacheMisses;
259             final long initTime = aEndReading - aStart;
260             final long workTime = aEndSearching - aEndReading;
261             LOG.debug("cache hits = " + mCacheHits + "/" + cacheLookups);
262             LOG.debug("files = " + mFiles.length);
263             LOG.debug("loc = " + mLoc);
264             LOG.debug("duplicates = " + mDuplicates);
265             LOG.debug("Runtime = " + initTime + " + " + workTime);
266         }
267     }
268
269     /**
270      * filters and sorts the relevant lines and stores the result
271      * in sortedRelevantChecksums during the setup phase.
272      * That data is later used in a binary search to find out
273      * if it is worth investigating a file for duplicates of a block.
274      * If one of the lines in the block does not occur in the other file
275      * at all, we can skip that file quickly.
276      */

277     private void fillSortedRelevantChecksums()
278     {
279         for (int i = 0; i < mLineChecksums.length; i++) {
280             int count = 0;
281             final long[] checksums = mLineChecksums[i];
282             final long[] relevant = new long[checksums.length];
283             for (int j = 0; j < checksums.length; j++) {
284                 final long checksum = checksums[j];
285                 if (checksum != IGNORE) {
286                     relevant[count++] = checksum;
287                 }
288             }
289             Arrays.sort(relevant, 0, count);
290             final long[] result = new long[count];
291             System.arraycopy(relevant, 0, result, 0, count);
292             mSortedRelevantChecksums[i] = result;
293         }
294     }
295
296     /**
297      * finds duplicate lines in mFiles,
298      * using a textsearch algorithm to find reoccuring
299      * patters in the lineChecksums.
300      */

301     private void findDuplicates()
302     {
303         if (LOG.isDebugEnabled()) {
304             LOG.debug("Analysis phase");
305         }
306
307         // It's been a while since my CS degree, but I think this is
308
// somewhere near O(mMax * LOC^2).
309

310         // It may be possible to do this *much* smarter,
311
// but I don't have the Knuth bible at hand right now :-)
312

313         // OK, prepare for some nested loops... :-(
314

315         for (int i = 0; i < mFiles.length; i++) {
316
317             final String JavaDoc path = mFiles[i].getPath();
318
319             getMessageCollector().reset();
320             final MessageDispatcher dispatcher = getMessageDispatcher();
321             dispatcher.fireFileStarted(path);
322
323             mLoc += mLineChecksums[i].length;
324             for (int j = 0; j <= i; j++) {
325                 findDuplicatesInFiles(i, j);
326             }
327
328             fireErrors(path);
329             dispatcher.fireFileFinished(path);
330         }
331     }
332
333     /**
334      * Compare two files and search for duplicates.
335      * @param aI mLineChecksums index of the first file to compare
336      * @param aJ mLineChecksums index of the seconds file to compare
337      */

338     private void findDuplicatesInFiles(int aI, int aJ)
339     {
340         final int iFileLength = mLineChecksums[aI].length;
341
342         // build up some supporting data structures
343
final boolean[] iLineOccurInJ = new boolean[iFileLength];
344         for (int iLine = 0; iLine < iFileLength; iLine++) {
345             iLineOccurInJ[iLine] = (Arrays.binarySearch(
346                 mSortedRelevantChecksums[aJ], mLineChecksums[aI][iLine]) >= 0);
347         }
348
349         // go through all the lines in iFile and check if the following
350
// mMin lines occur in jFile
351
for (int iLine = 0; iLine < iFileLength - mMin; iLine++) {
352
353             // fast exit if one of the lines does not occur in jFile at all
354
boolean fastExit = false;
355             final int kLimit = iFileLength - iLine;
356             for (int k = 0; k < Math.min(mMin, kLimit); k++) {
357                 if (!iLineOccurInJ[iLine + k]) {
358                     fastExit = true;
359                     break;
360                 }
361             }
362
363             if (!fastExit) {
364                 // all lines do occur -> brute force searching
365
mCacheMisses += 1;
366                 iLine = findDuplicateFromLine(aI, aJ, iLine);
367             }
368             else {
369                 mCacheHits += 1;
370             }
371         }
372     }
373
374     /**
375      * Find and report a duplicate of the code starting from line aILine
376      * in file aI in the file aJ
377      * @param aI index of file that contains the candidate code
378      * @param aJ index of file that is searched for a dup of the candidate
379      * @param aILine starting line of the candidate in aI
380      * @return the next line in file i where
381      * starting to search will make sense
382      */

383     private int findDuplicateFromLine(int aI, int aJ, int aILine)
384     {
385         // Using something more advanced like Boyer-Moore might be a
386
// good idea...
387

388         final int iFileLength = mLineChecksums[aI].length;
389         final int jFileLength = mLineChecksums[aJ].length;
390
391         for (int jLine = 0; jLine < jFileLength - mMin; jLine++) {
392
393             if ((aI == aJ) && (aILine == jLine)) {
394                 continue;
395             }
396
397             int equivalent = 0;
398             while ((aILine + equivalent < iFileLength)
399                     && (jLine + equivalent < jFileLength)
400                     && (mLineChecksums[aI][aILine + equivalent] != IGNORE)
401                     && (mLineChecksums[aI][aILine + equivalent]
402                        == mLineChecksums[aJ][jLine + equivalent]))
403             {
404                 equivalent += 1;
405             }
406
407             if (((aI != aJ) || (aILine < jLine)) && (equivalent >= mMin)) {
408                 reportDuplicate(equivalent, aILine, mFiles[aJ], jLine);
409                 aILine += equivalent; // skip to end of equivalent section
410
}
411         }
412         return aILine;
413     }
414
415     /**
416      * Dumps out a duplicate report.
417      * @param aEquivalent number of equivalent lines
418      * @param aILine location of duplicate code
419      * within file that is currently checked
420      * @param aJFile the other file that contains the duplicate
421      * @param aJLine location of duplicate code within aJFile
422      */

423     private void reportDuplicate(
424             int aEquivalent, int aILine, File JavaDoc aJFile, int aJLine)
425     {
426         final Integer JavaDoc dupLines = new Integer JavaDoc(aEquivalent);
427         final Integer JavaDoc startLine = new Integer JavaDoc(aJLine + 1);
428         final String JavaDoc fileName =
429                 Utils.getStrippedFileName(mBasedir, aJFile.getPath());
430         log(aILine + 1, "duplicates.lines",
431                 new Object JavaDoc[]{dupLines, fileName, startLine});
432         mDuplicates += 1;
433     }
434
435 }
436
Popular Tags