KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > htmlparser > lexerapplications > tabby > Tabby


1 // HTMLParser Library $Name: v1_5_20050313 $ - A java-based parser for HTML
2
// http://sourceforge.org/projects/htmlparser
3
// Copyright (C) 2003 Derrick Oswald
4
//
5
// Revision Control Information
6
//
7
// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexerapplications/tabby/Tabby.java,v $
8
// $Author: derrickoswald $
9
// $Date: 2005/03/13 14:51:44 $
10
// $Revision: 1.3 $
11
//
12
// This library is free software; you can redistribute it and/or
13
// modify it under the terms of the GNU Lesser General Public
14
// License as published by the Free Software Foundation; either
15
// version 2.1 of the License, or (at your option) any later version.
16
//
17
// This library is distributed in the hope that it will be useful,
18
// but WITHOUT ANY WARRANTY; without even the implied warranty of
19
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
// Lesser General Public License for more details.
21
//
22
// You should have received a copy of the GNU Lesser General Public
23
// License along with this library; if not, write to the Free Software
24
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25
//
26

27 package org.htmlparser.lexerapplications.tabby;
28
29 import java.io.File JavaDoc;
30 import java.io.FileFilter JavaDoc;
31 import java.io.FileInputStream JavaDoc;
32 import java.io.FileOutputStream JavaDoc;
33
34 import java.util.regex.Matcher JavaDoc;
35 import java.util.regex.Pattern JavaDoc;
36 import java.util.regex.PatternSyntaxException JavaDoc;
37
38 import org.htmlparser.lexer.Cursor;
39 import org.htmlparser.lexer.Page;
40
41 /**
42  * Replace tabs with spaces.
43  * Convert tabs to the correct number of spaces according to a tabstop,
44  * change DOS \r\n line endings to Unix \n form, and remove trailing whitespace
45  */

46 public class Tabby
47 {
48     /**
49      * The default tab stop spacing.
50      */

51     private static final int DEFAULT_TABSTOP = 4;
52
53     /**
54      * The file filter to apply.
55      */

56     protected Filter JavaDoc mFilter;
57
58     /**
59      * The replacement tab stop size.
60      */

61     protected int mTabsize;
62
63     /**
64      * Creates a new instance of Tabby with no file filter and a tab stop of 4.
65      */

66     public Tabby ()
67     {
68         mFilter = null;
69         mTabsize = DEFAULT_TABSTOP;
70     }
71
72     /**
73      * Creates a new instance of Tabby using the given regular expression and
74      * a tab stop of 4.
75      * @param filter The regular expression to apply to the files searched.
76      */

77     public Tabby (final String JavaDoc filter)
78     {
79         this ();
80         mFilter = new Filter JavaDoc (filter);
81     }
82
83     /** Creates a new instance of Tabby.
84      * @param filter The regular expression to apply to the files searched.
85      * @param tabsize The tab stop setting.
86      * @exception IllegalArgumentException If tabsize is not a positive number.
87      */

88     public Tabby (final String JavaDoc filter, final int tabsize)
89         throws
90             IllegalArgumentException JavaDoc
91     {
92         this (filter);
93         if (0 >= tabsize)
94             throw new IllegalArgumentException JavaDoc ("tab size cannot be negative");
95         mTabsize = tabsize;
96     }
97
98     /**
99      * Process the file or directory.
100      * @param file The file to process.
101      */

102     protected void process (final File JavaDoc file)
103     {
104         File JavaDoc[] files;
105
106         if (file.isDirectory ())
107         {
108             files = file.listFiles (mFilter);
109             for (int i = 0; i < files.length; i++)
110                 process (files[i]);
111         }
112         else
113             edit (file);
114     }
115
116     /**
117      * Process the file or directory.
118      * @param file The file to edit.
119      */

120     protected void edit (final File JavaDoc file)
121     {
122         FileInputStream JavaDoc in;
123         Page page;
124         Cursor cursor;
125         int position;
126         int expected;
127         boolean modified;
128         char ch;
129         int last;
130         StringBuffer JavaDoc buffer;
131         FileOutputStream JavaDoc out;
132
133         try
134         {
135             in = new FileInputStream JavaDoc (file);
136             buffer = new StringBuffer JavaDoc (in.available ());
137             try
138             {
139                 page = new Page (in, null);
140                 cursor = new Cursor (page, 0);
141                 position = 0;
142                 modified = false;
143                 expected = 0;
144                 last = -1;
145                 while (Page.EOF != (ch = page.getCharacter (cursor)))
146                 {
147                     if (++expected != cursor.getPosition ())
148                     {
149                         modified = true;
150                         expected = cursor.getPosition ();
151                     }
152                     if ('\t' == ch)
153                     {
154                         do
155                         {
156                             buffer.append (' ');
157                             position++;
158                         }
159                         while (0 != (position % mTabsize));
160                         modified = true;
161                     }
162                     else if ('\n' == ch)
163                     {
164                         // check for whitespace on the end of the line
165
if (last + 1 != position)
166                         {
167                             // remove trailing whitespace
168
last = buffer.length () - (position - last - 1);
169                             buffer.setLength (last);
170                             modified = true;
171                         }
172                         buffer.append (ch);
173                         position = 0;
174                         last = -1;
175                     }
176                     else
177                     {
178                         buffer.append (ch);
179                         if (!Character.isWhitespace (ch))
180                             last = position;
181                         position++;
182                     }
183                 }
184             }
185             finally
186             {
187                 in.close ();
188             }
189             if (modified)
190             {
191                 System.out.println (file.getAbsolutePath ());
192                 out = new FileOutputStream JavaDoc (file);
193                 out.write (buffer.toString ().getBytes (Page.DEFAULT_CHARSET));
194                 out.close ();
195             }
196         }
197         catch (Exception JavaDoc e)
198         {
199             System.out.println (e);
200         }
201     }
202
203     /**
204      * Implement a file filter.
205      */

206     class Filter implements FileFilter JavaDoc
207     {
208         /**
209          * The compiled expression.
210          */

211         protected Pattern JavaDoc mExpression;
212
213         /**
214          * Create a file filter from the regular expression.
215          * @param expression The <a HREF="http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html#sum">regular expression</a>.
216          * A useful regular expression is ".*\.java" which accepts all
217          * .java files.
218          * @exception IllegalArgumentException If the expression is
219          * <code>null</code>.
220          * @exception PatternSyntaxException If the expression is not a valid
221          * regular expression.
222          */

223         public Filter (final String JavaDoc expression)
224             throws
225                 PatternSyntaxException JavaDoc
226         {
227             if (null == expression)
228                 throw new IllegalArgumentException JavaDoc (
229                     "filter expression cannot be null");
230             mExpression = Pattern.compile (expression);
231         }
232
233         //
234
// FileFilter interface
235
//
236

237         /**
238          * Tests whether or not the file should be included in a pathname list.
239          * @param pathname The abstract pathname to be tested.
240          * @return <code>true</code> if and only if <code>pathname</code>
241          * should be included.
242          */

243         public boolean accept (final File JavaDoc pathname)
244         {
245             Matcher JavaDoc matcher;
246             boolean ret;
247
248             // match directories
249
if (pathname.isDirectory ())
250                 ret = true;
251             else
252             {
253                 matcher = mExpression.matcher (pathname.getAbsolutePath ());
254                 ret = matcher.matches ();
255             }
256
257             return (ret);
258         }
259     }
260
261     /**
262      * Run Tabby on a file or directory.
263      * @param args The command line arguments.
264      * <PRE>
265      * args[0] The file or directory to work on.
266      * args[1] Optional, the regular expression to use as a file filter
267      * args[2] Optional, the tab stop setting (integer).
268      * </PRE>
269      */

270     public static void main (final String JavaDoc[] args)
271     {
272         Tabby tabby;
273         File JavaDoc file;
274
275         if (0 == args.length)
276             System.out.println (
277                   "usage: Tabby (<directory>|<file>)"
278                 + " [file-match regexp] [tabsize]");
279         else
280         {
281             if (2 < args.length)
282                 tabby = new Tabby (args[1], Integer.parseInt (args[2]));
283             else
284                 if (1 < args.length)
285                     tabby = new Tabby (args[1]);
286                 else
287                     tabby = new Tabby ();
288             file = new File JavaDoc (args[0]);
289             tabby.process (file);
290         }
291     }
292 }
293
294 /*
295  * Revision Control Modification History
296  *
297  * $Log: Tabby.java,v $
298  * Revision 1.3 2005/03/13 14:51:44 derrickoswald
299  * Bug #1121401 No Parsing with yahoo!
300  * By default nio.charset.CharsetDecoder replaces characters it cannot
301  * represent in the current encoding with zero, which was the value
302  * returned by the page when the Stream reached EOF.
303  * This changes the Page return value to (char)Source.EOF (-1) when
304  * the end of stream is encountered.
305  *
306  * Revision 1.2 2004/07/31 16:42:34 derrickoswald
307  * Remove unused variables and other fixes exposed by turning on compiler warnings.
308  *
309  * Revision 1.1 2003/09/10 03:38:26 derrickoswald
310  * Add style checking target to ant build script:
311  * ant checkstyle
312  * It uses a jar from http://checkstyle.sourceforge.net which is dropped in the lib directory.
313  * The rules are in the file htmlparser_checks.xml in the src directory.
314  *
315  * Added lexerapplications package with Tabby as the first app. It performs whitespace manipulation
316  * on source files to follow the style rules. This reduced the number of style violations to roughly 14,000.
317  *
318  * There are a few issues with the style checker that need to be resolved before it should be taken too seriously.
319  * For example:
320  * It thinks all method arguments should be final, even if they are modified by the code (which the compiler frowns on).
321  * It complains about long lines, even when there is no possibility of wrapping the line, i.e. a URL in a comment
322  * that's more than 80 characters long.
323  * It considers all naked integers as 'magic numbers', even when they are obvious, i.e. the 4 corners of a box.
324  * It complains about whitespace following braces, even in array initializers, i.e. X[][] = { {a, b} { } }
325  *
326  * But it points out some really interesting things, even if you don't agree with the style guidelines,
327  * so it's worth a look.
328  *
329  *
330  */

331
Popular Tags