KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > iterator > RegexpLineIterator


1 /* LineReadingIterator
2 *
3 * $Id: RegexpLineIterator.java,v 1.2.14.1 2007/01/13 01:31:40 stack-sf Exp $
4 *
5 * Created on Jul 27, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.util.iterator;
26
27 import java.util.Iterator JavaDoc;
28 import java.util.logging.Logger JavaDoc;
29 import java.util.regex.Matcher JavaDoc;
30 import java.util.regex.Pattern JavaDoc;
31
32 /**
33  * Utility class providing an Iterator interface over line-oriented
34  * text input. By providing regexps indicating lines to ignore
35  * (such as pure whitespace or comments), lines to consider input, and
36  * what to return from the input lines (such as a whitespace-trimmed
37  * non-whitespace token with optional trailing comment), this can
38  * be configured to handle a number of formats.
39  *
40  * The public static members provide pattern configurations that will
41  * be helpful in a wide variety of contexts.
42  *
43  * @author gojomo
44  */

45 public class RegexpLineIterator
46 extends TransformingIteratorWrapper<String JavaDoc,String JavaDoc> {
47     private static final Logger JavaDoc logger =
48         Logger.getLogger(RegexpLineIterator.class.getName());
49
50     public static final String JavaDoc COMMENT_LINE = "\\s*(#.*)?";
51     public static final String JavaDoc NONWHITESPACE_ENTRY_TRAILING_COMMENT =
52         "^\\s*(\\S+)\\s*(#.*)?$";
53     public static final String JavaDoc TRIMMED_ENTRY_TRAILING_COMMENT =
54         "^\\s*([^#]+?)\\s*(#.*)?$";
55
56     public static final String JavaDoc ENTRY = "$1";
57
58     protected Matcher JavaDoc ignoreLine = null;
59     protected Matcher JavaDoc extractLine = null;
60     protected String JavaDoc outputTemplate = null;
61
62
63     public RegexpLineIterator(Iterator JavaDoc<String JavaDoc> inner, String JavaDoc ignore,
64             String JavaDoc extract, String JavaDoc replace) {
65         this.inner = inner;
66         ignoreLine = Pattern.compile(ignore).matcher("");
67         extractLine = Pattern.compile(extract).matcher("");
68         outputTemplate = replace;
69     }
70
71     /**
72      * Loads next item into lookahead spot, if available. Skips
73      * lines matching ignoreLine; extracts desired portion of
74      * lines matching extractLine; informationally reports any
75      * lines matching neither.
76      *
77      * @return whether any item was loaded into next field
78      */

79     protected String JavaDoc transform(String JavaDoc line) {
80         ignoreLine.reset(line);
81         if(ignoreLine.matches()) {
82             return null;
83         }
84         extractLine.reset(line);
85         if(extractLine.matches()) {
86             StringBuffer JavaDoc output = new StringBuffer JavaDoc();
87             // TODO: consider if a loop that find()s all is more
88
// generally useful here
89
extractLine.appendReplacement(output,outputTemplate);
90             return output.toString();
91         }
92         // no match; possibly error
93
logger.info("nonsense line: "+line);
94         return null;
95     }
96 }
97
Popular Tags