KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > archive > util > anvl > ANVLRecord


1 /* ANVLRecord
2 *
3 * $Id: ANVLRecord.java,v 1.10 2006/08/26 00:33:37 stack-sf Exp $
4 *
5 * Created on July 26, 2006.
6 *
7 * Copyright (C) 2006 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */

25 package org.archive.util.anvl;
26
27 import java.io.ByteArrayOutputStream JavaDoc;
28 import java.io.IOException JavaDoc;
29 import java.io.InputStream JavaDoc;
30 import java.io.UnsupportedEncodingException JavaDoc;
31 import java.util.ArrayList JavaDoc;
32 import java.util.Collection JavaDoc;
33 import java.util.HashMap JavaDoc;
34 import java.util.Iterator JavaDoc;
35 import java.util.List JavaDoc;
36 import java.util.Map JavaDoc;
37
38 import org.archive.io.UTF8Bytes;
39
40 /**
41  * An ordered {@link List} with 'data' {@link Element} values.
42  * ANVLRecords end with a blank line.
43  *
44  * @see <a
45  * HREF="http://www.cdlib.org/inside/diglib/ark/anvlspec.pdf">A Name-Value
46  * Language (ANVL)</a>
47  * @author stack
48  */

49 public class ANVLRecord extends ArrayList JavaDoc<Element> implements UTF8Bytes {
50     private static final long serialVersionUID = -4610638888453052958L;
51     
52     public static final String JavaDoc MIMETYPE = "text/anvl";
53     
54     public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord();
55     
56     /**
57      * Arbitrary upper bound on maximum size of ANVL Record.
58      * Will throw an IOException if exceed this size.
59      */

60     public static final long MAXIMUM_SIZE = 1024 * 10;
61     
62     /**
63      * An ANVL 'newline'.
64      * @see <a HREF="http://en.wikipedia.org/wiki/CRLF">http://en.wikipedia.org/wiki/CRLF</a>
65      */

66     static final String JavaDoc CRLF = "\r\n";
67     
68     static final String JavaDoc FOLD_PREFIX = CRLF + ' ';
69     
70     public ANVLRecord() {
71         super();
72     }
73
74     public ANVLRecord(Collection JavaDoc<? extends Element> c) {
75         super(c);
76     }
77
78     public ANVLRecord(int initialCapacity) {
79         super(initialCapacity);
80     }
81     
82     public boolean addLabel(final String JavaDoc l) {
83         return super.add(new Element(new Label(l)));
84     }
85
86     public boolean addLabelValue(final String JavaDoc l, final String JavaDoc v) {
87         return super.add(new Element(new Label(l), new Value(v)));
88     }
89     
90     @Override JavaDoc
91     public String JavaDoc toString() {
92         // TODO: What to emit for empty ANVLRecord?
93
StringBuilder JavaDoc sb = new StringBuilder JavaDoc();
94         for (final Iterator JavaDoc<Element> i = iterator(); i.hasNext();) {
95             sb.append(i.next());
96             sb.append(CRLF);
97         }
98         // 'ANVL Records end in a blank line'.
99
sb.append(CRLF);
100         return sb.toString();
101     }
102     
103     public Map JavaDoc<String JavaDoc, String JavaDoc> asMap() {
104         Map JavaDoc<String JavaDoc, String JavaDoc> m = new HashMap JavaDoc<String JavaDoc, String JavaDoc>(size());
105         for (final Iterator JavaDoc<Element> i = iterator(); i.hasNext();) {
106             Element e = i.next();
107             m.put(e.getLabel().toString(),
108                 e.isValue()? e.getValue().toString(): (String JavaDoc)null);
109         }
110         return m;
111     }
112     
113     @Override JavaDoc
114     public ANVLRecord clone() {
115         return new ANVLRecord(this);
116     }
117     
118     /**
119      * @return This ANVLRecord as UTF8 bytes.
120      */

121     public byte [] getUTF8Bytes()
122     throws UnsupportedEncodingException JavaDoc {
123         return toString().getBytes(UTF8);
124     }
125     
126     /**
127      * Parses a single ANVLRecord from passed InputStream.
128      * Read as a single-byte stream until we get to a CRLFCRLF which
129      * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream.
130      * Doing it this way, while requiring a double-scan, it makes it so do not
131      * need to be passed a RepositionableStream or a Stream that supports
132      * marking. Also no danger of over-reading which can happen when we
133      * wrap passed Stream with an InputStreamReader for doing UTF-8
134      * character conversion (See the ISR class comment).
135      * @param is InputStream
136      * @return An ANVLRecord instance.
137      * @throws IOException
138      */

139     public static ANVLRecord load(final InputStream JavaDoc is)
140     throws IOException JavaDoc {
141         // It doesn't look like a CRLF sequence is possible in UTF-8 without
142
// it signifying CRLF: The top bits are set in multibyte characters.
143
// Was thinking of recording CRLF as I was running through this first
144
// parse but the offsets would then be incorrect if any multibyte
145
// characters in the intervening gaps between CRLF.
146
boolean isCRLF = false;
147         boolean recordStart = false;
148         ByteArrayOutputStream JavaDoc baos = new ByteArrayOutputStream JavaDoc(1024);
149         boolean done = false;
150         int read = 0;
151         for (int c = -1, previousCharacter; !done;) {
152             if (read++ >= MAXIMUM_SIZE) {
153                 throw new IOException JavaDoc("Read " + MAXIMUM_SIZE +
154                     " bytes without finding \\r\\n\\r\\n " +
155                     "End-Of-ANVLRecord");
156             }
157             previousCharacter = c;
158             c = is.read();
159             if (c == -1) {
160                 throw new IOException JavaDoc("End-Of-Stream before \\r\\n\\r\\n " +
161                     "End-Of-ANVLRecord:\n" +
162                     new String JavaDoc(baos.toByteArray(), UTF8));
163             }
164             if (isLF((char)c) && isCR((char)previousCharacter)) {
165                 if (isCRLF) {
166                     // If we just had a CRLF, then its two CRLFs and its end of
167
// record. We're done.
168
done = true;
169                 } else {
170                     isCRLF = true;
171                 }
172             } else if (!recordStart && Character.isWhitespace(c)) {
173                 // Skip any whitespace at start of ANVLRecord.
174
continue;
175             } else {
176                 // Clear isCRLF flag if this character is NOT a '\r'.
177
if (isCRLF && !isCR((char)c)) {
178                     isCRLF = false;
179                 }
180                 // Not whitespace so start record if we haven't already.
181
if (!recordStart) {
182                     recordStart = true;
183                 }
184             }
185             baos.write(c);
186         }
187         return load(new String JavaDoc(baos.toByteArray(), UTF8));
188     }
189     
190     /**
191      * Parse passed String for an ANVL Record.
192      * Looked at writing javacc grammer but preprocessing is required to
193      * handle folding: See
194      * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173.
195      * Looked at Terence Parr's ANTLR. More capable. Can set lookahead count.
196      * A value of 3 would help with folding. But its a pain defining UNICODE
197      * grammers -- needed by ANVL -- and support seems incomplete
198      * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode.
199      * For now, go with the below hand-rolled parser.
200      * @param s String with an ANVLRecord.
201      * @return ANVLRecord parsed from passed String.
202      * @throws IOException
203      */

204     public static ANVLRecord load(final String JavaDoc s)
205     throws IOException JavaDoc {
206         ANVLRecord record = new ANVLRecord();
207         boolean inValue = false, inLabel = false, inComment = false,
208             inNewLine = false;
209         String JavaDoc label = null;
210         StringBuilder JavaDoc sb = new StringBuilder JavaDoc(s.length());
211         for (int i = 0; i < s.length(); i++) {
212             char c = s.charAt(i);
213            
214             // Assert I can do look-ahead.
215
if ((i + 1) > s.length()) {
216                 throw new IOException JavaDoc("Premature End-of-ANVLRecord:\n" +
217                     s.substring(i));
218             }
219             
220             // If at LF of a CRLF, just go around again. Eat up the LF.
221
if (inNewLine && isLF(c)) {
222                 continue;
223             }
224             
225             // If we're at a CRLF and we were just on one, exit. Found Record.
226
if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) {
227                 break;
228             }
229             
230             // Check if we're on a fold inside a Value. Skip multiple white
231
// space after CRLF.
232
if (inNewLine && inValue && Character.isWhitespace(c)) {
233                 continue;
234             }
235             
236             // Else set flag if we're at a CRLF.
237
inNewLine = isCR(c) && isLF(s.charAt(i + 1));
238             
239             if (inNewLine) {
240                 if (inComment) {
241                     inComment = false;
242                 } else if (label != null && !inValue) {
243                     // Label only 'data element'.
244
record.addLabel(label);
245                     label = null;
246                     sb.setLength(0);
247                 } else if (inValue) {
248                     // Assert I can do look-ahead past current CRLF.
249
if ((i + 3) > s.length()) {
250                         throw new IOException JavaDoc("Premature End-of-ANVLRecord "
251                             + "(2):\n" + s.substring(i));
252                     }
253                     if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3))
254                             && Character.isWhitespace(s.charAt(i + 2))) {
255                         // Its a fold. Let it go around. But add in a CRLF and
256
// space and do it here. We don't let CRLF fall through
257
// to the sb.append on the end of this loop.
258
sb.append(CRLF);
259                         sb.append(' ');
260                     } else {
261                         // Next line is a new SubElement, a new Comment or
262
// Label.
263
record.addLabelValue(label, sb.toString());
264                         sb.setLength(0);
265                         label = null;
266                         inValue = false;
267                     }
268                 } else {
269                     // We're whitespace between label and value or whitespace
270
// before we've figured whether label or comment.
271
}
272                 // Don't let the '\r' or CRLF through.
273
continue;
274             }
275             
276             if (inComment) {
277                 continue;
278             } else if (inLabel) {
279                 if (c == Label.COLON) {
280                     label = sb.toString();
281                     sb.setLength(0);
282                     inLabel = false;
283                     continue;
284                 }
285             } else {
286                 if (!inLabel && !inValue && !inComment) {
287                     // We have no state. Figure one.
288
if (Character.isWhitespace(c)) {
289                         // If no state, and whitespace, skip. Don't record.
290
continue;
291                     } else if (label == null && c == '#') {
292                         inComment = true;
293                         // Don't record comments.
294
continue;
295                     } else if (label == null) {
296                         inLabel = true;
297                     } else {
298                         inValue = true;
299                     }
300                 }
301             }
302             sb.append(c);
303         }
304         return record;
305     }
306     
307     /**
308      * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is
309      * CRLFCRLF so is of size 4. Also, expensive, since it makes String of
310      * the record so it can count bytes.
311      */

312     public synchronized int getLength() {
313         int length = -1;
314         try {
315             length = getUTF8Bytes().length;
316         } catch (UnsupportedEncodingException JavaDoc e) {
317             throw new RuntimeException JavaDoc(e);
318         }
319         return length;
320     }
321     
322     public static boolean isCROrLF(final char c) {
323         return isCR(c) || isLF(c);
324     }
325     
326     public static boolean isCR(final char c) {
327         return c == ANVLRecord.CRLF.charAt(0);
328     }
329     
330     public static boolean isLF(final char c) {
331         return c == ANVLRecord.CRLF.charAt(1);
332     }
333 }
Popular Tags