KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > au > id > jericho > lib > html > OutputDocument


1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2
// Version 2.2
3
// Copyright (C) 2006 Martin Jericho
4
// http://sourceforge.net/projects/jerichohtml/
5
//
6
// This library is free software; you can redistribute it and/or
7
// modify it under the terms of the GNU Lesser General Public
8
// License as published by the Free Software Foundation; either
9
// version 2.1 of the License, or (at your option) any later version.
10
// http://www.gnu.org/copyleft/lesser.html
11
//
12
// This library is distributed in the hope that it will be useful,
13
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
// Lesser General Public License for more details.
16
//
17
// You should have received a copy of the GNU Lesser General Public
18
// License along with this library; if not, write to the Free Software
19
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20

21 package au.id.jericho.lib.html;
22
23 import java.io.*;
24 import java.util.*;
25
26 /**
27  * Represents a modified version of an original {@link Source} document.
28  * <p>
29  * An <code>OutputDocument</code> represents an original source document that
30  * has been modified by substituting segments of it with other text.
31  * Each of these substitutions must be registered in the output document,
32  * which is most commonly done using the various <code>replace</code>, <code>remove</code> or <code>insert</code> methods in this class.
33  * These methods internally {@linkplain #register(OutputSegment) register} one or more {@link OutputSegment} objects to define each substitution.
34  *
35  * After all of the substitutions have been registered, the modified text can be retrieved using the
36  * {@link #writeTo(Writer)} or {@link #toString()} methods.
37  * <p>
38  * The registered {@linkplain OutputSegment output segments} must not overlap each other, but may be adjacent.
39  * Multiple output segments may be added at the same {@linkplain OutputSegment#getBegin() begin} position provided that they are all
40  * zero-length, with the exception of one segment which may {@linkplain OutputSegment#getEnd() end} at a different position.
41  * <p>
42  * For efficiency reasons, violations of the above rules on overlapping segments do not throw an exception when the segment is registered,
43  * but an {@link OverlappingOutputSegmentsException} is thrown when the {@linkplain #writeTo(Writer) output is generated}.
44  * <p>
45  * The following example converts all externally referenced style sheets to internal style sheets:
46  * <p>
47  * <pre>
48  * URL sourceUrl=new URL(sourceUrlString);
49  * String htmlText=Util.getString(new InputStreamReader(sourceUrl.openStream()));
50  * Source source=new Source(htmlText);
51  * OutputDocument outputDocument=new OutputDocument(source);
52  * StringBuffer sb=new StringBuffer();
53  * List linkStartTags=source.findAllStartTags(Tag.LINK);
54  * for (Iterator i=linkStartTags.iterator(); i.hasNext();) {
55  * StartTag startTag=(StartTag)i.next();
56  * Attributes attributes=startTag.getAttributes();
57  * String rel=attributes.getValue("rel");
58  * if (!"stylesheet".equalsIgnoreCase(rel)) continue;
59  * String HREF=attributes.getValue("href");
60  * if (href==null) continue;
61  * String styleSheetContent;
62  * try {
63  * styleSheetContent=Util.getString(new InputStreamReader(new URL(sourceUrl,href).openStream()));
64  * } catch (Exception ex) {
65  * continue; // don't convert if URL is invalid
66  * }
67  * sb.setLength(0);
68  * sb.append("&lt;style");
69  * Attribute typeAttribute=attributes.get("type");
70  * if (typeAttribute!=null) sb.append(' ').append(typeAttribute);
71  * sb.append("&gt;\n").append(styleSheetContent).append("\n&lt;/style&gt;");
72  * outputDocument.replace(startTag,sb);
73  * }
74  * String convertedHtmlText=outputDocument.toString();
75  * </pre>
76  *
77  * @see OutputSegment
78  * @see StringOutputSegment
79  */

80 public final class OutputDocument implements CharStreamSource {
81     private CharSequence JavaDoc sourceText;
82     private ArrayList outputSegments=new ArrayList();
83
84     /**
85      * Constructs a new output document based on the specified source document.
86      * @param source the source document.
87      */

88     public OutputDocument(final Source source) {
89       if (source==null) throw new IllegalArgumentException JavaDoc("source argument must not be null");
90         this.sourceText=source;
91     }
92
93     OutputDocument(final ParseText parseText) {
94         this.sourceText=parseText;
95     }
96
97     /**
98      * Returns the original source text upon which this output document is based.
99      * @return the original source text upon which this output document is based.
100      */

101     public CharSequence JavaDoc getSourceText() {
102         return sourceText;
103     }
104
105     /**
106      * Removes the specified {@linkplain Segment segment} from this output document.
107      * <p>
108      * This is equivalent to {@link #replace(Segment,CharSequence) replace}<code>(segment,null)</code>.
109      *
110      * @param segment the segment to remove.
111      */

112     public void remove(final Segment segment) {
113         replace(segment,(CharSequence JavaDoc)null);
114     }
115
116     /**
117      * Removes all the segments from this output document represented by the specified source {@linkplain Segment} objects.
118      * <p>
119      * This is equivalent to the following code:<pre>
120      * for (Iterator i=segments.iterator(); i.hasNext();)
121      * {@link #remove(Segment) remove}((Segment)i.next());</pre>
122      *
123      * @param segments a collection of segments to remove, represented by source {@link Segment} objects.
124      */

125     public void remove(final Collection segments) {
126         for (Iterator i=segments.iterator(); i.hasNext();) remove((Segment)i.next());
127     }
128
129     /**
130      * Inserts the specified text at the specified character position in this output document.
131      * @param pos the character position at which to insert the text.
132      * @param text the replacement text.
133      */

134     public void insert(final int pos, final CharSequence JavaDoc text) {
135         register(new StringOutputSegment(pos,pos,text));
136     }
137
138     /**
139      * Replaces the specified {@linkplain Segment segment} in this output document with the specified text.
140      * <p>
141      * Specifying a <code>null</code> argument to the <code>text</code> parameter is exactly equivalent to specifying an empty string,
142      * and results in the segment being completely removed from the output document.
143      *
144      * @param segment the segment to replace.
145      * @param text the replacement text, or <code>null</code> to remove the segment.
146      */

147     public void replace(final Segment segment, final CharSequence JavaDoc text) {
148         replace(segment.getBegin(),segment.getEnd(),text);
149     }
150
151     /**
152      * Replaces the specified segment of this output document with the specified text.
153      * <p>
154      * Specifying a <code>null</code> argument to the <code>text</code> parameter is exactly equivalent to specifying an empty string,
155      * and results in the segment being completely removed from the output document.
156      *
157      * @param begin the character position at which to begin the replacement.
158      * @param end the character position at which to end the replacement.
159      * @param text the replacement text, or <code>null</code> to remove the segment.
160      */

161     public void replace(final int begin, final int end, final CharSequence JavaDoc text) {
162         register(new StringOutputSegment(begin,end,text));
163     }
164
165     /**
166      * Replaces the specified segment of this output document with the specified character.
167      *
168      * @param begin the character position at which to begin the replacement.
169      * @param end the character position at which to end the replacement.
170      * @param ch the replacement character.
171      */

172     public void replace(final int begin, final int end, final char ch) {
173         register(new CharOutputSegment(begin,end,ch));
174     }
175
176     /**
177      * Replaces the specified {@link FormControl} in this output document.
178      * <p>
179      * The effect of this method is to {@linkplain #register(OutputSegment) register} zero or more
180      * {@linkplain OutputSegment output segments} in the output document as required to reflect
181      * previous modifications to the control's state.
182      * The state of a control includes its <a HREF="FormControl.html#SubmissionValue">submission value</a>,
183      * {@linkplain FormControl#setOutputStyle(FormControlOutputStyle) output style}, and whether it has been
184      * {@linkplain FormControl#setDisabled(boolean) disabled}.
185      * <p>
186      * The state of the form control should not be modified after this method is called, as there is no guarantee that
187      * subsequent changes either will or will not be reflected in the final output.
188      * A second call to this method with the same parameter is not allowed.
189      * It is therefore recommended to call this method as the last action before the output is generated.
190      * <p>
191      * Although the specifics of the number and nature of the output segments added in any particular circumstance
192      * is not defined in the specification, it can generally be assumed that only the minimum changes necessary
193      * are made to the original document. If the state of the control has not been modified, calling this method
194      * has no effect at all.
195      *
196      * @param formControl the form control to replace.
197      * @see #replace(FormFields)
198      */

199     public void replace(final FormControl formControl) {
200         formControl.replaceInOutputDocument(this);
201     }
202
203     /**
204      * {@linkplain #replace(FormControl) Replaces} all the constituent {@linkplain FormControl form controls}
205      * from the specified {@link FormFields} in this output document.
206      * <p>
207      * This is equivalent to the following code:
208      * <pre>for (Iterator i=formFields.{@link FormFields#getFormControls() getFormControls()}.iterator(); i.hasNext();)
209      * {@link #replace(FormControl) replace}((FormControl)i.next());</pre>
210      * <p>
211      * The state of any of the form controls in the specified form fields should not be modified after this method is called,
212      * as there is no guarantee that subsequent changes either will or will not be reflected in the final output.
213      * A second call to this method with the same parameter is not allowed.
214      * It is therefore recommended to call this method as the last action before the output is generated.
215      *
216      * @param formFields the form fields to replace.
217      * @see #replace(FormControl)
218      */

219     public void replace(final FormFields formFields) {
220         formFields.replaceInOutputDocument(this);
221     }
222
223     /**
224      * Replaces the specified {@link Attributes} segment in this output document with the name/value entries
225      * in the returned <code>Map</code>.
226      * The returned map initially contains entries representing the attributes from the source document,
227      * which can be modified before output.
228      * <p>
229      * The documentation of the {@link #replace(Attributes,Map)} method contains more information about the requirements
230      * of the map entries.
231      * <p>
232      * Specifying a value of <code>true</code> as an argument to the <code>convertNamesToLowerCase</code> parameter
233      * causes all original attribute names to be converted to lower case in the map.
234      * This simplifies the process of finding/updating specific attributes since map keys are case sensitive.
235      * <p>
236      * Attribute values are automatically {@linkplain CharacterReference#decode(CharSequence) decoded} before
237      * being loaded into the map.
238      * <p>
239      * This method is logically equivalent to:<br />
240      * {@link #replace(Attributes,Map) replace}<code>(attributes, attributes.</code>{@link Attributes#populateMap(Map,boolean) populateMap(new LinkedHashMap(),convertNamesToLowerCase)}<code>)</code>
241      * <p>
242      * The use of <code>LinkedHashMap</code> to implement the map ensures (probably unnecessarily) that
243      * existing attributes are output in the same order as they appear in the source document, and new
244      * attributes are output in the same order as they are added.
245      * <p>
246      * <dl>
247      * <dt>Example:</dt>
248      * <dd><pre>
249      * Source source=new Source(htmlDocument);
250      * Attributes bodyAttributes
251      * =source.findNextStartTag(0,Tag.BODY).getAttributes();
252      * OutputDocument outputDocument=new OutputDocument(source);
253      * Map attributesMap=outputDocument.replace(bodyAttributes,true);
254      * attributesMap.put("bgcolor","green");
255      * String htmlDocumentWithGreenBackground=outputDocument.toString();</pre></dl>
256      *
257      * @param attributes the <code>Attributes</code> segment defining the span of the segment and initial name/value entries of the returned map.
258      * @param convertNamesToLowerCase specifies whether all attribute names are converted to lower case in the map.
259      * @return a <code>Map</code> containing the name/value entries to be output.
260      * @see #replace(Attributes,Map)
261      */

262     public Map replace(final Attributes attributes, boolean convertNamesToLowerCase) {
263         AttributesOutputSegment attributesOutputSegment=new AttributesOutputSegment(attributes,convertNamesToLowerCase);
264         register(attributesOutputSegment);
265         return attributesOutputSegment.getMap();
266     }
267
268     /**
269      * Replaces the specified attributes segment in this source document with the name/value entries in the specified <code>Map</code>.
270      * <p>
271      * This method might be used if the <code>Map</code> containing the new attribute values
272      * should not be preloaded with the same entries as the source attributes, or a map implementation
273      * other than <code>LinkedHashMap</code> is required.
274      * Otherwise, the {@link #replace(Attributes, boolean convertNamesToLowerCase)} method is generally more useful.
275      * <p>
276      * Keys in the map must be <code>String</code> objects, and values must implement the <code>CharSequence</code> interface.
277      * <p>
278      * An attribute with no value is represented by a map entry with a <code>null</code> value.
279      * <p>
280      * Attribute values are stored unencoded in the map, and are automatically
281      * {@linkplain CharacterReference#encode(CharSequence) encoded} if necessary during output.
282      * <p>
283      * The use of invalid characters in attribute names results in unspecified behaviour.
284      * <p>
285      * Note that methods in the <code>Attributes</code> class treat attribute names as case insensitive,
286      * whereas the <code>Map</code> treats them as case sensitive.
287      *
288      * @param attributes the <code>Attributes</code> object defining the span of the segment to replace.
289      * @param map the <code>Map</code> containing the name/value entries.
290      * @see #replace(Attributes, boolean convertNamesToLowerCase)
291      */

292     public void replace(final Attributes attributes, final Map map) {
293         register(new AttributesOutputSegment(attributes,map));
294     }
295
296     /**
297      * Replaces the specified segment of this output document with a string of spaces of the same length.
298      * <p>
299      * This method is used internally to implement the functionality available through the
300      * {@link Segment#ignoreWhenParsing()} method.
301      * It is included in the public API in the unlikely event it has other practical uses
302      * for the developer.
303      * To remove a segment from the output document completely, use the {@link #remove(Segment)} method instead.
304      *
305      * @param begin the character position at which to begin the replacement.
306      * @param end the character position at which to end the replacement.
307      */

308     public void replaceWithSpaces(final int begin, final int end) {
309         register(new BlankOutputSegment(begin,end));
310     }
311
312     /**
313      * Registers the specified {@linkplain OutputSegment output segment} in this output document.
314      * <p>
315      * Use this method if you want to use a customised {@link OutputSegment} class.
316      *
317      * @param outputSegment the output segment to register.
318      */

319     public void register(final OutputSegment outputSegment) {
320         outputSegments.add(outputSegment);
321     }
322
323     /**
324      * Writes the final content of this output document to the specified <code>Writer</code>.
325      * <p>
326      * An {@link OverlappingOutputSegmentsException} is thrown if any of the output segments overlap.
327      * For efficiency reasons this condition is not caught when the offending output segment is {@linkplain #add(OutputSegment) added}.
328      * <p>
329      * If the output is required in the form of a <code>Reader</code>, use {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)} instead.
330      *
331      * @param writer the destination <code>java.io.Writer</code> for the output.
332      * @throws IOException if an I/O exception occurs.
333      * @throws OverlappingOutputSegmentsException if any of the output segments overlap.
334      * @see #toString()
335      */

336     public void writeTo(final Writer writer) throws IOException {
337         if (outputSegments.isEmpty()) {
338             Util.appendTo(writer,sourceText);
339             return;
340         }
341         int pos=0;
342         Collections.sort(outputSegments,OutputSegment.COMPARATOR);
343         OutputSegment lastOutputSegment=null;
344         for (final Iterator i=outputSegments.iterator(); i.hasNext();) {
345             final OutputSegment outputSegment=(OutputSegment)i.next();
346             if (outputSegment==lastOutputSegment) continue; // silently ignore duplicate output segment
347
if (outputSegment.getBegin()<pos) throw new OverlappingOutputSegmentsException(lastOutputSegment,outputSegment);
348             if (outputSegment.getBegin()>pos) Util.appendTo(writer,sourceText,pos,outputSegment.getBegin());
349             outputSegment.writeTo(writer);
350             lastOutputSegment=outputSegment;
351             pos=outputSegment.getEnd();
352         }
353         if (pos<sourceText.length()) Util.appendTo(writer,sourceText,pos,sourceText.length());
354         writer.close();
355     }
356
357     public long getEstimatedMaximumOutputLength() {
358         long estimatedMaximumOutputLength=sourceText.length();
359         for (final Iterator i=outputSegments.iterator(); i.hasNext();) {
360             final OutputSegment outputSegment=(OutputSegment)i.next();
361             final int outputSegmentOriginalLength=outputSegment.getEnd()-outputSegment.getBegin();
362             estimatedMaximumOutputLength+=(outputSegment.getEstimatedMaximumOutputLength()-outputSegmentOriginalLength);
363         }
364         return estimatedMaximumOutputLength;
365     }
366
367     /**
368      * Returns the final content of this output document as a <code>String</code>.
369      * @return the final content of this output document as a <code>String</code>.
370      * @throws OverlappingOutputSegmentsException if any of the output segments overlap.
371      * @see #writeTo(Writer)
372      */

373     public String JavaDoc toString() {
374         return CharStreamSourceUtil.toString(this);
375     }
376
377     /**
378      * Constructs a new output document based on the specified source text.
379      * <p>
380      * This constructor has been deprecated as of version 2.2 in favour of the {@link #OutputDocument(Source)} method
381      * as most of the methods in this class assume that the argument supplied to this constructor is the entire source document.
382      *
383      * @param sourceText the source text.
384      * @deprecated Use the {@link #OutputDocument(Source)} constructor instead.
385      */

386     public OutputDocument(final CharSequence JavaDoc sourceText) {
387       if (sourceText==null) throw new IllegalArgumentException JavaDoc("sourceText argument must not be null");
388         this.sourceText=sourceText;
389     }
390
391     /**
392      * Registers the specified {@linkplain OutputSegment output segment} in this output document.
393      * <p>
394      * This method has been deprecated as of version 2.2 in favour of the identical {@link #register(OutputSegment)} method
395      * in an effort to make this class and its methods more intuitive.
396      *
397      * @param outputSegment the output segment to register.
398      * @deprecated Use the {@link #register(OutputSegment)} method instead.
399      */

400     public void add(final OutputSegment outputSegment) {
401         register(outputSegment);
402     }
403
404     /**
405      * Replaces the specified {@link FormControl} in this output document.
406      * <p>
407      * This method has been deprecated as of version 2.2 in favour of the identical {@link #replace(FormControl)} method
408      * in an effort to make this class and its methods more intuitive.
409      *
410      * @param formControl the form control to replace.
411      * @deprecated Use the {@link #replace(FormControl)} method instead.
412      */

413     public void add(final FormControl formControl) {
414         replace(formControl);
415     }
416
417     /**
418      * {@linkplain #replace(FormControl) Replaces} all the constituent {@linkplain FormControl form controls}
419      * from the specified {@link FormFields} in this output document.
420      * <p>
421      * This method has been deprecated as of version 2.2 in favour of the identical {@link #replace(FormFields)} method
422      * in an effort to make this class and its methods more intuitive.
423      *
424      * @param formFields the form fields to replace.
425      * @deprecated Use the {@link #replace(FormFields)} method instead.
426      */

427     public void add(final FormFields formFields) {
428         formFields.replaceInOutputDocument(this);
429     }
430
431     /**
432      * Outputs the final content of this output document to the specified <code>Writer</code>.
433      * <p>
434      * This method has been deprecated as of version 2.2 in favour of the identical {@link #writeTo(Writer)} method in order for this class to implement {@link CharStreamSource}.
435      *
436      * @param writer the destination <code>java.io.Writer</code> for the output.
437      * @throws IOException if an I/O exception occurs.
438      * @throws OverlappingOutputSegmentsException if any of the output segments overlap.
439      * @deprecated Use the {@link #writeTo(Writer)} method instead.
440      */

441     public void output(final Writer writer) throws IOException {
442         writeTo(writer);
443     }
444     
445     /**
446      * Returns a <code>Reader</code> that reads the final content of this output document.
447      * <p>
448      * This method has been deprecated as of version 2.2 in favour of calling the {@link CharStreamSourceUtil#getReader(CharStreamSource)} method,
449      * passing this object as the argument.
450      *
451      * @return a <code>Reader</code> that reads the final content of this output document.
452      * @throws OverlappingOutputSegmentsException if any of the output segments overlap.
453      * @deprecated Use {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)} instead.
454      */

455     public Reader getReader() {
456         return CharStreamSourceUtil.getReader(this);
457     }
458 }
459
Popular Tags