HTMLPageParser


1   package com.opensymphony.module.sitemesh.parser;
2   
3   import com.opensymphony.module.sitemesh.HTMLPage;
4   import com.opensymphony.module.sitemesh.Page;
5   import com.opensymphony.module.sitemesh.PageParser;
6   import com.opensymphony.module.sitemesh.html.HTMLProcessor;
7   import com.opensymphony.module.sitemesh.html.State;
8   import com.opensymphony.module.sitemesh.html.StateTransitionRule;
9   import com.opensymphony.module.sitemesh.html.tokenizer.TagTokenizer;
10  import com.opensymphony.module.sitemesh.html.util.CharArray;
11  import com.opensymphony.module.sitemesh.parser.rules.BodyTagRule;
12  import com.opensymphony.module.sitemesh.parser.rules.ContentBlockExtractingRule;
13  import com.opensymphony.module.sitemesh.parser.rules.FramesetRule;
14  import com.opensymphony.module.sitemesh.parser.rules.HeadExtractingRule;
15  import com.opensymphony.module.sitemesh.parser.rules.HtmlAttributesRule;
16  import com.opensymphony.module.sitemesh.parser.rules.MetaTagRule;
17  import com.opensymphony.module.sitemesh.parser.rules.ParameterExtractingRule;
18  import com.opensymphony.module.sitemesh.parser.rules.TitleExtractingRule;
19  import com.opensymphony.module.sitemesh.parser.rules.MSOfficeDocumentPropertiesRule;
20  
21  import java.io.IOException  ;
22  
23  /**
24   * <b>WARNING - This is experimental - use at own risk!</b> Builds an HTMLPage object from an HTML document. This behaves
25   * similarly to the FastPageParser, however it's a complete rewrite that is simpler to add custom features to such as
26   * extraction and transformation of elements.
27   *
28   * @see TagTokenizer
29   *
30   * @author Joe Walnes
31   */
32  public class HTMLPageParser implements PageParser {
33  
34      public Page parse(char[] data) throws IOException   {
35          CharArray head = new CharArray(64);
36          CharArray body = new CharArray(4096);
37          HTMLPage page = new TokenizedHTMLPage(data, body, head);
38  
39          HTMLProcessor htmlProcessor = new HTMLProcessor(data, body);
40          State defaultState = htmlProcessor.defaultState();
41          State xmlState = new State();
42  
43          defaultState.addRule(new HtmlAttributesRule(page));
44          defaultState.addRule(new HeadExtractingRule(head));
45          defaultState.addRule(new MetaTagRule(page));
46          defaultState.addRule(new TitleExtractingRule(page));
47          defaultState.addRule(new BodyTagRule(page, body));
48          defaultState.addRule(new ParameterExtractingRule(page));
49          defaultState.addRule(new ContentBlockExtractingRule(page));
50          defaultState.addRule(new FramesetRule(page));
51          defaultState.addRule(new StateTransitionRule("xml", xmlState, true));
52  
53          xmlState.addRule(new MSOfficeDocumentPropertiesRule(page));
54  
55          htmlProcessor.process();
56  
57          return page;
58      }
59  
60  }
61
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags