1 package com.opensymphony.module.sitemesh.parser;2 3 import com.opensymphony.module.sitemesh.HTMLPage;4 import com.opensymphony.module.sitemesh.Page;5 import com.opensymphony.module.sitemesh.PageParser;6 import com.opensymphony.module.sitemesh.html.HTMLProcessor;7 import com.opensymphony.module.sitemesh.html.State;8 import com.opensymphony.module.sitemesh.html.StateTransitionRule;9 import com.opensymphony.module.sitemesh.html.tokenizer.TagTokenizer;10 import com.opensymphony.module.sitemesh.html.util.CharArray;11 import com.opensymphony.module.sitemesh.parser.rules.BodyTagRule;12 import com.opensymphony.module.sitemesh.parser.rules.ContentBlockExtractingRule;13 import com.opensymphony.module.sitemesh.parser.rules.FramesetRule;14 import com.opensymphony.module.sitemesh.parser.rules.HeadExtractingRule;15 import com.opensymphony.module.sitemesh.parser.rules.HtmlAttributesRule;16 import com.opensymphony.module.sitemesh.parser.rules.MetaTagRule;17 import com.opensymphony.module.sitemesh.parser.rules.ParameterExtractingRule;18 import com.opensymphony.module.sitemesh.parser.rules.TitleExtractingRule;19 import com.opensymphony.module.sitemesh.parser.rules.MSOfficeDocumentPropertiesRule;20 21 import java.io.IOException ;22 23 /**24 * <b>WARNING - This is experimental - use at own risk!</b> Builds an HTMLPage object from an HTML document. This behaves25 * similarly to the FastPageParser, however it's a complete rewrite that is simpler to add custom features to such as26 * extraction and transformation of elements.27 *28 * @see TagTokenizer29 *30 * @author Joe Walnes31 */32 public class HTMLPageParser implements PageParser {33 34 public Page parse(char[] data) throws IOException {35 CharArray head = new CharArray(64);36 CharArray body = new CharArray(4096);37 HTMLPage page = new TokenizedHTMLPage(data, body, head);38 39 HTMLProcessor htmlProcessor = new HTMLProcessor(data, body);40 State defaultState = htmlProcessor.defaultState();41 State xmlState = new State();42 43 defaultState.addRule(new HtmlAttributesRule(page));44 defaultState.addRule(new HeadExtractingRule(head));45 defaultState.addRule(new MetaTagRule(page));46 defaultState.addRule(new TitleExtractingRule(page));47 defaultState.addRule(new BodyTagRule(page, body));48 defaultState.addRule(new ParameterExtractingRule(page));49 defaultState.addRule(new ContentBlockExtractingRule(page));50 defaultState.addRule(new FramesetRule(page));51 defaultState.addRule(new StateTransitionRule("xml", xmlState, true));52 53 xmlState.addRule(new MSOfficeDocumentPropertiesRule(page));54 55 htmlProcessor.process();56 57 return page;58 }59 60 }61