1 17 package org.alfresco.repo.content.metadata; 18 19 import java.io.InputStream ; 20 import java.io.InputStreamReader ; 21 import java.io.Reader ; 22 import java.io.Serializable ; 23 import java.util.HashMap ; 24 import java.util.HashSet ; 25 import java.util.Map ; 26 import java.util.Set ; 27 28 import javax.swing.text.ChangedCharSetException ; 29 import javax.swing.text.MutableAttributeSet ; 30 import javax.swing.text.html.HTML ; 31 import javax.swing.text.html.HTMLEditorKit ; 32 import javax.swing.text.html.parser.ParserDelegator ; 33 34 import org.alfresco.model.ContentModel; 35 import org.alfresco.repo.content.MimetypeMap; 36 import org.alfresco.service.cmr.repository.ContentReader; 37 import org.alfresco.service.namespace.QName; 38 39 43 public class HtmlMetadataExtracter extends AbstractMetadataExtracter 44 { 45 private static final Set <String > MIMETYPES = new HashSet <String >(5); 46 static 47 { 48 MIMETYPES.add(MimetypeMap.MIMETYPE_HTML); 49 MIMETYPES.add(MimetypeMap.MIMETYPE_XHTML); 50 } 51 52 public HtmlMetadataExtracter() 53 { 54 super(MIMETYPES, 1.0, 1000); 55 } 56 57 public void extractInternal(ContentReader reader, Map <QName, Serializable > destination) throws Throwable 58 { 59 final Map <QName, Serializable > tempDestination = new HashMap <QName, Serializable >(); 60 61 HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback () 62 { 63 StringBuffer title = null; 64 boolean inHead = false; 65 66 public void handleText(char[] data, int pos) 67 { 68 if (title != null) 69 { 70 title.append(data); 71 } 72 } 73 74 public void handleComment(char[] data, int pos) 75 { 76 } 78 79 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) 80 { 81 if (HTML.Tag.HEAD.equals(t)) 82 { 83 inHead = true; 84 } 85 else if (HTML.Tag.TITLE.equals(t) && inHead) 86 { 87 title = new StringBuffer (); 88 } 89 else 90 handleSimpleTag(t, a, pos); 91 } 92 93 public void handleEndTag(HTML.Tag t, int pos) 94 { 95 if (HTML.Tag.HEAD.equals(t)) 96 { 97 inHead = false; 98 } 99 else if (HTML.Tag.TITLE.equals(t) && title != null) 100 { 101 trimPut(ContentModel.PROP_TITLE, title.toString(), tempDestination); 102 title = null; 103 } 104 } 105 106 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) 107 { 108 if (HTML.Tag.META.equals(t)) 109 { 110 Object nameO = a.getAttribute(HTML.Attribute.NAME); 111 Object valueO = a.getAttribute(HTML.Attribute.CONTENT); 112 if (nameO == null || valueO == null) 113 return; 114 115 String name = nameO.toString(); 116 117 if (name.equalsIgnoreCase("creator") || name.equalsIgnoreCase("author") 118 || name.equalsIgnoreCase("dc.creator")) 119 { 120 trimPut(ContentModel.PROP_AUTHOR, valueO, tempDestination); 121 } 122 if (name.equalsIgnoreCase("description") || name.equalsIgnoreCase("dc.description")) 123 { 124 trimPut(ContentModel.PROP_DESCRIPTION, valueO, tempDestination); 125 } 126 } 127 } 128 129 public void handleError(String errorMsg, int pos) 130 { 131 } 132 }; 133 134 String charsetGuess = "UTF-8"; 135 int tries = 0; 136 while (tries < 3) 137 { 138 tempDestination.clear(); 139 Reader r = null; 140 InputStream cis = null; 141 try 142 { 143 cis = reader.getContentInputStream(); 144 r = new InputStreamReader (cis); 146 HTMLEditorKit.Parser parser = new ParserDelegator (); 147 parser.parse(r, callback, tries > 0); 148 destination.putAll(tempDestination); 149 break; 150 } 151 catch (ChangedCharSetException ccse) 152 { 153 tries++; 154 charsetGuess = ccse.getCharSetSpec(); 155 int begin = charsetGuess.indexOf("charset="); 156 if (begin > 0) 157 charsetGuess = charsetGuess.substring(begin + 8, charsetGuess.length()); 158 reader = reader.getReader(); 159 } 160 finally 161 { 162 if (r != null) 163 r.close(); 164 if (cis != null) 165 cis.close(); 166 } 167 } 168 } 169 } 170 | Popular Tags |