1 2 3 4 package net.nutch.parse.html; 5 6 import junit.framework.TestCase; 7 8 import net.nutch.parse.html.RobotsMetaProcessor.*; 9 10 import java.io.ByteArrayInputStream ; 11 import java.net.URL ; 12 13 import org.cyberneko.html.parsers.*; 14 import org.xml.sax.*; 15 import org.w3c.dom.*; 16 import org.apache.html.dom.*; 17 18 19 public class TestRobotsMetaProcessor extends TestCase { 20 public TestRobotsMetaProcessor(String name) { 21 super(name); 22 } 23 24 36 37 38 public static String [] tests= 39 { 40 "<html><head><title>test page</title>" 41 + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> " 42 + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> " 43 + "</head><body>" 44 + " some text" 45 + "</body></html>", 46 47 "<html><head><title>test page</title>" 48 + "<meta name=\"robots\" content=\"all\"> " 49 + "<meta http-equiv=\"pragma\" content=\"no-cache\"> " 50 + "</head><body>" 51 + " some text" 52 + "</body></html>", 53 54 "<html><head><title>test page</title>" 55 + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> " 56 + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> " 57 + "</head><body>" 58 + " some text" 59 + "</body></html>", 60 61 "<html><head><title>test page</title>" 62 + "<meta name=\"robots\" content=\"none\"> " 63 + "</head><body>" 64 + " some text" 65 + "</body></html>", 66 67 "<html><head><title>test page</title>" 68 + "<meta name=\"robots\" content=\"noindex,nofollow\"> " 69 + "</head><body>" 70 + " some text" 71 + "</body></html>", 72 73 "<html><head><title>test page</title>" 74 + "<meta name=\"robots\" content=\"noindex,follow\"> " 75 + "</head><body>" 76 + " some text" 77 + "</body></html>", 78 79 "<html><head><title>test page</title>" 80 + "<meta name=\"robots\" content=\"index,nofollow\"> " 81 + "</head><body>" 82 + " some text" 83 + "</body></html>", 84 85 "<html><head><title>test page</title>" 86 + "<meta name=\"robots\" content=\"index,follow\"> " 87 + "<base HREF=\"http://www.nutch.org/\">" 88 + "</head><body>" 89 + " some text" 90 + "</body></html>", 91 92 "<html><head><title>test page</title>" 93 + "<meta name=\"robots\"> " 94 + "<base HREF=\"http://www.nutch.org/base/\">" 95 + "</head><body>" 96 + " some text" 97 + "</body></html>", 98 99 }; 100 101 public static final boolean[][] answers= { 102 {true, true, true}, {false, false, true}, {true, true, true}, {true, true, false}, {true, true, false}, {true, false, false}, {false, true, false}, {false, false, false}, {false, false, false}, }; 112 113 private URL [][] currURLsAndAnswers; 114 115 public void testRobotsMetaProcessor() { 116 DOMFragmentParser parser= new DOMFragmentParser();; 117 118 try { 119 currURLsAndAnswers= new URL [][] { 120 {new URL ("http://www.nutch.org"), null}, 121 {new URL ("http://www.nutch.org"), null}, 122 {new URL ("http://www.nutch.org"), null}, 123 {new URL ("http://www.nutch.org"), null}, 124 {new URL ("http://www.nutch.org"), null}, 125 {new URL ("http://www.nutch.org"), null}, 126 {new URL ("http://www.nutch.org"), null}, 127 {new URL ("http://www.nutch.org/foo/"), 128 new URL ("http://www.nutch.org/")}, 129 {new URL ("http://www.nutch.org"), 130 new URL ("http://www.nutch.org/base/")} 131 }; 132 } catch (Exception e) { 133 assertTrue("couldn't make test URLs!", false); 134 } 135 136 for (int i= 0; i < tests.length; i++) { 137 byte[] bytes= tests[i].getBytes(); 138 139 DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); 140 141 try { 142 parser.parse(new InputSource(new ByteArrayInputStream (bytes)), node); 143 } catch (Exception e) { 144 e.printStackTrace(); 145 } 146 147 RobotsMetaIndicator robotsMeta= new RobotsMetaIndicator(); 148 RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, node, 149 currURLsAndAnswers[i][0]); 150 151 assertTrue("got index wrong on test " + i, 152 robotsMeta.getNoIndex() == answers[i][0]); 153 assertTrue("got follow wrong on test " + i, 154 robotsMeta.getNoFollow() == answers[i][1]); 155 assertTrue("got cache wrong on test " + i, 156 robotsMeta.getNoCache() == answers[i][2]); 157 assertTrue("got base href wrong on test " + i + " (got " 158 + robotsMeta.getBaseHref() + ")", 159 ( (robotsMeta.getBaseHref() == null) 160 && (currURLsAndAnswers[i][1] == null) ) 161 || ( (robotsMeta.getBaseHref() != null) 162 && robotsMeta.getBaseHref().equals( 163 currURLsAndAnswers[i][1]) ) ); 164 165 } 166 } 167 168 } 169 | Popular Tags |