1 package net.javacoding.jspider.core.model; 2 3 4 import net.javacoding.jspider.api.model.*; 5 import net.javacoding.jspider.core.storage.exception.InvalidStateForActionException; 6 import net.javacoding.jspider.core.storage.exception.InvalidStateTransitionException; 7 import net.javacoding.jspider.core.storage.spi.StorageSPI; 8 import net.javacoding.jspider.core.logging.LogFactory; 9 import net.javacoding.jspider.core.util.URLUtil; 10 11 import java.io.InputStream ; 12 import java.net.URL ; 13 import java.util.*; 14 15 16 22 public class ResourceInternal implements ParsedResource, ParseErrorResource, ParseIgnoredResource, ForbiddenResource, FetchIgnoredResource, FetchErrorResource { 23 24 protected StorageSPI storage; 25 26 protected int site; 27 28 protected URL url; 29 protected Date discoveryTime; 30 protected FolderInternal folder; 31 protected int state; 32 protected int id; 33 34 protected int httpStatus; 35 protected int size; 36 protected int timeMs; 37 protected String mimeType; 38 protected Date fetchTime; 39 protected HTTPHeader[] headers; 40 41 protected Decision spiderDecision; 42 protected Decision parseDecision; 43 44 45 public ResourceInternal(StorageSPI storage, int id, int siteId, URL url, Date discoveryTime, FolderInternal folder) { 46 this.site = siteId; 47 this.storage = storage; 48 this.id = id; 49 this.url = url; 50 this.discoveryTime = discoveryTime; 51 this.folder = folder; 52 53 this.state = Resource.STATE_DISCOVERED; 54 } 55 56 public ResourceInternal(StorageSPI storage, int id, Site site, URL url, Date discoveryTime, FolderInternal folder) { 57 this(storage, id, ((SiteInternal) site).getId(), url, discoveryTime, folder); 58 } 59 60 public ResourceInternal(StorageSPI storage, Site site, URL url, Date discoveryTime, FolderInternal folder) { 61 this(storage, 0, site, url, discoveryTime, folder); 62 } 63 64 public void setFetched(int httpStatus, int size, int timeMs, String mimeType, Date fetchTime, HTTPHeader[] headers) { 65 if (state != Resource.STATE_DISCOVERED) { 66 LogFactory.getLog(Resource.class).error("error in state transition for resource " + url + ":\n" + this); 67 throw new InvalidStateTransitionException("cannot set resource fetched - it's not in the discovered state - was " + state); 68 } 69 this.httpStatus = httpStatus; 70 this.size = size; 71 this.timeMs = timeMs; 72 this.mimeType = mimeType; 73 this.fetchTime = fetchTime; 74 this.headers = headers; 75 state = Resource.STATE_FETCHED; 76 } 77 78 public void setFetchError(int httpStatus, HTTPHeader[] headers) { 79 if (state != Resource.STATE_DISCOVERED && state != Resource.STATE_FETCH_ERROR) { 80 LogFactory.getLog(Resource.class).error("error in state transition for resource " + url + ":\n" + this); 81 throw new InvalidStateTransitionException("cannot set resource fetch error - it's not in the discovered state - was" + state); 82 } 83 this.httpStatus = httpStatus; 84 this.headers = headers; 85 state = Resource.STATE_FETCH_ERROR; 86 } 87 88 public void setParseError() { 89 if (state != Resource.STATE_FETCHED && state != Resource.STATE_PARSE_ERROR) { 90 LogFactory.getLog(Resource.class).error("error in state transition for resource " + url + ":\n" + this); 91 throw new InvalidStateTransitionException("cannot set resource parse error - it's not in the fetched state - was " + state); 92 } 93 state = Resource.STATE_PARSE_ERROR; 94 } 95 96 public void setParsed() { 97 if (state != Resource.STATE_FETCHED && state != Resource.STATE_PARSED) { 98 LogFactory.getLog(Resource.class).error("error in state transition for resource " + url + ":\n" + this); 99 throw new InvalidStateTransitionException("cannot set resource parsed - it's not in the fetched state - was " + state); 100 } 101 state = Resource.STATE_PARSED; 102 } 103 104 public void setFetchIgnored() { 105 if (state != Resource.STATE_DISCOVERED && state != Resource.STATE_FETCH_IGNORED) { 106 LogFactory.getLog(Resource.class).error("error in state transition for resource " + url + ":\n" + this); 107 throw new InvalidStateTransitionException("cannot set resource fetch_ignored - it's not in the discovered state - was " + state); 108 } 109 state = Resource.STATE_FETCH_IGNORED; 110 } 111 112 public void setParseIgnored() { 113 if (state != Resource.STATE_FETCHED && state != Resource.STATE_PARSE_IGNORED) { 114 LogFactory.getLog(Resource.class).error("error in state transition for resource " + url + ":\n" + this); 115 throw new InvalidStateTransitionException("cannot set resource parse_ignored - it's not in the fetched state - was " + state); 116 } 117 state = Resource.STATE_PARSE_IGNORED; 118 } 119 120 public void setForbidden() { 121 if (state != Resource.STATE_DISCOVERED && state != Resource.STATE_FETCH_FORBIDDEN) { 122 LogFactory.getLog(Resource.class).error("error in state transition for resource " + url + ":\n" + this); 123 throw new InvalidStateTransitionException("cannot set resource forbidden - it's not in the discovered state - was " + state); 124 } 125 state = Resource.STATE_FETCH_FORBIDDEN; 126 } 127 128 public int getId() { 129 return id; 130 } 131 132 public void setInt(int id) { 133 this.id = id; 134 } 135 136 public int getState() { 137 return state; 138 } 139 140 public String getFileName() { 141 return URLUtil.getFileName(url); 142 } 143 144 public URL getURL() { 145 return url; 146 } 147 148 public Site getSite() { 149 return folder.getSite(); 150 } 151 152 public Folder getFolder() { 153 return folder; 154 } 155 156 public String getName() { 157 return url.getFile(); 158 } 159 160 public Date getDiscoveryTime() { 161 return discoveryTime; 162 } 163 164 public Resource[] getReferers() { 165 return storage.getResourceDAO().getRefereringResources(this); 166 } 167 168 public Resource[] getReferencedResources() { 169 if (state != Resource.STATE_PARSED) { 170 throw new InvalidStateForActionException("cannot get referenced resources if not parsed"); 171 } 172 return storage.getResourceDAO().getReferencedResources(this); 173 } 174 175 public int getHttpStatus() { 176 if (state == Resource.STATE_DISCOVERED) { 177 throw new InvalidStateForActionException("cannot get http status for a resource that's not fetched"); 178 } 179 return httpStatus; 180 } 181 182 public int getHttpStatusInternal() { 183 return httpStatus; 184 } 185 186 public void setHttpStatus(int status) { 187 this.httpStatus = status; 188 } 189 190 public HTTPHeader[] getHeaders() { 191 return headers; 192 } 193 194 public int getTimeMs() { 195 if (state < Resource.STATE_FETCHED) { 196 throw new InvalidStateForActionException("cannot get timing for non-fetched resource"); 197 } 198 return timeMs; 199 } 200 201 public int getTimeMsInternal() { 202 return timeMs; 203 } 204 205 public int getSize() { 206 if (state < Resource.STATE_FETCHED) { 207 throw new InvalidStateForActionException("cannot get size for non-fetched resource"); 208 } 209 return size; 210 } 211 212 public int getSizeInternal() { 213 return size; 214 } 215 216 public String getMime() { 217 if (state < Resource.STATE_FETCHED) { 218 throw new InvalidStateForActionException("cannot get mime type for non-fetched resource"); 219 } 220 return mimeType; 221 } 222 223 public String getMimeInternal() { 224 return mimeType; 225 } 226 227 public Date getFetchTime() { 228 if (state < Resource.STATE_FETCHED) { 229 throw new InvalidStateForActionException("cannot get fetch time for non-fetched resource"); 230 } 231 return fetchTime; 232 } 233 234 public Date getFetchTimeInternal() { 235 return fetchTime; 236 } 237 238 public String toString() { 239 StringBuffer sb = new StringBuffer (); 240 sb.append(url.toString()); 241 sb.append("\n STATUS : "); 242 sb.append(translateState(state)); 243 sb.append("\n "); 244 sb.append("\n SPIDER DECISION : "); 245 Decision sd = getSpiderDecision(); 246 if (sd == null) { 247 sb.append("\n "); 248 sb.append("[Not yet taken]"); 249 } else { 250 DecisionStep[] steps = sd.getSteps(); 251 for (int i = 0; i < steps.length; i++) { 252 DecisionStep step = steps[i]; 253 sb.append("\n "); 254 sb.append(step.toString()); 255 } 256 } 257 sb.append("\n "); 258 sb.append("\n PARSE DECISION : "); 259 Decision pd = getParseDecision(); 260 if (pd == null) { 261 sb.append("\n "); 262 sb.append("[Not yet taken]"); 263 } else { 264 DecisionStep[] steps = pd.getSteps(); 265 for (int i = 0; i < steps.length; i++) { 266 DecisionStep step = steps[i]; 267 sb.append("\n "); 268 sb.append(step.toString()); 269 } 270 } 271 sb.append("\n"); 272 273 switch (state) { 274 case STATE_DISCOVERED: 275 break; 276 case STATE_FETCH_ERROR: 277 sb.append(" HTTP Status: "); 278 sb.append(this.getHttpStatus()); 279 Resource[] referers = this.getReferers(); 280 sb.append("\n REFERERS: " + referers.length); 281 for (int i = 0; i < referers.length; i++) { 282 Resource referer = referers[i]; 283 sb.append("\n "); 284 sb.append(referer.getURL()); 285 } 286 break; 287 case STATE_FETCH_IGNORED: 288 break; 289 case STATE_FETCH_FORBIDDEN: 290 break; 291 case STATE_FETCHED: 292 sb.append(" HTTP Status: "); 293 sb.append(this.getHttpStatus()); 294 sb.append(", Content size: "); 295 sb.append(this.getSize()); 296 sb.append(", Mime Type: "); 297 sb.append(this.getMime()); 298 sb.append(", Fetch time: "); 299 sb.append(this.getTimeMs()); 300 break; 301 case STATE_PARSE_ERROR: 302 break; 303 case STATE_PARSE_IGNORED: 304 break; 305 case STATE_PARSED: 306 sb.append(" HTTP Status: "); 307 sb.append(this.getHttpStatus()); 308 sb.append(", Content size: "); 309 sb.append(this.getSize()); 310 sb.append(", Mime Type: "); 311 sb.append(this.getMime()); 312 sb.append(", Fetch time: "); 313 sb.append(this.getTimeMs()); 314 315 referers = this.getReferers(); 316 sb.append("\n REFERERS: " + referers.length); 317 for (int i = 0; i < referers.length; i++) { 318 Resource referer = referers[i]; 319 sb.append("\n "); 320 sb.append(referer.getURL()); 321 } 322 323 if (state == STATE_PARSED) { 324 325 Resource[] references = this.getReferencedResources(); 326 sb.append("\n REFERENCES: " + references.length); 327 for (int i = 0; i < references.length; i++) { 328 Resource reference = references[i]; 329 sb.append("\n "); 330 sb.append(reference.getURL()); 331 } 332 333 EMailAddress[] emails = this.getEmailAddresses(); 334 sb.append("\n E-MAIL ADDRESSES: " + emails.length); 335 for (int i = 0; i < emails.length; i++) { 336 EMailAddress email = emails[i]; 337 sb.append("\n "); 338 sb.append(email.getAddress()); 339 } 340 341 } else { 342 sb.append("\n EMAIL ADDRESSES and REFERENCES not known [Resource not parsed]"); 343 } 344 break; 345 } 346 347 sb.append("\n"); 348 349 return sb.toString(); 350 } 351 352 protected String translateState(int state) { 353 switch (state) { 354 case Resource.STATE_DISCOVERED: 355 return "DISCOVERED"; 356 case Resource.STATE_FETCH_ERROR: 357 return "FETCH_ERROR"; 358 case Resource.STATE_PARSE_ERROR: 359 return "PARSE_ERROR"; 360 case Resource.STATE_FETCHED: 361 return "FETCHED"; 362 case Resource.STATE_FETCH_FORBIDDEN: 363 return "FETCH_FORBIDDEN"; 364 case Resource.STATE_FETCH_IGNORED: 365 return "FETCH_IGNORED"; 366 case Resource.STATE_PARSE_IGNORED: 367 return "PARSE_IGNORED"; 368 case Resource.STATE_PARSED: 369 return "PARSED"; 370 default: 371 return "?!? UNKNOWN STATE ?!?"; 372 373 } 374 } 375 376 public InputStream getInputStream() { 377 return storage.getContentDAO().getInputStream(id); 378 } 379 380 public void setBytes(byte[] bytes) { 381 storage.getContentDAO().setBytes(id, bytes); 382 } 383 384 public Date getFetchTimeStamp() { 385 return fetchTime; 386 } 387 388 public String getStateName() { 389 return translateState(state); 390 } 391 392 public Decision getSpiderDecision() { 393 return storage.getDecisionDAO().findSpiderDecision(this); 394 } 395 396 public Decision getParseDecision() { 397 return storage.getDecisionDAO().findParseDecision(this); 398 } 399 400 public void setState(int state) { 401 this.state = state; 402 } 403 404 public void setMime(String mime) { 405 this.mimeType = mime; 406 } 407 408 public void setTime(int ms) { 409 this.timeMs = ms; 410 } 411 412 public void setSize(int size) { 413 this.size = size; 414 } 415 416 public int getSiteId() { 417 return site; 418 } 419 420 public ResourceReference[] getOutgoingReferences() { 421 return storage.getResourceDAO().getOutgoingReferences(this) ; 422 } 423 424 public ResourceReference[] getIncomingReferences() { 425 return storage.getResourceDAO().getIncomingReferences(this) ; 426 } 427 428 public EMailAddress[] getEmailAddresses() { 429 return storage.getEMailAddressDAO().findByResource(this); 430 } 431 432 public EMailAddressReference[] getEmailAddressReferences() { 433 return storage.getEMailAddressDAO().findReferencesByResource(this); 434 } 435 } 436 | Popular Tags |