1 package com.etymon.pj; 2 3 import java.io.*; 4 import java.util.*; 5 import com.etymon.pj.exception.*; 6 import com.etymon.pj.object.*; 7 8 public class PdfParser { 9 10 public static void getObjects(Pdf pdf, RandomAccessFile raf) 11 throws IOException, PjException { 12 long[][] xref = getXref(pdf, raf); 13 byte[] data; 14 PjObject object; 15 Hashtable ht = new Hashtable(); 16 for (int x = 0; x < xref.length; x++) { 17 if (xref[x][2] == 1) { 18 raf.seek(xref[x][0]); 19 data = readUntil(raf, "endobj"); 20 object = PdfParser.parse(pdf, raf, xref, data, 0); 21 pdf._objects.setObjectAt(object, x); 22 } 23 } 24 } 25 26 private static PjObject getObject(Pdf pdf, RandomAccessFile raf, long[][] xref, int num) 27 throws IOException, PjException { 28 PjObject obj = pdf._objects.objectAt(num); 30 if (obj != null) { 31 return obj; 32 } 33 raf.seek(xref[num][0]); 35 byte[] data = readUntil(raf, "endobj"); 36 obj = PdfParser.parse(pdf, raf, xref, data, 0); 37 pdf._objects.setObjectAt(obj, num); 38 return obj; 39 } 40 41 private static long[][] getXref(Pdf pdf, RandomAccessFile raf) throws 42 IOException, PjException { 43 long lastXref = getStartXref(raf); 57 return getNextXref(pdf, raf, lastXref, null); 58 } 59 60 private static long[][] getNextXref(Pdf pdf, RandomAccessFile raf, long start, 61 long[][] xref) 62 throws IOException, PjException { 63 raf.seek(start); 64 byte[] xrefData = readUntil(raf, "trailer"); 65 byte[] trailerData = readUntil(raf, "startxref"); 66 PjDictionary trailer = 67 (PjDictionary)(PdfParser.parse(pdf, raf, xref, trailerData, 0)); 68 Hashtable h = trailer.getHashtable(); 69 long[][] xr; 70 if (xref == null) { 71 xr = new long[((PjNumber)(h.get( 72 new PjName("Size")))).getInt()][3]; 73 pdf._trailer = h; 74 } else { 75 xr = xref; 76 } 77 PjNumber prev = (PjNumber)(h.get(new PjName("Prev"))); 79 if (prev != null) { 80 xr = getNextXref(pdf, raf, prev.getLong(), xr); 81 } 82 PdfParser.parseXref(xrefData, xr, 0); 84 return xr; 85 } 86 87 private static long getStartXref(RandomAccessFile raf) throws 88 IOException, PjException { 89 int scan = 0; 91 for (int retry = PjConst.SCAN_STARTXREF_RETRY; retry > 0; retry--) { 92 scan = scan + PjConst.SCAN_STARTXREF; 93 long fileSize = raf.length(); 94 raf.seek(fileSize - scan); 95 byte[] buffer = readUntil(raf, "startxref"); 96 buffer = readUntil(raf, "%%EOF"); 98 if (buffer.length != 0) { 99 StringBuffer sb = new StringBuffer (); 101 boolean abort = false; 102 int x = 0; 103 while ( (abort == false) && (Character.isDigit((char)(buffer[x]))) ) { 104 sb.append((char)(buffer[x])); 105 x++; 106 if (x >= buffer.length) { 107 abort = true; 108 } 109 } 110 if (abort == false) { 111 return new Long (new String (sb)).longValue(); 112 } 113 } 114 } 115 throw new StartxrefFormatException("Unexpected end of file (startxref)."); 116 } 117 118 public static byte[] readUntil(RandomAccessFile raf, String 119 endstr) throws IOException { 120 StringBuffer sb = new StringBuffer (); 121 char c = '\0'; 122 String s; 123 char[] compare = new char[endstr.length()]; 124 char lastEol = '\0'; 125 boolean eof = false; 126 boolean done = false; 127 do { 128 try { 129 c = (char)(raf.readUnsignedByte()); 130 switch (lastEol) { 131 case '\0': 132 if ( (c == '\r') || (c == 133 '\n') ) { 134 if (sb.length() >= 135 endstr.length()) { 136 sb.getChars(sb.length() - 137 endstr.length(), 138 sb.length(), 139 compare, 0); 140 s = new String (compare); 141 if (s.equals(endstr)) { 142 lastEol = c; 143 } 144 } 145 } 146 sb.append(c); 147 break; 148 case '\n': 149 raf.seek(raf.getFilePointer() - 1); 150 done = true; 151 break; 152 case '\r': 153 if (c == '\n') { 154 sb.append(c); 155 } else { 156 raf.seek(raf.getFilePointer() 157 - 1); 158 } 159 done = true; 160 break; 161 } 162 } 163 catch (EOFException e) { 164 eof = true; 165 } 166 } while ( (eof == false) && (done == false) ); 167 int y = sb.length(); 168 byte[] buffer = new byte[y]; 169 for (int x = 0; x < y; x++) { 170 buffer[x] = (byte)(sb.charAt(x)); 171 } 172 return buffer; 173 } 174 175 protected static String readLine(RandomAccessFile raf) throws 180 IOException { 181 char c = '\0'; 182 StringBuffer sb = new StringBuffer (); 183 boolean endOfLine = false; 184 boolean endOfFile = false; 185 boolean startOfNext = false; 186 boolean firstChar = true; 187 do { 188 try { 189 c = (char)(raf.readUnsignedByte()); 190 if ( (c != '\r') && (c != '\n') ) { 191 if (endOfLine) { 192 startOfNext = true; 193 } else { 194 sb.append(c); 195 } 196 } else { 197 endOfLine = true; 198 } 199 firstChar = false; 200 } 201 catch (EOFException e) { 202 endOfFile = true; 203 } 204 } while ( (endOfFile == false) && (startOfNext == 205 false) ); 206 if (startOfNext) { 207 raf.seek(raf.getFilePointer() - 1); 208 } 209 if ( (endOfFile) && (firstChar) ) { 210 return null; 211 } else { 212 return sb.toString(); 213 } 214 } 215 216 217 public static void parseXref(byte[] data, long[][] xref, int start) throws XrefFormatException { 218 PdfParserState state = new PdfParserState(); 219 state._data = data; 220 state._pos = start; 221 getLine(state); if (state._token.equals("xref") == false) { 223 throw new XrefFormatException("Start of xref not found (xref)."); 224 } 225 StringTokenizer st; 226 int index, count, x; 227 while (state._pos < state._data.length) { 228 getLine(state); 229 st = new StringTokenizer(state._token); 230 if (state._token.equals("trailer")) { 231 return; 232 } 233 index = Integer.parseInt(st.nextToken()); 234 count = Integer.parseInt(st.nextToken()); 235 for (x = 0; x < count; x++) { 236 getLine(state); 237 st = new StringTokenizer(state._token); 238 xref[index][0] = new Integer ( 239 st.nextToken()).longValue(); 240 xref[index][1] = new Integer ( 241 st.nextToken()).longValue(); 242 if (st.nextToken().equals("n")) { 243 xref[index][2] = 1; 244 } else { 245 xref[index][2] = 0; 246 } 247 index++; 248 } 249 } 250 } 251 252 public static PjObject parse(Pdf pdf, RandomAccessFile raf, long[][] xref, byte[] data, int start) 253 throws IOException, PjException { 254 PdfParserState state = new PdfParserState(); 255 state._data = data; 256 state._pos = start; 257 state._stream = -1; 258 Stack stack = new Stack(); 259 boolean endFlag = false; 260 while ( ( ! endFlag ) && (getToken(state)) ) { 261 if (state._stream != -1) { 262 stack.push(state._streamToken); 263 state._stream = -1; 264 } 265 else if (state._token.equals("startxref")) { 266 endFlag = true; 267 } 268 else if (state._token.equals("endobj")) { 269 endFlag = true; 270 } 271 else if (state._token.equals("%%EOF")) { 272 endFlag = true; 273 } 274 else if (state._token.equals("endstream")) { 275 byte[] stream = (byte[])(stack.pop()); 276 PjStreamDictionary pjsd = new PjStreamDictionary( 277 ((PjDictionary)(stack.pop())).getHashtable()); 278 PjStream pjs = new PjStream(pjsd, stream); 279 stack.push(pjs); 280 } 281 else if (state._token.equals("stream")) { 282 PjObject obj = ((PjObject)( 284 (((PjDictionary)(stack.peek())). 285 getHashtable(). 286 get(new PjName("Length"))))); 287 if (obj instanceof PjReference) { 288 obj = getObject(pdf, raf, xref, 289 ((PjReference)(obj)).getObjNumber().getInt()); 290 } 291 state._stream = 292 ((PjNumber)(obj)).getInt(); 293 294 if ( state._stream > 299 (state._data.length - state._pos) 300 ) { 301 state._stream = 302 state._data.length - 303 state._pos - 17; 304 } 305 306 if (state._pos < state._data.length) { 307 if ((char)(state._data[state._pos]) == '\r') { 308 state._pos++; 309 } 310 if ( (state._pos < state._data.length) && 311 ((char)(state._data[state._pos]) == 312 '\n') ) { 313 state._pos++; 314 } 315 } 316 } 317 else if (state._token.equals("null")) { 318 stack.push(new PjNull()); 319 } 320 else if (state._token.equals("true")) { 321 stack.push(new PjBoolean(true)); 322 } 323 else if (state._token.equals("false")) { 324 stack.push(new PjBoolean(false)); 325 } 326 else if (state._token.equals("R")) { 327 stack.pop(); PjNumber obj = (PjNumber)(stack.pop()); 333 stack.push(new PjReference(obj, PjNumber.ZERO)); 334 } 335 else if ( (state._token.charAt(0) == '<') && 336 (state._token.startsWith("<<") == false) ) { 337 stack.push(new PjString(PjString.decodePdf(state._token))); 338 } 339 else if ( 340 (Character.isDigit(state._token.charAt(0))) 341 || (state._token.charAt(0) == '-') 342 || (state._token.charAt(0) == '.') ) { 343 stack.push(new PjNumber(new Float (state._token).floatValue())); 344 } 345 else if (state._token.charAt(0) == '(') { 346 stack.push(new PjString(PjString.decodePdf(state._token))); 347 } 348 else if (state._token.charAt(0) == '/') { 349 stack.push(new PjName(state._token.substring(1))); 350 } 351 else if (state._token.equals(">>")) { 352 boolean done = false; 353 Object obj; 354 Hashtable h = new Hashtable(); 355 while ( ! done ) { 356 obj = stack.pop(); 357 if ( (obj instanceof String ) && 358 (((String )obj).equals("<<")) ) { 359 done = true; 360 } else { 361 h.put((PjName)(stack.pop()), 362 (PjObject)obj); 363 } 364 } 365 PjDictionary dictionary = new PjDictionary(h); 367 if (PjPage.isLike(dictionary)) { 368 stack.push(new PjPage(h)); 369 } 370 else if (PjPages.isLike(dictionary)) { 371 stack.push(new PjPages(h)); 372 } 373 else if (PjFontType1.isLike(dictionary)) { 374 stack.push(new PjFontType1(h)); 375 } 376 else if (PjFontDescriptor.isLike(dictionary)) { 377 stack.push(new PjFontDescriptor(h)); 378 } 379 else if (PjResources.isLike(dictionary)) { 380 stack.push(new PjResources(h)); 381 } 382 else if (PjCatalog.isLike(dictionary)) { 383 stack.push(new PjCatalog(h)); 384 } 385 else if (PjInfo.isLike(dictionary)) { 386 stack.push(new PjInfo(h)); 387 } 388 else if (PjEncoding.isLike(dictionary)) { 389 stack.push(new PjEncoding(h)); 390 } 391 else { 392 stack.push(dictionary); 393 } 394 } 395 else if (state._token.equals("]")) { 396 boolean done = false; 397 Object obj; 398 Vector v = new Vector(); 399 while ( ! done ) { 400 obj = stack.pop(); 401 if ( (obj instanceof String ) && 402 (((String )obj).equals("[")) ) { 403 done = true; 404 } else { 405 v.insertElementAt((PjObject)obj, 0); 406 } 407 } 408 PjArray array = new PjArray(v); 410 if (PjRectangle.isLike(array)) { 411 stack.push(new PjRectangle(v)); 412 } 413 else if (PjProcSet.isLike(array)) { 414 stack.push(new PjProcSet(v)); 415 } 416 else { 417 stack.push(array); 418 } 419 } 420 else if (state._token.startsWith("%")) { 421 } 423 else { 424 stack.push(state._token); 425 } 426 } 427 return (PjObject)(stack.pop()); 428 } 429 430 private static boolean getLine(PdfParserState state) { 431 StringBuffer sb = new StringBuffer (); 432 char c; 433 while (state._pos < state._data.length) { 434 c = (char)(state._data[state._pos]); 435 state._pos++; 436 switch (c) { 437 case '\r': 438 if ( (state._pos < state._data.length) && 439 ((char)(state._data[state._pos]) == '\n') ) { 440 state._pos++; 441 } 442 case '\n': 443 state._token = sb.toString(); 444 return true; 445 default: 446 sb.append(c); 447 } 448 } 449 return false; 450 } 451 452 private static boolean getToken(PdfParserState state) { 453 if (state._stream != -1) { 454 state._streamToken = new byte[state._stream]; 455 System.arraycopy(state._data, state._pos, state._streamToken, 0, 456 state._stream); 457 state._pos = state._pos + state._stream; 458 return true; 459 } 460 skipWhitespace(state); 461 StringBuffer sb = new StringBuffer (); 462 boolean firstChar = true; 463 boolean string = false; 464 int stringParen = 0; 465 boolean hstring = false; 466 char c = '\0'; 467 char last; 468 int x; 469 while (state._pos < state._data.length) { 470 last = c; 471 c = (char)(state._data[state._pos]); 472 state._pos++; 473 if (firstChar) { 474 switch (c) { 475 case '(': 476 string = true; 477 stringParen = 0; 478 break; 479 case ']': 480 state._token = "]"; 481 return true; 482 case '>': 483 if ( (state._pos < state._data.length) && 484 ((char)(state._data[state._pos]) == 485 '>') ) { 486 state._pos++; 487 state._token = ">>"; 488 return true; 489 } 490 break; 491 case '%': 492 sb.append('%'); 493 while ( (state._pos < state._data.length) && 494 ((c = (char)(state._data[state._pos])) != '\n') && 495 (c != '\r') ) { 496 sb.append(c); 497 state._pos++; 498 } 499 state._token = sb.toString(); 500 return true; 501 default: 502 } 503 } 504 if ( (string) || (hstring) ) { 505 if (string) { 506 if ( (c == '(') && (last != '\\') ) { 507 stringParen++; 508 } 509 if ( (c == ')') && (last != '\\') ) { 510 if (stringParen == 1) { 511 sb.append(c); 512 state._token = sb.toString(); 513 return true; 514 } else { 515 stringParen--; 516 } 517 } 518 } else { 519 if (c == '>') { 521 sb.append(c); 522 state._token = sb.toString(); 523 return true; 524 } 525 } 526 sb.append(c); 527 } else { 528 if (isWhitespace(c)) { 529 state._token = sb.toString(); 530 return true; 531 } else { 532 switch (c) { 533 case '[': 534 if ( ! firstChar ) { 535 state._pos--; 536 state._token = sb.toString(); 537 return true; 538 } else { 539 state._token = "["; 540 return true; 541 } 542 case '<': 543 if ( ! firstChar ) { 544 state._pos--; 545 state._token = sb.toString(); 546 return true; 547 } else { 548 if ( (state._pos < state._data.length) && 549 ((char)(state._data[state._pos]) == 550 '<') ) { 551 state._pos++; 553 state._token = "<<"; 554 return true; 555 } else { 556 hstring = true; 558 sb.append(c); 559 } 560 } 561 break; 562 case ']': 563 case '/': 564 case '(': 565 if ( ! firstChar ) { 566 state._pos--; 567 state._token = 568 sb.toString(); 569 return true; 570 } else { 571 sb.append(c); 572 break; 573 } 574 case '>': 575 if ( (state._pos < 576 state._data.length) && 577 ((char)(state._data[state._pos]) == '>') ) { 578 state._pos--; 579 state._token = 580 sb.toString(); 581 return true; 582 } else { 583 sb.append(c); 584 } 585 break; 586 default: 587 sb.append(c); 588 } 589 } 590 } 591 if (firstChar) { 592 firstChar = false; 593 } 594 } 595 return false; 596 } 597 598 private static void skipWhitespace(PdfParserState state) { 599 while ( (state._pos < state._data.length) && (isWhitespace((char)(state._data[state._pos]))) ) { 600 state._pos++; 601 } 602 } 603 604 private static boolean isWhitespace(char c) { 605 switch (c) { 606 case ' ': 607 case '\t': 608 case '\r': 609 case '\n': 610 return true; 611 default: 612 return false; 613 } 614 } 615 616 } 617 | Popular Tags |