KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > dspace > content > packager > PDFPackager


1 /*
2  * PDFPackager.java
3  *
4  * Version: $Revision: 1.1 $
5  *
6  * Date: $Date: 2006/03/17 00:04:38 $
7  *
8  * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
9  * Institute of Technology. All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions are
13  * met:
14  *
15  * - Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * - Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in the
20  * documentation and/or other materials provided with the distribution.
21  *
22  * - Neither the name of the Hewlett-Packard Company nor the name of the
23  * Massachusetts Institute of Technology nor the names of their
24  * contributors may be used to endorse or promote products derived from
25  * this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
38  * DAMAGE.
39  */

40
41 package org.dspace.content.packager;
42
43 import java.io.IOException JavaDoc;
44 import java.io.InputStream JavaDoc;
45 import java.io.OutputStream JavaDoc;
46 import java.sql.SQLException JavaDoc;
47 import java.util.Calendar JavaDoc;
48
49 import org.apache.log4j.Logger;
50 import org.dspace.authorize.AuthorizeException;
51 import org.dspace.content.Bitstream;
52 import org.dspace.content.BitstreamFormat;
53 import org.dspace.content.Bundle;
54 import org.dspace.content.Collection;
55 import org.dspace.content.DCDate;
56 import org.dspace.content.DSpaceObject;
57 import org.dspace.content.Item;
58 import org.dspace.content.WorkspaceItem;
59 import org.dspace.content.crosswalk.CrosswalkException;
60 import org.dspace.content.crosswalk.MetadataValidationException;
61 import org.dspace.core.Constants;
62 import org.dspace.core.Context;
63 import org.dspace.core.LogManager;
64 import org.dspace.core.SelfNamedPlugin;
65 import org.dspace.core.Utils;
66 import org.pdfbox.cos.COSDocument;
67 import org.pdfbox.pdfparser.PDFParser;
68 import org.pdfbox.pdmodel.PDDocument;
69 import org.pdfbox.pdmodel.PDDocumentInformation;
70                                     
71 /**
72  * Accept a PDF file by itself as a SIP.
73  * <p>
74  * This is mainly a proof-of-concept to demonstrate the flexibility
75  * of the packager and crosswalk plugins.
76  * <p>
77  * To import, open up the PDF and try to extract sufficient metadata
78  * from its InfoDict.
79  * <p>
80  * Export is a crude hack: if the item has a bitstream containing PDF,
81  * send that, otherwise it fails. Do not attempt to insert metadata.
82  *
83  * @author Larry Stone
84  * @version $Revision: 1.1 $
85  */

86 public class PDFPackager
87        extends SelfNamedPlugin
88        implements PackageIngester, PackageDisseminator
89 {
90     /** log4j category */
91     private static Logger log = Logger.getLogger(PDFPackager.class);
92
93     private final static String JavaDoc BITSTREAM_FORMAT_NAME = "Adobe PDF";
94
95     private static String JavaDoc aliases[] = { "PDF", "Adobe PDF", "pdf", "application/pdf" };
96
97     public static String JavaDoc[] getPluginNames()
98     {
99         return aliases;
100     }
101
102     // utility to grovel bitstream formats..
103
private static void setFormatToMIMEType(Context context, Bitstream bs, String JavaDoc mimeType)
104         throws SQLException JavaDoc
105     {
106         BitstreamFormat bf[] = BitstreamFormat.findNonInternal(context);
107         for (int i = 0; i < bf.length; ++i)
108         {
109             if (bf[i].getMIMEType().equalsIgnoreCase(mimeType))
110             {
111                 bs.setFormat(bf[i]);
112                 break;
113             }
114         }
115     }
116
117     /**
118      * Create new Item out of the ingested package, in the indicated
119      * collection. It creates a workspace item, which the application
120      * can then install if it chooses to bypass Workflow.
121      * <p>
122      * This is a VERY crude import of a single Adobe PDF (Portable
123      * Document Format) file, using the document's embedded metadata
124      * for package metadata. If the PDF file hasn't got the minimal
125      * metadata available, it is rejected.
126      * <p>
127      * @param context DSpace context.
128      * @param collection collection under which to create new item.
129      * @param pkg input stream containing package to ingest.
130      * @param params package parameters (none recognized)
131      * @param license may be null, which takes default license.
132      * @return workspace item created by ingest.
133      * @throws PackageException if package is unacceptable or there is
134      * a fatal error turning it into an Item.
135      */

136     public WorkspaceItem ingest(Context context, Collection collection,
137                                 InputStream JavaDoc pkg, PackageParameters params,
138                                 String JavaDoc license)
139         throws PackageValidationException, CrosswalkException,
140                AuthorizeException, SQLException JavaDoc, IOException JavaDoc
141     {
142         InputStream JavaDoc bis = null;
143         COSDocument cos = null;
144         boolean success = false;
145         Bundle original = null;
146         Bitstream bs = null;
147         WorkspaceItem wi = null;
148
149         /** XXX comment out for now
150           // XXX for debugging of parameter handling
151           if (params != null)
152           {
153               Enumeration pe = params.propertyNames();
154               while (pe.hasMoreElements())
155               {
156                   String name = (String)pe.nextElement();
157                   String v[] = params.getProperties(name);
158                   StringBuffer msg = new StringBuffer("PackageParam: ");
159                   msg.append(name).append(" = ");
160                   for (int i = 0; i < v.length; ++i)
161                   {
162                       if (i > 0)
163                           msg.append(", ");
164                       msg.append(v[i]);
165                   }
166                   log.debug(msg);
167               }
168           }
169         **/

170            
171         try
172         {
173             // Save the PDF in a bitstream first, since the parser
174
// has to read it as well, and we cannot "rewind" it after that.
175
wi = WorkspaceItem.create(context, collection, false);
176             Item myitem = wi.getItem();
177             original = myitem.createBundle("ORIGINAL");
178             bs = original.createBitstream(pkg);
179             pkg.close();
180             bs.setName("package.pdf");
181             setFormatToMIMEType(context, bs, "application/pdf");
182             bs.update();
183             log.debug("Created bitstream ID="+String.valueOf(bs.getID())+", parsing...");
184
185             crosswalkPDF(context, myitem, bs.retrieve());
186
187             wi.update();
188             context.commit();
189             success = true;
190             log.info(LogManager.getHeader(context, "ingest",
191                 "Created new Item, db ID="+String.valueOf(myitem.getID())+
192                 ", WorkspaceItem ID="+String.valueOf(wi.getID())));
193             return wi;
194         }
195         finally
196         {
197             try
198             {
199                 // Close bitstream input stream and PDF file.
200
if (bis != null)
201                     bis.close();
202                 if (cos != null)
203                     cos.close();
204             }
205             catch (IOException JavaDoc ie)
206             { }
207
208             // get rid of bitstream and item if ingest fails
209
if (!success)
210             {
211                 if (original != null && bs != null)
212                     original.removeBitstream(bs);
213                 if (wi != null)
214                     wi.deleteAll();
215             }
216             context.commit();
217         }
218     }
219
220     /**
221      * Replace is not implemented.
222      */

223     public Item replace(Context ctx, Item item, InputStream JavaDoc pckage, PackageParameters params)
224         throws PackageValidationException, CrosswalkException,
225                AuthorizeException, SQLException JavaDoc, IOException JavaDoc,
226                UnsupportedOperationException JavaDoc
227     {
228         throw new UnsupportedOperationException JavaDoc("The replace operation is not implemented.");
229     }
230
231     /**
232      * VERY crude dissemination: just look for the first
233      * bitstream with the PDF package type, and toss it out.
234      * Works on packages importer with this packager, and maybe some others.
235      */

236     public void disseminate(Context context, DSpaceObject dso,
237                             PackageParameters params, OutputStream JavaDoc out)
238         throws PackageValidationException, CrosswalkException,
239                AuthorizeException, SQLException JavaDoc, IOException JavaDoc
240     {
241         if (dso.getType() != Constants.ITEM)
242             throw new PackageValidationException("This disseminator can only handle objects of type ITEM.");
243
244         Item item = (Item)dso;
245         try
246         {
247             BitstreamFormat pdff = BitstreamFormat.findByShortDescription(context,
248                                     BITSTREAM_FORMAT_NAME);
249             if (pdff == null)
250                 throw new PackageValidationException("Cannot find BitstreamFormat \""+BITSTREAM_FORMAT_NAME+"\"");
251             Bitstream pkgBs = PackageUtils.getBitstreamByFormat(item, pdff, Constants.DEFAULT_BUNDLE_NAME);
252             if (pkgBs == null)
253                 throw new PackageValidationException("Cannot find Bitstream with format \""+BITSTREAM_FORMAT_NAME+"\"");
254             Utils.copy(pkgBs.retrieve(), out);
255         }
256             finally {}
257     }
258
259     /**
260      * Identifies the MIME-type of this package, i.e. "application/pdf".
261      *
262      * @return the MIME type (content-type header) of the package to be returned
263      */

264     public String JavaDoc getMIMEType(PackageParameters params)
265     {
266         return "application/pdf";
267     }
268
269     private void crosswalkPDF(Context context, Item item, InputStream JavaDoc metadata)
270         throws CrosswalkException, IOException JavaDoc, SQLException JavaDoc, AuthorizeException
271     {
272         COSDocument cos = null;
273
274         try
275         {
276             PDFParser parser = new PDFParser(metadata);
277             parser.parse();
278             cos = parser.getDocument();
279
280             // sanity check: PDFBox breaks on encrypted documents, so give up.
281
if(cos.getEncryptionDictionary() != null)
282                 throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
283
284             /* PDF to DC "crosswalk":
285              *
286              * NOTE: This is not in a crosswalk plugin because (a) it isn't
287              * useful anywhere else, and more importantly, (b) the source
288              * data is not XML so it doesn't fit the plugin's interface.
289              *
290              * pattern of crosswalk -- PDF dict entries to DC:
291              * Title -> title.null
292              * Author -> contributor.author
293              * CreationDate -> date.created
294              * ModDate -> date.created
295              * Creator -> description.provenance (application that created orig)
296              * Producer -> description.provenance (convertor to pdf)
297              * Subject -> description.abstract
298              * Keywords -> subject.other
299              * date is java.util.Calendar
300              */

301             PDDocument pd = new PDDocument(cos);
302             PDDocumentInformation docinfo = pd.getDocumentInformation();
303             String JavaDoc title = docinfo.getTitle();
304
305             // sanity check: item must have a title.
306
if (title == null)
307                 throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
308             log.debug("PDF Info dict title=\""+title+"\"");
309             item.addDC("title", null, "en", title);
310             String JavaDoc value;
311             Calendar JavaDoc date;
312             if ((value = docinfo.getAuthor()) != null)
313             {
314                 item.addDC("contributor", "author", null, value);
315                 log.debug("PDF Info dict author=\""+value+"\"");
316             }
317             if ((value = docinfo.getCreator()) != null)
318                 item.addDC("description", "provenance", "en",
319                               "Application that created the original document: "+value);
320             if ((value = docinfo.getProducer()) != null)
321                 item.addDC("description", "provenance", "en",
322                               "Original document converted to PDF by: "+value);
323             if ((value = docinfo.getSubject()) != null)
324                 item.addDC("description", "abstract", null, value);
325             if ((value = docinfo.getKeywords()) != null)
326                 item.addDC("subject", "other", null, value);
327
328             // Take either CreationDate or ModDate as "date.created",
329
// Too bad there's no place to put "last modified" in the DC.
330
Calendar JavaDoc calValue;
331             if ((calValue = docinfo.getCreationDate()) == null)
332                 calValue = docinfo.getModificationDate();
333             if (calValue != null)
334                 item.addDC("date", "created", null,
335                              (new DCDate(calValue.getTime())).toString());
336             item.update();
337         }
338         finally
339         {
340             if (cos != null)
341                 cos.close();
342         }
343     }
344 }
345
Popular Tags