KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > dspace > search > Harvest


1 /*
2  * Harvest.java
3  *
4  * Version: $Revision: 1.18 $
5  *
6  * Date: $Date: 2006/11/24 00:44:02 $
7  *
8  * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
9  * Institute of Technology. All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions are
13  * met:
14  *
15  * - Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * - Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in the
20  * documentation and/or other materials provided with the distribution.
21  *
22  * - Neither the name of the Hewlett-Packard Company nor the name of the
23  * Massachusetts Institute of Technology nor the names of their
24  * contributors may be used to endorse or promote products derived from
25  * this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
38  * DAMAGE.
39  */

40 package org.dspace.search;
41
42 import java.sql.Date JavaDoc;
43 import java.sql.SQLException JavaDoc;
44 import java.sql.Timestamp JavaDoc;
45 import java.text.ParseException JavaDoc;
46 import java.text.SimpleDateFormat JavaDoc;
47 import java.util.ArrayList JavaDoc;
48 import java.util.Calendar JavaDoc;
49 import java.util.LinkedList JavaDoc;
50 import java.util.List JavaDoc;
51 import java.util.TimeZone JavaDoc;
52
53 import org.apache.log4j.Logger;
54 import org.dspace.content.DSpaceObject;
55 import org.dspace.content.Item;
56 import org.dspace.core.ConfigurationManager;
57 import org.dspace.core.Constants;
58 import org.dspace.core.Context;
59 import org.dspace.core.LogManager;
60 import org.dspace.handle.HandleManager;
61 import org.dspace.storage.rdbms.DatabaseManager;
62 import org.dspace.storage.rdbms.TableRow;
63 import org.dspace.storage.rdbms.TableRowIterator;
64
65 /**
66  * Utility class for extracting information about items, possibly just within a
67  * certain community or collection, that have been created, modified or
68  * withdrawn within a particular range of dates.
69  *
70  * @author Robert Tansley
71  * @version $Revision: 1.18 $
72  */

73 public class Harvest
74 {
75     /** log4j logger */
76     private static Logger log = Logger.getLogger(Harvest.class);
77     
78     /**
79      * Obtain information about items that have been created, modified or
80      * withdrawn within a given date range. You can also specify 'offset' and
81      * 'limit' so that a big harvest can be split up into smaller sections.
82      * <P>
83      * Note that dates are passed in the standard ISO8601 format used by DSpace
84      * (and OAI-PMH).
85      * <P>
86      * FIXME: Assumes all in_archive items have public metadata
87      *
88      * @param context
89      * DSpace context
90      * @param scope
91      * a Collection, Community, or <code>null</code> indicating the scope is
92      * all of DSpace
93      * @param startDate
94      * start of date range, or <code>null</code>
95      * @param endDate
96      * end of date range, or <code>null</code>
97      * @param offset
98      * for a partial harvest, the point in the overall list of
99      * matching items to start at. 0 means just start at the
100      * beginning.
101      * @param limit
102      * the number of matching items to return in a partial harvest.
103      * Specify 0 to return the whole list (or the rest of the list if
104      * an offset was specified.)
105      * @param items
106      * if <code>true</code> the <code>item</code> field of each
107      * <code>HarvestedItemInfo</code> object is filled out
108      * @param collections
109      * if <code>true</code> the <code>collectionHandles</code>
110      * field of each <code>HarvestedItemInfo</code> object is
111      * filled out
112      * @param withdrawn
113      * If <code>true</code>, information about withdrawn items is
114      * included
115      * @return List of <code>HarvestedItemInfo</code> objects
116      * @throws SQLException
117      * @throws ParseException If the date is not in a supported format
118      */

119     public static List JavaDoc harvest(Context context, DSpaceObject scope,
120             String JavaDoc startDate, String JavaDoc endDate, int offset, int limit,
121             boolean items, boolean collections, boolean withdrawn)
122             throws SQLException JavaDoc, ParseException JavaDoc
123     {
124
125         // Put together our query. Note there is no need for an
126
// "in_archive=true" condition, we are using the existence of
127
// Handles as our 'existence criterion'.
128
String JavaDoc query = "SELECT handle.handle, handle.resource_id, item.withdrawn, item.last_modified FROM handle, item";
129         
130         
131         // We are building a complex query that may contain a variable
132
// about of input data points. To accomidate this while still
133
// providing type safty we build a list of parameters to be
134
// plugged into the query at the database level.
135
List JavaDoc parameters = new ArrayList JavaDoc();
136         
137         if (scope != null)
138         {
139             if (scope.getType() == Constants.COLLECTION)
140             {
141                 query += ", collection2item";
142             }
143             else if (scope.getType() == Constants.COMMUNITY)
144             {
145                 query += ", community2item";
146             }
147         }
148         
149
150         query += " WHERE handle.resource_type_id=" + Constants.ITEM + " AND handle.resource_id=item.item_id ";
151
152         if (scope != null)
153         {
154             if (scope.getType() == Constants.COLLECTION)
155             {
156                 query += " AND collection2item.collection_id= ? " +
157                          " AND collection2item.item_id=handle.resource_id ";
158                 parameters.add(new Integer JavaDoc(scope.getID()));
159             }
160             else if (scope.getType() == Constants.COMMUNITY)
161             {
162                 query += " AND community2item.community_id= ? " +
163                          " AND community2item.item_id=handle.resource_id";
164                 parameters.add(new Integer JavaDoc(scope.getID()));
165             }
166         }
167                 
168         if (startDate != null)
169         {
170             if ("oracle".equals(ConfigurationManager.getProperty("db.name")))
171             {
172                 
173                 startDate = oracleTimeStampFormat(startDate);
174                 query += " AND item.last_modified >= " +
175                         oracleTimeStampFunction(startDate);
176                 parameters.add(startDate);
177             }
178             else //postgres
179
{
180                 query = query + " AND item.last_modified >= ? ";
181                 parameters.add(toTimestamp(startDate, false));
182             }
183         }
184
185         if (endDate != null)
186         {
187             /*
188              * If the end date has seconds precision, e.g.:
189              *
190              * 2004-04-29T13:45:43Z
191              *
192              * we need to add 999 milliseconds to this. This is because SQL
193              * TIMESTAMPs have millisecond precision, and so might have a value:
194              *
195              * 2004-04-29T13:45:43.952Z
196              *
197              * and so <= '2004-04-29T13:45:43Z' would not pick this up. Reading
198              * things out of the database, TIMESTAMPs are rounded down, so the
199              * above value would be read as '2004-04-29T13:45:43Z', and
200              * therefore a caller would expect <= '2004-04-29T13:45:43Z' to
201              * include that value.
202              *
203              * Got that? ;-)
204              */

205             boolean selfGenerated = false;
206             if (endDate.length() == 20)
207             {
208                 endDate = endDate.substring(0, 19) + ".999Z";
209                 selfGenerated = true;
210             }
211
212             if ("oracle".equals(ConfigurationManager.getProperty("db.name")))
213             {
214                 endDate = oracleTimeStampFormat(endDate);
215                 query += " AND item.last_modified <= " +
216                         oracleTimeStampFunction(endDate);
217                 parameters.add(endDate);
218             }
219             else //postgres
220
{
221                 query += " AND item.last_modified <= ? ";
222                 parameters.add(toTimestamp(endDate, selfGenerated));
223             }
224         }
225         
226         if (withdrawn == false)
227         {
228             // Exclude withdrawn items
229
if ("oracle".equals(ConfigurationManager.getProperty("db.name")))
230             {
231                 query += " AND withdrawn=0 ";
232             }
233             else
234             {
235                 // postgres uses booleans
236
query += " AND withdrawn=false ";
237             }
238         }
239
240         // Order by item ID, so that for a given harvest the order will be
241
// consistent. This is so that big harvests can be broken up into
242
// several smaller operations (e.g. for OAI resumption tokens.)
243
query += " ORDER BY handle.resource_id";
244
245         log.debug(LogManager.getHeader(context, "harvest SQL", query));
246         
247         // Execute
248
Object JavaDoc[] parametersArray = parameters.toArray();
249         TableRowIterator tri = DatabaseManager.query(context, query, parametersArray);
250         List JavaDoc infoObjects = new LinkedList JavaDoc();
251         int index = 0;
252
253         // Process results of query into HarvestedItemInfo objects
254
while (tri.hasNext())
255         {
256             TableRow row = tri.next();
257
258             /*
259              * This conditional ensures that we only process items within any
260              * constraints specified by 'offset' and 'limit' parameters.
261              */

262             if ((index >= offset)
263                     && ((limit == 0) || (index < (offset + limit))))
264             {
265                 HarvestedItemInfo itemInfo = new HarvestedItemInfo();
266                 
267                 itemInfo.context = context;
268                 itemInfo.handle = row.getStringColumn("handle");
269                 itemInfo.itemID = row.getIntColumn("resource_id");
270                 itemInfo.datestamp = row.getDateColumn("last_modified");
271                 itemInfo.withdrawn = row.getBooleanColumn("withdrawn");
272
273                 if (collections)
274                 {
275                     fillCollections(context, itemInfo);
276                 }
277
278                 if (items)
279                 {
280                     // Get the item
281
itemInfo.item = Item.find(context, itemInfo.itemID);
282                 }
283
284                 infoObjects.add(itemInfo);
285             }
286
287             index++;
288         }
289         tri.close();
290
291         return infoObjects;
292     }
293
294     /**
295      * Get harvested item info for a single item. <code>item</code> field in
296      * returned <code>HarvestedItemInfo</code> object is always filled out.
297      *
298      * @param context
299      * DSpace context
300      * @param handle
301      * Prefix-less Handle of item
302      * @param collections
303      * if <code>true</code> the <code>collectionHandles</code>
304      * field of the <code>HarvestedItemInfo</code> object is filled
305      * out
306      *
307      * @return <code>HarvestedItemInfo</code> object for the single item, or
308      * <code>null</code>
309      * @throws SQLException
310      */

311     public static HarvestedItemInfo getSingle(Context context, String JavaDoc handle,
312             boolean collections) throws SQLException JavaDoc
313     {
314         // FIXME: Assume Handle is item
315
Item i = (Item) HandleManager.resolveToObject(context, handle);
316
317         if (i == null)
318         {
319             return null;
320         }
321
322         // Fill out OAI info item object
323
HarvestedItemInfo itemInfo = new HarvestedItemInfo();
324
325         itemInfo.context = context;
326         itemInfo.item = i;
327         itemInfo.handle = handle;
328         itemInfo.withdrawn = i.isWithdrawn();
329         itemInfo.datestamp = i.getLastModified();
330         itemInfo.itemID = i.getID();
331
332         // Get the sets
333
if (collections)
334         {
335             fillCollections(context, itemInfo);
336         }
337
338         return itemInfo;
339     }
340
341     /**
342      * Fill out the containers field of the HarvestedItemInfo object
343      *
344      * @param context
345      * DSpace context
346      * @param itemInfo
347      * HarvestedItemInfo object to fill out
348      * @throws SQLException
349      */

350     private static void fillCollections(Context context,
351             HarvestedItemInfo itemInfo) throws SQLException JavaDoc
352     {
353         // Get the collection Handles from DB
354
TableRowIterator colRows = DatabaseManager.query(context,
355                         "SELECT handle.handle FROM handle, collection2item WHERE handle.resource_type_id= ? " +
356                         "AND collection2item.collection_id=handle.resource_id AND collection2item.item_id = ? ",
357                         Constants.COLLECTION, itemInfo.itemID);
358
359         // Chuck 'em in the itemInfo object
360
itemInfo.collectionHandles = new LinkedList JavaDoc();
361
362         while (colRows.hasNext())
363         {
364             TableRow r = colRows.next();
365             itemInfo.collectionHandles.add(r.getStringColumn("handle"));
366         }
367     }
368
369     
370     /**
371      * Convert a String to a java.sql.Timestamp object
372      *
373      * @param t The timestamp String
374      * @param selfGenerated Is this a self generated timestamp (e.g. it has .999 on the end)
375      * @return The converted Timestamp
376      * @throws ParseException
377      */

378     private static Timestamp JavaDoc toTimestamp(String JavaDoc t, boolean selfGenerated) throws ParseException JavaDoc
379     {
380         SimpleDateFormat JavaDoc df;
381         
382         // Choose the correct date format based on string length
383
if (t.length() == 10)
384         {
385             df = new SimpleDateFormat JavaDoc("yyyy-MM-dd");
386         }
387         else if (t.length() == 20)
388         {
389             df = new SimpleDateFormat JavaDoc("yyyy-MM-dd'T'HH:mm:ss'Z'");
390         }
391         else if (selfGenerated)
392         {
393             df = new SimpleDateFormat JavaDoc("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
394         }
395         else {
396             // Not self generated, and not in a guessable format
397
throw new ParseException JavaDoc("", 0);
398         }
399         
400         // Parse the date
401
df.setCalendar(Calendar.getInstance(TimeZone.getTimeZone("UTC")));
402         return new Timestamp JavaDoc(df.parse(t).getTime());
403     }
404
405     
406     /**
407      * Create an oracle to_timestamp function for the given iso date. It must be
408      * an ISO 8601-stlye string.
409      *
410      * Since the date could be a possible sql injection attack vector instead
411      * of placing the value inside the query a place holder will be used. The
412      * caller must ensure that the isoDateString parameter is bound to the query
413      * for the approprate substitution.
414      *
415      * @param isoDateString
416      * @return The oracle to_timestamp function.
417      */

418     private static String JavaDoc oracleTimeStampFunction(String JavaDoc isoDateString)
419     {
420         if (isoDateString.length() == 19 )
421         {
422             return "TO_TIMESTAMP( ? ,'YYYY-MM-DD\"T\"HH24:MI:SS')";
423         } else if (isoDateString.length() > 19)
424         {
425             return "TO_TIMESTAMP( ? ,'YYYY-MM-DD\"T\"HH24:MI:SS.FF\"Z\"')";
426         } else
427         {
428             throw new IllegalArgumentException JavaDoc("argument does not seem to be in the expected ISO 8601 format");
429         }
430     }
431     
432     /**
433      * Format the isoDateString according to oracles needs. The input should be ISO-85601 style.
434      *
435      * @param isoDateString
436      * @return a datastring format better suited to oracles needs.
437      */

438     private static String JavaDoc oracleTimeStampFormat(String JavaDoc isoDateString)
439     {
440         if (isoDateString.length() == 10)
441         {
442             return isoDateString + "T00:00:00";
443         }
444         else
445         {
446             return isoDateString;
447         }
448     }
449
450 }
451
Popular Tags