KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > dspace > app > statistics > LogAnalyser


1 /*
2  * LogAnalyser.java
3  *
4  * Version: $Revision: 1.7 $
5  *
6  * Date: $Date: 2006/11/24 04:07:47 $
7  *
8  * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
9  * Institute of Technology. All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions are
13  * met:
14  *
15  * - Redistributions of source code must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * - Redistributions in binary form must reproduce the above copyright
19  * notice, this list of conditions and the following disclaimer in the
20  * documentation and/or other materials provided with the distribution.
21  *
22  * - Neither the name of the Hewlett-Packard Company nor the name of the
23  * Massachusetts Institute of Technology nor the names of their
24  * contributors may be used to endorse or promote products derived from
25  * this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37  * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
38  * DAMAGE.
39  */

40
41 package org.dspace.app.statistics;
42
43 import org.dspace.app.statistics.LogLine;
44
45 import org.dspace.core.ConfigurationManager;
46 import org.dspace.core.Context;
47 import org.dspace.storage.rdbms.DatabaseManager;
48 import org.dspace.storage.rdbms.TableRow;
49
50 import java.sql.SQLException JavaDoc;
51
52 import java.lang.Long JavaDoc;
53 import java.lang.StringBuffer JavaDoc;
54
55 import java.net.InetAddress JavaDoc;
56 import java.net.UnknownHostException JavaDoc;
57
58 import java.text.ParseException JavaDoc;
59 import java.text.SimpleDateFormat JavaDoc;
60
61 import java.util.ArrayList JavaDoc;
62 import java.util.Calendar JavaDoc;
63 import java.util.Date JavaDoc;
64 import java.util.GregorianCalendar JavaDoc;
65 import java.util.HashMap JavaDoc;
66 import java.util.Iterator JavaDoc;
67 import java.util.List JavaDoc;
68 import java.util.Map JavaDoc;
69 import java.util.regex.Matcher JavaDoc;
70 import java.util.regex.Pattern JavaDoc;
71 import java.util.StringTokenizer JavaDoc;
72
73 import java.io.BufferedReader JavaDoc;
74 import java.io.BufferedWriter JavaDoc;
75 import java.io.File JavaDoc;
76 import java.io.FileReader JavaDoc;
77 import java.io.FileWriter JavaDoc;
78 import java.io.IOException JavaDoc;
79
80 /**
81  * This class performs all the actual analysis of a given set of DSpace log
82  * files. Most input can be configured; use the -help flag for a full list
83  * of usage information.
84  *
85  * The output of this file is plain text and forms an "aggregation" file which
86  * can then be used for display purposes using the related ReportGenerator
87  * class.
88  *
89  * @author Richard Jones
90  */

91 public class LogAnalyser
92 {
93     
94     // set up our class globals
95
// FIXME: there are so many of these perhaps they should exist in a static
96
// object of their own
97

98     /////////////////
99
// aggregators
100
/////////////////
101

102     /** aggregator for all actions performed in the system */
103     private static Map JavaDoc actionAggregator = new HashMap JavaDoc();
104     
105     /** aggregator for all searches performed */
106     private static Map JavaDoc searchAggregator = new HashMap JavaDoc();
107     
108     /** aggregator for user logins */
109     private static Map JavaDoc userAggregator = new HashMap JavaDoc();
110     
111     /** aggregator for item views */
112     private static Map JavaDoc itemAggregator = new HashMap JavaDoc();
113     
114     /** aggregator for current archive state statistics */
115     private static Map JavaDoc archiveStats = new HashMap JavaDoc();
116     
117     /** warning counter */
118     private static int warnCount = 0;
119     
120     /** log line counter */
121     private static int lineCount = 0;
122         
123     //////////////////
124
// config data
125
//////////////////
126

127     /** list of actions to be included in the general summary */
128     private static List JavaDoc generalSummary = new ArrayList JavaDoc();
129     
130     /** list of words not to be aggregated */
131     private static List JavaDoc excludeWords = new ArrayList JavaDoc();
132     
133     /** list of search types to be ignored, such as "author:" */
134     private static List JavaDoc excludeTypes = new ArrayList JavaDoc();
135     
136     /** list of characters to be excluded */
137     private static List JavaDoc excludeChars = new ArrayList JavaDoc();
138     
139     /** list of item types to be reported on in the current state */
140     private static List JavaDoc itemTypes = new ArrayList JavaDoc();
141     
142     /** bottom limit to output for search word analysis */
143     private static int searchFloor;
144     
145     /** bottom limit to output for item view analysis */
146     private static int itemFloor;
147     
148     /** number of items from most popular to be looked up in the database */
149     private static int itemLookup;
150     
151     /** mode to use for user email display */
152     private static String JavaDoc userEmail;
153     
154     /** URL of the service being analysed */
155     private static String JavaDoc url;
156     
157     /** Name of the service being analysed */
158     private static String JavaDoc name;
159    
160     /** Name of the service being analysed */
161     private static String JavaDoc hostName;
162     
163     /** the average number of views per item */
164     private static int views = 0;
165     
166     ///////////////////////
167
// regular expressions
168
///////////////////////
169

170    /** Exclude characters regular expression pattern */
171    private static Pattern JavaDoc excludeCharRX = null;
172    
173    /** handle indicator string regular expression pattern */
174    private static Pattern JavaDoc handleRX = null;
175    
176    /** item id indicator string regular expression pattern */
177    private static Pattern JavaDoc itemRX = null;
178   
179    /** query string indicator regular expression pattern */
180    private static Pattern JavaDoc queryRX = null;
181    
182    /** collection indicator regular expression pattern */
183    private static Pattern JavaDoc collectionRX = null;
184    
185    /** community indicator regular expression pattern */
186    private static Pattern JavaDoc communityRX = null;
187    
188    /** results indicator regular expression pattern */
189    private static Pattern JavaDoc resultsRX = null;
190    
191    /** single character regular expression pattern */
192    private static Pattern JavaDoc singleRX = null;
193    
194    /** a pattern to match a valid version 1.3 log file line */
195    private static Pattern JavaDoc valid13 = null;
196    
197    /** a pattern to match a valid version 1.4 log file line */
198    private static Pattern JavaDoc valid14 = null;
199    
200    /** pattern to match valid log file names */
201    private static Pattern JavaDoc logRegex = null;
202    
203    /** pattern to match commented out lines from the config file */
204    private static Pattern JavaDoc comment = Pattern.compile("^#");
205         
206    /** pattern to match genuine lines from the config file */
207    private static Pattern JavaDoc real = Pattern.compile("^(.+)=(.+)");
208    
209    /** pattern to match all search types */
210    private static Pattern JavaDoc typeRX = null;
211    
212    /** pattern to match all search types */
213    private static Pattern JavaDoc wordRX = null;
214    
215    //////////////////////////
216
// Miscellaneous variables
217
//////////////////////////
218

219    /** process timing clock */
220    private static Calendar JavaDoc startTime = null;
221    
222    /////////////////////////
223
// command line options
224
////////////////////////
225

226    /** the log directory to be analysed */
227    private static String JavaDoc logDir = ConfigurationManager.getProperty("dspace.dir") +
228                         File.separator + "log";
229         
230    /** the regex to describe the file name format */
231    private static String JavaDoc fileTemplate = "dspace\\.log.*";
232         
233    /** the config file from which to configure the analyser */
234    private static String JavaDoc configFile = ConfigurationManager.getProperty("dspace.dir") +
235                             File.separator + "config" + File.separator +
236                             "dstat.cfg";
237    
238    /** the output file to which to write aggregation data */
239    private static String JavaDoc outFile = ConfigurationManager.getProperty("dspace.dir") +
240                             File.separator + "log" + File.separator + "dstat.dat";
241    
242    /** the starting date of the report */
243    private static Date JavaDoc startDate = null;
244         
245    /** the end date of the report */
246    private static Date JavaDoc endDate = null;
247         
248    /** the starting date of the report as obtained from the log files */
249    private static Date JavaDoc logStartDate = null;
250         
251    /** the end date of the report as obtained from the log files */
252    private static Date JavaDoc logEndDate = null;
253    
254    /** are we looking stuff up in the database */
255    private static boolean lookUp = false;
256         
257    
258     /**
259      * main method to be run from command line. See usage information for
260      * details as to how to use the command line flags (-help)
261      */

262     public static void main(String JavaDoc [] argv)
263         throws Exception JavaDoc, SQLException JavaDoc
264     {
265         // first, start the processing clock
266
startTime = new GregorianCalendar JavaDoc();
267         
268         // create context as super user
269
Context context = new Context();
270         context.setIgnoreAuthorization(true);
271         
272         // set up our command line variables
273
String JavaDoc myLogDir = null;
274         String JavaDoc myFileTemplate = null;
275         String JavaDoc myConfigFile = null;
276         String JavaDoc myOutFile = null;
277         Date JavaDoc myStartDate = null;
278         Date JavaDoc myEndDate = null;
279         boolean myLookUp = false;
280         
281         // read in our command line options
282
for (int i = 0; i < argv.length; i++)
283         {
284             if (argv[i].equals("-log"))
285             {
286                 myLogDir = argv[i+1];
287             }
288             
289             if (argv[i].equals("-file"))
290             {
291                 myFileTemplate = argv[i+1];
292             }
293             
294             if (argv[i].equals("-cfg"))
295             {
296                 myConfigFile = argv[i+1];
297             }
298             
299             if (argv[i].equals("-out"))
300             {
301                 myOutFile = argv[i+1];
302             }
303             
304             if (argv[i].equals("-help"))
305             {
306                 LogAnalyser.usage();
307                 System.exit(0);
308             }
309             
310             if (argv[i].equals("-start"))
311             {
312                 myStartDate = parseDate(argv[i+1]);
313             }
314             
315             if (argv[i].equals("-end"))
316             {
317                 myEndDate = parseDate(argv[i+1]);
318             }
319             
320             if (argv[i].equals("-lookup"))
321             {
322                 myLookUp = true;
323             }
324         }
325         
326         // now call the method which actually processes the logs
327
processLogs(context, myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp);
328     }
329     
330     /**
331      * using the pre-configuration information passed here, analyse the logs
332      * and produce the aggregation file
333      *
334      * @param context the DSpace context object this occurs under
335      * @param myLogDir the passed log directory. Uses default if null
336      * @param myFileTemplate the passed file name regex. Uses default if null
337      * @param myConfigFile the DStat config file. Uses default if null
338      * @param myOutFile the file to which to output aggregation data. Uses default if null
339      * @param myStartDate the desired start of the analysis. Starts from the beginning otherwise
340      * @param myEndDate the desired end of the analysis. Goes to the end otherwise
341      * @param myLookUp force a lookup of the database
342      */

343     public static void processLogs(Context context, String JavaDoc myLogDir,
344                                     String JavaDoc myFileTemplate, String JavaDoc myConfigFile,
345                                     String JavaDoc myOutFile, Date JavaDoc myStartDate,
346                                     Date JavaDoc myEndDate, boolean myLookUp)
347         throws IOException JavaDoc, SQLException JavaDoc
348     {
349         // FIXME: perhaps we should have all parameters and aggregators put
350
// together in a single aggregating object
351

352         // if the timer has not yet been started, then start it
353
if (startTime != null)
354         {
355             startTime = new GregorianCalendar JavaDoc();
356         }
357         
358         // set the parameters for this analysis
359
setParameters(myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp);
360         
361         // pre prepare our standard file readers and buffered readers
362
FileReader JavaDoc fr = null;
363         BufferedReader JavaDoc br = null;
364         
365         // read in the config information, throwing an error if we fail to open
366
// the given config file
367
readConfig(configFile);
368         
369         // assemble the regular expressions for later use (requires the file
370
// template to build the regex to match it
371
setRegex(fileTemplate);
372         
373         // get the log files
374
File JavaDoc[] logFiles = getLogFiles(logDir);
375         
376         // standard loop counter
377
int i = 0;
378         
379         // for every log file do analysis
380
// FIXME: it is easy to implement not processing log files after the
381
// dates exceed the end boundary, but is there an easy way to do it
382
// for the start of the file? Note that we can assume that the contents
383
// of the log file are sequential, but can we assume the files are
384
// provided in a data sequence?
385
for (i = 0; i < logFiles.length; i++)
386         {
387             // check to see if this file is a log file agains the global regex
388
Matcher JavaDoc matchRegex = logRegex.matcher(logFiles[i].getName());
389             if (matchRegex.matches())
390             {
391                 // if it is a log file, open it up and lets have a look at the
392
// contents.
393
try
394                 {
395                     fr = new FileReader JavaDoc(logFiles[i].toString());
396                     br = new BufferedReader JavaDoc(fr);
397                 }
398                 catch (IOException JavaDoc e)
399                 {
400                     System.out.println("Failed to read log file " + logFiles[i].toString());
401                     System.exit(0);
402                 }
403
404                 // for each line in the file do the analysis
405
// FIXME: perhaps each section needs to be dolled out to an
406
// analysing class to allow pluggability of other methods of
407
// analysis, and ease of code reading too - Pending further thought
408
String JavaDoc line = null;
409                 while ((line = br.readLine()) != null)
410                 {
411                     // get the log line object
412
LogLine logLine = getLogLine(line);
413                     
414                     // if there are line segments get on with the analysis
415
if (logLine != null)
416                     {
417                         // first find out if we are constraining by date and
418
// if so apply the restrictions
419
if (logLine.beforeDate(startDate))
420                         {
421                             continue;
422                         }
423                         
424                         if (logLine.afterDate(endDate))
425                         {
426                             break;
427                         }
428                         
429                         // count the number of lines parsed
430
lineCount++;
431                         
432                         // if we are not constrained by date, register the date
433
// as the start/end date if it is the earliest/latest so far
434
// FIXME: this should probably have a method of its own
435
if (startDate == null)
436                         {
437                             if (logStartDate != null)
438                             {
439                                 if (logLine.beforeDate(logStartDate))
440                                 {
441                                     logStartDate = logLine.getDate();
442                                 }
443                             }
444                             else
445                             {
446                                 logStartDate = logLine.getDate();
447                             }
448                         }
449                         
450                         if (endDate == null)
451                         {
452                             if (logEndDate != null)
453                             {
454                                 if (logLine.afterDate(logEndDate))
455                                 {
456                                     logEndDate = logLine.getDate();
457                                 }
458                             }
459                             else
460                             {
461                                 logEndDate = logLine.getDate();
462                             }
463                         }
464                         
465                         // count the warnings
466
if (logLine.isLevel("WARN"))
467                         {
468                             // FIXME: really, this ought to be some kind of level
469
// aggregator
470
warnCount++;
471                         }
472
473                         // is the action a search?
474
if (logLine.isAction("search"))
475                         {
476                             // get back all the valid search words from the query
477
String JavaDoc[] words = analyseQuery(logLine.getParams());
478                             
479                             // for each search word add to the aggregator or
480
// increment the aggregator's counter
481
for (int j = 0; j < words.length; j++)
482                             {
483                                 // FIXME: perhaps aggregators ought to be objects
484
// themselves
485
searchAggregator.put(words[j], increment(searchAggregator, words[j]));
486                             }
487                         }
488
489                         // is the action a login, and are we counting user logins?
490
if (logLine.isAction("login") && !userEmail.equals("off"))
491                         {
492                             userAggregator.put(logLine.getUser(), increment(userAggregator, logLine.getUser()));
493                         }
494
495                         // is the action an item view?
496
if (logLine.isAction("view_item"))
497                         {
498                             String JavaDoc handle = logLine.getParams();
499
500                             // strip the handle string
501
Matcher JavaDoc matchHandle = handleRX.matcher(handle);
502                             handle = matchHandle.replaceAll("");
503                             
504                             // strip the item id string
505
Matcher JavaDoc matchItem = itemRX.matcher(handle);
506                             handle = matchItem.replaceAll("");
507
508                             handle.trim();
509
510                             // either add the handle to the aggregator or
511
// increment its counter
512
itemAggregator.put(handle, increment(itemAggregator, handle));
513                         }
514
515                         // log all the activity
516
actionAggregator.put(logLine.getAction(), increment(actionAggregator, logLine.getAction()));
517                     }
518                 }
519
520                 // close the file reading buffers
521
br.close();
522                 fr.close();
523
524             }
525         }
526         
527         // do we want to do a database lookup? Do so only if the start and
528
// end dates are null or lookUp is true
529
// FIXME: this is a kind of separate section. Would it be worth building
530
// the summary string separately and then inserting it into the real
531
// summary later? Especially if we make the archive analysis more complex
532
archiveStats.put("All Items", getNumItems(context));
533         for (i = 0; i < itemTypes.size(); i++)
534         {
535             archiveStats.put(itemTypes.get(i), getNumItems(context, (String JavaDoc) itemTypes.get(i)));
536         }
537         
538         // now do the host lookup
539
try
540         {
541             InetAddress JavaDoc addr = InetAddress.getLocalHost();
542             hostName = addr.getHostName();
543         }
544         catch (UnknownHostException JavaDoc e)
545         {
546             hostName = "unknown host";
547         }
548         
549         // do the average views analysis
550
if (((Integer JavaDoc) archiveStats.get("All Items")).intValue() != 0)
551         {
552             // FIXME: this is dependent on their being a query on the db, which
553
// there might not always be if it becomes configurable
554
Double JavaDoc avg = new Double JavaDoc(
555                         Math.ceil(
556                             ((Integer JavaDoc) actionAggregator.get("view_item")).intValue() /
557                             ((Integer JavaDoc) archiveStats.get("All Items")).intValue()));
558             views = avg.intValue();
559         }
560         
561         // finally, write the output
562
createOutput();
563
564         return;
565     }
566    
567     
568     /**
569      * set the passed parameters up as global class variables. This has to
570      * be done in a separate method because the API permits for running from
571      * the command line with args or calling the processLogs method statically
572      * from elsewhere
573      *
574      * @param myLogDir the log file directory to be analysed
575      * @param myFileTemplate regex for log file names
576      * @param myConfigFile config file to use for dstat
577      * @param myOutFile file to write the aggregation into
578      * @param myStartDate requested log reporting start date
579      * @param myEndDate requested log reporting end date
580      * @param myLookUp requested look up force flag
581      */

582     public static void setParameters(String JavaDoc myLogDir, String JavaDoc myFileTemplate,
583                                     String JavaDoc myConfigFile, String JavaDoc myOutFile,
584                                     Date JavaDoc myStartDate, Date JavaDoc myEndDate,
585                                     boolean myLookUp)
586     {
587         if (myLogDir != null)
588         {
589             logDir = myLogDir;
590         }
591         
592         if (myFileTemplate != null)
593         {
594             fileTemplate = myFileTemplate;
595         }
596         
597         if (myConfigFile != null)
598         {
599             configFile = myConfigFile;
600         }
601         
602         if (myStartDate != null)
603         {
604             startDate = myStartDate;
605         }
606         
607         if (myEndDate != null)
608         {
609             endDate = myEndDate;
610         }
611         
612         if (myLogDir != null)
613         {
614             lookUp = myLookUp;
615         }
616         
617         if (myOutFile != null)
618         {
619             outFile = myOutFile;
620         }
621         
622         return;
623     }
624     
625     
626     /**
627      * generate the analyser's output to the specified out file
628      */

629     public static void createOutput()
630     {
631         // start a string buffer to hold the final output
632
StringBuffer JavaDoc summary = new StringBuffer JavaDoc();
633         
634         // define an iterator that will be used to go over the hashmap keys
635
Iterator JavaDoc keys = null;
636         
637         // output the number of lines parsed
638
summary.append("log_lines=" + Integer.toString(lineCount) + "\n");
639         
640         // output the number of warnings encountered
641
summary.append("warnings=" + Integer.toString(warnCount) + "\n");
642         
643         // set the general summary config up in the aggregator file
644
for (int i = 0; i < generalSummary.size(); i++)
645         {
646             summary.append("general_summary=" + generalSummary.get(i) + "\n");
647         }
648         
649         // output the host name
650
summary.append("server_name=" + hostName + "\n");
651         
652         // output the service name
653
summary.append("service_name=" + name + "\n");
654         
655         // output the date information if necessary
656
SimpleDateFormat JavaDoc sdf = new SimpleDateFormat JavaDoc("dd'/'MM'/'yyyy");
657         
658         if (startDate != null)
659         {
660             summary.append("start_date=" + sdf.format(startDate) + "\n");
661         }
662         else if (logStartDate != null)
663         {
664             summary.append("start_date=" + sdf.format(logStartDate) + "\n");
665         }
666         
667         if (endDate != null)
668         {
669             summary.append("end_date=" + sdf.format(endDate) + "\n");
670         }
671         else if (logEndDate != null)
672         {
673             summary.append("end_date=" + sdf.format(logEndDate) + "\n");
674         }
675         
676         // write out the archive stats
677
keys = archiveStats.keySet().iterator();
678         while (keys.hasNext())
679         {
680             String JavaDoc key = (String JavaDoc) keys.next();
681             summary.append("archive." + key + "=" + archiveStats.get(key) + "\n");
682         }
683         
684         // write out the action aggregation results
685
keys = actionAggregator.keySet().iterator();
686         while (keys.hasNext())
687         {
688             String JavaDoc key = (String JavaDoc) keys.next();
689             summary.append("action." + key + "=" + actionAggregator.get(key) + "\n");
690         }
691         
692         // depending on the config settings for reporting on emails output the
693
// login information
694
summary.append("user_email=" + userEmail + "\n");
695         int address = 1;
696         keys = userAggregator.keySet().iterator();
697
698         // for each email address either write out the address and the count
699
// or alias it with an "Address X" label, to keep the data confidential
700
// FIXME: the users reporting should also have a floor value
701
while (keys.hasNext())
702         {
703             String JavaDoc key = (String JavaDoc) keys.next();
704             summary.append("user.");
705             if (userEmail.equals("on"))
706             {
707                 summary.append(key + "=" + userAggregator.get(key) + "\n");
708             }
709             else if (userEmail.equals("alias"))
710             {
711                 summary.append("Address " + Integer.toString(address++) + "=" + userAggregator.get(key) + "\n");
712             }
713         }
714         
715         // FIXME: all values which have floors set should provide an "other"
716
// record which counts how many other things which didn't make it into
717
// the listing there are
718

719         // output the search word information
720
summary.append("search_floor=" + searchFloor + "\n");
721         keys = searchAggregator.keySet().iterator();
722         while (keys.hasNext())
723         {
724             String JavaDoc key = (String JavaDoc) keys.next();
725             if (((Integer JavaDoc) searchAggregator.get(key)).intValue() >= searchFloor)
726             {
727                 summary.append("search." + key + "=" + searchAggregator.get(key) + "\n");
728             }
729         }
730         
731         // FIXME: we should do a lot more with the search aggregator
732
// Possible feature list:
733
// - constrain by collection/community perhaps?
734
// - we should consider building our own aggregator class which can
735
// be full of rich data. Perhaps this and the Stats class should
736
// be the same thing.
737

738         // item viewing information
739
summary.append("item_floor=" + itemFloor + "\n");
740         summary.append("host_url=" + url + "\n");
741         summary.append("item_lookup=" + itemLookup + "\n");
742         
743         // write out the item access information
744
keys = itemAggregator.keySet().iterator();
745         while (keys.hasNext())
746         {
747             String JavaDoc key = (String JavaDoc) keys.next();
748             if (((Integer JavaDoc) itemAggregator.get(key)).intValue() >= itemFloor)
749             {
750                 summary.append("item." + key + "=" + itemAggregator.get(key) + "\n");
751             }
752         }
753         
754         // output the average views per item
755
if (views > 0)
756         {
757             summary.append("avg_item_views=" + views + "\n");
758         }
759         
760         // insert the analysis processing time information
761
Calendar JavaDoc endTime = new GregorianCalendar JavaDoc();
762         long timeInMillis = (endTime.getTimeInMillis() - startTime.getTimeInMillis());
763         summary.append("analysis_process_time=" + Long.toString(timeInMillis / 1000) + "\n");
764         
765         // finally write the string into the output file
766
try
767         {
768             BufferedWriter JavaDoc out = new BufferedWriter JavaDoc(new FileWriter JavaDoc(outFile));
769             out.write(summary.toString());
770             out.close();
771         }
772         catch (IOException JavaDoc e)
773         {
774             System.out.println("Unable to write to output file " + outFile);
775             System.exit(0);
776         }
777         
778         return;
779     }
780     
781     
782     /**
783      * get an array of file objects representing the passed log directory
784      *
785      * @param logDir the log directory in which to pick up files
786      *
787      * @return an array of file objects representing the given logDir
788      */

789     public static File JavaDoc[] getLogFiles(String JavaDoc logDir)
790     {
791         // open the log files directory, read in the files, check that they
792
// match the passed regular expression then analyse the content
793
File JavaDoc logs = new File JavaDoc(logDir);
794         
795         // if log dir is not a directory throw and error and exit
796
if (!logs.isDirectory())
797         {
798             System.out.println("Passed log directory is not a directory");
799             System.exit(0);
800         }
801         
802         // get the files in the directory
803
return logs.listFiles();
804     }
805     
806     
807     /**
808      * set up the regular expressions to be used by this analyser. Mostly this
809      * exists to provide a degree of segregation and readability to the code
810      * and to ensure that you only need to set up the regular expressions to
811      * be used once
812      *
813      * @param fileTemplate the regex to be used to identify dspace log files
814      */

815     public static void setRegex(String JavaDoc fileTemplate)
816     {
817         // build the exclude characters regular expression
818
StringBuffer JavaDoc charRegEx = new StringBuffer JavaDoc();
819         charRegEx.append("[");
820         for (int i = 0; i < excludeChars.size(); i++)
821         {
822             charRegEx.append("\\" + (String JavaDoc) excludeChars.get(i));
823         }
824         charRegEx.append("]");
825         excludeCharRX = Pattern.compile(charRegEx.toString());
826         
827         // regular expression to find handle indicators in strings
828
handleRX = Pattern.compile("handle=");
829         
830         // regular expression to find item_id indicators in strings
831
itemRX = Pattern.compile(",item_id=.*$");
832         
833         // regular expression to find query indicators in strings
834
queryRX = Pattern.compile("query=");
835         
836         // regular expression to find collections in strings
837
collectionRX = Pattern.compile("collection_id=[0-9]*,");
838         
839         // regular expression to find communities in strings
840
communityRX = Pattern.compile("community_id=[0-9]*,");
841         
842         // regular expression to find search result sets
843
resultsRX = Pattern.compile(",results=(.*)");
844         
845         // regular expressions to find single characters anywhere in the string
846
singleRX = Pattern.compile("( . |^. | .$)");
847         
848         // set up the standard log file line regular expression
849
String JavaDoc logLine13 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+):[^:]+:([^:]+):(.*)";
850         String JavaDoc logLine14 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+):[^:]+:[^:]+:([^:]+):(.*)";
851         valid13 = Pattern.compile(logLine13);
852         valid14 = Pattern.compile(logLine14);
853         
854         // set up the pattern for validating log file names
855
logRegex = Pattern.compile(fileTemplate);
856         
857         // set up the pattern for matching any of the query types
858
StringBuffer JavaDoc typeRXString = new StringBuffer JavaDoc();
859         typeRXString.append("(");
860         for (int i = 0; i < excludeTypes.size(); i++)
861         {
862             if (i > 0)
863             {
864                 typeRXString.append("|");
865             }
866             typeRXString.append((String JavaDoc) excludeTypes.get(i));
867         }
868         typeRXString.append(")");
869         typeRX = Pattern.compile(typeRXString.toString());
870         
871         // set up the pattern for matching any of the words to exclude
872
StringBuffer JavaDoc wordRXString = new StringBuffer JavaDoc();
873         wordRXString.append("(");
874         for (int i = 0; i < excludeWords.size(); i++)
875         {
876             if (i > 0)
877             {
878                 wordRXString.append("|");
879             }
880             wordRXString.append(" " + (String JavaDoc) excludeWords.get(i) + " ");
881             wordRXString.append("|");
882             wordRXString.append("^" + (String JavaDoc) excludeWords.get(i) + " ");
883             wordRXString.append("|");
884             wordRXString.append(" " + (String JavaDoc) excludeWords.get(i) + "$");
885         }
886         wordRXString.append(")");
887         wordRX = Pattern.compile(wordRXString.toString());
888         
889         return;
890     }
891     
892     
893     /**
894      * read in the given config file and populate the class globals
895      *
896      * @param configFile the config file to read in
897      */

898     public static void readConfig(String JavaDoc configFile)
899         throws IOException JavaDoc
900     {
901         // prepare our standard file readers and buffered readers
902
FileReader JavaDoc fr = null;
903         BufferedReader JavaDoc br = null;
904         
905         String JavaDoc record = null;
906         try
907         {
908             fr = new FileReader JavaDoc(configFile);
909             br = new BufferedReader JavaDoc(fr);
910         }
911         catch (IOException JavaDoc e)
912         {
913             System.out.println("Failed to read config file");
914             System.exit(0);
915         }
916         
917         // read in the config file and set up our instance variables
918
while ((record = br.readLine()) != null)
919         {
920             // check to see what kind of line we have
921
Matcher JavaDoc matchComment = comment.matcher(record);
922             Matcher JavaDoc matchReal = real.matcher(record);
923
924             // if the line is not a comment and is real, read it in
925
if (!matchComment.matches() && matchReal.matches())
926             {
927                 // lift the values out of the matcher's result groups
928
String JavaDoc key = matchReal.group(1).trim();
929                 String JavaDoc value = matchReal.group(2).trim();
930                 
931                 // read the config values into our instance variables (see
932
// documentation for more info on config params)
933
if (key.equals("general.summary"))
934                 {
935                     actionAggregator.put(value, new Integer JavaDoc(0));
936                     generalSummary.add(value);
937                 }
938                 
939                 if (key.equals("exclude.word"))
940                 {
941                     excludeWords.add(value);
942                 }
943
944                 if (key.equals("exclude.type"))
945                 {
946                     excludeTypes.add(value);
947                 }
948
949                 if (key.equals("exclude.character"))
950                 {
951                     excludeChars.add(value);
952                 }
953
954                 if (key.equals("item.type"))
955                 {
956                     itemTypes.add(value);
957                 }
958
959                 if (key.equals("item.floor"))
960                 {
961                     itemFloor = Integer.parseInt(value);
962                 }
963
964                 if (key.equals("search.floor"))
965                 {
966                     searchFloor = Integer.parseInt(value);
967                 }
968
969                 if (key.equals("item.lookup"))
970                 {
971                     itemLookup = Integer.parseInt(value);
972                 }
973
974                 if (key.equals("user.email"))
975                 {
976                     userEmail = value;
977                 }
978
979                 if (key.equals("host.url"))
980                 {
981                     url = value;
982                 }
983
984                 if (key.equals("host.name"))
985                 {
986                     name = value;
987                 }
988             }
989         }
990
991         // close the inputs
992
br.close();
993         fr.close();
994         
995         return;
996     }
997     
998     /**
999      * increment the value of the given map at the given key by one.
1000     *
1001     * @param map the map whose value we want to increase
1002     * @param key the key of the map whose value to increase
1003     *
1004     * @return an integer object containing the new value
1005     */

1006    public static Integer JavaDoc increment(Map JavaDoc map, String JavaDoc key)
1007    {
1008        Integer JavaDoc newValue = null;
1009        if (map.containsKey(key))
1010        {
1011            // FIXME: this seems like a ridiculous way to add Integers
1012
newValue = new Integer JavaDoc(((Integer JavaDoc) map.get(key)).intValue() + 1);
1013        }
1014        else
1015        {
1016            newValue = new Integer JavaDoc(1);
1017        }
1018        return newValue;
1019    }
1020    
1021    /**
1022     * Take the standard date string requested at the command line and convert
1023     * it into a Date object. Throws and error and exits if the date does
1024     * not parse
1025     *
1026     * @param date the string representation of the date
1027     *
1028     * @return a date object containing the date, with the time set to
1029     * 00:00:00
1030     */

1031    public static Date JavaDoc parseDate(String JavaDoc date)
1032    {
1033        SimpleDateFormat JavaDoc sdf = new SimpleDateFormat JavaDoc("yyyy'-'MM'-'dd");
1034        Date JavaDoc parsedDate = null;
1035        
1036        try
1037        {
1038             parsedDate = sdf.parse(date);
1039        }
1040        catch (ParseException JavaDoc e)
1041        {
1042            System.out.println("The date is not in the correct format");
1043            System.exit(0);
1044        }
1045        return parsedDate;
1046    }
1047    
1048    
1049    /**
1050     * Take the date object and convert it into a string of the form YYYY-MM-DD
1051     *
1052     * @param date the date to be converted
1053     *
1054     * @return A string of the form YYYY-MM-DD
1055     */

1056    public static String JavaDoc unParseDate(Date JavaDoc date)
1057    {
1058        // Use SimpleDateFormat
1059
SimpleDateFormat JavaDoc sdf = new SimpleDateFormat JavaDoc("yyyy'-'MM'-'dd");
1060        return sdf.format(date);
1061    }
1062    
1063    
1064    /**
1065     * Take a search query string and pull out all of the meaningful information
1066     * from it, giving the results in the form of a String array, a single word
1067     * to each element
1068     *
1069     * @param query the search query to be analysed
1070     *
1071     * @return the string array containing meaningful search terms
1072     */

1073    public static String JavaDoc[] analyseQuery(String JavaDoc query)
1074    {
1075        // register our standard loop counter
1076
int i = 0;
1077        
1078        // make the query string totally lower case, to ensure we don't miss out
1079
// on matches due to capitalisation
1080
query = query.toLowerCase();
1081        
1082        // now perform successive find and replace operations using pre-defined
1083
// global regular expressions
1084
Matcher JavaDoc matchQuery = queryRX.matcher(query);
1085        query = matchQuery.replaceAll(" ");
1086        
1087        Matcher JavaDoc matchCollection = collectionRX.matcher(query);
1088        query = matchCollection.replaceAll(" ");
1089        
1090        Matcher JavaDoc matchCommunity = communityRX.matcher(query);
1091        query = matchCommunity.replaceAll(" ");
1092        
1093        Matcher JavaDoc matchResults = resultsRX.matcher(query);
1094        query = matchResults.replaceAll(" ");
1095
1096        Matcher JavaDoc matchTypes = typeRX.matcher(query);
1097        query = matchTypes.replaceAll(" ");
1098        
1099        Matcher JavaDoc matchChars = excludeCharRX.matcher(query);
1100        query = matchChars.replaceAll(" ");
1101       
1102        Matcher JavaDoc matchWords = wordRX.matcher(query);
1103        query = matchWords.replaceAll(" ");
1104        
1105        Matcher JavaDoc single = singleRX.matcher(query);
1106        query = single.replaceAll(" ");
1107        
1108        // split the remaining string by whitespace, trim and stuff into an
1109
// array to be returned
1110
StringTokenizer JavaDoc st = new StringTokenizer JavaDoc(query);
1111        String JavaDoc[] words = new String JavaDoc[st.countTokens()];
1112        for (i = 0; i < words.length; i++)
1113        {
1114            words[i] = st.nextToken().trim();
1115        }
1116
1117        // FIXME: some single characters are still slipping through the net;
1118
// why? and how do we fix it?
1119
return words;
1120    }
1121    
1122    
1123    /**
1124     * split the given line into it's relevant segments if applicable (i.e. the
1125     * line matches the required regular expression.
1126     *
1127     * @param line the line to be segmented
1128     * @return a Log Line object for the given line
1129     */

1130    public static LogLine getLogLine(String JavaDoc line)
1131    {
1132        // FIXME: consider moving this code into the LogLine class. To do this
1133
// we need to much more carefully define the structure and behaviour
1134
// of the LogLine class
1135
Matcher JavaDoc match;
1136        
1137        if (line.indexOf(":ip_addr") > 0)
1138        {
1139            match = valid14.matcher(line);
1140        }
1141        else
1142        {
1143            match = valid13.matcher(line);
1144        }
1145        
1146        if (match.matches())
1147        {
1148            // set up a new log line object
1149
LogLine logLine = new LogLine(parseDate(match.group(1).trim()),
1150                                          match.group(2).trim(),
1151                                          match.group(3).trim(),
1152                                          match.group(4).trim(),
1153                                          match.group(5).trim());
1154            
1155            return logLine;
1156        }
1157        else
1158        {
1159            return null;
1160        }
1161    }
1162 
1163    
1164    /**
1165     * get the number of items in the archive which were accessioned between
1166     * the provided start and end dates, with the given value for the DC field
1167     * 'type' (unqualified)
1168     *
1169     * @param context the DSpace context for the action
1170     * @param type value for DC field 'type' (unqualified)
1171     *
1172     * @return an integer containing the relevant count
1173     */

1174    public static Integer JavaDoc getNumItems(Context context, String JavaDoc type)
1175        throws SQLException JavaDoc
1176    {
1177        // FIXME: this method is clearly not optimised
1178

1179        // FIXME: we don't yet collect total statistics, such as number of items
1180
// withdrawn, number in process of submission etc. We should probably do
1181
// that
1182

1183        // start the type constraint
1184
String JavaDoc typeQuery = null;
1185        
1186        if (type != null)
1187        {
1188            typeQuery = "SELECT item_id " +
1189                        "FROM metadatavalue " +
1190                        "WHERE text_value LIKE '%" + type + "%' " +
1191                        "AND metadata_field_id = (" +
1192                        " SELECT metadata_field_id " +
1193                        " FROM metadatafieldregistry " +
1194                        " WHERE element = 'type' " +
1195                        " AND qualifier IS NULL) ";
1196        }
1197        
1198        // start the date constraint query buffer
1199
StringBuffer JavaDoc dateQuery = new StringBuffer JavaDoc();
1200        dateQuery.append("SELECT item_id " +
1201                          "FROM metadatavalue " +
1202                          "WHERE metadata_field_id = (" +
1203                          " SELECT metadata_field_id " +
1204                          " FROM metadatafieldregistry " +
1205                          " WHERE element = 'date' " +
1206                          " AND qualifier = 'accessioned') ");
1207      
1208        if (startDate != null)
1209        {
1210           dateQuery.append(" AND text_value::timestamp > '" +
1211                          unParseDate(startDate) + "'::timestamp ");
1212        }
1213        
1214        if (endDate != null)
1215        {
1216            dateQuery.append(" AND text_value::timestamp < ' " +
1217                          unParseDate(endDate) + "'::timestamp ");
1218        }
1219        
1220        // build the final query
1221
StringBuffer JavaDoc query = new StringBuffer JavaDoc();
1222        
1223        query.append("SELECT COUNT(*) AS number " +
1224                  "FROM item " +
1225                  "WHERE in_archive = true " +
1226                  "AND withdrawn = false ");
1227        
1228        if (startDate != null || endDate != null)
1229        {
1230            query.append(" AND item_id IN ( " +
1231                         dateQuery.toString() + ") ");
1232        }
1233
1234        if (type != null)
1235        {
1236            query.append(" AND item_id IN ( " +
1237                         typeQuery + ") ");
1238        }
1239        
1240        TableRow row = DatabaseManager.querySingle(context, query.toString());
1241
1242        // for some reason the number column is of "long" data type!
1243
Long JavaDoc count = new Long JavaDoc(row.getLongColumn("number"));
1244        return new Integer JavaDoc(count.intValue());
1245    }
1246    
1247    
1248    /**
1249     * get the total number of items in the archive at time of execution,
1250     * ignoring all other constraints
1251     *
1252     * @param context the DSpace context the action is being performed in
1253     *
1254     * @return an Integer containing the number of items in the
1255     * archive
1256     */

1257    public static Integer JavaDoc getNumItems(Context context)
1258        throws SQLException JavaDoc
1259    {
1260        return getNumItems(context, null);
1261    }
1262    
1263    
1264    /**
1265     * print out the usage information for this class to the standard out
1266     */

1267    public static void usage()
1268    {
1269        String JavaDoc usage = "Usage Information:\n" +
1270                        "LogAnalyser [options [parameters]]\n" +
1271                        "-log [log directory]\n" +
1272                            "\tOptional\n" +
1273                            "\tSpecify a directory containing log files\n" +
1274                            "\tDefault uses [dspace.dir]/log from dspace.cfg\n" +
1275                        "-file [file name regex]\n" +
1276                            "\tOptional\n" +
1277                            "\tSpecify a regular expression as the file name template.\n" +
1278                            "\tCurrently this needs to be correctly escaped for Java string handling (FIXME)\n" +
1279                            "\tDefault uses dspace.log*\n" +
1280                        "-cfg [config file path]\n" +
1281                            "\tOptional\n" +
1282                            "\tSpecify a config file to be used\n" +
1283                            "\tDefault uses dstat.cfg in dspace config directory\n" +
1284                        "-out [output file path]\n" +
1285                            "\tOptional\n" +
1286                            "\tSpecify an output file to write results into\n" +
1287                            "\tDefault uses dstat.dat in dspace log directory\n" +
1288                        "-start [YYYY-MM-DD]\n" +
1289                            "\tOptional\n" +
1290                            "\tSpecify the start date of the analysis\n" +
1291                            "\tIf a start date is specified then no attempt to gather \n" +
1292                            "\tcurrent database statistics will be made unless -lookup is\n" +
1293                            "\talso passed\n" +
1294                            "\tDefault is to start from the earliest date records exist for\n" +
1295                        "-end [YYYY-MM-DD]\n" +
1296                            "\tOptional\n" +
1297                            "\tSpecify the end date of the analysis\n" +
1298                            "\tIf an end date is specified then no attempt to gather \n" +
1299                            "\tcurrent database statistics will be made unless -lookup is\n" +
1300                            "\talso passed\n" +
1301                            "\tDefault is to work up to the last date records exist for\n" +
1302                        "-lookup\n" +
1303                            "\tOptional\n" +
1304                            "\tForce a lookup of the current database statistics\n" +
1305                            "\tOnly needs to be used if date constraints are also in place\n" +
1306                        "-help\n" +
1307                            "\tdisplay this usage information\n";
1308        
1309        System.out.println(usage);
1310    }
1311}
1312
Popular Tags