[Koha-patches] [PATCH] Bug 2959:Enhancements for link checker

PlanoISD dschust1 at gmail.com
Thu Feb 11 15:51:41 CET 2010


From: root <root at koha1.pisd.edu>

Goes through all the URL's "856" marc tag in a database and can output results for everything or if using the verbose mode -v only those with errors and the error message.  It creates a web page with links to the bib record for easy access to fix incorrect URLS.  Designed as a Cron job assumes the person setting up the cron knows where the document directory is that it can write the file out to.  See --help for more information.  This was modified by a staff programmer at Plano ISD - Robin Luzi, submitted by David Schuster.
---
 misc/cronjobs/check-url.pl |  409 ++++++++++++++++++++++++++++++++++++-------
 1 files changed, 342 insertions(+), 67 deletions(-)

diff --git a/misc/cronjobs/check-url.pl b/misc/cronjobs/check-url.pl
index 381c122..110fe93 100755
--- a/misc/cronjobs/check-url.pl
+++ b/misc/cronjobs/check-url.pl
@@ -1,13 +1,16 @@
 #!/usr/bin/perl
 
-#
-# Copyright 2009 Tamil s.a.r.l.
-#
-# This software is placed under the gnu General Public License, v2 
-# (http://www.gnu.org/licenses/gpl.html)
-#
-
-
+########################################################################
+# Copyright 2009 Tamil s.a.r.l.                                        #
+#                                                                      #
+# This software is placed under the gnu General Public License, v2     #
+# (http://www.gnu.org/licenses/gpl.html)                               #
+#                                                                      #
+########################################################################
+
+################################################################ 
+#          C 4   U R L   C h e c k e r   P a c k a g e         #
+################################################################
 
 package C4::URL::Checker;
 
@@ -69,7 +72,60 @@ use LWP::UserAgent;
 use HTTP::Request;
 use C4::Biblio;
 
+my @arrbadurls = ();
+
+my $cdoubleforward  = '//';
+my $csingleforward  = '/';
 
+###################################################################
+#                       A d d   B a d   U R L                     #
+###################################################################
+
+sub addbadurl
+ {
+  my $strURL = shift;
+  my $strURLBase;
+  my $intArraySize;
+
+  my @arrURLElements = split(/$cdoubleforward/, $strURL);
+  my @arrURL = split(/$csingleforward/,$arrURLElements[1]);
+  $strURLBase = $arrURL[0];
+
+  $intArraySize = @arrbadurls;
+  $arrbadurls[$intArraySize] = $strURLBase;
+ }
+
+###################################################################
+#                     C h e c k   B a d   U R L                   #
+###################################################################
+
+sub checkbadurl
+ {
+  my $strURL = shift;
+  my $strURLBase;
+  my $intReturnCode = 0;
+  my $intloopcount = 0;
+  my $intArraySize = 0;
+
+  my @arrURLElements = split(/$cdoubleforward/, $strURL);
+  my @arrURL = split(/$csingleforward/,$arrURLElements[1]);
+  $strURLBase = $arrURL[0];
+
+  foreach my $badurl (@arrbadurls)
+   {
+    if ($badurl =~ /$strURLBase/)
+     {
+      $intReturnCode = 1;
+      last;
+     }
+   }
+
+  return $intReturnCode;
+ }
+
+################################################################ 
+#                             N e w                            #
+################################################################
 
 sub new {
 
@@ -77,43 +133,75 @@ sub new {
     my $class = shift;
     
     $self->{ user_agent } = new LWP::UserAgent;
+
+
     
     bless $self, $class;
     return $self;
 }
 
+################################################################ 
+#                     C h e c k   B i b l i o                  #
+################################################################
+
+sub check_biblio
+ {
+  my $self            = shift;
+  my $biblionumber    = shift;
+  my $uagent          = $self->{ user_agent   };
+  my $host            = $self->{ host_default };
+  my $wcallstatus     = '';
+  my $creturn500      = '500';
+
+#If you are running a proxy server on the network this may need to be filled in otherwise all you will get from the server in response is 500 errors unable to connect
+# $uagent->proxy(['http', 'ftp'] => 'http://username:password@proxy.server.address');
+# $uagent->proxy(http  => 'http://proxy.server.address');
+
+  my $record = GetMarcBiblio( $biblionumber ); 
+  return unless $record->field('856');
+
+  my @urls = ();
+  foreach my $field ( $record->field('856') )
+   {
+    my $url = $field->subfield('u');
+    next unless $url; 
+    $url = "$host/$url" unless $url =~ /^http/;
+    my $check = { url => $url };
+
+    if (checkbadurl($url))
+     {
+      $check->{ is_success } = 0;
+      $check->{ status } = '500: Site already checked.';
+     }
+    else
+     {
+      my $req = HTTP::Request->new( GET => $url );
+      my $res = $uagent->request( $req, sub { die }, 1 );
+      if ( $res->is_success )
+       {
+        $check->{ is_success } = 1;
+        $check->{ status     } = 'ok';
+       }
+      else
+       {
+        $wcallstatus = $res->status_line;
+        if ($wcallstatus =~ /$creturn500/)
+         {
+          addbadurl($url);
+         }
+        $check->{ is_success } = 0;
+        $check->{ status     } = $wcallstatus;
+       }
+     }
+
+    push( @urls, $check );       
+   }
+  return \@urls;
+ }
 
-sub check_biblio {
-    my $self            = shift;
-    my $biblionumber    = shift;
-    my $uagent          = $self->{ user_agent   };
-    my $host            = $self->{ host_default };
-
-    my $record = GetMarcBiblio( $biblionumber ); 
-    return unless $record->field('856');
-
-    my @urls = ();
-    foreach my $field ( $record->field('856') ) {
-        my $url = $field->subfield('u');
-        next unless $url; 
-        $url = "$host/$url" unless $url =~ /^http/;
-        my $check = { url => $url };
-        my $req = HTTP::Request->new( GET => $url );
-        my $res = $uagent->request( $req, sub { die }, 1 );
-        if ( $res->is_success ) {
-            $check->{ is_success } = 1;
-            $check->{ status     } = 'ok';
-        }
-        else {
-            $check->{ is_success } = 0;
-            $check->{ status     } = $res->status_line;
-        }
-        push( @urls, $check );       
-    }
-    return \@urls;
-}
-
-
+################################################################
+#                    M a i n   P a c k a g e                   #
+################################################################
 
 package Main;
 
@@ -126,7 +214,32 @@ use Pod::Usage;
 use Getopt::Long;
 use C4::Context;
 
+my $htmldir     = '';
+my $htmlfile    = '/check-url.htm';
+my $OutFileHTML = '';
+my $wiorec      = '';
+
+my $wdtsecond = '00';
+my $wdtminute = '00';
+my $wdthour = '00';
+
+my $wdtday = '00';
+my $wdtmonth = '00';
+my $wdtyear = '0000';
+
+my $wdt0day = '00';
+my $wdt0month = '00';
+my $wdt4year = '0000';
+
+my $cOpenRow1 = '<tr bgcolor="#f0f0f0">';
+my $cOpenRow2 = '<tr bgcolor="#ffffff">';
+
+my $cOpenCell1 = '<td align="center" style="font-weight:bold;font-size:12pt;color:#000000">';
+my $cOpenCell2 = '<td align="center" style="font-size:10pt;color:#0000ff">';
+my $cOpenCell3 = '<td style="font-size:10pt;color:#0000ff">';
+my $cCloseCell = '</td>';
 
+my $intRemainder = 0;
 
 my $verbose     = 0;
 my $help        = 0;
@@ -140,6 +253,7 @@ GetOptions(
     'help'          => \$help,
     'host=s'        => \$host,
     'host-pro=s'    => \$host_pro,
+    'htmldir=s'     => \$htmldir,
 );
 
 
@@ -155,39 +269,190 @@ sub bibediturl {
     return $html;
 }
 
+################################################################
+#       Check all URLs from all current Koha biblio records    #
+################################################################
 
-# 
-# Check all URLs from all current Koha biblio records
-#
-sub check_all_url {
-    my $checker = C4::URL::Checker->new();
-    $checker->{ host_default }  = $host;
+sub check_all_url
+ {
+  my $checker = C4::URL::Checker->new();
+  $checker->{ host_default }  = $host;
     
-    my $context = new C4::Context(  );  
-    my $dbh = $context->dbh;
-    my $sth = $dbh->prepare( 
+  my $context = new C4::Context(  );  
+  my $dbh = $context->dbh;
+  my $sth = $dbh->prepare( 
         "SELECT biblionumber FROM biblioitems WHERE url <> ''" );
-    $sth->execute;
-    print "<html>\n<body>\n<table>\n" if $html;
-    while ( my ($biblionumber) = $sth->fetchrow ) {
-        my $result = $checker->check_biblio( $biblionumber );  
-        next unless $result;  # No URL
-        foreach my $url ( @$result ) {
-            if ( ! $url->{ is_success } || $verbose ) {
-                print $html
-                      ? "<tr>\n<td>" . bibediturl( $biblionumber ) . 
-                        "</td>\n<td>" . $url->{url} . "</td>\n<td>" . 
-                        $url->{status} . "</td>\n</tr>\n\n"
-                      : "$biblionumber\t" . $url->{ url } . "\t" .
-                        $url->{ status } . "\n";
-            }
-        }
-    }
-    print "</table>\n</body>\n</html>\n" if $html;
+  $sth->execute;
+
+  if ($html)
+   {
+    $wiorec = "<html>\n<body>\n";
+    $wiorec .= '<h2 align="center">' . 'Check URL Start ' . $wdt0month . '/' . $wdt0day . '/' . $wdt4year . ' ' . $wdthour .
+               ':' . $wdtminute . '</h2>' . "\n";
+
+    $wiorec .= "<table border=\"1\" link=\"#ff0000\" alink=\"#0000ff\" vlink=\"#0000ff\"> \n";
+    $wiorec .= "<tr>\n";
+    $wiorec .= $cOpenCell1 . "Biblio" . $cCloseCell . "\n";
+    $wiorec .= $cOpenCell1 . "URL" . $cCloseCell . "\n";
+    $wiorec .= $cOpenCell1 . "Status" . $cCloseCell . "\n";
+    $wiorec .= "</tr>\n";
+
+    if ($htmldir ne '')
+     {
+      WriteHTMLOutput();
+     }
+    else
+     {
+      print $wiorec;
+     }
+   }
+
+  my $intLoopCount = 0;
+
+  while ( my ($biblionumber) = $sth->fetchrow )
+   {
+    my $result = $checker->check_biblio( $biblionumber );  
+    next unless $result;  # No URL
+
+    foreach my $url ( @$result )
+     {
+      if ( ! $url->{ is_success } || $verbose )
+       {
+        if ($html)
+         {
+          $intRemainder = $intLoopCount % 2;
+
+          if ($intRemainder > 0)
+           {
+            $wiorec = $cOpenRow1 . "\n";
+           }
+          else
+           {
+            $wiorec = $cOpenRow2 . "\n";
+           }
+
+          $wiorec .= $cOpenCell2 . bibediturl( $biblionumber ) . $cCloseCell . "\n";
+          $wiorec .= $cOpenCell3 . $url->{url} . $cCloseCell . "\n";
+          $wiorec .= $cOpenCell2 . $url->{status} . $cCloseCell . "\n";
+          $wiorec .= "</tr>\n\n";
+
+          if ($htmldir ne '')
+           {
+            WriteHTMLOutput();
+           }
+          else
+           {
+            print $wiorec;
+           }
+         }
+        else
+         {
+          print "$biblionumber\t" . $url->{url} . "\t" . $url->{status} . "\n";
+         }
+       }
+     }
+
+    ++$intLoopCount;
+
+#    if ($intLoopCount > 40)
+#     {
+#      last;
+#     }
+
+   }
+
+  if ($html)
+   {
+    setdatetime();
+    $wiorec = "</table>\n";
+    $wiorec .= '<h2 align="center">' . 'Check URL End ' . $wdt0month . '/' . $wdt0day . '/' . $wdt4year . '  ' . $wdthour .
+              ':' . $wdtminute . '</h2>' . "\n";
+    $wiorec .= "</body>\n</html>\n";
+
+    if ($htmldir ne '')
+     {
+      WriteHTMLOutput();
+     }
+    else
+     {
+      print $wiorec;
+     }
+   }
 }
 
+####################################################################
+#              O p e n   H T M L   O u p u t   F i l e             #
+####################################################################
+
+sub OpenHTMLOutput
+ {
+  if (open(HTMLOutput, ">$OutFileHTML"))
+   {
+   }
+  else
+   { 
+    print "Error: Unable to open output file: $OutFileHTML\n";
+    exit;
+   } 
+ }
+
+###################################################################
+#                W r i t e   H T M L    R e c o r d               #
+###################################################################
+
+sub WriteHTMLOutput
+ {
+  print HTMLOutput ($wiorec);
+ }
+
+###################################################################
+#                    C l o s e   H T M L    F i l e               #
+###################################################################
+
+sub CloseHTMLOutput
+ {
+  close (HTMLOutput);
+ }
+
+##################################################################
+#                     S e t   D a t e - T i m e                  #
+##################################################################
+
+sub setdatetime
+ {
+  my @tbldttim = localtime(time);
 
-# BEGIN
+  $wdtsecond = $tbldttim[0];
+  $wdtminute = $tbldttim[1];
+  $wdthour = $tbldttim[2];
+
+  $wdtday = $tbldttim[3];                    # Day 1, 2, 3.....
+  $wdtmonth = $tbldttim[4] + 1;              # Month 1, 2, 3....
+  $wdtyear = $tbldttim[5] + 1900;            # Year 2000, 2001, 2003......
+
+  $wdt0day = $tbldttim[3];                   # Day 01, 02, 03, ....
+  $wdt0month = $tbldttim[4] + 1;             # Month 01, 02, 03....
+  $wdt4year = $tbldttim[5] + 1900;           # Year 2000, 2001, 2003......
+
+  if ($wdtminute < 10)
+   {
+    $wdtminute = '0' . $tbldttim[1];
+   }
+
+  if ($wdtday < 10)
+   {
+    $wdt0day = '0' . $wdtday;
+   }
+
+  if ($wdtmonth < 10)
+   {
+   $wdt0month = '0' . $wdtmonth;
+   }
+ }
+
+################################################################ 
+#                           B e g i n                          #
+################################################################
 
 usage() if $help;          
 
@@ -201,9 +466,19 @@ if ( $html && !$host_pro ) {
     }
 }
 
-check_all_url(); 
+if ($html && $htmldir ne '')
+ {
+  $OutFileHTML = $htmldir . $htmlfile;
+  OpenHTMLOutput();
+ }
 
+setdatetime();
+check_all_url(); 
 
+if ($html && $htmldir ne '')
+ {
+  CloseHTMLOutput();
+ }
 
 =head1 NAME
 
-- 
1.5.6.5




More information about the Koha-patches mailing list