[Koha-patches] [PATCH] Bug 2959:Enhancements for link checker
PlanoISD
dschust1 at gmail.com
Thu Feb 11 15:51:41 CET 2010
From: root <root at koha1.pisd.edu>
Goes through all the URL's "856" marc tag in a database and can output results for everything or if using the verbose mode -v only those with errors and the error message. It creates a web page with links to the bib record for easy access to fix incorrect URLS. Designed as a Cron job assumes the person setting up the cron knows where the document directory is that it can write the file out to. See --help for more information. This was modified by a staff programmer at Plano ISD - Robin Luzi, submitted by David Schuster.
---
misc/cronjobs/check-url.pl | 409 ++++++++++++++++++++++++++++++++++++-------
1 files changed, 342 insertions(+), 67 deletions(-)
diff --git a/misc/cronjobs/check-url.pl b/misc/cronjobs/check-url.pl
index 381c122..110fe93 100755
--- a/misc/cronjobs/check-url.pl
+++ b/misc/cronjobs/check-url.pl
@@ -1,13 +1,16 @@
#!/usr/bin/perl
-#
-# Copyright 2009 Tamil s.a.r.l.
-#
-# This software is placed under the gnu General Public License, v2
-# (http://www.gnu.org/licenses/gpl.html)
-#
-
-
+########################################################################
+# Copyright 2009 Tamil s.a.r.l. #
+# #
+# This software is placed under the gnu General Public License, v2 #
+# (http://www.gnu.org/licenses/gpl.html) #
+# #
+########################################################################
+
+################################################################
+# C 4 U R L C h e c k e r P a c k a g e #
+################################################################
package C4::URL::Checker;
@@ -69,7 +72,60 @@ use LWP::UserAgent;
use HTTP::Request;
use C4::Biblio;
+my @arrbadurls = ();
+
+my $cdoubleforward = '//';
+my $csingleforward = '/';
+###################################################################
+# A d d B a d U R L #
+###################################################################
+
+sub addbadurl
+ {
+ my $strURL = shift;
+ my $strURLBase;
+ my $intArraySize;
+
+ my @arrURLElements = split(/$cdoubleforward/, $strURL);
+ my @arrURL = split(/$csingleforward/,$arrURLElements[1]);
+ $strURLBase = $arrURL[0];
+
+ $intArraySize = @arrbadurls;
+ $arrbadurls[$intArraySize] = $strURLBase;
+ }
+
+###################################################################
+# C h e c k B a d U R L #
+###################################################################
+
+sub checkbadurl
+ {
+ my $strURL = shift;
+ my $strURLBase;
+ my $intReturnCode = 0;
+ my $intloopcount = 0;
+ my $intArraySize = 0;
+
+ my @arrURLElements = split(/$cdoubleforward/, $strURL);
+ my @arrURL = split(/$csingleforward/,$arrURLElements[1]);
+ $strURLBase = $arrURL[0];
+
+ foreach my $badurl (@arrbadurls)
+ {
+ if ($badurl =~ /$strURLBase/)
+ {
+ $intReturnCode = 1;
+ last;
+ }
+ }
+
+ return $intReturnCode;
+ }
+
+################################################################
+# N e w #
+################################################################
sub new {
@@ -77,43 +133,75 @@ sub new {
my $class = shift;
$self->{ user_agent } = new LWP::UserAgent;
+
+
bless $self, $class;
return $self;
}
+################################################################
+# C h e c k B i b l i o #
+################################################################
+
+sub check_biblio
+ {
+ my $self = shift;
+ my $biblionumber = shift;
+ my $uagent = $self->{ user_agent };
+ my $host = $self->{ host_default };
+ my $wcallstatus = '';
+ my $creturn500 = '500';
+
+#If you are running a proxy server on the network this may need to be filled in otherwise all you will get from the server in response is 500 errors unable to connect
+# $uagent->proxy(['http', 'ftp'] => 'http://username:password@proxy.server.address');
+# $uagent->proxy(http => 'http://proxy.server.address');
+
+ my $record = GetMarcBiblio( $biblionumber );
+ return unless $record->field('856');
+
+ my @urls = ();
+ foreach my $field ( $record->field('856') )
+ {
+ my $url = $field->subfield('u');
+ next unless $url;
+ $url = "$host/$url" unless $url =~ /^http/;
+ my $check = { url => $url };
+
+ if (checkbadurl($url))
+ {
+ $check->{ is_success } = 0;
+ $check->{ status } = '500: Site already checked.';
+ }
+ else
+ {
+ my $req = HTTP::Request->new( GET => $url );
+ my $res = $uagent->request( $req, sub { die }, 1 );
+ if ( $res->is_success )
+ {
+ $check->{ is_success } = 1;
+ $check->{ status } = 'ok';
+ }
+ else
+ {
+ $wcallstatus = $res->status_line;
+ if ($wcallstatus =~ /$creturn500/)
+ {
+ addbadurl($url);
+ }
+ $check->{ is_success } = 0;
+ $check->{ status } = $wcallstatus;
+ }
+ }
+
+ push( @urls, $check );
+ }
+ return \@urls;
+ }
-sub check_biblio {
- my $self = shift;
- my $biblionumber = shift;
- my $uagent = $self->{ user_agent };
- my $host = $self->{ host_default };
-
- my $record = GetMarcBiblio( $biblionumber );
- return unless $record->field('856');
-
- my @urls = ();
- foreach my $field ( $record->field('856') ) {
- my $url = $field->subfield('u');
- next unless $url;
- $url = "$host/$url" unless $url =~ /^http/;
- my $check = { url => $url };
- my $req = HTTP::Request->new( GET => $url );
- my $res = $uagent->request( $req, sub { die }, 1 );
- if ( $res->is_success ) {
- $check->{ is_success } = 1;
- $check->{ status } = 'ok';
- }
- else {
- $check->{ is_success } = 0;
- $check->{ status } = $res->status_line;
- }
- push( @urls, $check );
- }
- return \@urls;
-}
-
-
+################################################################
+# M a i n P a c k a g e #
+################################################################
package Main;
@@ -126,7 +214,32 @@ use Pod::Usage;
use Getopt::Long;
use C4::Context;
+my $htmldir = '';
+my $htmlfile = '/check-url.htm';
+my $OutFileHTML = '';
+my $wiorec = '';
+
+my $wdtsecond = '00';
+my $wdtminute = '00';
+my $wdthour = '00';
+
+my $wdtday = '00';
+my $wdtmonth = '00';
+my $wdtyear = '0000';
+
+my $wdt0day = '00';
+my $wdt0month = '00';
+my $wdt4year = '0000';
+
+my $cOpenRow1 = '<tr bgcolor="#f0f0f0">';
+my $cOpenRow2 = '<tr bgcolor="#ffffff">';
+
+my $cOpenCell1 = '<td align="center" style="font-weight:bold;font-size:12pt;color:#000000">';
+my $cOpenCell2 = '<td align="center" style="font-size:10pt;color:#0000ff">';
+my $cOpenCell3 = '<td style="font-size:10pt;color:#0000ff">';
+my $cCloseCell = '</td>';
+my $intRemainder = 0;
my $verbose = 0;
my $help = 0;
@@ -140,6 +253,7 @@ GetOptions(
'help' => \$help,
'host=s' => \$host,
'host-pro=s' => \$host_pro,
+ 'htmldir=s' => \$htmldir,
);
@@ -155,39 +269,190 @@ sub bibediturl {
return $html;
}
+################################################################
+# Check all URLs from all current Koha biblio records #
+################################################################
-#
-# Check all URLs from all current Koha biblio records
-#
-sub check_all_url {
- my $checker = C4::URL::Checker->new();
- $checker->{ host_default } = $host;
+sub check_all_url
+ {
+ my $checker = C4::URL::Checker->new();
+ $checker->{ host_default } = $host;
- my $context = new C4::Context( );
- my $dbh = $context->dbh;
- my $sth = $dbh->prepare(
+ my $context = new C4::Context( );
+ my $dbh = $context->dbh;
+ my $sth = $dbh->prepare(
"SELECT biblionumber FROM biblioitems WHERE url <> ''" );
- $sth->execute;
- print "<html>\n<body>\n<table>\n" if $html;
- while ( my ($biblionumber) = $sth->fetchrow ) {
- my $result = $checker->check_biblio( $biblionumber );
- next unless $result; # No URL
- foreach my $url ( @$result ) {
- if ( ! $url->{ is_success } || $verbose ) {
- print $html
- ? "<tr>\n<td>" . bibediturl( $biblionumber ) .
- "</td>\n<td>" . $url->{url} . "</td>\n<td>" .
- $url->{status} . "</td>\n</tr>\n\n"
- : "$biblionumber\t" . $url->{ url } . "\t" .
- $url->{ status } . "\n";
- }
- }
- }
- print "</table>\n</body>\n</html>\n" if $html;
+ $sth->execute;
+
+ if ($html)
+ {
+ $wiorec = "<html>\n<body>\n";
+ $wiorec .= '<h2 align="center">' . 'Check URL Start ' . $wdt0month . '/' . $wdt0day . '/' . $wdt4year . ' ' . $wdthour .
+ ':' . $wdtminute . '</h2>' . "\n";
+
+ $wiorec .= "<table border=\"1\" link=\"#ff0000\" alink=\"#0000ff\" vlink=\"#0000ff\"> \n";
+ $wiorec .= "<tr>\n";
+ $wiorec .= $cOpenCell1 . "Biblio" . $cCloseCell . "\n";
+ $wiorec .= $cOpenCell1 . "URL" . $cCloseCell . "\n";
+ $wiorec .= $cOpenCell1 . "Status" . $cCloseCell . "\n";
+ $wiorec .= "</tr>\n";
+
+ if ($htmldir ne '')
+ {
+ WriteHTMLOutput();
+ }
+ else
+ {
+ print $wiorec;
+ }
+ }
+
+ my $intLoopCount = 0;
+
+ while ( my ($biblionumber) = $sth->fetchrow )
+ {
+ my $result = $checker->check_biblio( $biblionumber );
+ next unless $result; # No URL
+
+ foreach my $url ( @$result )
+ {
+ if ( ! $url->{ is_success } || $verbose )
+ {
+ if ($html)
+ {
+ $intRemainder = $intLoopCount % 2;
+
+ if ($intRemainder > 0)
+ {
+ $wiorec = $cOpenRow1 . "\n";
+ }
+ else
+ {
+ $wiorec = $cOpenRow2 . "\n";
+ }
+
+ $wiorec .= $cOpenCell2 . bibediturl( $biblionumber ) . $cCloseCell . "\n";
+ $wiorec .= $cOpenCell3 . $url->{url} . $cCloseCell . "\n";
+ $wiorec .= $cOpenCell2 . $url->{status} . $cCloseCell . "\n";
+ $wiorec .= "</tr>\n\n";
+
+ if ($htmldir ne '')
+ {
+ WriteHTMLOutput();
+ }
+ else
+ {
+ print $wiorec;
+ }
+ }
+ else
+ {
+ print "$biblionumber\t" . $url->{url} . "\t" . $url->{status} . "\n";
+ }
+ }
+ }
+
+ ++$intLoopCount;
+
+# if ($intLoopCount > 40)
+# {
+# last;
+# }
+
+ }
+
+ if ($html)
+ {
+ setdatetime();
+ $wiorec = "</table>\n";
+ $wiorec .= '<h2 align="center">' . 'Check URL End ' . $wdt0month . '/' . $wdt0day . '/' . $wdt4year . ' ' . $wdthour .
+ ':' . $wdtminute . '</h2>' . "\n";
+ $wiorec .= "</body>\n</html>\n";
+
+ if ($htmldir ne '')
+ {
+ WriteHTMLOutput();
+ }
+ else
+ {
+ print $wiorec;
+ }
+ }
}
+####################################################################
+# O p e n H T M L O u p u t F i l e #
+####################################################################
+
+sub OpenHTMLOutput
+ {
+ if (open(HTMLOutput, ">$OutFileHTML"))
+ {
+ }
+ else
+ {
+ print "Error: Unable to open output file: $OutFileHTML\n";
+ exit;
+ }
+ }
+
+###################################################################
+# W r i t e H T M L R e c o r d #
+###################################################################
+
+sub WriteHTMLOutput
+ {
+ print HTMLOutput ($wiorec);
+ }
+
+###################################################################
+# C l o s e H T M L F i l e #
+###################################################################
+
+sub CloseHTMLOutput
+ {
+ close (HTMLOutput);
+ }
+
+##################################################################
+# S e t D a t e - T i m e #
+##################################################################
+
+sub setdatetime
+ {
+ my @tbldttim = localtime(time);
-# BEGIN
+ $wdtsecond = $tbldttim[0];
+ $wdtminute = $tbldttim[1];
+ $wdthour = $tbldttim[2];
+
+ $wdtday = $tbldttim[3]; # Day 1, 2, 3.....
+ $wdtmonth = $tbldttim[4] + 1; # Month 1, 2, 3....
+ $wdtyear = $tbldttim[5] + 1900; # Year 2000, 2001, 2003......
+
+ $wdt0day = $tbldttim[3]; # Day 01, 02, 03, ....
+ $wdt0month = $tbldttim[4] + 1; # Month 01, 02, 03....
+ $wdt4year = $tbldttim[5] + 1900; # Year 2000, 2001, 2003......
+
+ if ($wdtminute < 10)
+ {
+ $wdtminute = '0' . $tbldttim[1];
+ }
+
+ if ($wdtday < 10)
+ {
+ $wdt0day = '0' . $wdtday;
+ }
+
+ if ($wdtmonth < 10)
+ {
+ $wdt0month = '0' . $wdtmonth;
+ }
+ }
+
+################################################################
+# B e g i n #
+################################################################
usage() if $help;
@@ -201,9 +466,19 @@ if ( $html && !$host_pro ) {
}
}
-check_all_url();
+if ($html && $htmldir ne '')
+ {
+ $OutFileHTML = $htmldir . $htmlfile;
+ OpenHTMLOutput();
+ }
+setdatetime();
+check_all_url();
+if ($html && $htmldir ne '')
+ {
+ CloseHTMLOutput();
+ }
=head1 NAME
--
1.5.6.5
More information about the Koha-patches
mailing list