[Koha-patches] [PATCH] Bug 7963 Parallel HTTP requests when checking URLs
Frédéric Demians
f.demians at tamil.fr
Sat Apr 14 15:06:29 CEST 2012
Current script check-url.pl checks URL found in 856$u by sending HTTP
requests, one by one. I propose a new script which send multiple
requests simultaneously.
This script is based on AnyEvent and AnyEvent::HTTP CPAN modules.
See doc: perldoc checkurl
---
misc/cronjobs/checkurl | 190 ++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 190 insertions(+), 0 deletions(-)
create mode 100755 misc/cronjobs/checkurl
diff --git a/misc/cronjobs/checkurl b/misc/cronjobs/checkurl
new file mode 100755
index 0000000..639d6ee
--- /dev/null
+++ b/misc/cronjobs/checkurl
@@ -0,0 +1,190 @@
+#!/usr/bin/perl
+
+# Copyright 2012 Tamil s.a.r.l.
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Koha; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+use 5.010;
+use utf8;
+use strict;
+use warnings;
+use Pod::Usage;
+use Getopt::Long;
+use C4::Context;
+use C4::Biblio;
+use AnyEvent;
+use AnyEvent::HTTP;
+
+my ($verbose, $help, $html) = (0, 0, 0);
+my ($host, $host_pro) = ('', '');
+my ($timeout, $maxconn) = (10, 200);
+my $uriedit = "/cgi-bin/koha/cataloguing/addbiblio.pl?biblionumber=";
+GetOptions(
+ 'verbose' => \$verbose,
+ 'html' => \$html,
+ 'help' => \$help,
+ 'host=s' => \$host,
+ 'host-pro=s' => \$host_pro,
+ 'timeout=i' => \$timeout,
+ 'maxconn=i' => \$maxconn,
+);
+
+
+sub usage {
+ pod2usage( -verbose => 2 );
+ exit;
+}
+
+
+sub bibediturl {
+ my $biblionumber = shift;
+ my $html = "<a href=\"$host_pro$uriedit$biblionumber\">$biblionumber</a>";
+ return $html;
+}
+
+
+# Check all URLs from all current Koha biblio records
+
+sub check_all_url {
+ my $sth = C4::Context->dbh->prepare(
+ "SELECT biblionumber FROM biblioitems WHERE url <> ''" );
+ $sth->execute;
+ if ( $html ) {
+ print <<EOS;
+<html>
+<body>
+<table>
+EOS
+ }
+
+ my $countconn = 0;
+ my $cv = AnyEvent->condvar;
+ my $idle = AnyEvent->idle(
+ cb => sub {
+ return if $countconn > $maxconn;
+ while ( my ($biblionumber) = $sth->fetchrow ) {
+ my $record = GetMarcBiblio( $biblionumber );
+ next unless $record->field('856');
+ foreach my $field ( $record->field('856') ) {
+ my $url = $field->subfield('u');
+ next unless $url;
+ $url = "$host/$url" unless $url =~ /^http/;
+ $countconn++;
+ http_request(
+ HEAD => $url,
+ headers => { 'user-agent' => 'Mozilla/5.0 (compatible; U; Koha checkurl)' },
+ timeout => $timeout,
+ sub {
+ my ($body, $hdr) = @_;
+ #say "HEAD found $url";
+ $countconn--;
+ if ( $hdr->{Status} !~ /^2/ || $verbose) { # OK
+ print $html
+ ? "<tr>\n<td><a href=\"" .
+ $host_pro . $uriedit . $biblionumber.
+ "\">$biblionumber</a>" .
+ "</td>\n<td>" . $url . "</td>\n<td>" .
+ "$hdr->{Status} $hdr->{Reason}</td>\n</tr>\n\n"
+ : "$biblionumber\t" . $url . "\t" .
+ "$hdr->{Status} $hdr->{Reason} \n";
+ }
+ }
+ );
+ }
+ return;
+ }
+ $cv->send;
+ }
+ );
+ $cv->recv;
+
+ print "</table>\n</body>\n</html>\n" if $html;
+}
+
+
+# BEGIN
+
+usage() if $help;
+
+if ( $html && !$host_pro ) {
+ if ( $host ) {
+ $host_pro = $host;
+ }
+ else {
+ print "Error: host-pro parameter or host must be provided in html mode\n";
+ exit;
+ }
+}
+
+check_all_url();
+
+
+=head1 NAME
+
+checkurl - Check URLs from 856$u field.
+
+=head1 USAGE
+
+=over
+
+=item checkurl [--verbose|--help] [--host=http://default.tld]
+
+Scan all URLs found in 856$u of bib records and display if resources are
+available or not. HTTP requests are sent in parallel for efficiency.
+
+=back
+
+=head1 PARAMETERS
+
+=over
+
+=item B<--host=http://default.tld>
+
+Server host used when URL doesn't have one, ie doesn't begin with 'http:'.
+For example, if --host=http://www.mylib.com, then when 856$u contains
+'img/image.jpg', the url checked is: http://www.mylib.com/image.jpg'.
+
+=item B<--verbose|-v>
+
+Outputs both successful and failed URLs.
+
+=item B<--html>
+
+Formats output in HTML. The result can be redirected to a file
+accessible by http. This way, it's possible to link directly to biblio
+record in edit mode. With this parameter B<--host-pro> is required.
+
+=item B<--host-pro=http://koha-pro.tld>
+
+Server host used to link to biblio record editing page.
+
+=item B<--timeout=10>
+
+Timeout for fetching URLs. By default 10 seconds.
+
+=item B<--maxconn=1000>
+
+Number of simulaneous HTTP requests. By default 200 connexions.
+
+=item B<--help|-h>
+
+Print this help page.
+
+=back
+
+=cut
+
+
--
1.7.8
More information about the Koha-patches
mailing list