[Koha-patches] [PATCH] Bug 7963 Parallel HTTP requests when checking URLs

Frédéric Demians f.demians at tamil.fr
Sat Apr 14 15:06:29 CEST 2012


Current script check-url.pl checks URL found in 856$u by sending HTTP
requests, one by one. I propose a new script which send multiple
requests simultaneously.

This script is based on AnyEvent and AnyEvent::HTTP CPAN modules.

See doc: perldoc checkurl
---
 misc/cronjobs/checkurl |  190 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 190 insertions(+), 0 deletions(-)
 create mode 100755 misc/cronjobs/checkurl

diff --git a/misc/cronjobs/checkurl b/misc/cronjobs/checkurl
new file mode 100755
index 0000000..639d6ee
--- /dev/null
+++ b/misc/cronjobs/checkurl
@@ -0,0 +1,190 @@
+#!/usr/bin/perl
+
+# Copyright 2012 Tamil s.a.r.l.
+#
+# This file is part of Koha.
+#
+# Koha is free software; you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Koha; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+use 5.010;
+use utf8;
+use strict;
+use warnings;
+use Pod::Usage;
+use Getopt::Long;
+use C4::Context;
+use C4::Biblio;
+use AnyEvent;
+use AnyEvent::HTTP;
+
+my ($verbose, $help, $html) = (0, 0, 0);
+my ($host, $host_pro)       = ('', '');
+my ($timeout, $maxconn)     = (10, 200);
+my $uriedit                 = "/cgi-bin/koha/cataloguing/addbiblio.pl?biblionumber=";
+GetOptions( 
+    'verbose'       => \$verbose,
+    'html'          => \$html,
+    'help'          => \$help,
+    'host=s'        => \$host,
+    'host-pro=s'    => \$host_pro,
+    'timeout=i'     => \$timeout,
+    'maxconn=i'     => \$maxconn,
+);
+
+
+sub usage {
+    pod2usage( -verbose => 2 );
+    exit;
+} 
+
+
+sub bibediturl {
+    my $biblionumber = shift;
+    my $html = "<a href=\"$host_pro$uriedit$biblionumber\">$biblionumber</a>";
+    return $html;
+}
+
+
+# Check all URLs from all current Koha biblio records
+
+sub check_all_url {
+    my $sth = C4::Context->dbh->prepare( 
+        "SELECT biblionumber FROM biblioitems WHERE url <> ''" );
+    $sth->execute;
+    if ( $html ) {
+        print <<EOS;
+<html>
+<body>
+<table>
+EOS
+    }
+
+    my $countconn = 0;
+    my $cv = AnyEvent->condvar;
+    my $idle = AnyEvent->idle(
+        cb => sub {
+            return if $countconn > $maxconn;
+            while ( my ($biblionumber) = $sth->fetchrow ) {
+                my $record = GetMarcBiblio( $biblionumber ); 
+                next unless $record->field('856');
+                foreach my $field ( $record->field('856') ) {
+                    my $url = $field->subfield('u');
+                    next unless $url; 
+                    $url = "$host/$url" unless $url =~ /^http/;
+                    $countconn++;
+                    http_request(
+                        HEAD => $url,
+                        headers => { 'user-agent' => 'Mozilla/5.0 (compatible; U; Koha checkurl)' },
+                        timeout => $timeout,
+                        sub {
+                            my ($body, $hdr) = @_;
+                            #say "HEAD found $url";
+                            $countconn--;
+                            if ( $hdr->{Status} !~ /^2/ || $verbose) { # OK
+                                print $html
+                                      ? "<tr>\n<td><a href=\"" .
+                                        $host_pro . $uriedit . $biblionumber.
+                                        "\">$biblionumber</a>" .
+                                        "</td>\n<td>" . $url . "</td>\n<td>" . 
+                                        "$hdr->{Status} $hdr->{Reason}</td>\n</tr>\n\n"
+                                      : "$biblionumber\t" . $url . "\t" .
+                                        "$hdr->{Status} $hdr->{Reason} \n";
+                            }
+                        }
+                    );
+                }
+                return;
+            }
+            $cv->send;
+        }
+    );
+    $cv->recv;
+
+    print "</table>\n</body>\n</html>\n" if $html;
+}
+
+
+# BEGIN
+
+usage() if $help;          
+
+if ( $html && !$host_pro ) {
+    if ( $host ) {
+        $host_pro = $host;
+    }
+    else {
+        print "Error: host-pro parameter or host must be provided in html mode\n";
+        exit;
+    }
+}
+
+check_all_url(); 
+
+
+=head1 NAME
+
+checkurl - Check URLs from 856$u field.
+
+=head1 USAGE
+
+=over
+
+=item checkurl [--verbose|--help] [--host=http://default.tld] 
+
+Scan all URLs found in 856$u of bib records and display if resources are
+available or not. HTTP requests are sent in parallel for efficiency.
+
+=back
+
+=head1 PARAMETERS
+
+=over
+
+=item B<--host=http://default.tld>
+
+Server host used when URL doesn't have one, ie doesn't begin with 'http:'. 
+For example, if --host=http://www.mylib.com, then when 856$u contains 
+'img/image.jpg', the url checked is: http://www.mylib.com/image.jpg'.
+
+=item B<--verbose|-v>
+
+Outputs both successful and failed URLs.
+
+=item B<--html>
+
+Formats output in HTML. The result can be redirected to a file
+accessible by http. This way, it's possible to link directly to biblio
+record in edit mode. With this parameter B<--host-pro> is required.
+
+=item B<--host-pro=http://koha-pro.tld>
+
+Server host used to link to biblio record editing page.
+
+=item B<--timeout=10>
+
+Timeout for fetching URLs. By default 10 seconds.
+
+=item B<--maxconn=1000>
+
+Number of simulaneous HTTP requests. By default 200 connexions.
+
+=item B<--help|-h>
+
+Print this help page.
+
+=back
+
+=cut
+
+
-- 
1.7.8



More information about the Koha-patches mailing list