Koha/misc/cronjobs/check-url.pl
Lars Wirzenius 160e01daa3 Update copyright license to GPL version 2 or later for two cronjobs.
See http://lists.koha.org/pipermail/koha-devel/2010-March/011067.html
for confirmation that this is OK.

Signed-off-by: Galen Charlton <gmcharlt@gmail.com>
2010-04-07 08:43:31 -04:00

289 lines
6.6 KiB
Perl
Executable file
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/perl
#
# Copyright 2009 Tamil s.a.r.l.
#
# This file is part of Koha.
#
# Koha is free software; you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with Koha; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
package C4::URL::Checker;
=head1 NAME
C4::URL::Checker - base object for checking URL stored in Koha DB
=head1 SYNOPSIS
use C4::URL::Checker;
my $checker = C4::URL::Checker->new( );
$checker->{ host_default } = 'http://mylib.kohalibrary.com';
my $checked_urls = $checker->check_biblio( 123 );
foreach my $url ( @$checked_urls ) {
print "url: ", $url->{ url  }, "\n",
"is_success: ", $url->{ is_success }, "\n",
"status: ", $url->{ status }, "\n";
}
=head1 FUNCTIONS
=head2 new
Create a URL Checker. The returned object can be used to set
default host variable :
my $checker = C4::URL::Checker->new( );
$checker->{ host_default } = 'http://mylib.kohalibrary.com';
=head2 check_biblio
Check all URL from a biblio record. Returns a pointer to an array
containing all URLs with checking for each of them.
my $checked_urls = $checker->check_biblio( 123 );
With 2 URLs, the returned array will look like that:
[
{
'url' => 'http://mylib.tamil.fr/img/62265_0055B.JPG',
'is_success' => 1,
'status' => 'ok'
},
{
'url' => 'http://mylib.tamil.fr//img/62265_0055C.JPG',
'is_success' => 0,
'status' => '404 - Page not found'
}
],
=cut
use strict;
use warnings;
use LWP::UserAgent;
use HTTP::Request;
use C4::Biblio;
sub new {
my $self = {};
my ($class, $timeout) = @_;
my $uagent = new LWP::UserAgent;
$uagent->timeout( $timeout) if $timeout;
$self->{ user_agent } = $uagent;
$self->{ bad_url } = { };
bless $self, $class;
return $self;
}
sub check_biblio {
my $self = shift;
my $biblionumber = shift;
my $uagent = $self->{ user_agent };
my $host = $self->{ host_default };
my $bad_url = $self->{ bad_url };
my $record = GetMarcBiblio( $biblionumber );
return unless $record->field('856');
my @urls = ();
foreach my $field ( $record->field('856') ) {
my $url = $field->subfield('u');
next unless $url;
$url = "$host/$url" unless $url =~ /^http/;
my $check = { url => $url };
if ( $bad_url->{ $url } ) {
$check->{ is_success } = 1;
$check->{ status } = '500 Site already checked';
}
else {
my $req = HTTP::Request->new( GET => $url );
my $res = $uagent->request( $req, sub { die }, 1 );
if ( $res->is_success ) {
$check->{ is_success } = 1;
$check->{ status } = 'ok';
}
else {
$check->{ is_success } = 0;
$check->{ status } = $res->status_line;
$bad_url->{ $url } = 1;
}
}
push @urls, $check;
}
return \@urls;
}
package Main;
use strict;
use warnings;
use diagnostics;
use Carp;
use Pod::Usage;
use Getopt::Long;
use C4::Context;
my $verbose = 0;
my $help = 0;
my $host = '';
my $host_pro = '';
my $html = 0;
my $uriedit = "/cgi-bin/koha/cataloguing/addbiblio.pl?biblionumber=";
my $timeout = 15;
GetOptions(
'verbose' => \$verbose,
'html' => \$html,
'help' => \$help,
'host=s' => \$host,
'host-pro=s' => \$host_pro,
'timeout=i', => \$timeout,
);
sub usage {
pod2usage( -verbose => 2 );
exit;
}
sub bibediturl {
my $biblionumber = shift;
my $html = "<a href=\"$host_pro$uriedit$biblionumber\">$biblionumber</a>";
return $html;
}
#
# Check all URLs from all current Koha biblio records
#
sub check_all_url {
my $checker = C4::URL::Checker->new($timeout);
$checker->{ host_default } = $host;
my $context = new C4::Context( );
my $dbh = $context->dbh;
my $sth = $dbh->prepare(
"SELECT biblionumber FROM biblioitems WHERE url <> ''" );
$sth->execute;
if ( $html ) {
print <<EOS;
<html>
<body>
<table>
EOS
}
while ( my ($biblionumber) = $sth->fetchrow ) {
my $result = $checker->check_biblio( $biblionumber );
next unless $result; # No URL
foreach my $url ( @$result ) {
if ( ! $url->{ is_success } || $verbose ) {
print $html
? "<tr>\n<td>" . bibediturl( $biblionumber ) .
"</td>\n<td>" . $url->{url} . "</td>\n<td>" .
$url->{status} . "</td>\n</tr>\n\n"
: "$biblionumber\t" . $url->{ url } . "\t" .
$url->{ status } . "\n";
}
}
}
print "</table>\n</body>\n</html>\n" if $html;
}
# BEGIN
usage() if $help;
if ( $html && !$host_pro ) {
if ( $host ) {
$host_pro = $host;
}
else {
print "Error: host-pro parameter or host must be provided in html mode\n";
exit;
}
}
check_all_url();
=head1 NAME
check-url.pl - Check URLs from 856$u field.
=head1 USAGE
=over
=item check-url.pl [--verbose|--help] [--host=http://default.tld]
Scan all URLs found in 856$u of bib records
and display if resources are available or not.
=back
=head1 PARAMETERS
=over
=item B<--host=http://default.tld>
Server host used when URL doesn't have one, ie doesn't begin with 'http:'.
For example, if --host=http://www.mylib.com, then when 856$u contains
'img/image.jpg', the url checked is: http://www.mylib.com/image.jpg'.
=item B<--verbose|-v>
Outputs both successful and failed URLs.
=item B<--html>
Formats output in HTML. The result can be redirected to a file
accessible by http. This way, it's possible to link directly to biblio
record in edit mode. With this parameter B<--host-pro> is required.
=item B<--host-pro=http://koha-pro.tld>
Server host used to link to biblio record editing page.
=item B<--timeout=15>
Timeout for fetching URLs. By default 15 seconds.
=item B<--help|-h>
Print this help page.
=back
=cut