From 64505d71188847dc465cb367ba2a95db439392d4 Mon Sep 17 00:00:00 2001 From: Frederic Demians Date: Tue, 17 Feb 2009 10:23:13 +0100 Subject: [PATCH] URLs checker enhancement (bug #2959) Improve URLs checker script in the way (half way) pointed out by Galen: - A C4::URL::Checker class handle URL checking. This class is not yet in a separate file in C4 directory. This class would be easily extended to accomodate authorities URLs checking. - Script output can now be formatted in CSV or HTML. HTML version link directly to MARC biblio record editor. Signed-off-by: Galen Charlton --- misc/cronjobs/check-url.pl | 209 ++++++++++++++++++++++++++++++++----- 1 file changed, 184 insertions(+), 25 deletions(-) diff --git a/misc/cronjobs/check-url.pl b/misc/cronjobs/check-url.pl index cd6dd22e91..64c16c94fc 100755 --- a/misc/cronjobs/check-url.pl +++ b/misc/cronjobs/check-url.pl @@ -7,54 +7,203 @@ # (http://www.gnu.org/licenses/gpl.html) # + + +package C4::URL::Checker; + +=head1 NAME + +C4::URL::Checker - base object for checking URL stored in Koha DB + +=head1 SYNOPSIS + + use C4::URL::Checker; + + my $checker = C4::URL::Checker->new( ); + $checker->{ host_default } = 'http://mylib.kohalibrary.com'; + my $checked_urls = $checker->check_biblio( 123 ); + foreach my $url ( @$checked_urls ) { + print "url: ", $url->{ url  }, "\n", + "is_success: ", $url->{ is_success }, "\n", + "status: ", $url->{ status }, "\n"; + } + +=head1 FUNCTIONS + +=head2 new + +Create a URL Checker. The returned object can be used to set +default host variable : + + my $checker = C4::URL::Checker->new( ); + $checker->{ host_default } = 'http://mylib.kohalibrary.com'; + +=head2 check_biblio + +Check all URL from a biblio record. Returns a pointer to an array +containing all URLs with checking for each of them. + + my $checked_urls = $checker->check_biblio( 123 ); + +With 2 URLs, the returned array will look like that: + + [ + { + 'url' => 'http://mylib.tamil.fr/img/62265_0055B.JPG', + 'is_success' => 1, + 'status' => 'ok' + }, + { + 'url' => 'http://mylib.tamil.fr//img/62265_0055C.JPG', + 'is_success' => 0, + 'status' => '404 - Page not found' + } + ], + + +=cut + +use LWP::UserAgent; +use HTTP::Request; +use C4::Biblio; + + + +sub new { + + my $self = {}; + my $class = shift; + + $self->{ user_agent } = new LWP::UserAgent; + + bless $self, $class; + return $self; +} + + +sub check_biblio { + my $self = shift; + my $biblionumber = shift; + my $uagent = $self->{ user_agent }; + my $host = $self->{ host_default }; + + my $record = GetMarcBiblio( $biblionumber ); + return undef unless $record->field('856'); + + my @urls = (); + foreach my $field ( $record->field('856') ) { + my $url = $field->subfield('u'); + next unless $url; + $url = "$host/$url" unless $url =~ /^http/; + my $check = { url => $url }; + my $req = HTTP::Request->new( GET => $url ); + my $res = $uagent->request( $req, sub { die }, 1 ); + if ( $res->is_success ) { + $check->{ is_success } = 1; + $check->{ status } = 'ok'; + } + else { + $check->{ is_success } = 0; + $check->{ status } = $res->status_line; + } + push( @urls, $check ); + } + return \@urls; +} + + + +package Main; + use strict; use warnings; use diagnostics; use Carp; -use LWP::Simple; + +use YAML::XS; + use Pod::Usage; use Getopt::Long; use C4::Context; -use C4::Biblio; + my $verbose = 0; my $help = 0; my $host = ''; +my $host_pro = ''; +my $html = 0; +my $uriedit = "/cgi-bin/koha/cataloguing/addbiblio.pl?biblionumber="; GetOptions( - 'verbose' => \$verbose, - 'help' => \$help, - 'host=s' => \$host, + 'verbose' => \$verbose, + 'html' => \$html, + 'help' => \$help, + 'host=s' => \$host, + 'host-pro=s' => \$host_pro, ); + sub usage { pod2usage( -verbose => 2 ); exit; } -usage() if $help; -my $context = new C4::Context( ); -my $dbh = $context->dbh; -my $sth = $dbh->prepare( - "SELECT biblionumber FROM biblioitems WHERE url <> ''" ); -$sth->execute; -while ( my ($biblionumber) = $sth->fetchrow ) { - my $record = GetMarcBiblio( $biblionumber ); - next unless $record->field('856'); - foreach my $field ( $record->field('856') ) { - my $url = $field->subfield('u'); - next unless $url; - $url = "$host/$url" unless $url =~ /^http/; - if ( head( $url ) ) { - print "$biblionumber\t$url\tsucceed\n" if $verbose; - } - else { - print "$biblionumber\t$url\tfailed\n"; +sub bibediturl { + my $biblionumber = shift; + my $html = "$biblionumber"; + return $html; +} + + +# +# Check all URLs from all current Koha biblio records +# +sub check_all_url { + my $checker = C4::URL::Checker->new(); + $checker->{ host_default } = $host; + + my $context = new C4::Context( ); + my $dbh = $context->dbh; + my $sth = $dbh->prepare( + "SELECT biblionumber FROM biblioitems WHERE url <> ''" ); + $sth->execute; + print "\n\n\n" if $html; + while ( my ($biblionumber) = $sth->fetchrow ) { + my $result = $checker->check_biblio( $biblionumber ); + next unless $result; # No URL + foreach my $url ( @$result ) { + if ( ! $url->{ is_success } || $verbose ) { + print $html + ? "\n\n\n\n\n\n" + : "$biblionumber\t" . $url->{ url } . "\t" . + $url->{ status } . "\n"; + } } } + print "
" . bibediturl( $biblionumber ) . + "" . $url->{url} . "" . + $url->{status} . "
\n\n\n" if $html; } -exit; + + +# BEGIN + +usage() if $help; + +if ( $html && !$host_pro ) { + if ( $host ) { + $host_pro = $host; + } + else { + print "Error: host_pro parameter or host must be provided in html mode\n"; + exit; + } +} + +check_all_url(); + + =head1 NAME @@ -82,7 +231,17 @@ For example, if --host=http://www.mylib.com, then when 856$u contains =item B<--verbose|-v> -Output succeed URL checks with failed ones. +Outputs succeed URL checks with failed ones. + +=item B<--html> + +Formats output in HTML. The result can be redirected to a file +accessible by http. This way, it's possible to link directly to biblio +record in edit mode. With this parameter B<--host-pro> is required. + +=item B<--host-pro=http://koha-pro.tld> + +Server host used to link to biblio record editing page. =item B<--help|-h> -- 2.39.5