From ca341f6840ad7eb9170ce49f1ed6869b3e468297 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fr=C3=A9d=C3=A9ric=20Demians?= Date: Sun, 3 Nov 2013 17:05:38 +0100 Subject: [PATCH] Bug 11190: sitemap.pl -- Generate a Catalog sitemap Add a script sitemap.pl to process all biblio records from a Koha instance and generate Sitemap files complying with this protocol as described on http://sitemaps.org. The goal of this script is to be able to provide to search engines direct access to biblio records. It avoid leaving search engine browsing Koha OPAC and so generating a lot of traffic, and workload, for a bad result. Thanks Magnus for testing, and helping to improve the script design. [2015.04.16] Switch from Moose to Moo. [2015.08.20] Add complete (more) UT. Signed-off-by: Magnus Enger All options to the script work as expected and the output looks good. Nice enhancement! Signed-off-by: Frederic Demians I signed-of my own patch after fixing various QA errors. Signed-off-by: Martin Renvoize Signed-off-by: Jonathan Druart Amended patch: replace tabs with spaces. Signed-off-by: Tomas Cohen Arazi --- C4/Installer/PerlDependencies.pm | 10 ++ Koha/Sitemapper.pm | 89 ++++++++++++++ Koha/Sitemapper/Writer.pm | 125 ++++++++++++++++++++ misc/cronjobs/sitemap.pl | 127 ++++++++++++++++++++ t/db_dependent/Sitemapper.t | 193 +++++++++++++++++++++++++++++++ 5 files changed, 544 insertions(+) create mode 100644 Koha/Sitemapper.pm create mode 100644 Koha/Sitemapper/Writer.pm create mode 100755 misc/cronjobs/sitemap.pl create mode 100755 t/db_dependent/Sitemapper.t diff --git a/C4/Installer/PerlDependencies.pm b/C4/Installer/PerlDependencies.pm index 0b845e48d7..dba7b52818 100644 --- a/C4/Installer/PerlDependencies.pm +++ b/C4/Installer/PerlDependencies.pm @@ -607,6 +607,11 @@ our $PERL_DEPS = { 'required' => '0', 'min_ver' => '2.13', }, + 'Moo' => { + 'usage' => 'Core', + 'required' => '0', + 'min_ver' => '1', + }, 'String::Random' => { 'usage' => 'OpacSelfRegistration', 'required' => '1', @@ -752,6 +757,11 @@ our $PERL_DEPS = { 'required' => '0', 'min_ver' => '0.0.3', }, + 'XML::Writer' => { + 'usage' => 'Command line scripts', + 'required' => '0', + 'min_ver' => '0.614', + }, }; 1; diff --git a/Koha/Sitemapper.pm b/Koha/Sitemapper.pm new file mode 100644 index 0000000000..bbcbad3be3 --- /dev/null +++ b/Koha/Sitemapper.pm @@ -0,0 +1,89 @@ +package Koha::Sitemapper; + +# +# Copyright 2015 Tamil s.a.r.l. +# +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 3 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with Koha; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +use Moo; +use Modern::Perl; +use Koha::Sitemapper::Writer; +use C4::Context; + + +has url => ( is => 'rw', ); + +has dir => ( + is => 'rw', + default => '.', + trigger => sub { + my ($self, $dir) = @_; + unless (-d $dir) { + say "This is not a valid directory: $dir"; + exit; + } + } +); + +has short => ( is => 'rw', default => 1 ); + +has verbose => ( is => 'rw', default => 0 ); + +has sth => ( is => 'rw' ); + +has writer => ( is => 'rw', ); + +has count => ( is => 'rw', default => 0); + + +sub run { + my $self = shift; + + say "Creation of Sitemap files in '" . $self->dir . "' directory" + if $self->verbose; + + $self->writer( Koha::Sitemapper::Writer->new( sitemapper => $self ) ); + my $sth = C4::Context->dbh->prepare( + "SELECT biblionumber, timestamp FROM biblio" ); + $sth->execute(); + $self->sth($sth); + + while ( $self->process() ) { + say "..... ", $self->count + if $self->verbose && $self->count % 10000 == 0; + } +} + + +sub process { + my $self = shift; + + my ($biblionumber, $timestamp) = $self->sth->fetchrow; + unless ($biblionumber) { + $self->writer->end(); + say "Number of biblio records processed: ", $self->count, "\n" . + "Number of Sitemap files: ", $self->writer->count + if $self->verbose; + return; + } + + $self->writer->write($biblionumber, $timestamp); + $self->count( $self->count + 1 ); + return $self->count; +} + + +1; diff --git a/Koha/Sitemapper/Writer.pm b/Koha/Sitemapper/Writer.pm new file mode 100644 index 0000000000..7e33eb04ba --- /dev/null +++ b/Koha/Sitemapper/Writer.pm @@ -0,0 +1,125 @@ +package Koha::Sitemapper::Writer; + +# +# Copyright 2015 Tamil s.a.r.l. +# +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 3 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with Koha; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + +use Moo; +use Modern::Perl; +use XML::Writer; +use IO::File; +use DateTime; + + +my $MAX = 50000; + + +has sitemapper => (is => 'rw', ); + +has current => ( is => 'rw', default => $MAX ); + +has count => ( is => 'rw', default => 0 ); + +has writer => ( is => 'rw', ); + + + +sub _writer_create { + my ($self, $name) = @_; + $name = $self->sitemapper->dir . "/$name"; + my $fh = IO::File->new(">$name"); + unless ($fh) { + say "Impossible to create file: $name"; + exit; + } + my $writer = XML::Writer->new( + OUTPUT => $fh, + DATA_MODE => 1, + DATA_INDENT => 2, + ); + $writer->xmlDecl("UTF-8"); + return $writer; +} + + +sub _writer_end { + my $self = shift; + return unless $self->writer; + $self->writer->endTag(); + $self->writer->end(); + $self->writer->getOutput()->close(); +} + + +sub write { + my ($self, $biblionumber, $timestamp) = @_; + + if ( $self->current == $MAX ) { + $self->_writer_end(); + $self->count( $self->count + 1 ); + my $w = $self->_writer_create( sprintf("sitemap%04d.xml", $self->count) ); + $w->startTag( + 'urlset', + 'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9', + 'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance', + 'xsi:schemaLocation' => 'http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd'); + $self->writer($w); + $self->current(0); + } + + $self->current( $self->current + 1 ); + my $writer = $self->writer; + my $url = $self->sitemapper->url . + ($self->sitemapper->short ? '/bib/' : '/cgi-bin/koha/opac-detail.pl?biblionumber=') . + $biblionumber; + $writer->startTag('url'); + $writer->startTag('loc'); + $writer->characters($url); + $writer->endTag(); + $writer->startTag('lastmod'); + $timestamp = substr($timestamp, 0, 10); + $writer->characters($timestamp); + $writer->endTag(); + $writer->endTag(); +} + + +sub end { + my $self = shift; + + $self->_writer_end(); + + my $w = $self->_writer_create("sitemapindex.xml"); + $w->startTag('sitemapindex', 'xmlns' => 'http://www.sitemaps.org/schemas/sitemap/0.9'); + my $now = DateTime->now()->ymd; + for my $i ( 1..$self->count ) { + $w->startTag('sitemap'); + $w->startTag('loc'); + my $name = sprintf("sitemap%04d.xml", $i); + $w->characters($self->sitemapper->url . "/$name"); + $w->endTag(); + $w->startTag('lastmod'); + $w->characters($now); + $w->endTag(); + $w->endTag(); + } + $w->endTag(); +} + + +1; \ No newline at end of file diff --git a/misc/cronjobs/sitemap.pl b/misc/cronjobs/sitemap.pl new file mode 100755 index 0000000000..0d615310d0 --- /dev/null +++ b/misc/cronjobs/sitemap.pl @@ -0,0 +1,127 @@ +#!/usr/bin/perl + +# Copyright 2015 Tamil s.a.r.l. +# +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 3 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with Koha; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +package Main; + +use Modern::Perl; +use utf8; +use Pod::Usage; +use Getopt::Long; +use C4::Biblio; +use Koha::Sitemapper; + + +my ($verbose, $help, $url, $dir, $short) = (0, 0, '', '.', 1); +GetOptions( + 'verbose' => \$verbose, + 'help' => \$help, + 'url=s' => \$url, + 'dir=s' => \$dir, + 'short!' => \$short, +); + +sub usage { + pod2usage( -verbose => 2 ); + exit; +} + +usage() if $help; + +unless ($url) { + $url = C4::Context->preference("OPACBaseURL"); + unless ($url) { + say "OPACBaseURL syspref isn't defined. You can use --url parameter."; + exit; + } + $url = 'http://' . $url; +} +$url =~ s/\/*$//g; + +my $sitemaper = Koha::Sitemapper->new( + verbose => $verbose, + url => $url, + dir => $dir, + short => $short, +); +$sitemaper->run(); + + +=head1 USAGE + +=over + +=item sitemap.pl [--verbose|--help|--short|--noshort|--url|--dir] + +=back + +=head1 SYNOPSIS + + sitemap.pl --verbose + sitemap.pl --noshort --url /home/koha/mylibrary/www + +=head1 DESCRIPTION + +Process all biblio records from a Koha instance and generate Sitemap files +complying with this protocol as described on L. The goal of +this script is to be able to provide to search engines direct access to biblio +records. It avoid leaving search engine browsing Koha OPAC and so generating +a lot of traffic, and workload, for a bad result. + +A file name F is generated. It contains references to Sitemap +multiples files. Each file contains at most 50,000 urls, and is named +F. + +The files must be stored on Koha OPAC root directory, ie +F</koha-tmpl/>. Place also in this directory a F file +like this one: + + Sitemap: sitemapindex.xml + User-agent: * + Disallow: /cgi-bin/ + +=head1 PARAMETERS + +=over + +=item B<--url=Koha OPAC base URL> + +If omitted, OPACBaseURL syspref is used. + +=item B<--short|noshort> + +By default, --short. With --short, URL to bib record ends with +/bib/biblionumber. With --noshort, URL ends with +/cgi-bin/koha/opac-detail.pl?biblionumber=bibnum + +=item B<--dir> + +Directory where to write sitemap files. By default, the current directory. + +=item B<--verbose|-v> + +Enable script verbose mode: a message is displayed for each 10,000 biblio +records processed. + +=item B<--help|-h> + +Print this help page. + +=back + +=cut diff --git a/t/db_dependent/Sitemapper.t b/t/db_dependent/Sitemapper.t new file mode 100755 index 0000000000..293edad632 --- /dev/null +++ b/t/db_dependent/Sitemapper.t @@ -0,0 +1,193 @@ +#!/usr/bin/perl + +# Copyright 2015 Tamil s.a.r.l. +# +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 3 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with Koha; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +use Modern::Perl; +use Test::MockModule; +use File::Basename; +use File::Path; +use DateTime; +use Test::More tests => 14; + + +BEGIN { + use_ok('Koha::Sitemapper'); + use_ok('Koha::Sitemapper::Writer'); +} + + +sub slurp { + my $file = shift; + open my $fh, '<', $file or die; + local $/ = undef; + my $cont = <$fh>; + close $fh; + return $cont; +} + + +# Create 3 mocked dataset to be used by Koha::Sitemaper in place of DB content +my $module_context = new Test::MockModule('C4::Context'); +$module_context->mock('_new_dbh', sub { + my $dbh = DBI->connect( 'DBI:Mock:', '', '' ) + || die "Cannot create handle: $DBI::errstr\n"; + return $dbh +}); +my $dbh = C4::Context->dbh(); +my $two_bibs = [ + [ qw/ biblionumber timestamp / ], + [ qw/ 1234 2013-11-15 / ], + [ qw/ 9875 2015-08-31 / ], +]; +my $lotof_bibs = [ [ qw/ biblionumber timestamp / ] ]; +push @$lotof_bibs, [ $_, '2015-08-31' ] for 1..75000; +$dbh->{mock_add_resultset} = $two_bibs; +$dbh->{mock_add_resultset} = $two_bibs; +$dbh->{mock_add_resultset} = $lotof_bibs; + +my $dir = File::Spec->rel2abs( dirname(__FILE__) ); + +# Create a sitemap for a catalog containg 2 biblios, with option 'long url' +my $sitemaper = Koha::Sitemapper->new( + verbose => 0, + url => 'http://www.mylibrary.org', + dir => $dir, + short => 0, +); +$sitemaper->run(); + +my $file = "$dir/sitemapindex.xml"; +ok( -e "$dir/sitemapindex.xml", "File sitemapindex.xml created"); +my $file_content = slurp($file); +my $now = DateTime->now->ymd; +my $expected_content = < + + + + http://www.mylibrary.org/sitemap0001.xml + $now + + +EOS +chop $expected_content; +ok( $file_content eq $expected_content, "Its content is valid" ); + +$file = "$dir/sitemap0001.xml"; +ok( -e $file, "File sitemap0001.xml created"); +$file_content = slurp($file); +$expected_content = < + + + + http://www.mylibrary.org/cgi-bin/koha/opac-detail.pl?biblionumber=1234 + 2013-11-15 + + + http://www.mylibrary.org/cgi-bin/koha/opac-detail.pl?biblionumber=9875 + 2015-08-31 + + +EOS +ok( $file_content eq $expected_content, "Its content is valid" ); + + +# Create a sitemap for a catalog containg 2 biblios, with option 'short url'. +# Test that 2 files are created. +$sitemaper = Koha::Sitemapper->new( + verbose => 0, + url => 'http://www.mylibrary.org', + dir => $dir, + short => 1, +); +$sitemaper->run(); + +$file = "$dir/sitemap0001.xml"; +ok( -e $file, "File sitemap0001.xml with short URLs created"); +$file_content = slurp($file); +$expected_content = < + + + + http://www.mylibrary.org/bib/1234 + 2013-11-15 + + + http://www.mylibrary.org/bib/9875 + 2015-08-31 + + +EOS +ok( $file_content eq $expected_content, "Its content is valid" ); + + +# Create a sitemap for a catalog containing 75000 biblios, with option 'short +# url'. Test that 3 files are created: index file + 2 urls file with +# respectively 50000 et 25000 urls. +$sitemaper = Koha::Sitemapper->new( + verbose => 0, + url => 'http://www.mylibrary.org', + dir => $dir, + short => 1, +); +$sitemaper->run(); + +$file = "$dir/sitemapindex.xml"; +ok( -e "$dir/sitemapindex.xml", "File sitemapindex.xml for 75000 bibs created"); +$file_content = slurp($file); +$expected_content = < + + + + http://www.mylibrary.org/sitemap0001.xml + $now + + + http://www.mylibrary.org/sitemap0002.xml + $now + + +EOS +chop $expected_content; +ok( $file_content eq $expected_content, "Its content is valid" ); + +$file = "$dir/sitemap0001.xml"; +ok( -e $file, "File sitemap0001.xml created"); + +open my $fh, "<", $file; +my $count = 0; +while (<$fh>) { + $count++ if //; +} +ok ( $count == 50000, "It contains 50000 URLs"); + +$file = "$dir/sitemap0002.xml"; +ok( -e $file, "File sitemap0002.xml created"); + +open $fh, "<", $file; +$count = 0; +while (<$fh>) { + $count++ if //; +} +ok ( $count == 25000, "It contains 25000 URLs"); + +# Cleanup +unlink "$dir/$_" for qw / sitemapindex.xml sitemap0001.xml sitemap0002.xml /; -- 2.39.5