From 6d44b0a91ad0245b45b367e543f097ba64856e5f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fr=C3=A9d=C3=A9ric=20Demians?= Date: Sun, 20 Oct 2013 08:44:55 +0200 Subject: [PATCH] Bug 11081: Port Koha::Contrib::Tamil indexer into Koha code base Add two scripts for indexing: koha-index and koha-index-daemon. Documentation: perldoc koha-index perldoc koha-index-daemon New dependencies: MooseX::Getopt MooseX::RW AnyEvent::Processor Available as a Debian package: libmoosex-getopt-perl Test plan: - Apply the patch - Install dependencies, if necessary - Reindex a whole biblio catalog: koha-index --verbose - Reindex a whole authorities catalog: koha-index --source authority --verbose - Modify 1 biblio record, and index it: koha-index --select queue - Modifiy 1 authority record, and index it: koha-index --select queue --source authority - Run indexing daemon, with a 15s timeout: koha-index-daemon --timeout 15 - Modify a biblio record: wait 15s, and see - Modify an authority record: wait 15s, and see Signed-off-by: Bernardo Gonzalez Kriegel Works well, followed test plan without problems One comment: perhaps an option for koha-index-daemon to work quietly, without printing information. No koha-qa errors, but new files don't have license, fixed in followup Signed-off-by: Kyle M Hall Signed-off-by: Brendan A Gallagher --- C4/Installer/PerlDependencies.pm | 40 +++++ Koha/Indexer/Daemon.pm | 116 ++++++++++++++ Koha/Indexer/Indexing.pm | 195 +++++++++++++++++++++++ Koha/Indexer/RecordReader.pm | 263 +++++++++++++++++++++++++++++++ Koha/Indexer/RecordWriter.pm | 59 +++++++ misc/bin/koha-index | 79 ++++++++++ misc/bin/koha-index-daemon | 49 ++++++ 7 files changed, 801 insertions(+) create mode 100644 Koha/Indexer/Daemon.pm create mode 100644 Koha/Indexer/Indexing.pm create mode 100644 Koha/Indexer/RecordReader.pm create mode 100644 Koha/Indexer/RecordWriter.pm create mode 100755 misc/bin/koha-index create mode 100755 misc/bin/koha-index-daemon diff --git a/C4/Installer/PerlDependencies.pm b/C4/Installer/PerlDependencies.pm index 1c650f01eb..d621f3377d 100644 --- a/C4/Installer/PerlDependencies.pm +++ b/C4/Installer/PerlDependencies.pm @@ -552,6 +552,36 @@ our $PERL_DEPS = { 'required' => '1', 'min_ver' => '0.02', }, + 'MooseX::Storage' => { + 'usage' => 'Core', + 'required' => '0', + 'min_ver' => '0.30', + }, + 'MooseX::Types' => { + 'usage' => 'Core', + 'required' => '0', + 'min_ver' => '0.30', + }, + 'MooseX::Getopt' => { + 'usage' => 'Command line scripts', + 'required' => '0', + 'min_ver' => '0.46', + }, + 'MooseX::RW' => { + 'usage' => 'Command line scripts', + 'required' => '0', + 'min_ver' => '0.003', + }, + 'String::RewritePrefix' => { + 'usage' => 'Core', + 'required' => '0', + 'min_ver' => '0.006', + }, + 'Time::Progress' => { + 'usage' => 'Core', + 'required' => '0', + 'min_ver' => '1.7', + }, 'DBD::Mock' => { 'usage' => 'Core', 'required' => '1', @@ -612,6 +642,16 @@ our $PERL_DEPS = { 'required' => '0', 'min_ver' => '1', }, + 'AnyEvent::Processor' => { + 'usage' => 'Command line scripts', + 'required' => '0', + 'min_ver' => '0.003', + }, + 'Moose' => { + 'usage' => 'Core', + 'required' => '0', + 'min_ver' => '1.09', + }, 'String::Random' => { 'usage' => 'OpacSelfRegistration', 'required' => '1', diff --git a/Koha/Indexer/Daemon.pm b/Koha/Indexer/Daemon.pm new file mode 100644 index 0000000000..18519695da --- /dev/null +++ b/Koha/Indexer/Daemon.pm @@ -0,0 +1,116 @@ +package Koha::Indexer::Daemon; + +use Moose; + +use Modern::Perl; +use utf8; +use AnyEvent; +use Koha::Indexer::Indexing; +use C4::Context; + +with 'MooseX::Getopt'; + + +has name => ( is => 'rw', isa => 'Str' ); + +has directory => ( is => 'rw', isa => 'Str' ); + +has timeout => ( + is => 'rw', + isa => 'Int', + default => 60, +); + +has verbose => ( is => 'rw', isa => 'Bool', default => 0 ); + + +sub BUILD { + my $self = shift; + + say "Starting Koha Indexer Daemon"; + + $self->name( C4::Context->config('database') ); + + my $idle = AnyEvent->timer( + after => $self->timeout, + interval => $self->timeout, + cb => sub { $self->index_zebraqueue(); } + ); + AnyEvent->condvar->recv; +} + + +sub index_zebraqueue { + my $self = shift; + + my $dbh = C4::Context->dbh(); + my $sql = " SELECT COUNT(*), server + FROM zebraqueue + WHERE done = 0 + GROUP BY server "; + my $sth = $dbh->prepare($sql); + $sth->execute(); + my %count = ( biblio => 0, authority => 0 ); + while ( my ($count, $server) = $sth->fetchrow ) { + $server =~ s/server//g; + $count{$server} = $count; + } + + say "[", $self->name, "] Index biblio (", $count{biblio}, ") authority (", + $count{authority}, ")"; + + for my $source (qw/biblio authority/) { + next unless $count{$source}; + my $indexer = Koha::Indexer::Indexing->new( + source => $source, + select => 'queue', + blocking => 1, + keep => 1, + verbose => $self->verbose, + ); + $indexer->directory($self->directory) if $self->directory; + $indexer->run(); + } +} + +no Moose; +__PACKAGE__->meta->make_immutable; +1; + +__END__ +=pod + +=head1 SYNOPSIS + + # Index Koha queued biblio/authority records every minute. + # KOHA_CONF environment variable is used to find which Koha + # instance to use. + # Records are exported from Koha DB into files located in + # the current directory + my $daemon = Koha::Indexer::Daemon->new(); + + my $daemon = Koha::Indexer::Daemon->new( + timeout => 20, + directory => '/home/koha/mylib/tmp', + verbose => 1 ); + +=head1 Attributes + +=over + +=item directory($directory_name) + +Location of the directory where to export biblio/authority records before +sending them to Zebra indexer. + +=item timeout($seconds) + +Number of seconds between indexing. + +=item verbose(0|1) + +Task verbosity. + +=back + +=cut diff --git a/Koha/Indexer/Indexing.pm b/Koha/Indexer/Indexing.pm new file mode 100644 index 0000000000..61f8fab56d --- /dev/null +++ b/Koha/Indexer/Indexing.pm @@ -0,0 +1,195 @@ +package Koha::Indexer::Indexing; + +use Moose; + +use Modern::Perl; +use utf8; +use Carp; +use Koha::Indexer::RecordReader; +use Koha::Indexer::RecordWriter; +use AnyEvent::Processor::Conversion; +use File::Path; +use IO::File; +use C4::Context; + + +with 'MooseX::Getopt'; + + +has source => ( + is => 'rw', + isa => 'Koha::RecordType', + default => 'biblio' +); + +has select => ( + is => 'rw', + isa => 'Koha::RecordSelect', + required => 1, + default => 'all', +); + +has directory => ( + is => 'rw', + isa => 'Str', + default => './koha-index', +); + +has keep => ( is => 'rw', isa => 'Bool', default => 0 ); + +has verbose => ( is => 'rw', isa => 'Bool', default => 0 ); + +has help => ( + is => 'rw', + isa => 'Bool', + default => 0, + traits => [ 'NoGetopt' ], +); + +has blocking => ( + is => 'rw', + isa => 'Bool', + default => 0, + traits => [ 'NoGetopt' ], +); + + +sub run { + my $self = shift; + + # Is it a full indexing of all Koha DB records? + my $is_full_indexing = $self->select =~ /all/i; + + # Is it biblio indexing (if not it's authority) + my $is_biblio_indexing = $self->source =~ /biblio/i; + + # STEP 1: All biblio records are exported in a directory + + unless ( -d $self->directory ) { + mkdir $self->directory + or die "Unable to create directory: " . $self->directory; + } + my $from_dir = $self->directory . "/" . $self->source; + mkdir $from_dir; + for my $dir ( ( "$from_dir/update", "$from_dir/delete") ) { + rmtree( $dir ) if -d $dir; + mkdir $dir; + } + + # DOM indexing? otherwise GRS-1 + my $is_dom = $self->source eq 'biblio' + ? 'zebra_bib_index_mode' + : 'zebra_auth_index_mode'; + $is_dom = C4::Context->config($is_dom) || ''; + $is_dom = $is_dom =~ /dom/i ? 1 : 0; + + # STEP 1.1: Records to update + say "Exporting records to update" if $self->verbose; + my $exporter = AnyEvent::Processor::Conversion->new( + reader => Koha::Indexer::RecordReader->new( + source => $self->source, + select => $is_full_indexing ? 'all' : 'queue_update', + xml => '1' + ), + writer => Koha::Indexer::RecordWriter->new( + fh => IO::File->new( "$from_dir/update/records", '>:encoding(utf8)' ), + valid => $is_dom ), + blocking => $self->blocking, + verbose => $self->verbose, + ); + $exporter->run(); + + # STEP 1.2: Record to delete, if zebraqueue + if ( ! $is_full_indexing ) { + say "Exporting records to delete" if $self->verbose; + $exporter = AnyEvent::Processor::Conversion->new( + reader => Koha::Indexer::RecordReader->new( + source => $self->source, + select => 'queue_delete', + xml => '1' + ), + writer => Koha::Indexer::RecordWriter->new( + fh => IO::File->new( "$from_dir/delete/records", '>:encoding(utf8)' ), + valid => $is_dom ), + blocking => $self->blocking, + verbose => $self->verbose, + ); + $exporter->run(); + } + + # STEP 2: Run zebraidx + + my $cmd; + my $zconfig = C4::Context->zebraconfig( + $is_biblio_indexing ? 'biblioserver' : 'authorityserver')->{config}; + my $db_name = $is_biblio_indexing ? 'biblios' : 'authorities'; + my $cmd_base = "zebraidx -c " . $zconfig; + $cmd_base .= " -n" if $is_full_indexing; # No shadow: no indexing daemon + $cmd_base .= $self->verbose ? " -v warning,log" : " -v none"; + $cmd_base .= " -g marcxml"; + $cmd_base .= " -d $db_name"; + + if ( $is_full_indexing ) { + $cmd = "$cmd_base init"; + say $cmd if $self->verbose; + system( $cmd ); + } + + $cmd = "$cmd_base update $from_dir/update"; + say $cmd if $self->verbose; + system( $cmd ); + + if ( ! $is_full_indexing ) { + $cmd = "$cmd_base adelete $from_dir/delete"; + say $cmd if $self->verbose; + system( $cmd ); + my $cmd = "$cmd_base commit"; + say $cmd if $self->verbose; + system( $cmd ); + } + + rmtree( $self->directory ) unless $self->keep; +} + + +no Moose; +__PACKAGE__->meta->make_immutable; + +__END__ +=pod + +=head1 SYNOPSIS + + my $indexer = Koha::Indexer->new( + source => 'biblio', + select => 'queue' + ); + $indexer->run(); + + my $indexer = Koha::Indexer->new( + source => 'authority', + select => 'all', + directory => '/tmp', + verbose => 1, + ); + $indexer->run(); + +=head1 DESCRIPTION + +Indexes Koha biblio/authority records, full indexing or queued record indexing. + + +=head1 Methods + +=over + +=item run + +Runs the indexing task. + +=back + + +=cut + +1; diff --git a/Koha/Indexer/RecordReader.pm b/Koha/Indexer/RecordReader.pm new file mode 100644 index 0000000000..4a423113f3 --- /dev/null +++ b/Koha/Indexer/RecordReader.pm @@ -0,0 +1,263 @@ +package Koha::Indexer::RecordReader; + +use Moose; + +with 'MooseX::RW::Reader'; + + +use Modern::Perl; +use utf8; +use Moose::Util::TypeConstraints; +use MARC::Record; +use MARC::File::XML; +use C4::Context; +use C4::Biblio; +use C4::Items; + + +subtype 'Koha::RecordType' + => as 'Str', + => where { /biblio|authority/i }, + => message { "$_ is not a valid Koha::RecordType (biblio or authority" }; + +subtype 'Koha::RecordSelect' + => as 'Str', + => where { /all|queue|queue_update|queue_delete/ }, + => message { + "$_ is not a valide Koha::RecordSelect " . + "(all or queue or queue_update or queue_delete)" + }; + + +has source => ( + is => 'rw', + isa => 'Koha::RecordType', + required => 1, + default => 'biblio', +); + +has select => ( + is => 'rw', + isa => 'Koha::RecordSelect', + required => 1, + default => 'all', +); + +has xml => ( is => 'rw', isa => 'Bool', default => '0' ); + +has sth => ( is => 'rw' ); + +# Last returned record biblionumber; +has id => ( is => 'rw' ); + +# Biblio records normalizer, if necessary +has normalizer => ( is => 'rw' ); + +# Read all records? (or queued records) +has allrecords => ( is => 'rw', isa => 'Bool', default => 1 ); + +# Mark as done an entry is Zebra queue +has sth_queue_done => ( is => 'rw' ); + +# Items tag +has itemtag => ( is => 'rw' ); + +# Las returned record frameworkcode +# FIXME: a KohaRecord class should contain this information +has frameworkcode => ( is => 'rw', isa => 'Str' ); + + +sub BUILD { + my $self = shift; + my $dbh = C4::Context->dbh(); + + # Tag containing items + my ( $itemtag, $itemsubfield ) = GetMarcFromKohaField("items.itemnumber",''); + $self->itemtag($itemtag); + + if ( $self->source =~ /biblio/i && + C4::Context->preference('IncludeSeeFromInSearches') ) + { + require Koha::RecordProcessor; + my $normalizer = Koha::RecordProcessor->new( { filters => 'EmbedSeeFromHeadings' } ); + $self->normalizer($normalizer); + # Necessary for as_xml method + MARC::File::XML->default_record_format( C4::Context->preference('marcflavour') ); + } + + my $operation = $self->select =~ /update/i + ? 'specialUpdate' + : 'recordDelete'; + $self->allrecords( $self->select =~ /all/i ? 1 : 0 ); + my $sql = + $self->source =~ /biblio/i + ? $self->allrecords + ? "SELECT NULL, biblionumber FROM biblio" + : "SELECT id, biblio_auth_number FROM zebraqueue + WHERE server = 'biblioserver' + AND operation = '$operation' AND done = 0" + : $self->allrecords + ? "SELECT NULL, authid FROM auth_header" + : "SELECT id, biblio_auth_number FROM zebraqueue + WHERE server = 'authorityserver' + AND operation = '$operation' AND done = 0"; + my $sth = $dbh->prepare( $sql ); + $sth->execute(); + $self->sth( $sth ); + + unless ( $self->allrecords ) { + $self->sth_queue_done( $dbh->prepare( + "UPDATE zebraqueue SET done=1 WHERE id=?" ) ); + } + + __PACKAGE__->meta->add_method( 'get' => + $self->source =~ /biblio/i + ? $self->xml && !$self->normalizer + ? \&get_biblio_xml + : \&get_biblio_marc + : $self->xml + ? \&get_auth_xml + : \&get_auth_marc + ); +} + + + +sub read { + my $self = shift; + while ( my ($queue_id, $id) = $self->sth->fetchrow ) { + # Suppress entry in zebraqueue table + $self->sth_queue_done->execute($queue_id) if $queue_id; + if ( my $record = $self->get( $id ) ) { + $record = $self->normalizer->process($record) if $self->normalizer; + $self->count($self->count+1); + $self->id( $id ); + return $record; + } + } + return 0; +} + + + +sub get_biblio_xml { + my ( $self, $id ) = @_; + my$dbh = C4::Context->dbh(); + my $sth = $dbh->prepare( + "SELECT marcxml FROM biblioitems WHERE biblionumber=? "); + $sth->execute( $id ); + my ($marcxml) = $sth->fetchrow; + + # If biblio isn't found in biblioitems, it is searched in + # deletedbilioitems. Usefull for delete Zebra requests + unless ( $marcxml ) { + $sth = $dbh->prepare( + "SELECT marcxml FROM deletedbiblioitems WHERE biblionumber=? "); + $sth->execute( $id ); + ($marcxml) = $sth->fetchrow; + } + + # Items extraction + # FIXME: It slows down drastically biblio records export + { + my @items = @{ $dbh->selectall_arrayref( + "SELECT * FROM items WHERE biblionumber=$id", + {Slice => {} } ) }; + if (@items){ + my $record = MARC::Record->new; + $record->encoding('UTF-8'); + my @itemsrecord; + foreach my $item (@items) { + my $record = Item2Marc($item, $id); + push @itemsrecord, $record->field($self->itemtag); + } + $record->insert_fields_ordered(@itemsrecord); + my $itemsxml = $record->as_xml_record(); + $marcxml = + substr($marcxml, 0, length($marcxml)-10) . + substr($itemsxml, index($itemsxml, "\n", 0) + 10); + } + } + return $marcxml; +} + + +# Get biblio record, if the record doesn't exist in biblioitems, it is searched +# in deletedbiblioitems. +sub get_biblio_marc { + my ( $self, $id ) = @_; + + my $dbh = C4::Context->dbh(); + my $sth = $dbh->prepare( + "SELECT marcxml FROM biblioitems WHERE biblionumber=? "); + $sth->execute( $id ); + my ($marcxml) = $sth->fetchrow; + + unless ( $marcxml ) { + $sth = $dbh->prepare( + "SELECT marcxml FROM deletedbiblioitems WHERE biblionumber=? "); + $sth->execute( $id ); + ($marcxml) = $sth->fetchrow; + } + + $marcxml =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g; + my $record = MARC::Record->new(); + if ($marcxml) { + $record = eval { + MARC::Record::new_from_xml( $marcxml, "utf8" ) }; + if ($@) { warn " problem with: $id : $@ \n$marcxml"; } + + # Items extraction if Koha v3.4 and above + # FIXME: It slows down drastically biblio records export + if ( $self->itemsextraction ) { + my @items = @{ $dbh->selectall_arrayref( + "SELECT * FROM items WHERE biblionumber=$id", + {Slice => {} } ) }; + if (@items){ + my @itemsrecord; + foreach my $item (@items) { + my $record = Item2Marc($item, $id); + push @itemsrecord, $record->field($self->itemtag); + } + $record->insert_fields_ordered(@itemsrecord); + } + } + return $record; + } + return; +} + + +sub get_auth_xml { + my ( $self, $id ) = @_; + + my $dbh = C4::Context->dbh(); + my $sth = $dbh->prepare( + "select marcxml from auth_header where authid=? " ); + $sth->execute( $id ); + my ($xml) = $sth->fetchrow; + + # If authority isn't found we build a mimimalist record + # Usefull for delete Zebra requests + unless ( $xml ) { + return + " + + $id + \n"; + } + + my $new_xml = ''; + foreach ( split /\n/, $xml ) { + next if /^ no +has valid => ( + is => 'rw', + isa => 'Bool', + default => 0, +); + + +sub begin { + my $self = shift; + if ( $self->valid ) { + my $fh = $self->fh; + print $fh '', "\n", '', "\n"; + } +} + + +sub end { + my $self = shift; + my $fh = $self->fh; + if ( $self->valid ) { + print $fh '', "\n"; + } + $fh->flush(); +} + + + +# +# Sent record is rather a MARC::Record object or an marcxml string +# +sub write { + my ($self, $record) = @_; + + $self->count( $self->count + 1 ); + + my $fh = $self->fh; + my $xml = ref($record) eq 'MARC::Record' + ? $record->as_xml_record() : $record; + $xml =~ s/<\?xml version="1.0" encoding="UTF-8"\?>\n//g if $self->valid; + print $fh $xml; +} + +__PACKAGE__->meta->make_immutable; + +1; diff --git a/misc/bin/koha-index b/misc/bin/koha-index new file mode 100755 index 0000000000..d578ab3a6d --- /dev/null +++ b/misc/bin/koha-index @@ -0,0 +1,79 @@ +#!/usr/bin/perl + +package Main; + +use Modern::Perl; +use utf8; +use Koha::Indexer::Indexing; +use Pod::Usage; + +my $indexer = Koha::Indexer::Indexing->new_with_options(); +if ( $indexer->help ) { + pod2usage( -verbose => 99 ); + exit; +} +$indexer->run(); + + +__END__ + +=pod + +=head1 SYNOPSIS + + koha-index + + koha-index --verbose + + koha-index --source biblio --select queue + + koha-index --source authority --select all + + koha-index --select queue --directory /tmp/koha-index-mylib --keep + +=head1 DESCRIPTION + +Index queued biblio/autority record, or reindex the whole DB. Koha standard +environment variables must ne set appropriately: KOHA_CONF and PERL5LIB. + +=head1 OPTIONS + +=over + +=item --source + +Select records to be indexed: C or C. If not specified, +biblio by default. + +=item --select + +Select record to be indexed: C or C. If not specified, C is +selected. If C is selected, zebra database is reset before indexing. + +=item --directory + +Directory where records to be indexed by Zebra are exported. If not specified, +a direcory named C is used, and if necessary created, in the +current directory. In this directory, sub-directories are created containing +records to be updated or deleted by Zebra. If those subdirectories already +exist, they are first emptied. The export directory tree is kept after zebra +indexing. + +=item --keep + +Keep the directory, and its content, where biblio/authority records have been +exported. + +=item --verbose + +Increase the amount of logging. Normally only warnings and errors from the +indexing are shown. + +=back + +=head1 SEE ALSO + +=for :list +* L + +=cut diff --git a/misc/bin/koha-index-daemon b/misc/bin/koha-index-daemon new file mode 100755 index 0000000000..5ff19bc179 --- /dev/null +++ b/misc/bin/koha-index-daemon @@ -0,0 +1,49 @@ +#!/usr/bin/perl + +package Main; + +use Modern::Perl; +use utf8; +use Koha::Indexer::Daemon; +use Pod::Usage; + +Koha::Indexer::Daemon->new_with_options(); + +__END__ + +=pod + +=head1 SYNOPSIS + + koha-index-daemon + + koha-index-daemon --timeout 60 + + koha-index-daemon --timeout 60 --directory /home/mylib/tmp + +=head1 DESCRIPTION + +Examine periodicaly zebraqueue table from a Koha instance and index +bilbio/authority records. + +=head1 OPTIONS + +=over + +=item --timeout + +Specify the daemon timeout in seconds. + +=item --directory + +Directory where to write record exported from Koha DB before sending them to +Zebra. Subdirectories are created. + +=back + +=head1 SEE ALSO + +=for :list +* L + +=cut -- 2.39.5