Koha/misc/workers/es_indexer_daemon.pl
Marcel de Rooy 17032f89a3 Bug 35819: (QA follow-up) Prevent warning on uninitialized retries count
Signed-off-by: Marcel de Rooy <m.de.rooy@rijksmuseum.nl>
Signed-off-by: Katrin Fischer <katrin.fischer@bsz-bw.de>
(cherry picked from commit c86e6ee82d)
Signed-off-by: Fridolin Somers <fridolin.somers@biblibre.com>
2024-03-11 10:52:34 +01:00

240 lines
6.9 KiB
Perl
Executable file

#!/usr/bin/perl
# This file is part of Koha.
#
# Koha is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Koha is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Koha; if not, see <http://www.gnu.org/licenses>.
=head1 NAME
es_indexer_daemon.pl - Worker script that will process background Elasticsearch jobs
=head1 SYNOPSIS
./es_indexer_daemon.pl --batch_size=X
Options:
-b --batch_size how many jobs to commit (default: 10)
--help brief help message
=head1 OPTIONS
=over 8
=item B<--help>
Print a brief help message and exits.
=item B<--batch_size>
How many jobs to commit per batch. Defaults to 10, will commit after .1 seconds if no more jobs incoming.
=back
=head1 DESCRIPTION
This script will connect to the Stomp server (RabbitMQ) and subscribe to the Elasticsearch queue, processing batches every second.
If a Stomp server is not active it will poll the database every 10s for new jobs in the Elasticsearch queue
and process them in batches every second.
=cut
use Modern::Perl;
use JSON qw( decode_json );
use Try::Tiny;
use Pod::Usage;
use Getopt::Long;
use List::MoreUtils qw( natatime );
use Time::HiRes;
use C4::Context;
use Koha::Logger;
use Koha::BackgroundJobs;
use Koha::SearchEngine;
use Koha::SearchEngine::Indexer;
my ( $help, $batch_size );
my $not_found_retries = {};
my $max_retries = $ENV{MAX_RETRIES} || 10;
GetOptions(
'h|help' => \$help,
'b|batch_size=s' => \$batch_size
) || pod2usage(1);
pod2usage(0) if $help;
$batch_size //= 10;
warn "Not using Elasticsearch" unless C4::Context->preference('SearchEngine') eq 'Elasticsearch';
my $logger = Koha::Logger->get({ interface => 'worker' });
my $conn;
try {
$conn = Koha::BackgroundJob->connect;
} catch {
warn sprintf "Cannot connect to the message broker, the jobs will be processed anyway (%s)", $_;
};
if ( $conn ) {
# FIXME cf note in Koha::BackgroundJob about $namespace
my $namespace = C4::Context->config('memcached_namespace');
$conn->subscribe(
{
destination => sprintf( "/queue/%s-%s", $namespace, 'elastic_index' ),
ack => 'client',
'prefetch-count' => 1,
}
);
}
my $biblio_indexer = Koha::SearchEngine::Indexer->new( { index => $Koha::SearchEngine::BIBLIOS_INDEX } );
my $auth_indexer = Koha::SearchEngine::Indexer->new( { index => $Koha::SearchEngine::AUTHORITIES_INDEX } );
my $config = $biblio_indexer->get_elasticsearch_params;
my $at_a_time = $config->{chunk_size} // 5000;
my @jobs = ();
while (1) {
if ( $conn ) {
my $frame = $conn->receive_frame;
if ( !defined $frame ) {
# maybe log connection problems
next; # will reconnect automatically
}
my $args = try {
my $body = $frame->body;
decode_json($body); # TODO Should this be from_json? Check utf8 flag.
} catch {
Koha::Logger->get({ interface => 'worker' })->warn(sprintf "Frame not processed - %s", $_);
return;
};
unless ( $args ) {
Koha::Logger->get({ interface => 'worker' })->warn(sprintf "Frame does not have correct args, ignoring it");
$conn->nack( { frame => $frame, requeue => 'false' } );
next;
}
my $job = Koha::BackgroundJobs->find( $args->{job_id} );
if ( $job && $job->status ne 'new' ) {
Koha::Logger->get( { interface => 'worker' } )
->warn( sprintf "Job %s has wrong status %s", $args->{job_id}, $job->status );
# nack without requeue, we do not want to process this frame again
$conn->nack( { frame => $frame, requeue => 'false' } );
next;
}
unless ($job) {
$not_found_retries->{ $args->{job_id} } //= 0;
if ( ++$not_found_retries->{ $args->{job_id} } >= $max_retries ) {
Koha::Logger->get( { interface => 'worker' } )
->warn( sprintf "Job %s not found, no more retry", $args->{job_id} );
# nack without requeue, we do not want to process this frame again
$conn->nack( { frame => $frame, requeue => 'false' } );
next;
}
Koha::Logger->get( { interface => 'worker' } )
->debug( sprintf "Job %s not found, will retry later", $args->{job_id} );
# nack to force requeue
$conn->nack( { frame => $frame, requeue => 'true' } );
Time::HiRes::sleep(0.5);
next;
}
$conn->ack( { frame => $frame } );
push @jobs, $job;
if ( @jobs >= $batch_size || !$conn->can_read( { timeout => '0.1' } ) ) {
commit(@jobs);
@jobs = ();
}
} else {
@jobs = Koha::BackgroundJobs->search(
{ status => 'new', queue => 'elastic_index' } )->as_list;
commit(@jobs);
@jobs = ();
sleep 10;
}
}
$conn->disconnect;
sub commit {
my (@jobs) = @_;
my @bib_records;
my @auth_records;
my $jobs = Koha::BackgroundJobs->search( { id => [ map { $_->id } @jobs ] });
# Start
$jobs->update({
progress => 0,
status => 'started',
started_on => \'NOW()',
});
for my $job (@jobs) {
my $args = try {
$job->json->decode( $job->data );
} catch {
$logger->warn( sprintf "Cannot decode data for job id=%s", $job->id );
$job->status('failed')->store;
return;
};
next unless $args;
if ( $args->{record_server} eq 'biblioserver' ) {
push @bib_records, @{ $args->{record_ids} };
} else {
push @auth_records, @{ $args->{record_ids} };
}
}
if (@auth_records) {
my $auth_chunks = natatime $at_a_time, @auth_records;
while ( ( my @auth_chunk = $auth_chunks->() ) ) {
try {
$auth_indexer->update_index( \@auth_chunk );
} catch {
$logger->warn( sprintf "Update of elastic index failed with: %s", $_ );
};
}
}
if (@bib_records) {
my $biblio_chunks = natatime $at_a_time, @bib_records;
while ( ( my @bib_chunk = $biblio_chunks->() ) ) {
try {
$biblio_indexer->update_index( \@bib_chunk );
} catch {
$logger->warn( sprintf "Update of elastic index failed with: %s", $_ );
};
}
}
# Finish
$jobs->update({
progress => 1,
status => 'finished',
ended_on => \'NOW()',
});
}