Koha/misc/migration_tools/rebuild_zebra.pl

#!/usr/bin/perl

# This file is part of Koha.
#
# Koha is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Koha is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Koha; if not, see <http://www.gnu.org/licenses>.

use Modern::Perl;

use Koha::Script;
use C4::Context;
use Getopt::Long qw( GetOptions );
use Fcntl qw( LOCK_EX LOCK_NB LOCK_UN );
use File::Temp qw( tempdir );
use File::Path qw( mkpath rmtree );
use C4::Biblio qw( GetXmlBiblio );
use C4::AuthoritiesMarc qw( GetAuthority GetAuthorityXML );
use C4::Items qw( Item2Marc );
use Koha::RecordProcessor;
use Koha::Caches;
use XML::LibXML;

use constant LOCK_FILENAME => 'rebuild..LCK';

# script that checks zebradir structure & create directories & mandatory files if needed
#
#

$|=1; # flushes output
# If the cron job starts us in an unreadable dir, we will break without
# this.
chdir $ENV{HOME} if (!(-r '.'));
my $daemon_mode;
my $daemon_sleep = 5;
my $directory;
my $nosanitize;
my $skip_export;
my $keep_export;
my $skip_index;
my $reset;
my $biblios;
my $authorities;
my $as_xml;
my $noshadow;
my $want_help;
my $process_zebraqueue;
my $process_zebraqueue_skip_deletes;
my $do_not_clear_zebraqueue;
my $length;
my $where;
my $offset;
my $run_as_root;
my $run_user = (getpwuid($<))[0];
my $wait_for_lock = 0;
my $use_flock;
my $table = 'biblioitems';
my $is_memcached = Koha::Caches->get_instance->memcached_cache;

my $verbose_logging = 0;
my $zebraidx_log_opt = " -v none,fatal,warn ";
my $result = GetOptions(
    'daemon'        => \$daemon_mode,
    'sleep:i'       => \$daemon_sleep,
    'd:s'           => \$directory,
    'r|reset'       => \$reset,
    's'             => \$skip_export,
    'k'             => \$keep_export,
    'I|skip-index'  => \$skip_index,
    'nosanitize'    => \$nosanitize,
    'b'             => \$biblios,
    'w'             => \$noshadow,
    'a'             => \$authorities,
    'h|help'        => \$want_help,
    'x'             => \$as_xml,
    'y'             => \$do_not_clear_zebraqueue,
    'z'             => \$process_zebraqueue,
    'skip-deletes'  => \$process_zebraqueue_skip_deletes,
    'where:s'       => \$where,
    'length:i'      => \$length,
    'offset:i'      => \$offset,
    'v+'            => \$verbose_logging,
    'run-as-root'   => \$run_as_root,
    'wait-for-lock' => \$wait_for_lock,
    't|table:s'     => \$table,
);

if (not $result or $want_help) {
    print_usage();
    exit 0;
}

if ( $as_xml ) {
    warn "Warning: You passed -x which is already the default and is now deprecated\n";
    undef $as_xml; # Should not be used later
}

if( not defined $run_as_root and $run_user eq 'root') {
    my $msg = "Warning: You are running this script as the user 'root'.\n";
    $msg   .= "If this is intentional you must explicitly specify this using the -run-as-root switch\n";
    $msg   .= "Please do '$0 --help' to see usage.\n";
    die $msg;
}

if ($process_zebraqueue and ($skip_export or $reset)) {
    my $msg = "Cannot specify -r or -s if -z is specified\n";
    $msg   .= "Please do '$0 --help' to see usage.\n";
    die $msg;
}

if ($process_zebraqueue and $do_not_clear_zebraqueue) {
    my $msg = "Cannot specify both -y and -z\n";
    $msg   .= "Please do '$0 --help' to see usage.\n";
    die $msg;
}

if ($daemon_mode) {
    # incompatible flags handled above: help, reset, and do_not_clear_zebraqueue
    if ($skip_export or $keep_export or $skip_index or
          $where or $length or $offset) {
        my $msg = "Cannot specify -s, -k, -I, -where, -length, or -offset with -daemon.\n";
        $msg   .= "Please do '$0 --help' to see usage.\n";
        die $msg;
    }
    unless ($is_memcached) {
        warn "Warning: script running in daemon mode, without recommended caching system (memcached).\n";
    }
    $authorities = 1;
    $biblios = 1;
    $process_zebraqueue = 1;
}

if (not $biblios and not $authorities) {
    my $msg = "Must specify -b or -a to reindex bibs or authorities\n";
    $msg   .= "Please do '$0 --help' to see usage.\n";
    die $msg;
}

our @tables_allowed_for_select = ( 'biblioitems', 'items', 'biblio', 'biblio_metadata' );
unless ( grep { $_ eq $table } @tables_allowed_for_select ) {
    die "Cannot specify -t|--table with value '$table'. Only "
      . ( join ', ', @tables_allowed_for_select )
      . " are allowed.";
}


#  -v is for verbose, which seems backwards here because of how logging is set
#    on the CLI of zebraidx.  It works this way.  The default is to not log much
if ($verbose_logging >= 2) {
    $zebraidx_log_opt = '-v none,fatal,warn,all';
}

my $use_tempdir = 0;
unless ($directory) {
    $use_tempdir = 1;
    $directory = tempdir(CLEANUP => ($keep_export ? 0 : 1));
}


my $biblioserverdir = C4::Context->zebraconfig('biblioserver')->{directory};
my $authorityserverdir = C4::Context->zebraconfig('authorityserver')->{directory};

my $kohadir = C4::Context->config('intranetdir');

my ($biblionumbertagfield,$biblionumbertagsubfield) = C4::Biblio::GetMarcFromKohaField( "biblio.biblionumber" );
my ($biblioitemnumbertagfield,$biblioitemnumbertagsubfield) = C4::Biblio::GetMarcFromKohaField( "biblioitems.biblioitemnumber" );

my $marcxml_open = q{<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
};

my $marcxml_close = q{
</collection>
};

# Protect again simultaneous update of the zebra index by using a lock file.
# Create our own lock directory if it is missing. This should be created
# by koha-zebra-ctl.sh or at system installation. If the desired directory
# does not exist and cannot be created, we fall back on /tmp - which will
# always work.

my ($lockfile, $LockFH);
foreach (
    C4::Context->config("zebra_lockdir"),
    '/var/lock/zebra_' . C4::Context->config('database'),
    '/tmp/zebra_' . C4::Context->config('database')
) {
    #we try three possibilities (we really want to lock :)
    next if !$_;
    ($LockFH, $lockfile) = _create_lockfile($_.'/rebuild');
    last if defined $LockFH;
}
if( !defined $LockFH ) {
    print "WARNING: Could not create lock file $lockfile: $!\n";
    print "Please check your koha-conf.xml for ZEBRA_LOCKDIR.\n";
    print "Verify file permissions for it too.\n";
    $use_flock = 0; # we disable file locking now and will continue
                    # without it
                    # note that this mimics old behavior (before we used
                    # the lockfile)
};

my $start_time = time();
if ( $verbose_logging ) {
    my $pretty_time = POSIX::strftime("%H:%M:%S",localtime($start_time));
    print "Zebra configuration information\n";
    print "================================\n";
    print "Zebra biblio directory      = $biblioserverdir\n";
    print "Zebra authorities directory = $authorityserverdir\n";
    print "Koha directory              = $kohadir\n";
    print "Lockfile                    = $lockfile\n" if $lockfile;
    print "BIBLIONUMBER in :     $biblionumbertagfield\$$biblionumbertagsubfield\n";
    print "BIBLIOITEMNUMBER in : $biblioitemnumbertagfield\$$biblioitemnumbertagsubfield\n";
    print "================================\n";
    print "Job started: $pretty_time\n";
}

my $tester = XML::LibXML->new();
my $dbh;

# The main work is done here by calling do_one_pass().  We have added locking
# avoid race conditions between full rebuilds and incremental updates either from
# daemon mode or periodic invocation from cron.  The race can lead to an updated
# record being overwritten by a rebuild if the update is applied after the export
# by the rebuild and before the rebuild finishes (more likely to affect large
# catalogs).
#
# We have chosen to exit immediately by default if we cannot obtain the lock
# to prevent the potential for a infinite backlog from cron invocations, but an
# option (wait-for-lock) is provided to let the program wait for the lock.
# See http://bugs.koha-community.org/bugzilla3/show_bug.cgi?id=11078 for details.
if ($daemon_mode) {
    while (1) {
        # For incremental updates, skip the update if the updates are locked
        if (_flock($LockFH, LOCK_EX|LOCK_NB)) {
            eval {
                $dbh = C4::Context->dbh;
                if( zebraqueue_not_empty() ) {
                    Koha::Caches->flush_L1_caches() if $is_memcached;
                    do_one_pass();
                }
            };
            if ($@ && $verbose_logging) {
                warn "Warning : $@\n";
            }
            _flock($LockFH, LOCK_UN);
        }
        sleep $daemon_sleep;
    }
} else {
    # all one-off invocations
    my $lock_mode = ($wait_for_lock) ? LOCK_EX : LOCK_EX|LOCK_NB;
    if (_flock($LockFH, $lock_mode)) {
        $dbh = C4::Context->dbh;
        do_one_pass();
        _flock($LockFH, LOCK_UN);
    } else {
        print "Skipping rebuild/update because flock failed on $lockfile: $!\n";
    }
}


if ( $verbose_logging ) {
    print "====================\n";
    print "Indexing complete: ". pretty_time() . "\n";
    print "====================\n";
    print "CLEANING\n";
    print "====================\n";
}
if ($keep_export) {
    print "NOTHING cleaned : the export $directory has been kept.\n";
    print "You can re-run this script with the -s ";
    if ($use_tempdir) {
        print " and -d $directory parameters";
    } else {
        print "parameter";
    }
    print "\n";
    print "if you just want to rebuild zebra after changing zebra config files\n";
} else {
    unless ($use_tempdir) {
        # if we're using a temporary directory
        # created by File::Temp, it will be removed
        # automatically.
        rmtree($directory, 0, 1);
        print "directory $directory deleted\n";
    }
}

sub do_one_pass {
    if ($authorities) {
        index_records('authority', $directory, $skip_export, $skip_index, $process_zebraqueue, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir);
    } else {
        print "skipping authorities\n" if ( $verbose_logging );
    }

    if ($biblios) {
        index_records('biblio', $directory, $skip_export, $skip_index, $process_zebraqueue, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir);
    } else {
        print "skipping biblios\n" if ( $verbose_logging );
    }
}

# Check the zebra update queue and return true if there are records to process
# This routine will handle each of -ab, -a, or -b, but in practice we force
# -ab when in daemon mode.
sub zebraqueue_not_empty {
    my $where_str;

    if ($authorities && $biblios) {
        $where_str = 'done = 0;';
    } elsif ($biblios) {
        $where_str = 'server = "biblioserver" AND done = 0;';
    } else {
        $where_str = 'server = "authorityserver" AND done = 0;';
    }
    my $query =
        $dbh->prepare('SELECT COUNT(*) FROM zebraqueue WHERE ' . $where_str );

    $query->execute;
    my $count = $query->fetchrow_arrayref->[0];
    print "queued records: $count\n" if $verbose_logging > 0;
    return $count > 0;
}

# This checks to see if the zebra directories exist under the provided path.
# If they don't, then zebra is likely to spit the dummy. This returns true
# if the directories had to be created, false otherwise.
sub check_zebra_dirs {
    my ($base) = shift() . '/';
    my $needed_repairing = 0;
    my @dirs = ( '', 'key', 'register', 'shadow', 'tmp' );
    foreach my $dir (@dirs) {
        my $bdir = $base . $dir;
        if (! -d $bdir) {
            $needed_repairing = 1;
            mkdir $bdir || die "Unable to create '$bdir': $!\n";
            print "$0: needed to create '$bdir'\n";
        }
    }
    return $needed_repairing;
}   # ----------  end of subroutine check_zebra_dirs  ----------

sub index_records {
    my ($record_type, $directory, $skip_export, $skip_index, $process_zebraqueue, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_;

    my $num_records_exported = 0;
    my $records_deleted = {};
    my $need_reset = check_zebra_dirs($server_dir);
    if ($need_reset) {
        print "$0: found broken zebra server directories: forcing a rebuild\n";
        $reset = 1;
    }
    if ($skip_export && $verbose_logging) {
        print "====================\n";
        print "SKIPPING $record_type export\n";
        print "====================\n";
    } else {
        if ( $verbose_logging ) {
            print "====================\n";
            print "exporting $record_type " . pretty_time() . "\n";
            print "====================\n";
        }
        mkdir "$directory" unless (-d $directory);
        mkdir "$directory/$record_type" unless (-d "$directory/$record_type");
        if ($process_zebraqueue) {
            my $entries;

            unless ( $process_zebraqueue_skip_deletes ) {
                $entries = select_zebraqueue_records($record_type, 'deleted');
                mkdir "$directory/del_$record_type" unless (-d "$directory/del_$record_type");
                $records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type");
                mark_zebraqueue_batch_done($entries);
            }

            $entries = select_zebraqueue_records($record_type, 'updated');
            mkdir "$directory/upd_$record_type" unless (-d "$directory/upd_$record_type");
            $num_records_exported = export_marc_records_from_list($record_type,$entries, "$directory/upd_$record_type", $records_deleted);
            mark_zebraqueue_batch_done($entries);

        } else {
            my $sth = select_all_records($record_type);
            $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $nosanitize);
            unless ($do_not_clear_zebraqueue) {
                mark_all_zebraqueue_done($record_type);
            }
        }
    }

    #
    # and reindexing everything
    #
    if ($skip_index) {
        if ($verbose_logging) {
            print "====================\n";
            print "SKIPPING $record_type indexing\n";
            print "====================\n";
        }
    } else {
        if ( $verbose_logging ) {
            print "====================\n";
            print "REINDEXING zebra " . pretty_time() . "\n";
            print "====================\n";
        }
        my $record_fmt = 'marcxml';
        if ($process_zebraqueue) {
            do_indexing($record_type, 'adelete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
                if %$records_deleted;
            do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
                if $num_records_exported;
        } else {
            do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
                if ($num_records_exported or $skip_export);
        }
    }
}


sub select_zebraqueue_records {
    my ($record_type, $update_type) = @_;

    my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
    my $op = ($update_type eq 'deleted') ? 'recordDelete' : 'specialUpdate';

    my $sth = $dbh->prepare("SELECT id, biblio_auth_number
                             FROM zebraqueue
                             WHERE server = ?
                             AND   operation = ?
                             AND   done = 0
                             ORDER BY id DESC");
    $sth->execute($server, $op);
    my $entries = $sth->fetchall_arrayref({});
}

sub mark_all_zebraqueue_done {
    my ($record_type) = @_;

    my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';

    my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1
                             WHERE server = ?
                             AND done = 0");
    $sth->execute($server);
}

sub mark_zebraqueue_batch_done {
    my ($entries) = @_;

    $dbh->{AutoCommit} = 0;
    my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1 WHERE id = ?");
    $dbh->commit();
    foreach my $id (map { $_->{id} } @$entries) {
        $sth->execute($id);
    }
    $dbh->{AutoCommit} = 1;
}

sub select_all_records {
    my $record_type = shift;
    return ($record_type eq 'biblio') ? select_all_biblios() : select_all_authorities();
}

sub select_all_authorities {
    my $strsth=qq{SELECT authid FROM auth_header};
    $strsth.=qq{ WHERE $where } if ($where);
    $strsth.=qq{ LIMIT $length } if ($length && !$offset);
    $strsth.=qq{ LIMIT $offset,$length } if ($length && $offset);
    my $sth = $dbh->prepare($strsth);
    $sth->execute();
    return $sth;
}

sub select_all_biblios {
    $table = 'biblioitems'
      unless grep { $_ eq $table } @tables_allowed_for_select;
    my $strsth = qq{ SELECT DISTINCT biblionumber FROM $table };
    $strsth.=qq{ WHERE $where } if ($where);
    $strsth.=qq{ LIMIT $length } if ($length && !$offset);
    $strsth.=qq{ LIMIT $offset,$length } if ($offset);
    my $sth = $dbh->prepare($strsth);
    $sth->execute();
    return $sth;
}

sub export_marc_records_from_sth {
    my ($record_type, $sth, $directory, $nosanitize) = @_;

    my $num_exported = 0;
    open my $fh, '>:encoding(UTF-8) ', "$directory/exported_records" or die $!;

    print {$fh} $marcxml_open;

    my $i = 0;
    my ( $itemtag, $itemsubfield ) = C4::Biblio::GetMarcFromKohaField( "items.itemnumber" );
    while (my ($record_number) = $sth->fetchrow_array) {
        print "." if ( $verbose_logging );
        print "\r$i" unless ($i++ %100 or !$verbose_logging);
        if ( $nosanitize ) {
            my $marcxml = $record_type eq 'biblio'
                          ? GetXmlBiblio( $record_number )
                          : GetAuthorityXML( $record_number );
            if ($record_type eq 'biblio'){
                my $biblio = Koha::Biblios->find($record_number);
                next unless $biblio;
                my $items = $biblio->items;
                if ($items->count){
                    my $record = MARC::Record->new;
                    $record->encoding('UTF-8');
                    my @itemsrecord;
                    for my $item ( @{$items->unblessed} ) {
                        my $record = Item2Marc($item, $record_number);
                        push @itemsrecord, $record->field($itemtag);
                    }
                    $record->insert_fields_ordered(@itemsrecord);
                    my $itemsxml = $record->as_xml_record();
                    $marcxml =
                        substr($marcxml, 0, length($marcxml)-10) .
                        substr($itemsxml, index($itemsxml, "</leader>\n", 0) + 10);
                }
            }
            # extra test to ensure that result is valid XML; otherwise
            # Zebra won't parse it in DOM mode
            eval {
                my $doc = $tester->parse_string($marcxml);
            };
            if ($@) {
                warn "Error exporting record $record_number ($record_type): $@\n";
                next;
            }
            if ( $marcxml ) {
                $marcxml =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
                print {$fh} $marcxml;
                $num_exported++;
            }
            next;
        }
        my ($marc) = get_corrected_marc_record($record_type, $record_number);
        if (defined $marc) {
            eval {
                my $rec = $marc->as_xml_record(C4::Context->preference('marcflavour'));
                eval {
                    my $doc = $tester->parse_string($rec);
                };
                if ($@) {
                    die "invalid XML: $@";
                }
                $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
                print {$fh} $rec;
                $num_exported++;
            };
            if ($@) {
                warn "Error exporting record $record_number ($record_type) XML";
                warn "... specific error is $@" if $verbose_logging;
            }
        }
    }
    print "\nRecords exported: $num_exported " . pretty_time() . "\n" if ( $verbose_logging );
    print {$fh} $marcxml_close;

    close $fh;
    return $num_exported;
}

sub export_marc_records_from_list {
    my ($record_type, $entries, $directory, $records_deleted) = @_;

    my $num_exported = 0;
    open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!;

    print {$fh} $marcxml_open;

    my $i = 0;

    # Skip any deleted records. We check for this anyway, but this reduces error spam
    my %found = %$records_deleted;
    foreach my $record_number ( map { $_->{biblio_auth_number} }
                                grep { !$found{ $_->{biblio_auth_number} }++ }
                                @$entries ) {
        print "." if ( $verbose_logging );
        print "\r$i" unless ($i++ %100 or !$verbose_logging);
        my ($marc) = get_corrected_marc_record($record_type, $record_number);
        if (defined $marc) {
            eval {
                my $rec = $marc->as_xml_record(C4::Context->preference('marcflavour'));
                $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
                print {$fh} $rec;
                $num_exported++;
            };
            if ($@) {
              warn "Error exporting record $record_number ($record_type) XML";
            }
        }
    }
    print "\nRecords exported: $num_exported " . pretty_time() . "\n" if ( $verbose_logging );

    print {$fh} $marcxml_close;

    close $fh;
    return $num_exported;
}

sub generate_deleted_marc_records {

    my ($record_type, $entries, $directory) = @_;

    my $records_deleted = {};
    open my $fh, '>:encoding(UTF-8)', "$directory/exported_records" or die $!;

    print {$fh} $marcxml_open;

    my $i = 0;
    foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) {
        print "\r$i" unless ($i++ %100 or !$verbose_logging);
        print "." if ( $verbose_logging );

        my $marc = MARC::Record->new();
        if ($record_type eq 'biblio') {
            fix_biblio_ids($marc, $record_number, $record_number);
        } else {
            fix_authority_id($marc, $record_number);
        }
        if (C4::Context->preference("marcflavour") eq "UNIMARC") {
            fix_unimarc_100($marc);
        }

        my $rec = $marc->as_xml_record(C4::Context->preference('marcflavour'));
        # Remove the record's XML header
        $rec =~ s!<\?xml version="1.0" encoding="UTF-8"\?>\n!!;
        print {$fh} $rec;

        $records_deleted->{$record_number} = 1;
    }
    print "\nRecords exported: $i " . pretty_time() . "\n" if ( $verbose_logging );

    print {$fh} $marcxml_close;

    close $fh;
    return $records_deleted;
}

sub get_corrected_marc_record {
    my ( $record_type, $record_number ) = @_;

    my $marc = get_raw_marc_record( $record_type, $record_number );

    if ( defined $marc ) {
        fix_leader($marc);
        if ( $record_type eq 'authority' ) {
            fix_authority_id( $marc, $record_number );
        }
        elsif ( $record_type eq 'biblio' ) {

            my @filters;
            my @other_headings;
            push @filters, 'EmbedItemsAvailability';
            if (   C4::Context->preference('IncludeSeeFromInSearches')
                || C4::Context->preference('IncludeSeeAlsoFromInSearches') )
            {
                push @filters, 'EmbedSeeFromHeadings';
                if ( C4::Context->preference('IncludeSeeFromInSearches') ) {
                    push @other_headings, 'see_from';
                }
                if ( C4::Context->preference('IncludeSeeAlsoFromInSearches') ) {
                    push @other_headings, 'see_also_from';
                }
            }

            my $normalizer = Koha::RecordProcessor->new(
                {
                    filters => \@filters,
                    options => { other_headings => \@other_headings }
                }
            );
            $marc = $normalizer->process($marc);
        }
        if ( C4::Context->preference("marcflavour") eq "UNIMARC" ) {
            fix_unimarc_100($marc);
        }
    }

    return $marc;
}

sub get_raw_marc_record {
    my ($record_type, $record_number) = @_;

    my $marc;
    if ($record_type eq 'biblio') {
        eval {
            my $biblio = Koha::Biblios->find($record_number);
            $marc = $biblio->metadata->record({ embed_items => 1 });
        };
        if ($@ || !$marc) {
            # here we do warn since catching an exception
            # means that the bib was found but failed
            # to be parsed
            warn "error retrieving biblio $record_number";
            return;
        }
    } else {
        eval { $marc = GetAuthority($record_number); };
        if ($@) {
            warn "error retrieving authority $record_number";
            return;
        }
    }
    return $marc;
}

sub fix_leader {
    # FIXME - this routine is suspect
    # It blanks the Leader/00-05 and Leader/12-16 to
    # force them to be recalculated correct when
    # the $marc->as_usmarc() or $marc->as_xml() is called.
    # But why is this necessary?  It would be a serious bug
    # in MARC::Record (definitely) and MARC::File::XML (arguably)
    # if they are emitting incorrect leader values.
    my $marc = shift;

    my $leader = $marc->leader;
    substr($leader,  0, 5) = '     ';
    substr($leader, 10, 7) = '22     ';
    $marc->leader(substr($leader, 0, 24));
}

sub fix_biblio_ids {
    # FIXME - it is essential to ensure that the biblionumber is present,
    #         otherwise, Zebra will choke on the record.  However, this
    #         logic belongs in the relevant C4::Biblio APIs.
    my $marc = shift;
    my $biblionumber = shift;
    my $biblioitemnumber;
    if (@_) {
        $biblioitemnumber = shift;
    } else {
        my $sth = $dbh->prepare(
            "SELECT biblioitemnumber FROM biblioitems WHERE biblionumber=?");
        $sth->execute($biblionumber);
        ($biblioitemnumber) = $sth->fetchrow_array;
        $sth->finish;
        unless ($biblioitemnumber) {
            warn "failed to get biblioitemnumber for biblio $biblionumber";
            return 0;
        }
    }

    # FIXME - this is cheating on two levels
    # 1. C4::Biblio::_koha_marc_update_bib_ids is meant to be an internal function
    # 2. Making sure that the biblionumber and biblioitemnumber are correct and
    #    present in the MARC::Record object ought to be part of GetMarcBiblio.
    #
    # On the other hand, this better for now than what rebuild_zebra.pl used to
    # do, which was duplicate the code for inserting the biblionumber
    # and biblioitemnumber
    C4::Biblio::_koha_marc_update_bib_ids($marc, '', $biblionumber, $biblioitemnumber);

    return 1;
}

sub fix_authority_id {
    # FIXME - as with fix_biblio_ids, the authid must be present
    #         for Zebra's sake.  However, this really belongs
    #         in C4::AuthoritiesMarc.
    my ($marc, $authid) = @_;
    unless ($marc->field('001') and $marc->field('001')->data() eq $authid){
        $marc->delete_field($marc->field('001'));
        $marc->insert_fields_ordered(MARC::Field->new('001',$authid));
    }
}

sub fix_unimarc_100 {
    # FIXME - again, if this is necessary, it belongs in C4::AuthoritiesMarc.
    my $marc = shift;

    my $string;
    my $length_100a = length($marc->subfield( 100, "a" ));
    if (  $length_100a and $length_100a == 36 ) {
        $string = $marc->subfield( 100, "a" );
        my $f100 = $marc->field(100);
        $marc->delete_field($f100);
    }
    else {
        $string = POSIX::strftime( "%Y%m%d", localtime );
        $string =~ s/\-//g;
        $string = sprintf( "%-*s", 35, $string );
    }
    substr( $string, 22, 6, "frey50" );
    $length_100a = length($marc->subfield( 100, "a" ));
    unless ( $length_100a and $length_100a == 36 ) {
        $marc->delete_field($marc->field(100));
        $marc->insert_grouped_field(MARC::Field->new( 100, "", "", "a" => $string ));
    }
}

sub do_indexing {
    my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format, $zebraidx_log_opt) = @_;

    my $zebra_server  = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
    my $zebra_db_name = ($record_type eq 'biblio') ? 'biblios' : 'authorities';
    my $zebra_config  = C4::Context->zebraconfig($zebra_server)->{'config'};
    my $zebra_db_dir  = C4::Context->zebraconfig($zebra_server)->{'directory'};

    $noshadow //= '';

    if ($noshadow or $reset_index) {
        $noshadow = '-n';
    }

    system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name init") if $reset_index;
    system("zebraidx -c $zebra_config $zebraidx_log_opt $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
    system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name commit") unless $noshadow;
}

sub _flock {
    # test if flock is present; if so, use it; if not, return true
    # op refers to the official flock operations including LOCK_EX,
    # LOCK_UN, etc.
    # combining LOCK_EX with LOCK_NB returns immediately
    my ($fh, $op)= @_;
    if( !defined($use_flock) ) {
        #check if flock is present; if not, you will have a fatal error
        my $lock_acquired = eval { flock($fh, $op) };
        # assuming that $fh and $op are fine(..), an undef $lock_acquired
        # means no flock
        $use_flock = defined($lock_acquired) ? 1 : 0;
        print "Warning: flock could not be used!\n" if $verbose_logging && !$use_flock;
        return 1 if !$use_flock;
        return $lock_acquired;
    } else {
        return 1 if !$use_flock;
        return flock($fh, $op);
    }
}

sub _create_lockfile { #returns undef on failure
    my $dir= shift;
    unless (-d $dir) {
        eval { mkpath($dir, 0, oct(755)) };
        return if $@;
    }
    return if !open my $fh, q{>}, $dir.'/'.LOCK_FILENAME;
    return ( $fh, $dir.'/'.LOCK_FILENAME );
}

sub pretty_time {
    use integer;
    my $now = time;
    my $elapsed = $now - $start_time;
    local $_ = $elapsed;
    my ( $h, $m, $s );
    $s = $_ % 60;
    $_ /= 60;
    $m = $_ % 60;
    $_ /= 60;
    $h = $_ % 24;

    my $now_pretty = POSIX::strftime("%H:%M:%S",localtime($now));
    my $elapsed_pretty = sprintf "[%02d:%02d:%02d]",$h,$m,$s;

    return "$now_pretty $elapsed_pretty";
}

sub print_usage {
    print <<_USAGE_;
$0: reindex MARC bibs and/or authorities in Zebra.

Use this batch job to reindex all biblio or authority
records in your Koha database.

Parameters:

    -b                      index bibliographic records

    -a                      index authority records

    -daemon                 Run in daemon mode.  The program will loop checking
                            for entries on the zebraqueue table, processing
                            them incrementally if present, and then sleep
                            for a few seconds before repeating the process
                            Checking the zebraqueue table is done with a cheap
                            SQL query.  This allows for near realtime update of
                            the zebra search index with low system overhead.
                            Use -sleep to control the checking interval.

                            Daemon mode implies -z, -a, -b.  The program will
                            refuse to start if options are present that do not
                            make sense while running as an incremental update
                            daemon (e.g. -r or -offset).

    -sleep 10               Seconds to sleep between checks of the zebraqueue
                            table in daemon mode.  The default is 5 seconds.

    -z                      select only updated and deleted
                            records marked in the zebraqueue
                            table.  Cannot be used with -r
                            or -s.

    --skip-deletes          only select record updates, not record
                            deletions, to avoid potential excessive
                            I/O when zebraidx processes deletions.
                            If this option is used for normal indexing,
                            a cronjob should be set up to run
                            rebuild_zebra.pl -z without --skip-deletes
                            during off hours.
                            Only effective with -z.

    -r                      clear Zebra index before
                            adding records to index. Implies -w.

    -d                      Temporary directory for indexing.
                            If not specified, one is automatically
                            created.  The export directory
                            is automatically deleted unless
                            you supply the -k switch.

    -k                      Do not delete export directory.

    -s                      Skip export.  Used if you have
                            already exported the records
                            in a previous run.

    -nosanitize             export biblio/authority records directly from DB marcxml
                            field without sanitizing records. It speed up
                            dump process but could fail if DB contains badly
                            encoded records. Works only with -x,

    -w                      skip shadow indexing for this batch

    -y                      do NOT clear zebraqueue after indexing; normally,
                            after doing batch indexing, zebraqueue should be
                            marked done for the affected record type(s) so that
                            a running zebraqueue_daemon doesn't try to reindex
                            the same records - specify -y to override this.
                            Cannot be used with -z.

    -v                      increase the amount of logging.  Normally only
                            warnings and errors from the indexing are shown.
                            Use log level 2 (-v -v) to include all Zebra logs.

    --length   1234         how many biblio you want to export
    --offset 1243           offset you want to start to
                                example: --offset 500 --length=500 will result in a LIMIT 500,1000 (exporting 1000 records, starting by the 500th one)
                                note that the numbers are NOT related to biblionumber, that's the intended behaviour.
    --where                 let you specify a WHERE query, like itemtype='BOOK'
                            or something like that

    --run-as-root           explicitily allow script to run as 'root' user

    --wait-for-lock         when not running in daemon mode, the default
                            behavior is to abort a rebuild if the rebuild
                            lock is busy.  This option will cause the program
                            to wait for the lock to free and then continue
                            processing the rebuild request,

    --table                 specify a table (can be items, biblioitems, biblio, biblio_metadata) to retrieve biblionumber to index.
                            biblioitems is the default value.

    --help or -h            show this message.
_USAGE_
}