From cad16316677d1c17a96a81629dc9296021f2da29 Mon Sep 17 00:00:00 2001 From: kados Date: Wed, 8 Mar 2006 23:39:11 +0000 Subject: [PATCH] An example for building marc21 authorities from existing bib data. --- .../build_marc21_authorities.pl | 274 ++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 misc/migration_tools/build_marc21_authorities.pl diff --git a/misc/migration_tools/build_marc21_authorities.pl b/misc/migration_tools/build_marc21_authorities.pl new file mode 100644 index 0000000000..ba525c3bcc --- /dev/null +++ b/misc/migration_tools/build_marc21_authorities.pl @@ -0,0 +1,274 @@ +#!/usr/bin/perl +# script that rebuild thesaurus from biblio table. + +use strict; + +# Koha modules used +use MARC::File::USMARC; +use MARC::Record; +use MARC::Batch; +use C4::Context; +use C4::Biblio; +use C4::AuthoritiesMarc; +use Time::HiRes qw(gettimeofday); + +use Getopt::Long; +my ( $input_marc_file, $number) = ('',0); +my ($version, $verbose, $delete, $confirm, $howmany); +GetOptions( + 'h' => \$version, + 'd' => \$delete, + 'v' => \$verbose, + 'c' => \$confirm, +# this $howmany parameter & other commented code was here to enable incremental building of the authorities, but it does not work well. +# 'n:s' => \$howmany, +); + +if ($version || (!$confirm)) { + print <> parameters >> thesaurus structure >> add). It can be whatever you want. NP/CO/NG/TI/NC in CVS refers to UNIMARC french RAMEAU category codes. +* in values a sub-hash with the following values : +\ttaglist : the list of MARC tags using this authority +\tkey : the list of MARC subfields used as key for authority. 2 entries in biblio having the same key will be considered as the same. +\tother : the list of MARC subfields not used as key, but to be copied in authority. +\tauthtag : the field in authority that will be reported in biblio. Remember that all subfields in tag "authtag" will be reported in the same subfield of the biblio (in MARC tags that are in "taglist") + +don't forget to define the itemfield. In UNIMARC, it should be 995, in MARC21, probably 852 + +Any warning will be stored in the warnings.log file. +EOF +;#' +die; +} + +my $dbh = C4::Context->dbh; + +my $itemfield = '852'; # enter the TAG number where your items are stored. + +my %whattodo = ( + # authority code (the one you've choosen (or will choose) in Koha >> parameters >> thesaurus structure >> add) + PNAME => { taglist => "100|600|700|800|900", + key => "a|b|c|q|d", + other => "1|2|4|5|6|7|8|0|e|f|g|h|i|j|k|l|m|n|o|p|r|s|t|u|v|w|x|y|z", + authtag => "100", + }, + CNAME => { taglist => "110|610|710|810|910", + key => "a|b|c|d", + other => "1|2|4|5|6|7|8|0|e|f|g|h|i|j|k|l|m|n|o|p|r|s|t|u|v|w|x|y|z", + authtag => "110", + }, + MNAME => { taglist => "111|611|711|811|911", + key => "a|b|c|q|d", + other => "1|2|4|5|6|7|8|0|e|f|g|h|i|j|k|l|m|n|o|p|r|s|t|u|v|w|x|y|z", + authtag => "111", + }, + GNAME => { taglist => "151|651|751|851", + key => "a|b|c|q|d", + other => "1|2|4|5|6|7|8|0|e|f|g|h|i|j|k|l|m|n|o|p|r|s|t|u|v|w|x|y|z", + authtag => "151", + }, + #TITLE => { taglist => "130|210|211|212|222|240|243|400|410|411|440|490|630|730|830|930|940|980|981|982|983", + TITLE => { taglist => "130|210|211|212|222|240|243|440|490|630|730|830|930|940|980|981|982|983", + key => "a", + other => "0|1|2|4|5|6|7|8|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z", + authtag => "130", + }, + GENRE => { taglist => "655|656|657|658", + key => "a|z|x|y|v", + other => "0|1|2|4|5|7|8|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|w", + authtag => "155", + }, + TTERM => { taglist => "650", + key => "a|z|x|y|v", + other => "0|1|2|4|5|6|7|8|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|w", + authtag => "150", + }, + CTERM => { taglist => "648", + key => "a|z|x|y|v", + other => "0|1|2|4|5|6|7|8|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|w", + authtag => "148", + }, +); +my %authorities; + +open WARNING_FILE,">","warnings.log"; + +my $field_list; +my $category_list; +foreach (keys %whattodo) { + $field_list .= $whattodo{$_}->{taglist}.'|'; + $category_list.= "'".$_."'," +} +chop $field_list; + +if ($delete) { + print "deleting thesaurus step 1\n"; + chop $category_list; + my $del1 = $dbh->prepare("delete from auth_subfield_table where authid=?"); + my $del2 = $dbh->prepare("delete from auth_word where authid=?"); + my $sth = $dbh->prepare("select authid from auth_header where authtypecode in ($category_list)"); + $sth->execute; + while (my ($authid) = $sth->fetchrow) { + $del1->execute($authid); + $del2->execute($authid); + } + print "deleting thesaurus step 2\n"; + $dbh->do("delete from auth_header where authtypecode in ($category_list)"); + $dbh->do("delete from marc_subfield_table where tag in ('".join("','",split('\|',$field_list))."') and subfieldcode='9'"); + $dbh->do("delete from marc_word where tagsubfield in ('".join("9','",split('\|',$field_list))."9')"); +# die; +} + +$|=1; # flushes output +my $starttime = gettimeofday; +my $sth = $dbh->prepare("select bibid from marc_biblio"); +$sth->execute; +my $i=1; + +my $modified; +my $alreadydone; +my $totalskipped; +while (my ($bibid) = $sth->fetchrow) { + my $record = MARCgetbiblio($dbh,$bibid); + $modified=0; + $i++; +# print "i : $i / howmany : $howmany\n"; +# exit if $i>$howmany; + + # skip what has already been done... +# $alreadydone=0; +# foreach my $field ($record->fields) { +# if ($field->tag() =~ /$field_list/) { +# # print "F : $field_list ".$field->tag()." => ".$field->as_formatted."\n"; +# if ($field->subfield('9')) { +# $alreadydone++; +# } else { +# if ($alreadydone) { +# print "ERROR : biblio partially done, some \$9 (authority link) missing : ".$record->as_formatted."\n======= You should run the script again using -d to delete everything"; +# die; +# } +# } +# } +# } +# $totalskipped++ if $alreadydone; +# next if $alreadydone; +# my $timeneeded = gettimeofday - $starttime; + print " $i in ".(gettimeofday-$starttime)." s\n" unless ($i % 100); + # be careful, as the last entry may have been + # delete ITEM field, we only deal with BIBLIOS + foreach my $field ($record->field($itemfield)) { + $record->delete_field($field); + } + my $totdone=0; +# my $authid; + # on passe tous les champs + foreach my $field ($record->fields) { + foreach my $DOauthtype (keys %whattodo) { + my $DOtaglist = $whattodo{$DOauthtype}->{taglist}; + my $DOkey = $whattodo{$DOauthtype}->{key}; + my $DOother = $whattodo{$DOauthtype}->{other}; + my $DOauthtag = $whattodo{$DOauthtype}->{authtag}; + if ($field->tag() =~ /$DOtaglist/) { + # try to find the authority in %NP ... + # build the "key" + my $authPrimaryKey; + foreach (split '\|',$DOkey) { + $authPrimaryKey .= join('|',$field->subfield($_))."|" if $field->subfield($_); + } + # if authority exist, check it can't be completed by subfields not previously seen. + # otherwise, create if with whatever available. + if ($authorities{$DOauthtype}->{$authPrimaryKey}) { + # check that the existing authority has all the datas. Otherwise, add them, but don't modify already parsed biblios. + # at the end of the script, all authorities will be updated. So, the "merge_authority.pl" tool can be used to update all biblios. + foreach my $subfieldtotest (split '\|',$DOother) { + if ($field->subfield($subfieldtotest)) { + if ($authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->subfield($subfieldtotest) + ne + $field->subfield($subfieldtotest)) { + print WARNING_FILE "========\nERROR ON $i $subfieldtotest authorities seems to differ, can't choose between : \n".$authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->as_formatted()." \n====== AND ======\n ".$field->as_formatted()."\n=======\n"; + print "W"; + } + # $c was not here, add it... + unless ($authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->subfield($subfieldtotest)) { + my $fieldA= $authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->clone(); + $fieldA->add_subfields($subfieldtotest => $field->subfield($subfieldtotest)); + $authorities{$DOauthtype}->{$authPrimaryKey}->{record}->field($DOauthtag)->replace_with($fieldA); + $authorities{$DOauthtype}->{$authPrimaryKey}->{modified} = 1; + } + } + } + } else { + my $authrecord = MARC::Record->new(); + my $authfield; + foreach (split '\|',$DOkey) { + if ($authfield) { + $authfield->add_subfields($_ => join ('|',$field->subfield($_))) if $field->subfield($_); + } else { + $authfield = MARC::Field->new( $DOauthtag,'','',$_ => join ('|',$field->subfield($_))); + } + } + foreach (split '\|',$DOother) { + if ($authfield) { + $authfield->add_subfields($_ => join ('|',$field->subfield($_))) if $field->subfield($_); + } else { + $authfield = MARC::Field->new( $DOauthtag,'','',$_ => join ('|',$field->subfield($_))); + } + } + $authrecord->insert_fields_ordered($authfield); + my $authid = AUTHaddauthority($dbh,$authrecord,'',$DOauthtype); + $authorities{$DOauthtype}->{$authPrimaryKey}->{authid} = $authid; + $authorities{$DOauthtype}->{$authPrimaryKey}->{record} = $authrecord->clone; + $authorities{$DOauthtype}->{$authPrimaryKey}->{modified} = 0; + } + print "ERROR !!!! \$9 already exists in $authPrimaryKey / ".$field->as_formatted."\n" if $field->subfield('9'); + my $fieldC = $field->clone(); + $fieldC->add_subfields('9' => $authorities{$DOauthtype}->{$authPrimaryKey}->{authid}); + $field->replace_with($fieldC); + # print $NP{$keyNP}->{authid}." => ".$record->as_formatted."\n"; + $modified++; + } + } + } +# +# NC +# +# OK, done, now store modified biblio if it has been modified + if ($modified) { + my $frameworkcode=MARCfind_frameworkcode($dbh,$bibid); + NEWmodbiblio($dbh,$record,$bibid,$frameworkcode); +# print "skipped $totalskipped biblios in ".(gettimeofday-$starttime)." s\n" if $totalskipped; +# $totalskipped=0; + print "$modified"; + } else { + # if $totalskipped is not null, we are in a biblio that has no authorities entry, but inside an already done part of the job + # ++ totalskipped & don't show a useless * +# if ($totalskipped) { +# $totalskipped++; +# } else { + print "*"; +# } + } +} + +# +# now, parse authorities & modify them if they have been modified/completed by a subfield not existing on the 1st biblio using this authority. +# +foreach my $authtype (keys %whattodo) { + foreach my $authentry (keys %{$authorities{$authtype}}) { +# print "AUTH : $authentry\n" if $authorities{$authtype}->{$authentry}->{modified}; + AUTHmodauthority($dbh,$authorities{$authtype}->{$authentry}->{authid},$authorities{$authtype}->{$authentry}->{record}) if $authorities{$authtype}->{$authentry}->{modified}; + } +} +# +my $timeneeded = gettimeofday - $starttime; +print "$i entries done in $timeneeded seconds (".($i/$timeneeded)." per second)\n"; +close WARNING_FILE; -- 2.39.5