464 lines
14 KiB
Perl
Executable file
464 lines
14 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
|
|
# $Id$
|
|
|
|
package C4::SimpleMarc;
|
|
|
|
# Routines for handling import of MARC data into Koha db
|
|
|
|
# Koha library project www.koha.org
|
|
|
|
# Licensed under the GPL
|
|
|
|
use strict;
|
|
|
|
# standard or CPAN modules used
|
|
use DBI;
|
|
|
|
# Koha modules used
|
|
use C4::Database;
|
|
|
|
require Exporter;
|
|
|
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
|
|
|
# set the version for version checking
|
|
$VERSION = 0.01;
|
|
|
|
@ISA = qw(Exporter);
|
|
@EXPORT = qw(
|
|
&extractmarcfields
|
|
&parsemarcfileformat
|
|
&taglabel
|
|
%tagtext
|
|
%tagmap
|
|
);
|
|
%EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
|
|
|
|
# your exported package globals go here,
|
|
# as well as any optionally exported functions
|
|
|
|
@EXPORT_OK = qw(
|
|
%tagtext
|
|
%tagmap
|
|
);
|
|
|
|
# non-exported package globals go here
|
|
use vars qw(@more $stuff);
|
|
|
|
# initalize package globals, first exported ones
|
|
|
|
my $Var1 = '';
|
|
my %Hashit = ();
|
|
|
|
# then the others (which are still accessible as $Some::Module::stuff)
|
|
my $stuff = '';
|
|
my @more = ();
|
|
|
|
# all file-scoped lexicals must be created before
|
|
# the functions below that use them.
|
|
|
|
# file-private lexicals go here
|
|
my $priv_var = '';
|
|
my %secret_hash = ();
|
|
|
|
# here's a file-private function as a closure,
|
|
# callable as &$priv_func; it cannot be prototyped.
|
|
my $priv_func = sub {
|
|
# stuff goes here.
|
|
};
|
|
|
|
# make all your functions, whether exported or not;
|
|
#------------------------------------------------
|
|
|
|
#------------------
|
|
# Constants
|
|
|
|
my %tagtext = (
|
|
'LDR' => 'Leader',
|
|
'001' => 'Control number',
|
|
'003' => 'Control number identifier',
|
|
'005' => 'Date and time of latest transaction',
|
|
'006' => 'Fixed-length data elements -- additional material characteristics',
|
|
'007' => 'Physical description fixed field',
|
|
'008' => 'Fixed length data elements',
|
|
'010' => 'LCCN',
|
|
'015' => 'National library CN',
|
|
'020' => 'ISBN',
|
|
'022' => 'ISSN',
|
|
'024' => 'Other standard ID',
|
|
'035' => 'System control number',
|
|
'037' => 'Source of acquisition',
|
|
'040' => 'Cataloging source',
|
|
'041' => 'Language code',
|
|
'043' => 'Geographic area code',
|
|
'043' => 'Publishing country code',
|
|
'050' => 'Library of Congress call number',
|
|
'055' => 'Canadian classification number',
|
|
'060' => 'National Library of Medicine call number',
|
|
'082' => 'Dewey decimal call number',
|
|
'100' => 'Main entry -- Personal name',
|
|
'110' => 'Main entry -- Corporate name',
|
|
'130' => 'Main entry -- Uniform title',
|
|
'240' => 'Uniform title',
|
|
'245' => 'Title statement',
|
|
'246' => 'Varying form of title',
|
|
'250' => 'Edition statement',
|
|
'256' => 'Computer file characteristics',
|
|
'260' => 'Publication, distribution, etc.',
|
|
'263' => 'Projected publication date',
|
|
'300' => 'Physical description',
|
|
'306' => 'Playing time',
|
|
'440' => 'Series statement / Added entry -- Title',
|
|
'490' => 'Series statement',
|
|
'500' => 'General note',
|
|
'504' => 'Bibliography, etc. note',
|
|
'505' => 'Formatted contents note',
|
|
'508' => 'Creation/production credits note',
|
|
'510' => 'Citation/references note',
|
|
'511' => 'Participant or performer note',
|
|
'520' => 'Summary, etc. note',
|
|
'521' => 'Target audience note (ie age)',
|
|
'530' => 'Additional physical form available note',
|
|
'538' => 'System details note',
|
|
'586' => 'Awards note',
|
|
'600' => 'Subject added entry -- Personal name',
|
|
'610' => 'Subject added entry -- Corporate name',
|
|
'650' => 'Subject added entry -- Topical term',
|
|
'651' => 'Subject added entry -- Geographic name',
|
|
'656' => 'Index term -- Occupation',
|
|
'700' => 'Added entry -- Personal name',
|
|
'710' => 'Added entry -- Corporate name',
|
|
'730' => 'Added entry -- Uniform title',
|
|
'740' => 'Added entry -- Uncontrolled related/analytical title',
|
|
'800' => 'Series added entry -- Personal name',
|
|
'830' => 'Series added entry -- Uniform title',
|
|
'852' => 'Location',
|
|
'856' => 'Electronic location and access',
|
|
);
|
|
|
|
# tag, subfield, field name, repeats, striptrailingchars
|
|
my %tagmap=(
|
|
'010'=>{'a'=>{name=> 'lccn', rpt=>0, striptrail=>' ' }},
|
|
'015'=>{'a'=>{name=> 'lccn', rpt=>0 }},
|
|
'020'=>{'a'=>{name=> 'isbn', rpt=>0 }},
|
|
'022'=>{'a'=>{name=> 'issn', rpt=>0 }},
|
|
'082'=>{'a'=>{name=> 'dewey', rpt=>0 }},
|
|
'100'=>{'a'=>{name=> 'author', rpt=>0, striptrail=>',:;/-' }},
|
|
'245'=>{'a'=>{name=> 'title', rpt=>0, striptrail=>',:;/' },
|
|
'b'=>{name=> 'subtitle', rpt=>0, striptrail=>',:;/' }},
|
|
'260'=>{'a'=>{name=> 'place', rpt=>0, striptrail=>',:;/-' },
|
|
'b'=>{name=> 'publisher', rpt=>0, striptrail=>',:;/-' },
|
|
'c'=>{name=> 'year' , rpt=>0, striptrail=>'.,:;/-' }},
|
|
'300'=>{'a'=>{name=> 'pages', rpt=>0, striptrail=>',:;/-' },
|
|
'c'=>{name=> 'size', rpt=>0, striptrail=>',:;/-' }},
|
|
'362'=>{'a'=>{name=> 'volume-number', rpt=>0 }},
|
|
'440'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' },
|
|
'v'=>{name=> 'volume-number',rpt=>0 }},
|
|
'490'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' },
|
|
'v'=>{name=> 'volume-number',rpt=>0 }},
|
|
'700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/' }},
|
|
'5xx'=>{'a'=>{name=> 'notes', rpt=>1 }},
|
|
'65x'=>{'a'=>{name=> 'subject', rpt=>1, striptrail=>'.,:;/-' }},
|
|
);
|
|
|
|
|
|
#------------------
|
|
sub extractmarcfields {
|
|
use strict;
|
|
# input
|
|
my (
|
|
$record, # pointer to list of MARC field hashes.
|
|
# Example: $record->[0]->{'tag'} = '100' # Author
|
|
# $record->[0]->{'subfields'}->{'a'} = subfieldvalue
|
|
)=@_;
|
|
|
|
# return
|
|
my $bib; # pointer to hash of named output fields
|
|
# Example: $bib->{'author'} = "Twain, Mark";
|
|
|
|
my $debug=0;
|
|
|
|
my (
|
|
$field, # hash ref
|
|
$value,
|
|
$subfield, # Marc subfield [a-z]
|
|
$fieldname, # name of field "author", "title", etc.
|
|
$strip, # chars to remove from end of field
|
|
$stripregex, # reg exp pattern
|
|
);
|
|
my ($lccn, $isbn, $issn,
|
|
$publicationyear, @subjects, $subject,
|
|
$controlnumber,
|
|
$notes, $additionalauthors, $illustrator, $copyrightdate,
|
|
$s, $subdivision, $subjectsubfield,
|
|
);
|
|
|
|
print "<PRE>\n" if $debug;
|
|
|
|
if ( ref($record) eq "ARRAY" ) {
|
|
foreach $field (@$record) {
|
|
|
|
# Check each subfield in field
|
|
foreach $subfield ( keys %{$field->{subfields}} ) {
|
|
# see if it is defined in our Marc to koha mapping table
|
|
if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
|
|
# Yes, so keep the value
|
|
if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
|
|
# if it was an array, just keep first element.
|
|
$bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
|
|
} else {
|
|
$bib->{$fieldname}=$field->{'subfields'}->{$subfield};
|
|
} # if array
|
|
print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
|
|
# see if this field should have trailing chars dropped
|
|
if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
|
|
$strip=~s//\\/; # backquote each char
|
|
$stripregex='[ ' . $strip . ']+$'; # remove trailing spaces also
|
|
$bib->{$fieldname}=~s/$stripregex//;
|
|
# also strip leading spaces
|
|
$bib->{$fieldname}=~s/^ +//;
|
|
} # if strip
|
|
print "Found subfield $field->{'tag'} $subfield " .
|
|
"$fieldname = $bib->{$fieldname}\n" if $debug;
|
|
} # if tagmap exists
|
|
|
|
} # foreach subfield
|
|
|
|
|
|
if ($field->{'tag'} eq '001') {
|
|
$bib->{controlnumber}=$field->{'indicator'};
|
|
}
|
|
if ($field->{'tag'} eq '015') {
|
|
$bib->{lccn}=$field->{'subfields'}->{'a'};
|
|
$bib->{lccn}=~s/^\s*//;
|
|
$bib->{lccn}=~s/^C//;
|
|
($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
|
|
}
|
|
|
|
|
|
if ($field->{'tag'} eq '260') {
|
|
|
|
$publicationyear=$field->{'subfields'}->{'c'};
|
|
if ($publicationyear=~/c(\d\d\d\d)/) {
|
|
$copyrightdate=$1;
|
|
}
|
|
if ($publicationyear=~/[^c](\d\d\d\d)/) {
|
|
$publicationyear=$1;
|
|
} elsif ($copyrightdate) {
|
|
$publicationyear=$copyrightdate;
|
|
} else {
|
|
$publicationyear=~/(\d\d\d\d)/;
|
|
$publicationyear=$1;
|
|
}
|
|
}
|
|
if ($field->{'tag'} eq '700') {
|
|
my $name=$field->{'subfields'}->{'a'};
|
|
if ( defined($field->{'subfields'}->{'e'})
|
|
and $field->{'subfields'}->{'e'}=~/ill/) {
|
|
$illustrator=$name;
|
|
} else {
|
|
$additionalauthors.="$name\n";
|
|
}
|
|
}
|
|
if ($field->{'tag'} =~/^5/) {
|
|
$notes.="$field->{'subfields'}->{'a'}\n";
|
|
}
|
|
if ($field->{'tag'} =~/65\d/) {
|
|
my $sub;
|
|
my $subject=$field->{'subfields'}->{'a'};
|
|
$subject=~s/\.$//;
|
|
print "Subject=$subject\n" if $debug;
|
|
foreach $subjectsubfield ( 'x','y','z' ) {
|
|
if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
|
|
if ( ref($subdivision) eq 'ARRAY' ) {
|
|
foreach $s (@$subdivision) {
|
|
$s=~s/\.$//;
|
|
$subject.=" -- $s";
|
|
} # foreach subdivision
|
|
} else {
|
|
$subdivision=~s/\.$//;
|
|
$subject.=" -- $subdivision";
|
|
} # if array
|
|
} # if subfield exists
|
|
} # foreach subfield
|
|
print "Subject=$subject\n" if $debug;
|
|
push @subjects, $subject;
|
|
} # if tag 65x
|
|
|
|
|
|
} # foreach field
|
|
($publicationyear ) && ($bib->{publicationyear}=$publicationyear );
|
|
($copyrightdate ) && ($bib->{copyrightdate}=$copyrightdate );
|
|
($additionalauthors ) && ($bib->{additionalauthors}=$additionalauthors );
|
|
($illustrator ) && ($bib->{illustrator}=$illustrator );
|
|
($notes ) && ($bib->{notes}=$notes );
|
|
($#subjects ) && ($bib->{subject}=\@subjects );
|
|
|
|
# Misc cleanup
|
|
if ($bib->{dewey}) {
|
|
$bib->{dewey}=~s/\///g; # drop any slashes
|
|
}
|
|
|
|
if ($bib->{lccn}) {
|
|
($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
|
|
}
|
|
|
|
if ( $bib->{isbn} ) {
|
|
$bib->{isbn}=~s/[^\d]*//g; # drop non-digits
|
|
};
|
|
|
|
if ( $bib->{issn} ) {
|
|
$bib->{issn}=~s/^\s*//;
|
|
($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
|
|
};
|
|
|
|
if ( $bib->{'volume-number'} ) {
|
|
if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
|
|
$bib->{'volume'}=$1;
|
|
$bib->{'number'}=$2;
|
|
} else {
|
|
$bib->{volume}=$bib->{'volume-number'};
|
|
}
|
|
delete $bib->{'volume-number'};
|
|
} # if volume-number
|
|
|
|
} else {
|
|
print "Error: extractmarcfields: input ref $record is " .
|
|
ref($record) . " not ARRAY. Contact sysadmin.\n";
|
|
}
|
|
print "</PRE>\n" if $debug;
|
|
|
|
return $bib;
|
|
|
|
} # sub extractmarcfields
|
|
#---------------------------------
|
|
|
|
#--------------------------
|
|
# Parse MARC data in file format with control-character separators
|
|
# May be multiple records.
|
|
sub parsemarcfileformat {
|
|
use strict;
|
|
# Input is one big text string
|
|
my $data=shift;
|
|
# Output is list of records. Each record is list of field hashes
|
|
my @records;
|
|
|
|
my $splitchar=chr(29);
|
|
my $splitchar2=chr(30);
|
|
my $splitchar3=chr(31);
|
|
my $debug=0;
|
|
my $record;
|
|
foreach $record (split(/$splitchar/, $data)) {
|
|
my @record;
|
|
my $directory=0;
|
|
my $tagcounter=0;
|
|
my %tag;
|
|
my $field;
|
|
|
|
my $leader=substr($record,0,24);
|
|
print "<pre>parse Leader:$leader</pre>\n" if $debug;
|
|
push (@record, {
|
|
'tag' => 'LDR',
|
|
'indicator' => $leader ,
|
|
} );
|
|
|
|
$record=substr($record,24);
|
|
foreach $field (split(/$splitchar2/, $record)) {
|
|
my %field;
|
|
my $tag;
|
|
my $indicator;
|
|
unless ($directory) {
|
|
# If we didn't already find a directory, extract one.
|
|
$directory=$field;
|
|
my $itemcounter=1;
|
|
my $counter2=0;
|
|
my $item;
|
|
my $length;
|
|
my $start;
|
|
while ($item=substr($directory,0,12)) {
|
|
# Pull out location of first field
|
|
$tag=substr($directory,0,3);
|
|
$length=substr($directory,3,4);
|
|
$start=substr($directory,7,6);
|
|
|
|
# Bump to next directory entry
|
|
$directory=substr($directory,12);
|
|
$tag{$counter2}=$tag;
|
|
$counter2++;
|
|
}
|
|
$directory=1;
|
|
next;
|
|
}
|
|
$tag=$tag{$tagcounter};
|
|
$tagcounter++;
|
|
$field{'tag'}=$tag;
|
|
my @subfields=split(/$splitchar3/, $field);
|
|
$indicator=$subfields[0];
|
|
$field{'indicator'}=$indicator;
|
|
print "<pre>parse indicator:$indicator</pre>\n" if $debug;
|
|
my $firstline=1;
|
|
unless ($#subfields==0) {
|
|
my %subfields;
|
|
my @subfieldlist;
|
|
my $i;
|
|
for ($i=1; $i<=$#subfields; $i++) {
|
|
my $text=$subfields[$i];
|
|
my $subfieldcode=substr($text,0,1);
|
|
my $subfield=substr($text,1);
|
|
# if this subfield already exists, do array
|
|
if ($subfields{$subfieldcode}) {
|
|
my $subfieldlist=$subfields{$subfieldcode};
|
|
if ( ref($subfieldlist) eq 'ARRAY' ) {
|
|
# Already an array, add on to it
|
|
print "$tag Adding to array $subfieldcode -- $subfield<br>\n" if $debug;
|
|
@subfieldlist=@$subfieldlist;
|
|
push (@subfieldlist, $subfield);
|
|
} else {
|
|
# Change simple value to array
|
|
print "$tag Arraying $subfieldcode -- $subfield<br>\n" if $debug;
|
|
@subfieldlist=($subfields{$subfieldcode}, $subfield);
|
|
}
|
|
# keep new array
|
|
$subfields{$subfieldcode}=\@subfieldlist;
|
|
} else {
|
|
# subfield doesn't exist yet, keep simple value
|
|
$subfields{$subfieldcode}=$subfield;
|
|
}
|
|
}
|
|
$field{'subfields'}=\%subfields;
|
|
}
|
|
push (@record, \%field);
|
|
} # foreach field in record
|
|
push (@records, \@record);
|
|
# $counter++;
|
|
}
|
|
print "</pre>" if $debug;
|
|
return @records;
|
|
} # sub parsemarcfileformat
|
|
|
|
#----------------------------------------------
|
|
sub taglabel {
|
|
my ($tag)=@_;
|
|
|
|
return $tagtext{$tag};
|
|
|
|
} # sub taglabel
|
|
|
|
#---------------------------------------------
|
|
# $Log$
|
|
# Revision 1.2 2002/07/02 20:30:15 tonnesen
|
|
# Merged SimpleMarc.pm over from rel-1-2
|
|
#
|
|
# Revision 1.1.2.4 2002/06/28 14:36:47 amillar
|
|
# Fix broken logic on illustrator vs. add'l author
|
|
#
|
|
# Revision 1.1.2.3 2002/06/26 20:54:32 tonnesen
|
|
# use warnings breaks on perl 5.005...
|
|
#
|
|
# Revision 1.1.2.2 2002/06/26 15:52:55 amillar
|
|
# Fix display of marc tag labels and indicators
|
|
#
|
|
# Revision 1.1.2.1 2002/06/26 07:27:35 amillar
|
|
# Moved acqui.simple MARC handling to new module SimpleMarc.pm
|
|
#
|