Koha/C4/SimpleMarc.pm
2002-07-02 20:30:15 +00:00

464 lines
14 KiB
Perl
Executable file

#!/usr/bin/perl
# $Id$
package C4::SimpleMarc;
# Routines for handling import of MARC data into Koha db
# Koha library project www.koha.org
# Licensed under the GPL
use strict;
# standard or CPAN modules used
use DBI;
# Koha modules used
use C4::Database;
require Exporter;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
# set the version for version checking
$VERSION = 0.01;
@ISA = qw(Exporter);
@EXPORT = qw(
&extractmarcfields
&parsemarcfileformat
&taglabel
%tagtext
%tagmap
);
%EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
# your exported package globals go here,
# as well as any optionally exported functions
@EXPORT_OK = qw(
%tagtext
%tagmap
);
# non-exported package globals go here
use vars qw(@more $stuff);
# initalize package globals, first exported ones
my $Var1 = '';
my %Hashit = ();
# then the others (which are still accessible as $Some::Module::stuff)
my $stuff = '';
my @more = ();
# all file-scoped lexicals must be created before
# the functions below that use them.
# file-private lexicals go here
my $priv_var = '';
my %secret_hash = ();
# here's a file-private function as a closure,
# callable as &$priv_func; it cannot be prototyped.
my $priv_func = sub {
# stuff goes here.
};
# make all your functions, whether exported or not;
#------------------------------------------------
#------------------
# Constants
my %tagtext = (
'LDR' => 'Leader',
'001' => 'Control number',
'003' => 'Control number identifier',
'005' => 'Date and time of latest transaction',
'006' => 'Fixed-length data elements -- additional material characteristics',
'007' => 'Physical description fixed field',
'008' => 'Fixed length data elements',
'010' => 'LCCN',
'015' => 'National library CN',
'020' => 'ISBN',
'022' => 'ISSN',
'024' => 'Other standard ID',
'035' => 'System control number',
'037' => 'Source of acquisition',
'040' => 'Cataloging source',
'041' => 'Language code',
'043' => 'Geographic area code',
'043' => 'Publishing country code',
'050' => 'Library of Congress call number',
'055' => 'Canadian classification number',
'060' => 'National Library of Medicine call number',
'082' => 'Dewey decimal call number',
'100' => 'Main entry -- Personal name',
'110' => 'Main entry -- Corporate name',
'130' => 'Main entry -- Uniform title',
'240' => 'Uniform title',
'245' => 'Title statement',
'246' => 'Varying form of title',
'250' => 'Edition statement',
'256' => 'Computer file characteristics',
'260' => 'Publication, distribution, etc.',
'263' => 'Projected publication date',
'300' => 'Physical description',
'306' => 'Playing time',
'440' => 'Series statement / Added entry -- Title',
'490' => 'Series statement',
'500' => 'General note',
'504' => 'Bibliography, etc. note',
'505' => 'Formatted contents note',
'508' => 'Creation/production credits note',
'510' => 'Citation/references note',
'511' => 'Participant or performer note',
'520' => 'Summary, etc. note',
'521' => 'Target audience note (ie age)',
'530' => 'Additional physical form available note',
'538' => 'System details note',
'586' => 'Awards note',
'600' => 'Subject added entry -- Personal name',
'610' => 'Subject added entry -- Corporate name',
'650' => 'Subject added entry -- Topical term',
'651' => 'Subject added entry -- Geographic name',
'656' => 'Index term -- Occupation',
'700' => 'Added entry -- Personal name',
'710' => 'Added entry -- Corporate name',
'730' => 'Added entry -- Uniform title',
'740' => 'Added entry -- Uncontrolled related/analytical title',
'800' => 'Series added entry -- Personal name',
'830' => 'Series added entry -- Uniform title',
'852' => 'Location',
'856' => 'Electronic location and access',
);
# tag, subfield, field name, repeats, striptrailingchars
my %tagmap=(
'010'=>{'a'=>{name=> 'lccn', rpt=>0, striptrail=>' ' }},
'015'=>{'a'=>{name=> 'lccn', rpt=>0 }},
'020'=>{'a'=>{name=> 'isbn', rpt=>0 }},
'022'=>{'a'=>{name=> 'issn', rpt=>0 }},
'082'=>{'a'=>{name=> 'dewey', rpt=>0 }},
'100'=>{'a'=>{name=> 'author', rpt=>0, striptrail=>',:;/-' }},
'245'=>{'a'=>{name=> 'title', rpt=>0, striptrail=>',:;/' },
'b'=>{name=> 'subtitle', rpt=>0, striptrail=>',:;/' }},
'260'=>{'a'=>{name=> 'place', rpt=>0, striptrail=>',:;/-' },
'b'=>{name=> 'publisher', rpt=>0, striptrail=>',:;/-' },
'c'=>{name=> 'year' , rpt=>0, striptrail=>'.,:;/-' }},
'300'=>{'a'=>{name=> 'pages', rpt=>0, striptrail=>',:;/-' },
'c'=>{name=> 'size', rpt=>0, striptrail=>',:;/-' }},
'362'=>{'a'=>{name=> 'volume-number', rpt=>0 }},
'440'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' },
'v'=>{name=> 'volume-number',rpt=>0 }},
'490'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' },
'v'=>{name=> 'volume-number',rpt=>0 }},
'700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/' }},
'5xx'=>{'a'=>{name=> 'notes', rpt=>1 }},
'65x'=>{'a'=>{name=> 'subject', rpt=>1, striptrail=>'.,:;/-' }},
);
#------------------
sub extractmarcfields {
use strict;
# input
my (
$record, # pointer to list of MARC field hashes.
# Example: $record->[0]->{'tag'} = '100' # Author
# $record->[0]->{'subfields'}->{'a'} = subfieldvalue
)=@_;
# return
my $bib; # pointer to hash of named output fields
# Example: $bib->{'author'} = "Twain, Mark";
my $debug=0;
my (
$field, # hash ref
$value,
$subfield, # Marc subfield [a-z]
$fieldname, # name of field "author", "title", etc.
$strip, # chars to remove from end of field
$stripregex, # reg exp pattern
);
my ($lccn, $isbn, $issn,
$publicationyear, @subjects, $subject,
$controlnumber,
$notes, $additionalauthors, $illustrator, $copyrightdate,
$s, $subdivision, $subjectsubfield,
);
print "<PRE>\n" if $debug;
if ( ref($record) eq "ARRAY" ) {
foreach $field (@$record) {
# Check each subfield in field
foreach $subfield ( keys %{$field->{subfields}} ) {
# see if it is defined in our Marc to koha mapping table
if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
# Yes, so keep the value
if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
# if it was an array, just keep first element.
$bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
} else {
$bib->{$fieldname}=$field->{'subfields'}->{$subfield};
} # if array
print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
# see if this field should have trailing chars dropped
if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
$strip=~s//\\/; # backquote each char
$stripregex='[ ' . $strip . ']+$'; # remove trailing spaces also
$bib->{$fieldname}=~s/$stripregex//;
# also strip leading spaces
$bib->{$fieldname}=~s/^ +//;
} # if strip
print "Found subfield $field->{'tag'} $subfield " .
"$fieldname = $bib->{$fieldname}\n" if $debug;
} # if tagmap exists
} # foreach subfield
if ($field->{'tag'} eq '001') {
$bib->{controlnumber}=$field->{'indicator'};
}
if ($field->{'tag'} eq '015') {
$bib->{lccn}=$field->{'subfields'}->{'a'};
$bib->{lccn}=~s/^\s*//;
$bib->{lccn}=~s/^C//;
($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
}
if ($field->{'tag'} eq '260') {
$publicationyear=$field->{'subfields'}->{'c'};
if ($publicationyear=~/c(\d\d\d\d)/) {
$copyrightdate=$1;
}
if ($publicationyear=~/[^c](\d\d\d\d)/) {
$publicationyear=$1;
} elsif ($copyrightdate) {
$publicationyear=$copyrightdate;
} else {
$publicationyear=~/(\d\d\d\d)/;
$publicationyear=$1;
}
}
if ($field->{'tag'} eq '700') {
my $name=$field->{'subfields'}->{'a'};
if ( defined($field->{'subfields'}->{'e'})
and $field->{'subfields'}->{'e'}=~/ill/) {
$illustrator=$name;
} else {
$additionalauthors.="$name\n";
}
}
if ($field->{'tag'} =~/^5/) {
$notes.="$field->{'subfields'}->{'a'}\n";
}
if ($field->{'tag'} =~/65\d/) {
my $sub;
my $subject=$field->{'subfields'}->{'a'};
$subject=~s/\.$//;
print "Subject=$subject\n" if $debug;
foreach $subjectsubfield ( 'x','y','z' ) {
if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
if ( ref($subdivision) eq 'ARRAY' ) {
foreach $s (@$subdivision) {
$s=~s/\.$//;
$subject.=" -- $s";
} # foreach subdivision
} else {
$subdivision=~s/\.$//;
$subject.=" -- $subdivision";
} # if array
} # if subfield exists
} # foreach subfield
print "Subject=$subject\n" if $debug;
push @subjects, $subject;
} # if tag 65x
} # foreach field
($publicationyear ) && ($bib->{publicationyear}=$publicationyear );
($copyrightdate ) && ($bib->{copyrightdate}=$copyrightdate );
($additionalauthors ) && ($bib->{additionalauthors}=$additionalauthors );
($illustrator ) && ($bib->{illustrator}=$illustrator );
($notes ) && ($bib->{notes}=$notes );
($#subjects ) && ($bib->{subject}=\@subjects );
# Misc cleanup
if ($bib->{dewey}) {
$bib->{dewey}=~s/\///g; # drop any slashes
}
if ($bib->{lccn}) {
($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
}
if ( $bib->{isbn} ) {
$bib->{isbn}=~s/[^\d]*//g; # drop non-digits
};
if ( $bib->{issn} ) {
$bib->{issn}=~s/^\s*//;
($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
};
if ( $bib->{'volume-number'} ) {
if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
$bib->{'volume'}=$1;
$bib->{'number'}=$2;
} else {
$bib->{volume}=$bib->{'volume-number'};
}
delete $bib->{'volume-number'};
} # if volume-number
} else {
print "Error: extractmarcfields: input ref $record is " .
ref($record) . " not ARRAY. Contact sysadmin.\n";
}
print "</PRE>\n" if $debug;
return $bib;
} # sub extractmarcfields
#---------------------------------
#--------------------------
# Parse MARC data in file format with control-character separators
# May be multiple records.
sub parsemarcfileformat {
use strict;
# Input is one big text string
my $data=shift;
# Output is list of records. Each record is list of field hashes
my @records;
my $splitchar=chr(29);
my $splitchar2=chr(30);
my $splitchar3=chr(31);
my $debug=0;
my $record;
foreach $record (split(/$splitchar/, $data)) {
my @record;
my $directory=0;
my $tagcounter=0;
my %tag;
my $field;
my $leader=substr($record,0,24);
print "<pre>parse Leader:$leader</pre>\n" if $debug;
push (@record, {
'tag' => 'LDR',
'indicator' => $leader ,
} );
$record=substr($record,24);
foreach $field (split(/$splitchar2/, $record)) {
my %field;
my $tag;
my $indicator;
unless ($directory) {
# If we didn't already find a directory, extract one.
$directory=$field;
my $itemcounter=1;
my $counter2=0;
my $item;
my $length;
my $start;
while ($item=substr($directory,0,12)) {
# Pull out location of first field
$tag=substr($directory,0,3);
$length=substr($directory,3,4);
$start=substr($directory,7,6);
# Bump to next directory entry
$directory=substr($directory,12);
$tag{$counter2}=$tag;
$counter2++;
}
$directory=1;
next;
}
$tag=$tag{$tagcounter};
$tagcounter++;
$field{'tag'}=$tag;
my @subfields=split(/$splitchar3/, $field);
$indicator=$subfields[0];
$field{'indicator'}=$indicator;
print "<pre>parse indicator:$indicator</pre>\n" if $debug;
my $firstline=1;
unless ($#subfields==0) {
my %subfields;
my @subfieldlist;
my $i;
for ($i=1; $i<=$#subfields; $i++) {
my $text=$subfields[$i];
my $subfieldcode=substr($text,0,1);
my $subfield=substr($text,1);
# if this subfield already exists, do array
if ($subfields{$subfieldcode}) {
my $subfieldlist=$subfields{$subfieldcode};
if ( ref($subfieldlist) eq 'ARRAY' ) {
# Already an array, add on to it
print "$tag Adding to array $subfieldcode -- $subfield<br>\n" if $debug;
@subfieldlist=@$subfieldlist;
push (@subfieldlist, $subfield);
} else {
# Change simple value to array
print "$tag Arraying $subfieldcode -- $subfield<br>\n" if $debug;
@subfieldlist=($subfields{$subfieldcode}, $subfield);
}
# keep new array
$subfields{$subfieldcode}=\@subfieldlist;
} else {
# subfield doesn't exist yet, keep simple value
$subfields{$subfieldcode}=$subfield;
}
}
$field{'subfields'}=\%subfields;
}
push (@record, \%field);
} # foreach field in record
push (@records, \@record);
# $counter++;
}
print "</pre>" if $debug;
return @records;
} # sub parsemarcfileformat
#----------------------------------------------
sub taglabel {
my ($tag)=@_;
return $tagtext{$tag};
} # sub taglabel
#---------------------------------------------
# $Log$
# Revision 1.2 2002/07/02 20:30:15 tonnesen
# Merged SimpleMarc.pm over from rel-1-2
#
# Revision 1.1.2.4 2002/06/28 14:36:47 amillar
# Fix broken logic on illustrator vs. add'l author
#
# Revision 1.1.2.3 2002/06/26 20:54:32 tonnesen
# use warnings breaks on perl 5.005...
#
# Revision 1.1.2.2 2002/06/26 15:52:55 amillar
# Fix display of marc tag labels and indicators
#
# Revision 1.1.2.1 2002/06/26 07:27:35 amillar
# Moved acqui.simple MARC handling to new module SimpleMarc.pm
#