From dcfb3b835ca59b0dc3744ed35d11db85f41aaa06 Mon Sep 17 00:00:00 2001 From: tonnesen Date: Tue, 2 Jul 2002 20:30:15 +0000 Subject: [PATCH] Merged SimpleMarc.pm over from rel-1-2 --- C4/SimpleMarc.pm | 464 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 464 insertions(+) create mode 100755 C4/SimpleMarc.pm diff --git a/C4/SimpleMarc.pm b/C4/SimpleMarc.pm new file mode 100755 index 0000000000..65b7798e2e --- /dev/null +++ b/C4/SimpleMarc.pm @@ -0,0 +1,464 @@ +#!/usr/bin/perl + +# $Id$ + +package C4::SimpleMarc; + +# Routines for handling import of MARC data into Koha db + +# Koha library project www.koha.org + +# Licensed under the GPL + +use strict; + +# standard or CPAN modules used +use DBI; + +# Koha modules used +use C4::Database; + +require Exporter; + +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); + +# set the version for version checking +$VERSION = 0.01; + +@ISA = qw(Exporter); +@EXPORT = qw( + &extractmarcfields + &parsemarcfileformat + &taglabel + %tagtext + %tagmap +); +%EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ], + +# your exported package globals go here, +# as well as any optionally exported functions + +@EXPORT_OK = qw( + %tagtext + %tagmap +); + +# non-exported package globals go here +use vars qw(@more $stuff); + +# initalize package globals, first exported ones + +my $Var1 = ''; +my %Hashit = (); + +# then the others (which are still accessible as $Some::Module::stuff) +my $stuff = ''; +my @more = (); + +# all file-scoped lexicals must be created before +# the functions below that use them. + +# file-private lexicals go here +my $priv_var = ''; +my %secret_hash = (); + +# here's a file-private function as a closure, +# callable as &$priv_func; it cannot be prototyped. +my $priv_func = sub { + # stuff goes here. + }; + +# make all your functions, whether exported or not; +#------------------------------------------------ + +#------------------ +# Constants + +my %tagtext = ( + 'LDR' => 'Leader', + '001' => 'Control number', + '003' => 'Control number identifier', + '005' => 'Date and time of latest transaction', + '006' => 'Fixed-length data elements -- additional material characteristics', + '007' => 'Physical description fixed field', + '008' => 'Fixed length data elements', + '010' => 'LCCN', + '015' => 'National library CN', + '020' => 'ISBN', + '022' => 'ISSN', + '024' => 'Other standard ID', + '035' => 'System control number', + '037' => 'Source of acquisition', + '040' => 'Cataloging source', + '041' => 'Language code', + '043' => 'Geographic area code', + '043' => 'Publishing country code', + '050' => 'Library of Congress call number', + '055' => 'Canadian classification number', + '060' => 'National Library of Medicine call number', + '082' => 'Dewey decimal call number', + '100' => 'Main entry -- Personal name', + '110' => 'Main entry -- Corporate name', + '130' => 'Main entry -- Uniform title', + '240' => 'Uniform title', + '245' => 'Title statement', + '246' => 'Varying form of title', + '250' => 'Edition statement', + '256' => 'Computer file characteristics', + '260' => 'Publication, distribution, etc.', + '263' => 'Projected publication date', + '300' => 'Physical description', + '306' => 'Playing time', + '440' => 'Series statement / Added entry -- Title', + '490' => 'Series statement', + '500' => 'General note', + '504' => 'Bibliography, etc. note', + '505' => 'Formatted contents note', + '508' => 'Creation/production credits note', + '510' => 'Citation/references note', + '511' => 'Participant or performer note', + '520' => 'Summary, etc. note', + '521' => 'Target audience note (ie age)', + '530' => 'Additional physical form available note', + '538' => 'System details note', + '586' => 'Awards note', + '600' => 'Subject added entry -- Personal name', + '610' => 'Subject added entry -- Corporate name', + '650' => 'Subject added entry -- Topical term', + '651' => 'Subject added entry -- Geographic name', + '656' => 'Index term -- Occupation', + '700' => 'Added entry -- Personal name', + '710' => 'Added entry -- Corporate name', + '730' => 'Added entry -- Uniform title', + '740' => 'Added entry -- Uncontrolled related/analytical title', + '800' => 'Series added entry -- Personal name', + '830' => 'Series added entry -- Uniform title', + '852' => 'Location', + '856' => 'Electronic location and access', +); + +# tag, subfield, field name, repeats, striptrailingchars +my %tagmap=( + '010'=>{'a'=>{name=> 'lccn', rpt=>0, striptrail=>' ' }}, + '015'=>{'a'=>{name=> 'lccn', rpt=>0 }}, + '020'=>{'a'=>{name=> 'isbn', rpt=>0 }}, + '022'=>{'a'=>{name=> 'issn', rpt=>0 }}, + '082'=>{'a'=>{name=> 'dewey', rpt=>0 }}, + '100'=>{'a'=>{name=> 'author', rpt=>0, striptrail=>',:;/-' }}, + '245'=>{'a'=>{name=> 'title', rpt=>0, striptrail=>',:;/' }, + 'b'=>{name=> 'subtitle', rpt=>0, striptrail=>',:;/' }}, + '260'=>{'a'=>{name=> 'place', rpt=>0, striptrail=>',:;/-' }, + 'b'=>{name=> 'publisher', rpt=>0, striptrail=>',:;/-' }, + 'c'=>{name=> 'year' , rpt=>0, striptrail=>'.,:;/-' }}, + '300'=>{'a'=>{name=> 'pages', rpt=>0, striptrail=>',:;/-' }, + 'c'=>{name=> 'size', rpt=>0, striptrail=>',:;/-' }}, + '362'=>{'a'=>{name=> 'volume-number', rpt=>0 }}, + '440'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' }, + 'v'=>{name=> 'volume-number',rpt=>0 }}, + '490'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' }, + 'v'=>{name=> 'volume-number',rpt=>0 }}, + '700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/' }}, + '5xx'=>{'a'=>{name=> 'notes', rpt=>1 }}, + '65x'=>{'a'=>{name=> 'subject', rpt=>1, striptrail=>'.,:;/-' }}, +); + + +#------------------ +sub extractmarcfields { + use strict; + # input + my ( + $record, # pointer to list of MARC field hashes. + # Example: $record->[0]->{'tag'} = '100' # Author + # $record->[0]->{'subfields'}->{'a'} = subfieldvalue + )=@_; + + # return + my $bib; # pointer to hash of named output fields + # Example: $bib->{'author'} = "Twain, Mark"; + + my $debug=0; + + my ( + $field, # hash ref + $value, + $subfield, # Marc subfield [a-z] + $fieldname, # name of field "author", "title", etc. + $strip, # chars to remove from end of field + $stripregex, # reg exp pattern + ); + my ($lccn, $isbn, $issn, + $publicationyear, @subjects, $subject, + $controlnumber, + $notes, $additionalauthors, $illustrator, $copyrightdate, + $s, $subdivision, $subjectsubfield, + ); + + print "
\n" if $debug;
+
+    if ( ref($record) eq "ARRAY" ) {
+        foreach $field (@$record) {
+
+	    # Check each subfield in field
+	    foreach $subfield ( keys %{$field->{subfields}} ) {
+		# see if it is defined in our Marc to koha mapping table
+	    	if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
+		    # Yes, so keep the value
+		    if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
+		        # if it was an array, just keep first element.
+		        $bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
+		    } else {
+		        $bib->{$fieldname}=$field->{'subfields'}->{$subfield};
+		    } # if array
+		    print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
+		    # see if this field should have trailing chars dropped
+	    	    if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
+			$strip=~s//\\/; # backquote each char
+			$stripregex='[ ' . $strip . ']+$';  # remove trailing spaces also
+			$bib->{$fieldname}=~s/$stripregex//;
+			# also strip leading spaces
+			$bib->{$fieldname}=~s/^ +//;
+		    } # if strip
+		    print "Found subfield $field->{'tag'} $subfield " .
+			"$fieldname = $bib->{$fieldname}\n" if $debug;
+		} # if tagmap exists
+
+	    } # foreach subfield
+
+
+	    if ($field->{'tag'} eq '001') {
+		$bib->{controlnumber}=$field->{'indicator'};
+	    }
+	    if ($field->{'tag'} eq '015') {
+		$bib->{lccn}=$field->{'subfields'}->{'a'};
+		$bib->{lccn}=~s/^\s*//;
+		$bib->{lccn}=~s/^C//;
+		($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
+	    }
+
+
+		if ($field->{'tag'} eq '260') {
+
+		    $publicationyear=$field->{'subfields'}->{'c'};
+		    if ($publicationyear=~/c(\d\d\d\d)/) {
+			$copyrightdate=$1;
+		    }
+		    if ($publicationyear=~/[^c](\d\d\d\d)/) {
+			$publicationyear=$1;
+		    } elsif ($copyrightdate) {
+			$publicationyear=$copyrightdate;
+		    } else {
+			$publicationyear=~/(\d\d\d\d)/;
+			$publicationyear=$1;
+		    }
+		}
+		if ($field->{'tag'} eq '700') {
+		    my $name=$field->{'subfields'}->{'a'};
+		    if ( defined($field->{'subfields'}->{'e'}) 
+		        and  $field->{'subfields'}->{'e'}=~/ill/) {
+			$illustrator=$name;
+		    } else {
+			$additionalauthors.="$name\n";
+		    }
+		}
+		if ($field->{'tag'} =~/^5/) {
+		    $notes.="$field->{'subfields'}->{'a'}\n";
+		}
+		if ($field->{'tag'} =~/65\d/) {
+		    my $sub;
+		    my $subject=$field->{'subfields'}->{'a'};
+		    $subject=~s/\.$//;
+		    print "Subject=$subject\n" if $debug;
+		    foreach $subjectsubfield ( 'x','y','z' ) {
+		      if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
+			if ( ref($subdivision) eq 'ARRAY' ) {
+			    foreach $s (@$subdivision) {
+				$s=~s/\.$//;
+				$subject.=" -- $s";
+			    } # foreach subdivision
+			} else {
+			    $subdivision=~s/\.$//;
+			    $subject.=" -- $subdivision";
+			} # if array
+		      } # if subfield exists
+		    } # foreach subfield
+		    print "Subject=$subject\n" if $debug;
+		    push @subjects, $subject;
+		} # if tag 65x
+
+
+        } # foreach field
+	($publicationyear	) && ($bib->{publicationyear}=$publicationyear  );
+	($copyrightdate		) && ($bib->{copyrightdate}=$copyrightdate  );
+	($additionalauthors	) && ($bib->{additionalauthors}=$additionalauthors  );
+	($illustrator		) && ($bib->{illustrator}=$illustrator  );
+	($notes			) && ($bib->{notes}=$notes  );
+	($#subjects		) && ($bib->{subject}=\@subjects  );
+
+	# Misc cleanup
+	if ($bib->{dewey}) {
+	    $bib->{dewey}=~s/\///g;	# drop any slashes
+	}
+
+	if ($bib->{lccn}) {
+	   ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
+	}
+
+	if ( $bib->{isbn} ) {
+	    $bib->{isbn}=~s/[^\d]*//g;	# drop non-digits
+	};
+
+	if ( $bib->{issn} ) {
+	    $bib->{issn}=~s/^\s*//;
+	    ($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
+	};
+
+	if ( $bib->{'volume-number'} ) {
+	    if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
+		$bib->{'volume'}=$1;
+		$bib->{'number'}=$2;
+	    } else {
+		$bib->{volume}=$bib->{'volume-number'};
+	    }
+	    delete $bib->{'volume-number'};
+	} # if volume-number
+
+    } else {
+	print "Error: extractmarcfields: input ref $record is " .
+		ref($record) . " not ARRAY. Contact sysadmin.\n";
+    }
+    print "
\n" if $debug; + + return $bib; + +} # sub extractmarcfields +#--------------------------------- + +#-------------------------- +# Parse MARC data in file format with control-character separators +# May be multiple records. +sub parsemarcfileformat { + use strict; + # Input is one big text string + my $data=shift; + # Output is list of records. Each record is list of field hashes + my @records; + + my $splitchar=chr(29); + my $splitchar2=chr(30); + my $splitchar3=chr(31); + my $debug=0; + my $record; + foreach $record (split(/$splitchar/, $data)) { + my @record; + my $directory=0; + my $tagcounter=0; + my %tag; + my $field; + + my $leader=substr($record,0,24); + print "
parse Leader:$leader
\n" if $debug; + push (@record, { + 'tag' => 'LDR', + 'indicator' => $leader , + } ); + + $record=substr($record,24); + foreach $field (split(/$splitchar2/, $record)) { + my %field; + my $tag; + my $indicator; + unless ($directory) { + # If we didn't already find a directory, extract one. + $directory=$field; + my $itemcounter=1; + my $counter2=0; + my $item; + my $length; + my $start; + while ($item=substr($directory,0,12)) { + # Pull out location of first field + $tag=substr($directory,0,3); + $length=substr($directory,3,4); + $start=substr($directory,7,6); + + # Bump to next directory entry + $directory=substr($directory,12); + $tag{$counter2}=$tag; + $counter2++; + } + $directory=1; + next; + } + $tag=$tag{$tagcounter}; + $tagcounter++; + $field{'tag'}=$tag; + my @subfields=split(/$splitchar3/, $field); + $indicator=$subfields[0]; + $field{'indicator'}=$indicator; + print "
parse indicator:$indicator
\n" if $debug; + my $firstline=1; + unless ($#subfields==0) { + my %subfields; + my @subfieldlist; + my $i; + for ($i=1; $i<=$#subfields; $i++) { + my $text=$subfields[$i]; + my $subfieldcode=substr($text,0,1); + my $subfield=substr($text,1); + # if this subfield already exists, do array + if ($subfields{$subfieldcode}) { + my $subfieldlist=$subfields{$subfieldcode}; + if ( ref($subfieldlist) eq 'ARRAY' ) { + # Already an array, add on to it + print "$tag Adding to array $subfieldcode -- $subfield
\n" if $debug; + @subfieldlist=@$subfieldlist; + push (@subfieldlist, $subfield); + } else { + # Change simple value to array + print "$tag Arraying $subfieldcode -- $subfield
\n" if $debug; + @subfieldlist=($subfields{$subfieldcode}, $subfield); + } + # keep new array + $subfields{$subfieldcode}=\@subfieldlist; + } else { + # subfield doesn't exist yet, keep simple value + $subfields{$subfieldcode}=$subfield; + } + } + $field{'subfields'}=\%subfields; + } + push (@record, \%field); + } # foreach field in record + push (@records, \@record); + # $counter++; + } + print "" if $debug; + return @records; +} # sub parsemarcfileformat + +#---------------------------------------------- +sub taglabel { + my ($tag)=@_; + + return $tagtext{$tag}; + +} # sub taglabel + +#--------------------------------------------- +# $Log$ +# Revision 1.2 2002/07/02 20:30:15 tonnesen +# Merged SimpleMarc.pm over from rel-1-2 +# +# Revision 1.1.2.4 2002/06/28 14:36:47 amillar +# Fix broken logic on illustrator vs. add'l author +# +# Revision 1.1.2.3 2002/06/26 20:54:32 tonnesen +# use warnings breaks on perl 5.005... +# +# Revision 1.1.2.2 2002/06/26 15:52:55 amillar +# Fix display of marc tag labels and indicators +# +# Revision 1.1.2.1 2002/06/26 07:27:35 amillar +# Moved acqui.simple MARC handling to new module SimpleMarc.pm +# -- 2.20.1