package C4::SimpleMarc;

# $Id$

# Routines for handling import of MARC data into Koha db

# Koha library project  www.koha.org

# Licensed under the GPL


# Copyright 2000-2002 Katipo Communications
#
# This file is part of Koha.
#
# Koha is free software; you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
# Suite 330, Boston, MA  02111-1307 USA

use strict;
use DBI;
require Exporter;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);

# set the version for version checking
$VERSION = 0.01;

=head1 NAME

C4::SimpleMarc - Functions for parsing MARC records and files

=head1 SYNOPSIS

  use C4::SimpleMarc;

=head1 DESCRIPTION

This module provides functions for parsing MARC records and files.

=head1 FUNCTIONS

=over 2

=cut

@ISA = qw(Exporter);
@EXPORT = qw(
	&extractmarcfields
	&parsemarcfileformat
	&taglabel
	%tagtext
	%tagmap
);

# FIXME - %tagtext and %tagmap are in both @EXPORT and @EXPORT_OK.
# They should be in one or the other, but not both (though preferably,
# things shouldn't get exported in the first place).
@EXPORT_OK   = qw(
	%tagtext
	%tagmap
);

#------------------------------------------------

#------------------
# Constants

# %tagtext maps MARC tags to descriptive names.
my %tagtext = (
    'LDR' => 'Leader',
    '001' => 'Control number',
    '003' => 'Control number identifier',
    '005' => 'Date and time of latest transaction',
    '006' => 'Fixed-length data elements -- additional material characteristics',
    '007' => 'Physical description fixed field',
    '008' => 'Fixed length data elements',
    '010' => 'LCCN',
    '015' => 'National library CN',
    '020' => 'ISBN',
    '022' => 'ISSN',
    '024' => 'Other standard ID',
    '035' => 'System control number',
    '037' => 'Source of acquisition',
    '040' => 'Cataloging source',
    '041' => 'Language code',
    '043' => 'Geographic area code',
    '043' => 'Publishing country code',
    '050' => 'Library of Congress call number',
    '055' => 'Canadian classification number',
    '060' => 'National Library of Medicine call number',
    '082' => 'Dewey decimal call number',
    '100' => 'Main entry -- Personal name',
    '110' => 'Main entry -- Corporate name',
    '130' => 'Main entry -- Uniform title',
    '240' => 'Uniform title',
    '245' => 'Title statement',
    '246' => 'Varying form of title',
    '250' => 'Edition statement',
    '256' => 'Computer file characteristics',
    '260' => 'Publication, distribution, etc.',
    '263' => 'Projected publication date',
    '300' => 'Physical description',
    '306' => 'Playing time',
    '440' => 'Series statement / Added entry -- Title',
    '490' => 'Series statement',
    '500' => 'General note',
    '504' => 'Bibliography, etc. note',
    '505' => 'Formatted contents note',
    '508' => 'Creation/production credits note',
    '510' => 'Citation/references note',
    '511' => 'Participant or performer note',
    '520' => 'Summary, etc. note',
    '521' => 'Target audience note (ie age)',
    '530' => 'Additional physical form available note',
    '538' => 'System details note',
    '586' => 'Awards note',
    '600' => 'Subject added entry -- Personal name',
    '610' => 'Subject added entry -- Corporate name',
    '650' => 'Subject added entry -- Topical term',
    '651' => 'Subject added entry -- Geographic name',
    '656' => 'Index term -- Occupation',
    '700' => 'Added entry -- Personal name',
    '710' => 'Added entry -- Corporate name',
    '730' => 'Added entry -- Uniform title',
    '740' => 'Added entry -- Uncontrolled related/analytical title',
    '800' => 'Series added entry -- Personal name',
    '830' => 'Series added entry -- Uniform title',
    '852' => 'Location',
    '856' => 'Electronic location and access',
);

# tag, subfield, field name, repeats, striptrailingchars
# FIXME - What is this? Can it be explained without a semester-long
# course in MARC?

# XXX - Maps MARC (field, subfield) tuples to Koha database field
# names (presumably in 'biblioitems'). $tagmap{$field}->{$subfield} is
# an anonymous hash of the form
#	{
#		name	=> "title",	# Name of Koha field
#		rpt	=> 0,		# I don't know what this is, but
#					# it's not used.
#		striptrail => ',:;/-',	# Lists the set of characters that
#					# should be stripped from the end
#					# of the MARC field.
#	}

my %tagmap=(
    '010'=>{'a'=>{name=> 'lccn',	rpt=>0, striptrail=>' ' 	}},
    '015'=>{'a'=>{name=> 'lccn',	rpt=>0	}},
    '020'=>{'a'=>{name=> 'isbn',	rpt=>0	}},
    '022'=>{'a'=>{name=> 'issn',	rpt=>0	}},
    '082'=>{'a'=>{name=> 'dewey',	rpt=>0	}},
    '100'=>{'a'=>{name=> 'author',	rpt=>0, striptrail=>',:;/-'	}},
    '245'=>{'a'=>{name=> 'title',	rpt=>0, striptrail=>',:;/'	},
            'b'=>{name=> 'subtitle',	rpt=>0, striptrail=>',:;/'	}},
    '260'=>{'a'=>{name=> 'place',	rpt=>0, striptrail=>',:;/-'	},
            'b'=>{name=> 'publisher',	rpt=>0, striptrail=>',:;/-'	},
            'c'=>{name=> 'year' ,	rpt=>0, striptrail=>'.,:;/-'	}},
    '300'=>{'a'=>{name=> 'pages',	rpt=>0, striptrail=>',:;/-'	},
            'c'=>{name=> 'size',	rpt=>0, striptrail=>',:;/-'	}},
    '362'=>{'a'=>{name=> 'volume-number',	rpt=>0	}},
    '440'=>{'a'=>{name=> 'seriestitle',	rpt=>0, striptrail=>',:;/'	},
            'v'=>{name=> 'volume-number',rpt=>0	}},
    '490'=>{'a'=>{name=> 'seriestitle',	rpt=>0, striptrail=>',:;/'	},
            'v'=>{name=> 'volume-number',rpt=>0	}},
    '700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/'	}},
    '5xx'=>{'a'=>{name=> 'notes',	rpt=>1	}},
    '65x'=>{'a'=>{name=> 'subject',	rpt=>1, striptrail=>'.,:;/-'	}},
);


#------------------

=item extractmarcfields

  $biblioitem = &extractmarcfields($marc_record);

C<$marc_record> is a reference-to-array representing a MARC record;
each element is a reference-to-hash specifying a MARC field (possibly
with subfields).

C<&extractmarcfields> translates C<$marc_record> into a Koha
biblioitem. C<$biblioitem> is a reference-to-hash whose keys are named
after fields in the biblioitems table of the Koha database.

=cut
#'
# FIXME - Throughout:
#	$foo->{bar}->[baz]->{quux}
# can be rewritten as
#	$foo->{bar}[baz]{quux}
sub extractmarcfields {
    use strict;
    # input
    my (
	$record,	# pointer to list of MARC field hashes.
			# Example: $record->[0]->{'tag'} = '100' # Author
			# 	$record->[0]->{'subfields'}->{'a'} = subfieldvalue
    )=@_;

    # return
    my $bib;		# pointer to hash of named output fields
			# Example: $bib->{'author'} = "Twain, Mark";

    my $debug=0;

    my (
	$field, 	# hash ref
	$value,
	$subfield,	# Marc subfield [a-z]
	$fieldname,	# name of field "author", "title", etc.
	$strip,		# chars to remove from end of field
	$stripregex,	# reg exp pattern
    );
    my ($lccn, $isbn, $issn,
	$publicationyear, @subjects, $subject,
	$controlnumber,
	$notes, $additionalauthors, $illustrator, $copyrightdate,
	$s, $subdivision, $subjectsubfield,
    );

    print "<PRE>\n" if $debug;

    if ( ref($record) eq "ARRAY" ) {
        foreach $field (@$record) {

	    # Check each subfield in field
	    # FIXME - Would this code be more readable with
	    #	while (($subfieldname, $subfield) = each %{$field->{subfields}})
	    # ?
	    foreach $subfield ( keys %{$field->{subfields}} ) {
		# see if it is defined in our Marc to koha mapping table
		# FIXME - This if-clause takes up the entire loop.
		# This would be better rewritten as
		#	next unless defined($tagmap{...});
		# Then the body of the loop doesn't have to be
		# indented as much.
	    	if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
		    # Yes, so keep the value
		    if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
		        # if it was an array, just keep first element.
		        $bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
		    } else {
		        $bib->{$fieldname}=$field->{'subfields'}->{$subfield};
		    } # if array
		    print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
		    # see if this field should have trailing chars dropped
	    	    if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
			# FIXME - The next three lines can be rewritten as:
			#	$bib =~ s/[\Q$strip\E]+$//;
			$strip=~s//\\/; # backquote each char
			$stripregex='[ ' . $strip . ']+$';  # remove trailing spaces also
			$bib->{$fieldname}=~s/$stripregex//;
			# also strip leading spaces
			$bib->{$fieldname}=~s/^ +//;
		    } # if strip
		    print "Found subfield $field->{'tag'} $subfield " .
			"$fieldname = $bib->{$fieldname}\n" if $debug;
		} # if tagmap exists

	    } # foreach subfield

	    # Handle special fields and tags
	    if ($field->{'tag'} eq '001') {
		$bib->{controlnumber}=$field->{'indicator'};
	    }
	    if ($field->{'tag'} eq '015') {
		# FIXME - I think this can be rewritten as
		#	$field->{"subfields"}{"a"} =~ /^\s*C?(\S+)/ and
		#		$bib->{"lccn"} = $1;
		# This might break with invalid input, though.
		$bib->{lccn}=$field->{'subfields'}->{'a'};
		$bib->{lccn}=~s/^\s*//;
		$bib->{lccn}=~s/^C//;
		($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
	    }


		# FIXME - Fix indentation
		if ($field->{'tag'} eq '260') {

		    $publicationyear=$field->{'subfields'}->{'c'};
		    # FIXME - "\d\d\d\d" can be rewritten as "\d{4}"
		    if ($publicationyear=~/c(\d\d\d\d)/) {
			$copyrightdate=$1;
		    }
		    if ($publicationyear=~/[^c](\d\d\d\d)/) {
			$publicationyear=$1;
		    } elsif ($copyrightdate) {
			$publicationyear=$copyrightdate;
		    } else {
			$publicationyear=~/(\d\d\d\d)/;
			$publicationyear=$1;
		    }
		}
		if ($field->{'tag'} eq '700') {
		    my $name=$field->{'subfields'}->{'a'};
		    if ( defined($field->{'subfields'}->{'e'})
		        and  $field->{'subfields'}->{'e'}=~/ill/) {
			$illustrator=$name;
		    } else {
			$additionalauthors.="$name\n";
		    }
		}
		if ($field->{'tag'} =~/^5/) {
		    $notes.="$field->{'subfields'}->{'a'}\n";
		}
		if ($field->{'tag'} =~/65\d/) {
		    my $sub;	# FIXME - Never used
		    my $subject=$field->{'subfields'}->{'a'};
		    $subject=~s/\.$//;
		    print "Subject=$subject\n" if $debug;
		    foreach $subjectsubfield ( 'x','y','z' ) {
		      # FIXME - $subdivision is only used in this
		      # loop. Make it 'my' here, rather than in the
		      # entire function.
		      # Ditto $subjectsubfield. Make it 'my' in the
		      # 'foreach' statement.
		      if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
			if ( ref($subdivision) eq 'ARRAY' ) {
			    foreach $s (@$subdivision) {
				$s=~s/\.$//;
				$subject.=" -- $s";
			    } # foreach subdivision
			} else {
			    $subdivision=~s/\.$//;
			    $subject.=" -- $subdivision";
			} # if array
		      } # if subfield exists
		    } # foreach subfield
		    print "Subject=$subject\n" if $debug;
		    push @subjects, $subject;
		} # if tag 65x


        } # foreach field
        # FIXME - Why not do this up in the "Handle special fields and
        # tags" section?
	($publicationyear	) && ($bib->{publicationyear}=$publicationyear  );
	($copyrightdate		) && ($bib->{copyrightdate}=$copyrightdate  );
	($additionalauthors	) && ($bib->{additionalauthors}=$additionalauthors  );
	($illustrator		) && ($bib->{illustrator}=$illustrator  );
	($notes			) && ($bib->{notes}=$notes  );
	($#subjects		) && ($bib->{subject}=\@subjects  );
		# FIXME - This doesn't look right: for an array with
		# one element, $#subjects == 0, which is false. For an
		# array with 0 elements, $#subjects == -1, which is
		# true.

	# Misc cleanup
	if ($bib->{dewey}) {
	    $bib->{dewey}=~s/\///g;	# drop any slashes
					# FIXME - Why? Don't the
					# slashes mean something?
					# The Dewey code is NOT a number,
					# it's a string.
	}

	if ($bib->{lccn}) {
	   ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
	}

	if ( $bib->{isbn} ) {
	    $bib->{isbn}=~s/[^\d]*//g;	# drop non-digits
			# FIXME - "[^\d]" can be rewritten as "\D"
			# FIXME - Does this include the check digit? If so,
			# it might be "X".
	};

	if ( $bib->{issn} ) {
	    $bib->{issn}=~s/^\s*//;
	    ($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
	};

	if ( $bib->{'volume-number'} ) {
	    if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
		$bib->{'volume'}=$1;
		$bib->{'number'}=$2;
	    } else {
		$bib->{volume}=$bib->{'volume-number'};
	    }
	    delete $bib->{'volume-number'};
	} # if volume-number

    } else {
	# FIXME - Style: this sort of error-checking should really go
	# closer to the actual test, e.g.:
	#	if (ref($record) ne "ARRAY")
	#	{
	#		die "Not an array!"
	#	}
	# then the rest of the code which follows can assume that the
	# input is good, and you don't have to indent as much.
	print "Error: extractmarcfields: input ref $record is " .
		ref($record) . " not ARRAY. Contact sysadmin.\n";
    }
    print "</PRE>\n" if $debug;

    return $bib;

} # sub extractmarcfields
#---------------------------------

#--------------------------

=item parsemarcfileformat

  @records = &parsemarcfileformat($marc_data);

Parses the contents of a MARC file.

C<$marc_data> is a string, the contents of a MARC file.
C<&parsemarcfileformat> parses this string into individual MARC
records and returns them.

C<@records> is an array of references-to-hash. Each element is a MARC
record; its keys are the MARC tags.

=cut
#'
# Parse MARC data in file format with control-character separators
#   May be multiple records.
# FIXME - Is the input ever likely to be more than a few Kb? If so, it
# might be worth changing this function to take a (read-only)
# reference-to-string, to avoid unnecessary copying.
sub parsemarcfileformat {
    use strict;
    # Input is one big text string
    my $data=shift;
    # Output is list of records.  Each record is list of field hashes
    my @records;

    my $splitchar=chr(29);	# \c]
    my $splitchar2=chr(30);	# \c^
    my $splitchar3=chr(31);	# \c_
    my $debug=0;
    my $record;
    foreach $record (split(/$splitchar/, $data)) {
	my @record;
	my $directory=0;
	my $tagcounter=0;
	my %tag;
	my $field;

	my $leader=substr($record,0,24);
	print "<pre>parse Leader:$leader</pre>\n" if $debug;
	push (@record, {
		'tag' => 'LDR',
		'indicator' => $leader ,
	} );

	$record=substr($record,24);
	foreach $field (split(/$splitchar2/, $record)) {
	    my %field;
	    my $tag;
	    my $indicator;
	    unless ($directory) {
		# If we didn't already find a directory, extract one.
		$directory=$field;
		my $itemcounter=1;
		my $counter2=0;
		my $item;
		my $length;
		my $start;
		while ($item=substr($directory,0,12)) {
		    # Pull out location of first field
		    $tag=substr($directory,0,3);
		    $length=substr($directory,3,4);
		    $start=substr($directory,7,6);

		    # Bump to next directory entry
		    $directory=substr($directory,12);
		    $tag{$counter2}=$tag;
		    $counter2++;
		}
		$directory=1;
		next;
	    }
	    $tag=$tag{$tagcounter};
	    $tagcounter++;
	    $field{'tag'}=$tag;
	    my @subfields=split(/$splitchar3/, $field);
	    $indicator=$subfields[0];
	    $field{'indicator'}=$indicator;
	    print "<pre>parse indicator:$indicator</pre>\n" if $debug;
	    my $firstline=1;
	    unless ($#subfields==0) {
		my %subfields;
		my @subfieldlist;
		my $i;
		for ($i=1; $i<=$#subfields; $i++) {
		    my $text=$subfields[$i];
		    my $subfieldcode=substr($text,0,1);
		    my $subfield=substr($text,1);
		    # if this subfield already exists, do array
		    if ($subfields{$subfieldcode}) {
			my $subfieldlist=$subfields{$subfieldcode};
			if ( ref($subfieldlist) eq 'ARRAY' ) {
                            # Already an array, add on to it
			    print "$tag Adding to array $subfieldcode -- $subfield<br>\n" if $debug;
			    @subfieldlist=@$subfieldlist;
			    push (@subfieldlist, $subfield);
			} else {
                            # Change simple value to array
			    print "$tag Arraying $subfieldcode -- $subfield<br>\n" if $debug;
			    @subfieldlist=($subfields{$subfieldcode}, $subfield);
			}
			# keep new array
			$subfields{$subfieldcode}=\@subfieldlist;
		    } else {
			# subfield doesn't exist yet, keep simple value
			$subfields{$subfieldcode}=$subfield;
		    }
		}
		$field{'subfields'}=\%subfields;
	    }
	    push (@record, \%field);
	} # foreach field in record
	push (@records, \@record);
	# $counter++;
    }
    print "</pre>" if $debug;
    return @records;
} # sub parsemarcfileformat

#----------------------------------------------

=item taglabel

  $label = &taglabel($tag);

Converts a MARC tag (a three-digit number, or "LDR") and returns a
descriptive label.

Note that although the tag looks like a number, it is treated here as
a string. Be sure to use

    $label = &taglabel("082");

and not

    $label = &taglabel(082);	# <-- Invalid octal number!

=cut
#'
# FIXME - Does this function mean that %tagtext doesn't need to be
# exported?
sub taglabel {
    my ($tag)=@_;

    return $tagtext{$tag};

} # sub taglabel

1;

#---------------------------------------------
# $Log$
# Revision 1.7  2002/10/13 08:30:38  arensb
# Deleted unused variables.
# Removed trailing whitespace.
#
# Revision 1.6  2002/10/10 04:44:28  arensb
# Added whitespace to make the POD work.
#
# Revision 1.5  2002/10/07 00:51:22  arensb
# Added POD and some comments.
#
# Revision 1.4  2002/10/05 09:53:11  arensb
# Merged with arensb-context branch: use C4::Context->dbh instead of
# &C4Connect, and generally prefer C4::Context over C4::Database.
#
# Revision 1.3.2.1  2002/10/04 02:57:38  arensb
# Removed useless "use C4::Database;" line.
#
# Revision 1.3  2002/08/14 18:12:52  tonnesen
# Added copyright statement to all .pl and .pm files
#
# Revision 1.2  2002/07/02 20:30:15  tonnesen
# Merged SimpleMarc.pm over from rel-1-2
#
# Revision 1.1.2.4  2002/06/28 14:36:47  amillar
# Fix broken logic on illustrator vs. add'l author
#
# Revision 1.1.2.3  2002/06/26 20:54:32  tonnesen
# use warnings breaks on perl 5.005...
#
# Revision 1.1.2.2  2002/06/26 15:52:55  amillar
# Fix display of marc tag labels and indicators
#
# Revision 1.1.2.1  2002/06/26 07:27:35  amillar
# Moved acqui.simple MARC handling to new module SimpleMarc.pm
#
__END__

=back

=head1 AUTHOR

Koha Developement team <info@koha.org>

=cut