C4/SimpleMarc.pm

   1 #!/usr/bin/perl
   2
   3 # $Id$
   4
   5 package C4::SimpleMarc;
   6
   7 # Routines for handling import of MARC data into Koha db
   8
   9 # Koha library project  www.koha.org
  10
  11 # Licensed under the GPL
  12
  13 use strict;
  14
  15 # standard or CPAN modules used
  16 use DBI;
  17
  18 # Koha modules used
  19 use C4::Database;
  20
  21 require Exporter;
  22
  23 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  24
  25 # set the version for version checking
  26 $VERSION = 0.01;
  27
  28 @ISA = qw(Exporter);
  29 @EXPORT = qw(
  30         &extractmarcfields
  31         &parsemarcfileformat
  32         &taglabel
  33         %tagtext
  34         %tagmap
  35 );
  36 %EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],
  37
  38 # your exported package globals go here,
  39 # as well as any optionally exported functions
  40
  41 @EXPORT_OK   = qw(
  42         %tagtext
  43         %tagmap
  44 );
  45
  46 # non-exported package globals go here
  47 use vars qw(@more $stuff);
  48
  49 # initalize package globals, first exported ones
  50
  51 my $Var1   = '';
  52 my %Hashit = ();
  53
  54 # then the others (which are still accessible as $Some::Module::stuff)
  55 my $stuff  = '';
  56 my @more   = ();
  57
  58 # all file-scoped lexicals must be created before
  59 # the functions below that use them.
  60
  61 # file-private lexicals go here
  62 my $priv_var    = '';
  63 my %secret_hash = ();
  64
  65 # here's a file-private function as a closure,
  66 # callable as &$priv_func;  it cannot be prototyped.
  67 my $priv_func = sub {
  68   # stuff goes here.
  69   };
  70
  71 # make all your functions, whether exported or not;
  72 #------------------------------------------------
  73
  74 #------------------
  75 # Constants
  76
  77 my %tagtext = (
  78     'LDR' => 'Leader',
  79     '001' => 'Control number',
  80     '003' => 'Control number identifier',
  81     '005' => 'Date and time of latest transaction',
  82     '006' => 'Fixed-length data elements -- additional material characteristics',
  83     '007' => 'Physical description fixed field',
  84     '008' => 'Fixed length data elements',
  85     '010' => 'LCCN',
  86     '015' => 'National library CN',
  87     '020' => 'ISBN',
  88     '022' => 'ISSN',
  89     '024' => 'Other standard ID',
  90     '035' => 'System control number',
  91     '037' => 'Source of acquisition',
  92     '040' => 'Cataloging source',
  93     '041' => 'Language code',
  94     '043' => 'Geographic area code',
  95     '043' => 'Publishing country code',
  96     '050' => 'Library of Congress call number',
  97     '055' => 'Canadian classification number',
  98     '060' => 'National Library of Medicine call number',
  99     '082' => 'Dewey decimal call number',
 100     '100' => 'Main entry -- Personal name',
 101     '110' => 'Main entry -- Corporate name',
 102     '130' => 'Main entry -- Uniform title',
 103     '240' => 'Uniform title',
 104     '245' => 'Title statement',
 105     '246' => 'Varying form of title',
 106     '250' => 'Edition statement',
 107     '256' => 'Computer file characteristics',
 108     '260' => 'Publication, distribution, etc.',
 109     '263' => 'Projected publication date',
 110     '300' => 'Physical description',
 111     '306' => 'Playing time',
 112     '440' => 'Series statement / Added entry -- Title',
 113     '490' => 'Series statement',
 114     '500' => 'General note',
 115     '504' => 'Bibliography, etc. note',
 116     '505' => 'Formatted contents note',
 117     '508' => 'Creation/production credits note',
 118     '510' => 'Citation/references note',
 119     '511' => 'Participant or performer note',
 120     '520' => 'Summary, etc. note',
 121     '521' => 'Target audience note (ie age)',
 122     '530' => 'Additional physical form available note',
 123     '538' => 'System details note',
 124     '586' => 'Awards note',
 125     '600' => 'Subject added entry -- Personal name',
 126     '610' => 'Subject added entry -- Corporate name',
 127     '650' => 'Subject added entry -- Topical term',
 128     '651' => 'Subject added entry -- Geographic name',
 129     '656' => 'Index term -- Occupation',
 130     '700' => 'Added entry -- Personal name',
 131     '710' => 'Added entry -- Corporate name',
 132     '730' => 'Added entry -- Uniform title',
 133     '740' => 'Added entry -- Uncontrolled related/analytical title',
 134     '800' => 'Series added entry -- Personal name',
 135     '830' => 'Series added entry -- Uniform title',
 136     '852' => 'Location',
 137     '856' => 'Electronic location and access',
 138 );
 139
 140 # tag, subfield, field name, repeats, striptrailingchars
 141 my %tagmap=(
 142     '010'=>{'a'=>{name=> 'lccn',        rpt=>0, striptrail=>' '         }},
 143     '015'=>{'a'=>{name=> 'lccn',        rpt=>0  }},
 144     '020'=>{'a'=>{name=> 'isbn',        rpt=>0  }},
 145     '022'=>{'a'=>{name=> 'issn',        rpt=>0  }},
 146     '082'=>{'a'=>{name=> 'dewey',       rpt=>0  }},
 147     '100'=>{'a'=>{name=> 'author',      rpt=>0, striptrail=>',:;/-'     }},
 148     '245'=>{'a'=>{name=> 'title',       rpt=>0, striptrail=>',:;/'      },
 149             'b'=>{name=> 'subtitle',    rpt=>0, striptrail=>',:;/'      }},
 150     '260'=>{'a'=>{name=> 'place',       rpt=>0, striptrail=>',:;/-'     },
 151             'b'=>{name=> 'publisher',   rpt=>0, striptrail=>',:;/-'     },
 152             'c'=>{name=> 'year' ,       rpt=>0, striptrail=>'.,:;/-'    }},
 153     '300'=>{'a'=>{name=> 'pages',       rpt=>0, striptrail=>',:;/-'     },
 154             'c'=>{name=> 'size',        rpt=>0, striptrail=>',:;/-'     }},
 155     '362'=>{'a'=>{name=> 'volume-number',       rpt=>0  }},
 156     '440'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/'      },
 157             'v'=>{name=> 'volume-number',rpt=>0 }},
 158     '490'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/'      },
 159             'v'=>{name=> 'volume-number',rpt=>0 }},
 160     '700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/'    }},
 161     '5xx'=>{'a'=>{name=> 'notes',       rpt=>1  }},
 162     '65x'=>{'a'=>{name=> 'subject',     rpt=>1, striptrail=>'.,:;/-'    }},
 163 );
 164
 165
 166 #------------------
 167 sub extractmarcfields {
 168     use strict;
 169     # input
 170     my (
 171         $record,        # pointer to list of MARC field hashes.
 172                         # Example: $record->[0]->{'tag'} = '100' # Author
 173                         #       $record->[0]->{'subfields'}->{'a'} = subfieldvalue
 174     )=@_;
 175
 176     # return
 177     my $bib;            # pointer to hash of named output fields
 178                         # Example: $bib->{'author'} = "Twain, Mark";
 179
 180     my $debug=0;
 181
 182     my (
 183         $field,         # hash ref
 184         $value,
 185         $subfield,      # Marc subfield [a-z]
 186         $fieldname,     # name of field "author", "title", etc.
 187         $strip,         # chars to remove from end of field
 188         $stripregex,    # reg exp pattern
 189     );
 190     my ($lccn, $isbn, $issn,
 191         $publicationyear, @subjects, $subject,
 192         $controlnumber,
 193         $notes, $additionalauthors, $illustrator, $copyrightdate,
 194         $s, $subdivision, $subjectsubfield,
 195     );
 196
 197     print "<PRE>\n" if $debug;
 198
 199     if ( ref($record) eq "ARRAY" ) {
 200         foreach $field (@$record) {
 201
 202             # Check each subfield in field
 203             foreach $subfield ( keys %{$field->{subfields}} ) {
 204                 # see if it is defined in our Marc to koha mapping table
 205                 if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
 206                     # Yes, so keep the value
 207                     if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
 208                         # if it was an array, just keep first element.
 209                         $bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
 210                     } else {
 211                         $bib->{$fieldname}=$field->{'subfields'}->{$subfield};
 212                     } # if array
 213                     print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
 214                     # see if this field should have trailing chars dropped
 215                     if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
 216                         $strip=~s//\\/; # backquote each char
 217                         $stripregex='[ ' . $strip . ']+$';  # remove trailing spaces also
 218                         $bib->{$fieldname}=~s/$stripregex//;
 219                         # also strip leading spaces
 220                         $bib->{$fieldname}=~s/^ +//;
 221                     } # if strip
 222                     print "Found subfield $field->{'tag'} $subfield " .
 223                         "$fieldname = $bib->{$fieldname}\n" if $debug;
 224                 } # if tagmap exists
 225
 226             } # foreach subfield
 227
 228
 229             if ($field->{'tag'} eq '001') {
 230                 $bib->{controlnumber}=$field->{'indicator'};
 231             }
 232             if ($field->{'tag'} eq '015') {
 233                 $bib->{lccn}=$field->{'subfields'}->{'a'};
 234                 $bib->{lccn}=~s/^\s*//;
 235                 $bib->{lccn}=~s/^C//;
 236                 ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
 237             }
 238
 239
 240                 if ($field->{'tag'} eq '260') {
 241
 242                     $publicationyear=$field->{'subfields'}->{'c'};
 243                     if ($publicationyear=~/c(\d\d\d\d)/) {
 244                         $copyrightdate=$1;
 245                     }
 246                     if ($publicationyear=~/[^c](\d\d\d\d)/) {
 247                         $publicationyear=$1;
 248                     } elsif ($copyrightdate) {
 249                         $publicationyear=$copyrightdate;
 250                     } else {
 251                         $publicationyear=~/(\d\d\d\d)/;
 252                         $publicationyear=$1;
 253                     }
 254                 }
 255                 if ($field->{'tag'} eq '700') {
 256                     my $name=$field->{'subfields'}->{'a'};
 257                     if ( defined($field->{'subfields'}->{'e'})
 258                         and  $field->{'subfields'}->{'e'}=~/ill/) {
 259                         $illustrator=$name;
 260                     } else {
 261                         $additionalauthors.="$name\n";
 262                     }
 263                 }
 264                 if ($field->{'tag'} =~/^5/) {
 265                     $notes.="$field->{'subfields'}->{'a'}\n";
 266                 }
 267                 if ($field->{'tag'} =~/65\d/) {
 268                     my $sub;
 269                     my $subject=$field->{'subfields'}->{'a'};
 270                     $subject=~s/\.$//;
 271                     print "Subject=$subject\n" if $debug;
 272                     foreach $subjectsubfield ( 'x','y','z' ) {
 273                       if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
 274                         if ( ref($subdivision) eq 'ARRAY' ) {
 275                             foreach $s (@$subdivision) {
 276                                 $s=~s/\.$//;
 277                                 $subject.=" -- $s";
 278                             } # foreach subdivision
 279                         } else {
 280                             $subdivision=~s/\.$//;
 281                             $subject.=" -- $subdivision";
 282                         } # if array
 283                       } # if subfield exists
 284                     } # foreach subfield
 285                     print "Subject=$subject\n" if $debug;
 286                     push @subjects, $subject;
 287                 } # if tag 65x
 288
 289
 290         } # foreach field
 291         ($publicationyear       ) && ($bib->{publicationyear}=$publicationyear  );
 292         ($copyrightdate         ) && ($bib->{copyrightdate}=$copyrightdate  );
 293         ($additionalauthors     ) && ($bib->{additionalauthors}=$additionalauthors  );
 294         ($illustrator           ) && ($bib->{illustrator}=$illustrator  );
 295         ($notes                 ) && ($bib->{notes}=$notes  );
 296         ($#subjects             ) && ($bib->{subject}=\@subjects  );
 297
 298         # Misc cleanup
 299         if ($bib->{dewey}) {
 300             $bib->{dewey}=~s/\///g;     # drop any slashes
 301         }
 302
 303         if ($bib->{lccn}) {
 304            ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
 305         }
 306
 307         if ( $bib->{isbn} ) {
 308             $bib->{isbn}=~s/[^\d]*//g;  # drop non-digits
 309         };
 310
 311         if ( $bib->{issn} ) {
 312             $bib->{issn}=~s/^\s*//;
 313             ($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
 314         };
 315
 316         if ( $bib->{'volume-number'} ) {
 317             if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
 318                 $bib->{'volume'}=$1;
 319                 $bib->{'number'}=$2;
 320             } else {
 321                 $bib->{volume}=$bib->{'volume-number'};
 322             }
 323             delete $bib->{'volume-number'};
 324         } # if volume-number
 325
 326     } else {
 327         print "Error: extractmarcfields: input ref $record is " .
 328                 ref($record) . " not ARRAY. Contact sysadmin.\n";
 329     }
 330     print "</PRE>\n" if $debug;
 331
 332     return $bib;
 333
 334 } # sub extractmarcfields
 335 #---------------------------------
 336
 337 #--------------------------
 338 # Parse MARC data in file format with control-character separators
 339 #   May be multiple records.
 340 sub parsemarcfileformat {
 341     use strict;
 342     # Input is one big text string
 343     my $data=shift;
 344     # Output is list of records.  Each record is list of field hashes
 345     my @records;
 346
 347     my $splitchar=chr(29);
 348     my $splitchar2=chr(30);
 349     my $splitchar3=chr(31);
 350     my $debug=0;
 351     my $record;
 352     foreach $record (split(/$splitchar/, $data)) {
 353         my @record;
 354         my $directory=0;
 355         my $tagcounter=0;
 356         my %tag;
 357         my $field;
 358
 359         my $leader=substr($record,0,24);
 360         print "<pre>parse Leader:$leader</pre>\n" if $debug;
 361         push (@record, {
 362                 'tag' => 'LDR',
 363                 'indicator' => $leader ,
 364         } );
 365
 366         $record=substr($record,24);
 367         foreach $field (split(/$splitchar2/, $record)) {
 368             my %field;
 369             my $tag;
 370             my $indicator;
 371             unless ($directory) {
 372                 # If we didn't already find a directory, extract one.
 373                 $directory=$field;
 374                 my $itemcounter=1;
 375                 my $counter2=0;
 376                 my $item;
 377                 my $length;
 378                 my $start;
 379                 while ($item=substr($directory,0,12)) {
 380                     # Pull out location of first field
 381                     $tag=substr($directory,0,3);
 382                     $length=substr($directory,3,4);
 383                     $start=substr($directory,7,6);
 384
 385                     # Bump to next directory entry
 386                     $directory=substr($directory,12);
 387                     $tag{$counter2}=$tag;
 388                     $counter2++;
 389                 }
 390                 $directory=1;
 391                 next;
 392             }
 393             $tag=$tag{$tagcounter};
 394             $tagcounter++;
 395             $field{'tag'}=$tag;
 396             my @subfields=split(/$splitchar3/, $field);
 397             $indicator=$subfields[0];
 398             $field{'indicator'}=$indicator;
 399             print "<pre>parse indicator:$indicator</pre>\n" if $debug;
 400             my $firstline=1;
 401             unless ($#subfields==0) {
 402                 my %subfields;
 403                 my @subfieldlist;
 404                 my $i;
 405                 for ($i=1; $i<=$#subfields; $i++) {
 406                     my $text=$subfields[$i];
 407                     my $subfieldcode=substr($text,0,1);
 408                     my $subfield=substr($text,1);
 409                     # if this subfield already exists, do array
 410                     if ($subfields{$subfieldcode}) {
 411                         my $subfieldlist=$subfields{$subfieldcode};
 412                         if ( ref($subfieldlist) eq 'ARRAY' ) {
 413                             # Already an array, add on to it
 414                             print "$tag Adding to array $subfieldcode -- $subfield<br>\n" if $debug;
 415                             @subfieldlist=@$subfieldlist;
 416                             push (@subfieldlist, $subfield);
 417                         } else {
 418                             # Change simple value to array
 419                             print "$tag Arraying $subfieldcode -- $subfield<br>\n" if $debug;
 420                             @subfieldlist=($subfields{$subfieldcode}, $subfield);
 421                         }
 422                         # keep new array
 423                         $subfields{$subfieldcode}=\@subfieldlist;
 424                     } else {
 425                         # subfield doesn't exist yet, keep simple value
 426                         $subfields{$subfieldcode}=$subfield;
 427                     }
 428                 }
 429                 $field{'subfields'}=\%subfields;
 430             }
 431             push (@record, \%field);
 432         } # foreach field in record
 433         push (@records, \@record);
 434         # $counter++;
 435     }
 436     print "</pre>" if $debug;
 437     return @records;
 438 } # sub parsemarcfileformat
 439
 440 #----------------------------------------------
 441 sub taglabel {
 442     my ($tag)=@_;
 443
 444     return $tagtext{$tag};
 445
 446 } # sub taglabel
 447
 448 #---------------------------------------------
 449 # $Log$
 450 # Revision 1.2  2002/07/02 20:30:15  tonnesen
 451 # Merged SimpleMarc.pm over from rel-1-2
 452 #
 453 # Revision 1.1.2.4  2002/06/28 14:36:47  amillar
 454 # Fix broken logic on illustrator vs. add'l author
 455 #
 456 # Revision 1.1.2.3  2002/06/26 20:54:32  tonnesen
 457 # use warnings breaks on perl 5.005...
 458 #
 459 # Revision 1.1.2.2  2002/06/26 15:52:55  amillar
 460 # Fix display of marc tag labels and indicators
 461 #
 462 # Revision 1.1.2.1  2002/06/26 07:27:35  amillar
 463 # Moved acqui.simple MARC handling to new module SimpleMarc.pm
 464 #