C4/Ris.pm

   1 package C4::Ris;
   2
   3 # Original script :
   4 ## marc2ris: converts MARC21 and UNIMARC datasets to RIS format
   5 ##           See comments below for compliance with other MARC dialects
   6 ##
   7 ## usage: perl marc2ris < infile.marc > outfile.ris
   8 ##
   9 ## Dependencies: perl 5.6.0 or later
  10 ##               MARC::Record
  11 ##               MARC::Charset
  12 ##
  13 ## markus@mhoenicka.de 2002-11-16
  14
  15 ##   This program is free software; you can redistribute it and/or modify
  16 ##   it under the terms of the GNU General Public License as published by
  17 ##   the Free Software Foundation; either version 2 of the License, or
  18 ##   (at your option) any later version.
  19 ##
  20 ##   This program is distributed in the hope that it will be useful,
  21 ##   but WITHOUT ANY WARRANTY; without even the implied warranty of
  22 ##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23 ##   GNU General Public License for more details.
  24
  25 ##   You should have received a copy of the GNU General Public License
  26 ##   along with this program; if not, write to the Free Software
  27 ##   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  28
  29 ## Some background about MARC as understood by this script
  30 ## The default input format used in this script is MARC21, which
  31 ## superseded USMARC and CANMARC. The specification can be found at:
  32 ## http://lcweb.loc.gov/marc/
  33 ## UNIMARC follows the specification at:
  34 ## http://www.ifla.org/VI/3/p1996-1/sec-uni.htm
  35 ## UKMARC support is a bit shaky because there is no specification available
  36 ## for free. The wisdom used in this script was taken from a PDF document
  37 ## comparing UKMARC to MARC21 found at:
  38 ## www.bl.uk/services/bibliographic/marcchange.pdf
  39
  40
  41 # Modified 2008 by BibLibre for Koha
  42 # Modified 2011 by Catalyst
  43 # Modified 2011 by Equinox Software, Inc.
  44 #
  45 # This file is part of Koha.
  46 #
  47 # Koha is free software; you can redistribute it and/or modify it under the
  48 # terms of the GNU General Public License as published by the Free Software
  49 # Foundation; either version 2 of the License, or (at your option) any later
  50 # version.
  51 #
  52 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  53 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  54 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  55 #
  56 # You should have received a copy of the GNU General Public License along with
  57 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  58 # Suite 330, Boston, MA  02111-1307 USA
  59 #
  60 #
  61
  62 #use strict;
  63 #use warnings;
  64
  65 use vars qw($VERSION @ISA @EXPORT);
  66
  67 # set the version for version checking
  68 $VERSION = 3.07.00.049;
  69
  70 @ISA = qw(Exporter);
  71
  72 # only export API methods
  73
  74 @EXPORT = qw(
  75   &marc2ris
  76 );
  77
  78
  79 =head1 marc2bibtex - Convert from UNIMARC to RIS
  80
  81   my ($ris) = marc2ris($record);
  82
  83 Returns a RIS scalar
  84
  85 C<$record> - a MARC::Record object
  86
  87 =cut
  88
  89 sub marc2ris {
  90     my ($record) = @_;
  91     my $output;
  92
  93     my $marcflavour = C4::Context->preference("marcflavour");
  94     my $intype = lc($marcflavour);
  95     my $marcprint = 0; # Debug flag;
  96
  97     # Let's redirect stdout
  98     open my $oldout, ">&STDOUT";
  99     my $outvar;
 100     close STDOUT;
 101     open STDOUT,'>', \$outvar;
 102
 103
 104         ## First we should check the character encoding. This may be
 105         ## MARC-8 or UTF-8. The former is indicated by a blank, the latter
 106         ## by 'a' at position 09 (zero-based) of the leader
 107         my $leader = $record->leader();
 108         if ($intype eq "marc21") {
 109             if ($leader =~ /^.{9}a/) {
 110                 print "<marc>---\r\n<marc>UTF-8 data\r\n" if $marcprint;
 111                 $utf = 1;
 112             }
 113             else {
 114                 print "<marc>---\r\n<marc>MARC-8 data\r\n" if $marcprint;
 115             }
 116         }
 117         ## else: other MARC formats do not specify the character encoding
 118         ## we assume it's *not* UTF-8
 119
 120         ## start RIS dataset
 121         &print_typetag($leader);
 122
 123         ## retrieve all author fields and collect them in a list
 124         my @author_fields;
 125
 126         if ($intype eq "unimarc") {
 127             ## Fields 700, 701, and 702 can contain author names
 128             @author_fields = ($record->field('700'), $record->field('701'), $record->field('702'));
 129         }
 130         else {  ## marc21, ukmarc
 131             ## Field 100 sometimes carries main author
 132             ## Field(s) 700 carry added entries - personal names
 133             @author_fields = ($record->field('100'), $record->field('700'));
 134         }
 135
 136         ## loop over all author fields
 137         foreach my $field (@author_fields) {
 138             if (length($field)) {
 139                 my $author = &get_author($field);
 140                 print "AU  - ",&charconv($author),"\r\n";
 141             }
 142         }
 143
 144         # ToDo: should we specify anonymous as author if we didn't find
 145         # one? or use one of the corporate/meeting names below?
 146
 147         ## add corporate names or meeting names as editors ??
 148         my @editor_fields;
 149
 150         if ($intype eq "unimarc") {
 151             ## Fields 710, 711, and 712 can carry corporate names
 152             ## Field(s) 720, 721, 722, 730 have additional candidates
 153             @editor_fields = ($record->field('710'), $record->field('711'), $record->field('712'), $record->field('720'), $record->field('721'), $record->field('722'), $record->field('730'));
 154         }
 155         else { ## marc21, ukmarc
 156             ## Fields 110 and 111 carry the main entries - corporate name and
 157             ## meeting name, respectively
 158             ## Field(s) 710, 711 carry added entries - personal names
 159             @editor_fields = ($record->field('110'), $record->field('111'), $record->field('710'), $record->field('711'));
 160         }
 161
 162         ## loop over all editor fields
 163         foreach my $field (@editor_fields) {
 164             if (length($field)) {
 165                 my $editor = &get_editor($field);
 166                 print "ED  - ",&charconv($editor),"\r\n";
 167             }
 168         }
 169
 170         ## get info from the title field
 171         if ($intype eq "unimarc") {
 172             &print_title($record->field('200'));
 173         }
 174         else { ## marc21, ukmarc
 175             &print_title($record->field('245'));
 176         }
 177
 178         ## series title
 179         if ($intype eq "unimarc") {
 180             &print_stitle($record->field('225'));
 181         }
 182         else { ## marc21, ukmarc
 183             &print_stitle($record->field('490'));
 184         }
 185
 186         ## ISBN/ISSN
 187         if ($intype eq "unimarc") {
 188             &print_isbn($record->field('010'));
 189             &print_issn($record->field('011'));
 190         }
 191         elsif ($intype eq "ukmarc") {
 192             &print_isbn($record->field('021'));
 193             ## this is just an assumption
 194             &print_issn($record->field('022'));
 195         }
 196         else { ## assume marc21
 197             &print_isbn($record->field('020'));
 198             &print_issn($record->field('022'));
 199         }
 200
 201         if ($intype eq "marc21") {
 202             &print_loc_callno($record->field('050'));
 203             &print_dewey($record->field('082'));
 204         }
 205         ## else: unimarc, ukmarc do not seem to store call numbers?
 206
 207         ## publication info
 208         if ($intype eq "unimarc") {
 209             &print_pubinfo($record->field('210'));
 210         }
 211         else { ## marc21, ukmarc
 212             if ($record->field('264')) {
 213                  &print_pubinfo($record->field('264'));
 214             }
 215             else {
 216             &print_pubinfo($record->field('260'));
 217             }
 218         }
 219
 220         ## 6XX fields contain KW candidates. We add all of them to a
 221         ## hash to eliminate duplicates
 222         my %kwpool;
 223
 224         if ($intype eq "unimarc") {
 225             foreach ('600', '601', '602', '604', '605', '606','607', '608', '610', '615', '620', '660'. '661', '670', '675', '676', '680', '686') {
 226                 &get_keywords(\%kwpool, "$_",$record->field($_));
 227             }
 228         }
 229         elsif ($intype eq "ukmarc") {
 230             foreach ('600', '610', '611', '630', '650', '651','653', '655', '660', '661', '668', '690', '691', '692', '695') {
 231                 &get_keywords(\%kwpool, "$_",$record->field($_));
 232             }
 233         }
 234         else { ## assume marc21
 235             foreach ('600', '610', '611', '630', '650', '651','653', '654', '655', '656', '657', '658') {
 236                 &get_keywords(\%kwpool, "$_",$record->field($_));
 237             }
 238         }
 239
 240         ## print all keywords found in the hash. The value of each hash
 241         ## entry is the number of occurrences, but we're not really interested
 242         ## in that and rather print the key
 243         while (my ($key, $value) = each %kwpool) {
 244             print "KW  - ", &charconv($key), "\r\n";
 245         }
 246
 247         ## 5XX have various candidates for notes and abstracts. We pool
 248         ## all notes-like stuff in one list.
 249         my @notepool;
 250
 251         ## these fields have notes candidates
 252         if ($intype eq "unimarc") {
 253             foreach ('300', '301', '302', '303', '304', '305', '306', '307', '308', '310', '311', '312', '313', '314', '315', '316', '317', '318', '320', '321', '322', '323', '324', '325', '326', '327', '328', '332', '333', '336', '337', '345') {
 254                 &pool_subx(\@notepool, $_, $record->field($_));
 255             }
 256         }
 257         elsif ($intype eq "ukmarc") {
 258             foreach ('500', '501', '502', '503', '504', '505', '506', '508', '514', '515', '516', '521', '524', '525', '528', '530', '531', '532', '533', '534', '535', '537', '538', '540', '541', '542', '544', '554', '555', '556', '557', '561', '563', '580', '583', '584', '586') {
 259                 &pool_subx(\@notepool, $_, $record->field($_));
 260         }
 261         }
 262         else { ## assume marc21
 263             foreach ('500', '501', '502', '504', '505', '506', '507', '508', '510', '511', '513', '514', '515', '516', '518', '521', '522', '524', '525', '526', '530', '533', '534', '535') {
 264                 &pool_subx(\@notepool, $_, $record->field($_));
 265             }
 266         }
 267
 268         my $allnotes = join "; ", @notepool;
 269
 270         if (length($allnotes) > 0) {
 271             print "N1  - ", &charconv($allnotes), "\r\n";
 272         }
 273
 274         ## 320/520 have the abstract
 275         if ($intype eq "unimarc") {
 276             &print_abstract($record->field('320'));
 277         }
 278         elsif ($intype eq "ukmarc") {
 279             &print_abstract($record->field('512'), $record->field('513'));
 280         }
 281         else { ## assume marc21
 282             &print_abstract($record->field('520'));
 283         }
 284
 285     # 856u has the URI
 286     if ($record->field('856')) {
 287         print_uri($record->field('856'));
 288     }
 289
 290         ## end RIS dataset
 291         print "ER  - \r\n";
 292
 293     # Let's re-redirect stdout
 294     close STDOUT;
 295     open STDOUT, ">&", $oldout;
 296
 297     return $outvar;
 298
 299 }
 300
 301
 302 ##********************************************************************
 303 ## print_typetag(): prints the first line of a RIS dataset including
 304 ## the preceeding newline
 305 ## Argument: the leader of a MARC dataset
 306 ## Returns: the value at leader position 06
 307 ##********************************************************************
 308 sub print_typetag {
 309   my ($leader)= @_;
 310     ## the keys of typehash are the allowed values at position 06
 311     ## of the leader of a MARC record, the values are the RIS types
 312     ## that might appropriately represent these types.
 313     my %ustypehash = (
 314                     "a" => "BOOK",
 315                     "c" => "MUSIC",
 316                     "d" => "MUSIC",
 317                     "e" => "MAP",
 318                     "f" => "MAP",
 319                     "g" => "ADVS",
 320                     "i" => "SOUND",
 321                     "j" => "SOUND",
 322                     "k" => "ART",
 323                     "m" => "DATA",
 324                     "o" => "GEN",
 325                     "p" => "GEN",
 326                     "r" => "ART",
 327                     "t" => "GEN",
 328                 );
 329
 330     my %unitypehash = (
 331                     "a" => "BOOK",
 332                     "b" => "BOOK",
 333                     "c" => "MUSIC",
 334                     "d" => "MUSIC",
 335                     "e" => "MAP",
 336                     "f" => "MAP",
 337                     "g" => "ADVS",
 338                     "i" => "SOUND",
 339                     "j" => "SOUND",
 340                     "k" => "ART",
 341                     "l" => "ELEC",
 342                     "m" => "ADVS",
 343                     "r" => "ART",
 344                 );
 345
 346     ## The type of a MARC record is found at position 06 of the leader
 347     my $typeofrecord = substr($leader, 6, 1);
 348
 349     ## ToDo: for books, field 008 positions 24-27 might have a few more
 350     ## hints
 351
 352     my %typehash;
 353
 354     ## the ukmarc here is just a guess
 355     if ($intype eq "marc21" || $intype eq "ukmarc") {
 356         %typehash = %ustypehash;
 357     }
 358     elsif ($intype eq "unimarc") {
 359         %typehash = %unitypehash;
 360     }
 361     else {
 362         ## assume MARC21 as default
 363         %typehash = %ustypehash;
 364     }
 365
 366     if (!exists $typehash{$typeofrecord}) {
 367         print "TY  - BOOK\r\n"; ## most reasonable default
 368         warn ("no type found - assume BOOK") if $marcprint;
 369     }
 370     else {
 371         print "TY  - $typehash{$typeofrecord}\r\n";
 372     }
 373
 374     ## use $typeofrecord as the return value, just in case
 375     $typeofrecord;
 376 }
 377
 378 ##********************************************************************
 379 ## normalize_author(): normalizes an authorname
 380 ## Arguments: authorname subfield a
 381 ##            authorname subfield b
 382 ##            authorname subfield c
 383 ##            name type if known: 0=direct order
 384 ##                               1=only surname or full name in
 385 ##                                 inverted order
 386 ##                               3=family, clan, dynasty name
 387 ## Returns: the normalized authorname
 388 ##********************************************************************
 389 sub normalize_author {
 390     my($rawauthora, $rawauthorb, $rawauthorc, $nametype) = @_;
 391
 392     if ($nametype == 0) {
 393         # ToDo: convert every input to Last[,(F.|First)[ (M.|Middle)[,Suffix]]]
 394         warn("name >>$rawauthora<< in direct order - leave as is") if $marcprint;
 395         return $rawauthora;
 396     }
 397     elsif ($nametype == 1) {
 398         ## start munging subfield a (the real name part)
 399         ## remove spaces after separators
 400         $rawauthora =~ s%([,.]+) *%$1%g;
 401
 402         ## remove trailing separators after spaces
 403         $rawauthora =~ s% *[,;:/]*$%%;
 404
 405         ## remove periods after a non-abbreviated name
 406         $rawauthora =~ s%(\w{2,})\.%$1%g;
 407
 408         ## start munging subfield b (something like the suffix)
 409         ## remove trailing separators after spaces
 410         $rawauthorb =~ s% *[,;:/]*$%%;
 411
 412         ## we currently ignore subfield c until someone complains
 413         if (length($rawauthorb) > 0) {
 414             return join ",", ($rawauthora, $rawauthorb);
 415         }
 416         else {
 417             return $rawauthora;
 418         }
 419     }
 420     elsif ($nametype == 3) {
 421         return $rawauthora;
 422     }
 423 }
 424
 425 ##********************************************************************
 426 ## get_author(): gets authorname info from MARC fields 100, 700
 427 ## Argument: field (100 or 700)
 428 ## Returns: an author string in the format found in the record
 429 ##********************************************************************
 430 sub get_author {
 431     my ($authorfield) = @_;
 432     my ($indicator);
 433
 434     ## the sequence of the name parts is encoded either in indicator
 435     ## 1 (marc21) or 2 (unimarc)
 436     if ($intype eq "unimarc") {
 437         $indicator = 2;
 438     }
 439     else { ## assume marc21
 440         $indicator = 1;
 441     }
 442
 443     print "<marc>:Author(Ind$indicator): ", $authorfield->indicator("$indicator"),"\r\n" if $marcprint;
 444     print "<marc>:Author(\$a): ", $authorfield->subfield('a'),"\r\n" if $marcprint;
 445     print "<marc>:Author(\$b): ", $authorfield->subfield('b'),"\r\n" if $marcprint;
 446     print "<marc>:Author(\$c): ", $authorfield->subfield('c'),"\r\n" if $marcprint;
 447     print "<marc>:Author(\$h): ", $authorfield->subfield('h'),"\r\n" if $marcprint;
 448     if ($intype eq "ukmarc") {
 449         my $authorname = $authorfield->subfield('a') . "," . $authorfield->subfield('h');
 450         normalize_author($authorname, $authorfield->subfield('b'), $authorfield->subfield('c'), $authorfield->indicator("$indicator"));
 451     }
 452     else {
 453         normalize_author($authorfield->subfield('a'), $authorfield->subfield('b'), $authorfield->subfield('c'), $authorfield->indicator("$indicator"));
 454     }
 455 }
 456
 457 ##********************************************************************
 458 ## get_editor(): gets editor info from MARC fields 110, 111, 710, 711
 459 ## Argument: field (110, 111, 710, or 711)
 460 ## Returns: an author string in the format found in the record
 461 ##********************************************************************
 462 sub get_editor {
 463     my ($editorfield) = @_;
 464
 465     if (!$editorfield) {
 466         return;
 467     }
 468     else {
 469         print "<marc>Editor(\$a): ", $editorfield->subfield('a'),"\r\n" if $marcprint;
 470         print "<marc>Editor(\$b): ", $editorfield->subfield('b'),"\r\n" if $marcprint;
 471         print "<marc>editor(\$c): ", $editorfield->subfield('c'),"\r\n" if $marcprint;
 472         return $editorfield->subfield('a');
 473     }
 474 }
 475
 476 ##********************************************************************
 477 ## print_title(): gets info from MARC field 245
 478 ## Arguments: field (245)
 479 ## Returns:
 480 ##********************************************************************
 481 sub print_title {
 482     my ($titlefield) = @_;
 483     if (!$titlefield) {
 484         print "<marc>empty title field (245)\r\n" if $marcprint;
 485         warn("empty title field (245)") if $marcprint;
 486     }
 487     else {
 488         print "<marc>Title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
 489         print "<marc>Title(\$b): ",$titlefield->subfield('b'),"\r\n" if $marcprint;
 490         print "<marc>Title(\$c): ",$titlefield->subfield('c'),"\r\n" if $marcprint;
 491
 492         ## The title is usually written in a very odd notation. The title
 493         ## proper ($a) often ends with a space followed by a separator like
 494         ## a slash or a colon. The subtitle ($b) doesn't start with a space
 495         ## so simple concatenation looks odd. We have to conditionally remove
 496         ## the separator and make sure there's a space between title and
 497         ## subtitle
 498
 499         my $clean_title = $titlefield->subfield('a');
 500
 501         my $clean_subtitle = $titlefield->subfield('b');
 502         $clean_title =~ s% *[/:;.]$%%;
 503         $clean_subtitle =~ s%^ *(.*) *[/:;.]$%$1%;
 504
 505         if (length($clean_title) > 0
 506             || (length($clean_subtitle) > 0 && $intype ne "unimarc")) {
 507             print "TI  - ", &charconv($clean_title);
 508
 509             ## subfield $b is relevant only for marc21/ukmarc
 510             if (length($clean_subtitle) > 0 && $intype ne "unimarc") {
 511                 print ": ",&charconv($clean_subtitle);
 512             }
 513             print "\r\n";
 514         }
 515
 516         ## The statement of responsibility is just this: horrors. There is
 517         ## no formal definition how authors, editors and the like should
 518         ## be written and designated. The field is free-form and resistant
 519         ## to all parsing efforts, so this information is lost on me
 520     }
 521 }
 522
 523 ##********************************************************************
 524 ## print_stitle(): prints info from series title field
 525 ## Arguments: field
 526 ## Returns:
 527 ##********************************************************************
 528 sub print_stitle {
 529     my ($titlefield) = @_;
 530
 531     if (!$titlefield) {
 532         print "<marc>empty series title field\r\n" if $marcprint;
 533     }
 534     else {
 535         print "<marc>Series title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
 536         my $clean_title = $titlefield->subfield('a');
 537
 538         $clean_title =~ s% *[/:;.]$%%;
 539
 540         if (length($clean_title) > 0) {
 541             print "T2  - ", &charconv($clean_title),"\r\n";
 542         }
 543
 544         if ($intype eq "unimarc") {
 545             print "<marc>Series vol(\$v): ",$titlefield->subfield('v'),"\r\n" if $marcprint;
 546             if (length($titlefield->subfield('v')) > 0) {
 547                 print "VL  - ", &charconv($titlefield->subfield('v')),"\r\n";
 548             }
 549         }
 550     }
 551 }
 552
 553 ##********************************************************************
 554 ## print_isbn(): gets info from MARC field 020
 555 ## Arguments: field (020)
 556 ##********************************************************************
 557 sub print_isbn {
 558     my($isbnfield) = @_;
 559
 560     if (!$isbnfield || length ($isbnfield->subfield('a')) == 0) {
 561         print "<marc>no isbn found (020\$a)\r\n" if $marcprint;
 562         warn("no isbn found") if $marcprint;
 563     }
 564     else {
 565         if (length ($isbnfield->subfield('a')) < 10) {
 566             print "<marc>truncated isbn (020\$a)\r\n" if $marcprint;
 567             warn("truncated isbn") if $marcprint;
 568         }
 569
 570         my $isbn = substr($isbnfield->subfield('a'), 0, 10);
 571         print "SN  - ", &charconv($isbn), "\r\n";
 572     }
 573 }
 574
 575 ##********************************************************************
 576 ## print_issn(): gets info from MARC field 022
 577 ## Arguments: field (022)
 578 ##********************************************************************
 579 sub print_issn {
 580     my($issnfield) = @_;
 581
 582     if (!$issnfield || length ($issnfield->subfield('a')) == 0) {
 583         print "<marc>no issn found (022\$a)\r\n" if $marcprint;
 584         warn("no issn found") if $marcprint;
 585     }
 586     else {
 587         if (length ($issnfield->subfield('a')) < 9) {
 588             print "<marc>truncated issn (022\$a)\r\n" if $marcprint;
 589             warn("truncated issn") if $marcprint;
 590         }
 591
 592         my $issn = substr($issnfield->subfield('a'), 0, 9);
 593         print "SN  - ", &charconv($issn), "\r\n";
 594     }
 595 }
 596
 597 ###
 598 # print_uri() prints info from 856 u
 599 ###
 600 sub print_uri {
 601     my @f856s = @_;
 602
 603     foreach my $f856 (@f856s) {
 604         if (my $uri = $f856->subfield('u')) {
 605                 print "UR  - ", charconv($uri), "\r\n";
 606         }
 607     }
 608 }
 609
 610 ##********************************************************************
 611 ## print_loc_callno(): gets info from MARC field 050
 612 ## Arguments: field (050)
 613 ##********************************************************************
 614 sub print_loc_callno {
 615     my($callnofield) = @_;
 616
 617     if (!$callnofield || length ($callnofield->subfield('a')) == 0) {
 618         print "<marc>no LOC call number found (050\$a)\r\n" if $marcprint;
 619         warn("no LOC call number found") if $marcprint;
 620     }
 621     else {
 622         print "AV  - ", &charconv($callnofield->subfield('a')), " ", &charconv($callnofield->subfield('b')), "\r\n";
 623     }
 624 }
 625
 626 ##********************************************************************
 627 ## print_dewey(): gets info from MARC field 082
 628 ## Arguments: field (082)
 629 ##********************************************************************
 630 sub print_dewey {
 631     my($deweyfield) = @_;
 632
 633     if (!$deweyfield || length ($deweyfield->subfield('a')) == 0) {
 634         print "<marc>no Dewey number found (082\$a)\r\n" if $marcprint;
 635         warn("no Dewey number found") if $marcprint;
 636     }
 637     else {
 638         print "U1  - ", &charconv($deweyfield->subfield('a')), " ", &charconv($deweyfield->subfield('2')), "\r\n";
 639     }
 640 }
 641
 642 ##********************************************************************
 643 ## print_pubinfo(): gets info from MARC field 260
 644 ## Arguments: field (260)
 645 ##********************************************************************
 646 sub print_pubinfo {
 647     my($pubinfofield) = @_;
 648
 649     if (!$pubinfofield) {
 650     print "<marc>no publication information found (260/264)\r\n" if $marcprint;
 651         warn("no publication information found") if $marcprint;
 652     }
 653     else {
 654         ## the following information is available in MARC21:
 655         ## $a place -> CY
 656         ## $b publisher -> PB
 657         ## $c date -> PY
 658         ## the corresponding subfields for UNIMARC:
 659         ## $a place -> CY
 660         ## $c publisher -> PB
 661         ## $d date -> PY
 662
 663         ## all of them are repeatable. We pool all places into a
 664         ## comma-separated list in CY. We also pool all publishers
 665         ## into a comma-separated list in PB.  We break the rule with
 666         ## the date field because this wouldn't make much sense. In
 667         ## this case, we use the first occurrence for PY, the second
 668         ## for Y2, and ignore the rest
 669
 670         my @pubsubfields = $pubinfofield->subfields();
 671         my @cities;
 672         my @publishers;
 673         my $pycounter = 0;
 674
 675         my $pubsub_place;
 676         my $pubsub_publisher;
 677         my $pubsub_date;
 678
 679         if ($intype eq "unimarc") {
 680             $pubsub_place = "a";
 681             $pubsub_publisher = "c";
 682             $pubsub_date = "d";
 683         }
 684         else { ## assume marc21
 685             $pubsub_place = "a";
 686             $pubsub_publisher = "b";
 687             $pubsub_date = "c";
 688         }
 689
 690         ## loop over all subfield list entries
 691         for my $tuple (@pubsubfields) {
 692             ## each tuple consists of the subfield code and the value
 693             if (@$tuple[0] eq $pubsub_place) {
 694                 ## strip any trailing crap
 695                 $_ = @$tuple[1];
 696                 s% *[,;:/]$%%;
 697                 ## pool all occurrences in a list
 698                 push (@cities, $_);
 699             }
 700             elsif (@$tuple[0] eq $pubsub_publisher) {
 701                 ## strip any trailing crap
 702                 $_ = @$tuple[1];
 703                 s% *[,;:/]$%%;
 704                 ## pool all occurrences in a list
 705                 push (@publishers, $_);
 706             }
 707             elsif (@$tuple[0] eq $pubsub_date) {
 708                 ## the dates are free-form, so we want to extract
 709                 ## a four-digit year and leave the rest as
 710                 ## "other info"
 711                 $protoyear = @$tuple[1];
 712                 print "<marc>Year (260\$c): $protoyear\r\n" if $marcprint;
 713
 714                 ## strip any separator chars at the end
 715                 $protoyear =~ s% *[\.;:/]*$%%;
 716
 717                 ## isolate a four-digit year. We discard anything
 718                 ## preceeding the year, but keep everything after
 719                 ## the year as other info.
 720                 $protoyear =~ s%\D*([0-9\-]{4})(.*)%$1///$2%;
 721
 722                 ## check what we've got. If there is no four-digit
 723                 ## year, make it up. If digits are replaced by '-',
 724                 ## replace those with 0s
 725
 726                 if (index($protoyear, "/") == 4) {
 727                     ## have year info
 728                     ## replace all '-' in the four-digit year
 729                     ## by '0'
 730                     substr($protoyear,0,4) =~ s!-!0!g;
 731                 }
 732                 else {
 733                     ## have no year info
 734                     print "<marc>no four-digit year found, use 0000\r\n" if $marcprint;
 735                     $protoyear = "0000///$protoyear";
 736                     warn("no four-digit year found, use 0000") if $marcprint;
 737                 }
 738
 739                 if ($pycounter == 0 && length($protoyear)) {
 740                     print "PY  - $protoyear\r\n";
 741                 }
 742                 elsif ($pycounter == 1 && length($_)) {
 743                     print "Y2  - $protoyear\r\n";
 744                 }
 745                 ## else: discard
 746             }
 747             ## else: discard
 748         }
 749
 750         ## now dump the collected CY and PB lists
 751         if (@cities > 0) {
 752             print "CY  - ", &charconv(join(", ", @cities)), "\r\n";
 753         }
 754         if (@publishers > 0) {
 755             print "PB  - ", &charconv(join(", ", @publishers)), "\r\n";
 756         }
 757     }
 758 }
 759
 760 ##********************************************************************
 761 ## get_keywords(): prints info from MARC fields 6XX
 762 ## Arguments: list of fields (6XX)
 763 ##********************************************************************
 764 sub get_keywords {
 765     my($href, $fieldname, @keywords) = @_;
 766
 767     ## a list of all possible subfields
 768     my @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '2', '3', '4');
 769
 770     ## loop over all 6XX fields
 771     foreach my $kwfield (@keywords) {
 772         if ($kwfield != undef) {
 773             ## authornames get special treatment
 774             if ($fieldname eq "600") {
 775                 my $val = normalize_author($kwfield->subfield('a'), $kwfield->subfield('b'), $kwfield->subfield('c'), $kwfield->indicator('1'));
 776                 ${$href}{$val} += 1;
 777                 print "<marc>Field $kwfield subfield a:", $kwfield->subfield('a'), "\r\n<marc>Field $kwfield subfield b:", $kwfield->subfield('b'), "\r\n<marc>Field $kwfield subfield c:", $kwfield->subfield('c'), "\r\n" if $marcprint;
 778             }
 779             else {
 780                 ## retrieve all available subfields
 781                 @kwsubfields = $kwfield->subfields();
 782
 783                 ## loop over all available subfield tuples
 784         foreach my $kwtuple (@kwsubfields) {
 785                     ## loop over all subfields to check
 786             foreach my $subfield (@subfields) {
 787                         ## [0] contains subfield code
 788                         if (@$kwtuple[0] eq $subfield) {
 789                             ## [1] contains value, remove trailing separators
 790                             @$kwtuple[1] =~ s% *[,;.:/]*$%%;
 791                             if (length(@$kwtuple[1]) > 0) {
 792                                 ## add to hash
 793                                 ${$href}{@$kwtuple[1]} += 1;
 794                                 print "<marc>Field $fieldname subfield $subfield:", @$kwtuple[1], "\r\n" if $marcprint;
 795                             }
 796                             ## we can leave the subfields loop here
 797                             last;
 798                         }
 799                     }
 800                 }
 801             }
 802         }
 803     }
 804 }
 805
 806 ##********************************************************************
 807 ## pool_subx(): adds contents of several subfields to a list
 808 ## Arguments: reference to a list
 809 ##            field name
 810 ##            list of fields (5XX)
 811 ##********************************************************************
 812 sub pool_subx {
 813     my($aref, $fieldname, @notefields) = @_;
 814
 815     ## we use a list that contains the interesting subfields
 816     ## for each field
 817     # ToDo: this is apparently correct only for marc21
 818     my @subfields;
 819
 820     if ($fieldname eq "500") {
 821         @subfields = ('a');
 822     }
 823     elsif ($fieldname eq "501") {
 824         @subfields = ('a');
 825     }
 826     elsif ($fieldname eq "502") {
 827         @subfields = ('a');
 828             }
 829     elsif ($fieldname eq "504") {
 830         @subfields = ('a', 'b');
 831     }
 832     elsif ($fieldname eq "505") {
 833         @subfields = ('a', 'g', 'r', 't', 'u');
 834     }
 835     elsif ($fieldname eq "506") {
 836         @subfields = ('a', 'b', 'c', 'd', 'e');
 837     }
 838     elsif ($fieldname eq "507") {
 839         @subfields = ('a', 'b');
 840     }
 841     elsif ($fieldname eq "508") {
 842         @subfields = ('a');
 843     }
 844     elsif ($fieldname eq "510") {
 845         @subfields = ('a', 'b', 'c', 'x', '3');
 846     }
 847     elsif ($fieldname eq "511") {
 848         @subfields = ('a');
 849     }
 850     elsif ($fieldname eq "513") {
 851         @subfields = ('a', 'b');
 852     }
 853     elsif ($fieldname eq "514") {
 854         @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'm', 'u', 'z');
 855     }
 856     elsif ($fieldname eq "515") {
 857         @subfields = ('a');
 858     }
 859     elsif ($fieldname eq "516") {
 860         @subfields = ('a');
 861     }
 862     elsif ($fieldname eq "518") {
 863         @subfields = ('a', '3');
 864     }
 865     elsif ($fieldname eq "521") {
 866         @subfields = ('a', 'b', '3');
 867     }
 868     elsif ($fieldname eq "522") {
 869         @subfields = ('a');
 870     }
 871     elsif ($fieldname eq "524") {
 872         @subfields = ('a', '2', '3');
 873     }
 874     elsif ($fieldname eq "525") {
 875         @subfields = ('a');
 876     }
 877     elsif ($fieldname eq "526") {
 878         @subfields = ('a', 'b', 'c', 'd', 'i', 'x', 'z', '5');
 879     }
 880     elsif ($fieldname eq "530") {
 881         @subfields = ('a', 'b', 'c', 'd', 'u', '3');
 882     }
 883     elsif ($fieldname eq "533") {
 884         @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'm', 'n', '3');
 885     }
 886     elsif ($fieldname eq "534") {
 887         @subfields = ('a', 'b', 'c', 'e', 'f', 'k', 'l', 'm', 'n', 'p', 't', 'x', 'z');
 888     }
 889     elsif ($fieldname eq "535") {
 890         @subfields = ('a', 'b', 'c', 'd', 'g', '3');
 891     }
 892
 893     ## loop over all notefields
 894     foreach my $notefield (@notefields) {
 895         if ($notefield != undef) {
 896             ## retrieve all available subfield tuples
 897             @notesubfields = $notefield->subfields();
 898
 899             ## loop over all subfield tuples
 900         foreach my $notetuple (@notesubfields) {
 901                 ## loop over all subfields to check
 902         foreach my $subfield (@subfields) {
 903                     ## [0] contains subfield code
 904                     if (@$notetuple[0] eq $subfield) {
 905                         ## [1] contains value, remove trailing separators
 906                         print "<marc>field $fieldname subfield $subfield: ", @$notetuple[1], "\r\n" if $marcprint;
 907                         @$notetuple[1] =~ s% *[,;.:/]*$%%;
 908                         if (length(@$notetuple[1]) > 0) {
 909                             ## add to list
 910                             push @{$aref}, @$notetuple[1];
 911                         }
 912                         last;
 913                     }
 914                 }
 915             }
 916         }
 917     }
 918 }
 919
 920 ##********************************************************************
 921 ## print_abstract(): prints abstract fields
 922 ## Arguments: list of fields (520)
 923 ##********************************************************************
 924 sub print_abstract {
 925     # ToDo: take care of repeatable subfields
 926     my(@abfields) = @_;
 927
 928     ## we check the following subfields
 929     my @subfields = ('a', 'b');
 930
 931     ## we generate a list for all useful strings
 932     my @abstrings;
 933
 934     ## loop over all abfields
 935     foreach my $abfield (@abfields) {
 936         foreach my $field (@subfields) {
 937             if ( length( $abfield->subfield($field) ) > 0 ) {
 938                 my $ab = $abfield->subfield($field);
 939
 940                 print "<marc>field 520 subfield $field: $ab\r\n" if $marcprint;
 941
 942                 ## strip trailing separators
 943                 $ab =~ s% *[;,:./]*$%%;
 944
 945                 ## add string to the list
 946                 push( @abstrings, $ab );
 947             }
 948         }
 949     }
 950
 951     my $allabs = join "; ", @abstrings;
 952
 953     if (length($allabs) > 0) {
 954         print "N2  - ", &charconv($allabs), "\r\n";
 955     }
 956
 957 }
 958
 959
 960
 961 ##********************************************************************
 962 ## charconv(): converts to a different charset based on a global var
 963 ## Arguments: string
 964 ## Returns: string
 965 ##********************************************************************
 966 sub charconv {
 967     if ($utf) {
 968         ## return unaltered if already utf-8
 969         return @_;
 970     }
 971     elsif ($uniout eq "t") {
 972         ## convert to utf-8
 973         return marc8_to_utf8("@_");
 974     }
 975     else {
 976         ## return unaltered if no utf-8 requested
 977         return @_;
 978     }
 979 }
 980 1;