C4/Ris.pm

   1 package C4::Ris;
   2
   3 # Original script :
   4 ## marc2ris: converts MARC21 and UNIMARC datasets to RIS format
   5 ##           See comments below for compliance with other MARC dialects
   6 ##
   7 ## usage: perl marc2ris < infile.marc > outfile.ris
   8 ##
   9 ## Dependencies: perl 5.6.0 or later
  10 ##               MARC::Record
  11 ##               MARC::Charset
  12 ##
  13 ## markus@mhoenicka.de 2002-11-16
  14
  15 ##   This program is free software; you can redistribute it and/or modify
  16 ##   it under the terms of the GNU General Public License as published by
  17 ##   the Free Software Foundation; either version 2 of the License, or
  18 ##   (at your option) any later version.
  19 ##
  20 ##   This program is distributed in the hope that it will be useful,
  21 ##   but WITHOUT ANY WARRANTY; without even the implied warranty of
  22 ##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23 ##   GNU General Public License for more details.
  24
  25 ##   You should have received a copy of the GNU General Public License
  26 ##   along with this program; if not, write to the Free Software
  27 ##   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  28
  29 ## Some background about MARC as understood by this script
  30 ## The default input format used in this script is MARC21, which
  31 ## superseded USMARC and CANMARC. The specification can be found at:
  32 ## http://lcweb.loc.gov/marc/
  33 ## UNIMARC follows the specification at:
  34 ## http://www.ifla.org/VI/3/p1996-1/sec-uni.htm
  35 ## UKMARC support is a bit shaky because there is no specification available
  36 ## for free. The wisdom used in this script was taken from a PDF document
  37 ## comparing UKMARC to MARC21 found at:
  38 ## www.bl.uk/services/bibliographic/marcchange.pdf
  39
  40
  41 # Modified 2008 by BibLibre for Koha
  42 # Modified 2011 by Catalyst
  43 #
  44 # This file is part of Koha.
  45 #
  46 # Koha is free software; you can redistribute it and/or modify it under the
  47 # terms of the GNU General Public License as published by the Free Software
  48 # Foundation; either version 2 of the License, or (at your option) any later
  49 # version.
  50 #
  51 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  52 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  53 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  54 #
  55 # You should have received a copy of the GNU General Public License along with
  56 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  57 # Suite 330, Boston, MA  02111-1307 USA
  58 #
  59 #
  60
  61 #use strict;
  62 #use warnings; FIXME - Bug 2505
  63
  64 use vars qw($VERSION @ISA @EXPORT);
  65
  66 # set the version for version checking
  67 $VERSION = 3.00;
  68
  69 @ISA = qw(Exporter);
  70
  71 # only export API methods
  72
  73 @EXPORT = qw(
  74   &marc2ris
  75 );
  76
  77
  78 =head1 marc2bibtex - Convert from UNIMARC to RIS
  79
  80   my ($ris) = marc2ris($record);
  81
  82 Returns a RIS scalar
  83
  84 C<$record> - a MARC::Record object
  85
  86 =cut
  87
  88 sub marc2ris {
  89     my ($record) = @_;
  90     my $output;
  91
  92     my $marcflavour = C4::Context->preference("marcflavour");
  93     my $intype = lc($marcflavour);
  94     my $marcprint = 0; # Debug flag;
  95
  96     # Let's redirect stdout
  97     open my $oldout, ">&STDOUT";
  98     my $outvar;
  99     close STDOUT;
 100     open STDOUT,'>', \$outvar;
 101
 102
 103         ## First we should check the character encoding. This may be
 104         ## MARC-8 or UTF-8. The former is indicated by a blank, the latter
 105         ## by 'a' at position 09 (zero-based) of the leader
 106         my $leader = $record->leader();
 107         if ($intype eq "marc21") {
 108             if ($leader =~ /^.{9}a/) {
 109                 print "<marc>---\r\n<marc>UTF-8 data\r\n" if $marcprint;
 110                 $utf = 1;
 111             }
 112             else {
 113                 print "<marc>---\r\n<marc>MARC-8 data\r\n" if $marcprint;
 114             }
 115         }
 116         ## else: other MARC formats do not specify the character encoding
 117         ## we assume it's *not* UTF-8
 118
 119         ## start RIS dataset
 120         &print_typetag($leader);
 121
 122         ## retrieve all author fields and collect them in a list
 123         my @author_fields;
 124
 125         if ($intype eq "unimarc") {
 126             ## Fields 700, 701, and 702 can contain author names
 127             @author_fields = ($record->field('700'), $record->field('701'), $record->field('702'));
 128         }
 129         else {  ## marc21, ukmarc
 130             ## Field 100 sometimes carries main author
 131             ## Field(s) 700 carry added entries - personal names
 132             @author_fields = ($record->field('100'), $record->field('700'));
 133         }
 134
 135         ## loop over all author fields
 136         foreach my $field (@author_fields) {
 137             if (length($field)) {
 138                 my $author = &get_author($field);
 139                 print "AU  - ",&charconv($author),"\r\n";
 140             }
 141         }
 142
 143         # ToDo: should we specify anonymous as author if we didn't find
 144         # one? or use one of the corporate/meeting names below?
 145
 146         ## add corporate names or meeting names as editors ??
 147         my @editor_fields;
 148
 149         if ($intype eq "unimarc") {
 150             ## Fields 710, 711, and 712 can carry corporate names
 151             ## Field(s) 720, 721, 722, 730 have additional candidates
 152             @editor_fields = ($record->field('710'), $record->field('711'), $record->field('712'), $record->field('720'), $record->field('721'), $record->field('722'), $record->field('730'));
 153         }
 154         else { ## marc21, ukmarc
 155             ## Fields 110 and 111 carry the main entries - corporate name and
 156             ## meeting name, respectively
 157             ## Field(s) 710, 711 carry added entries - personal names
 158             @editor_fields = ($record->field('110'), $record->field('111'), $record->field('710'), $record->field('711'));
 159         }
 160
 161         ## loop over all editor fields
 162         foreach my $field (@editor_fields) {
 163             if (length($field)) {
 164                 my $editor = &get_editor($field);
 165                 print "ED  - ",&charconv($editor),"\r\n";
 166             }
 167         }
 168
 169         ## get info from the title field
 170         if ($intype eq "unimarc") {
 171             &print_title($record->field('200'));
 172         }
 173         else { ## marc21, ukmarc
 174             &print_title($record->field('245'));
 175         }
 176
 177         ## series title
 178         if ($intype eq "unimarc") {
 179             &print_stitle($record->field('225'));
 180         }
 181         else { ## marc21, ukmarc
 182             &print_stitle($record->field('490'));
 183         }
 184
 185         ## ISBN/ISSN
 186         if ($intype eq "unimarc") {
 187             &print_isbn($record->field('010'));
 188             &print_issn($record->field('011'));
 189         }
 190         elsif ($intype eq "ukmarc") {
 191             &print_isbn($record->field('021'));
 192             ## this is just an assumption
 193             &print_issn($record->field('022'));
 194         }
 195         else { ## assume marc21
 196             &print_isbn($record->field('020'));
 197             &print_issn($record->field('022'));
 198         }
 199
 200         if ($intype eq "marc21") {
 201             &print_loc_callno($record->field('050'));
 202             &print_dewey($record->field('082'));
 203         }
 204         ## else: unimarc, ukmarc do not seem to store call numbers?
 205
 206         ## publication info
 207         if ($intype eq "unimarc") {
 208             &print_pubinfo($record->field('210'));
 209         }
 210         else { ## marc21, ukmarc
 211             &print_pubinfo($record->field('260'));
 212         }
 213
 214         ## 6XX fields contain KW candidates. We add all of them to a
 215         ## hash to eliminate duplicates
 216         my %kwpool;
 217
 218         if ($intype eq "unimarc") {
 219             foreach ('600', '601', '602', '604', '605', '606','607', '608', '610', '615', '620', '660'. '661', '670', '675', '676', '680', '686') {
 220                 &get_keywords(\%kwpool, "$_",$record->field($_));
 221             }
 222         }
 223         elsif ($intype eq "ukmarc") {
 224             foreach ('600', '610', '611', '630', '650', '651','653', '655', '660', '661', '668', '690', '691', '692', '695') {
 225                 &get_keywords(\%kwpool, "$_",$record->field($_));
 226             }
 227         }
 228         else { ## assume marc21
 229             foreach ('600', '610', '611', '630', '650', '651','653', '654', '655', '656', '657', '658') {
 230                 &get_keywords(\%kwpool, "$_",$record->field($_));
 231             }
 232         }
 233
 234         ## print all keywords found in the hash. The value of each hash
 235         ## entry is the number of occurrences, but we're not really interested
 236         ## in that and rather print the key
 237         while (my ($key, $value) = each %kwpool) {
 238             print "KW  - ", &charconv($key), "\r\n";
 239         }
 240
 241         ## 5XX have various candidates for notes and abstracts. We pool
 242         ## all notes-like stuff in one list.
 243         my @notepool;
 244
 245         ## these fields have notes candidates
 246         if ($intype eq "unimarc") {
 247             foreach ('300', '301', '302', '303', '304', '305', '306', '307', '308', '310', '311', '312', '313', '314', '315', '316', '317', '318', '320', '321', '322', '323', '324', '325', '326', '327', '328', '332', '333', '336', '337', '345') {
 248                 &pool_subx(\@notepool, $_, $record->field($_));
 249             }
 250         }
 251         elsif ($intype eq "ukmarc") {
 252             foreach ('500', '501', '502', '503', '504', '505', '506', '508', '514', '515', '516', '521', '524', '525', '528', '530', '531', '532', '533', '534', '535', '537', '538', '540', '541', '542', '544', '554', '555', '556', '557', '561', '563', '580', '583', '584', '586') {
 253                 &pool_subx(\@notepool, $_, $record->field($_));
 254             }
 255         }
 256         else { ## assume marc21
 257             foreach ('500', '501', '502', '504', '505', '506', '507', '508', '510', '511', '513', '514', '515', '516', '518', '521', '522', '524', '525', '526', '530', '533', '534', '535') {
 258                 &pool_subx(\@notepool, $_, $record->field($_));
 259             }
 260         }
 261
 262         my $allnotes = join "; ", @notepool;
 263
 264         if (length($allnotes) > 0) {
 265             print "N1  - ", &charconv($allnotes), "\r\n";
 266         }
 267
 268         ## 320/520 have the abstract
 269         if ($intype eq "unimarc") {
 270             &print_abstract($record->field('320'));
 271         }
 272         elsif ($intype eq "ukmarc") {
 273             &print_abstract($record->field('512'), $record->field('513'));
 274         }
 275         else { ## assume marc21
 276             &print_abstract($record->field('520'));
 277         }
 278
 279         # 856u has the URI
 280         if ($record->field('856')){
 281             print_uri($record->field('856'));
 282         }
 283
 284         ## end RIS dataset
 285         print "ER  - \r\n";
 286
 287     # Let's re-redirect stdout
 288     close STDOUT;
 289     open STDOUT, ">&", $oldout;
 290
 291     return $outvar;
 292
 293 }
 294
 295
 296 ##********************************************************************
 297 ## print_typetag(): prints the first line of a RIS dataset including
 298 ## the preceeding newline
 299 ## Argument: the leader of a MARC dataset
 300 ## Returns: the value at leader position 06
 301 ##********************************************************************
 302 sub print_typetag {
 303   my ($leader)= @_;
 304     ## the keys of typehash are the allowed values at position 06
 305     ## of the leader of a MARC record, the values are the RIS types
 306     ## that might appropriately represent these types.
 307     my %ustypehash = (
 308                     "a" => "BOOK",
 309                     "c" => "MUSIC",
 310                     "d" => "MUSIC",
 311                     "e" => "MAP",
 312                     "f" => "MAP",
 313                     "g" => "ADVS",
 314                     "i" => "SOUND",
 315                     "j" => "SOUND",
 316                     "k" => "ART",
 317                     "m" => "DATA",
 318                     "o" => "GEN",
 319                     "p" => "GEN",
 320                     "r" => "ART",
 321                     "t" => "GEN",
 322                 );
 323
 324     my %unitypehash = (
 325                     "a" => "BOOK",
 326                     "b" => "BOOK",
 327                     "c" => "MUSIC",
 328                     "d" => "MUSIC",
 329                     "e" => "MAP",
 330                     "f" => "MAP",
 331                     "g" => "ADVS",
 332                     "i" => "SOUND",
 333                     "j" => "SOUND",
 334                     "k" => "ART",
 335                     "l" => "ELEC",
 336                     "m" => "ADVS",
 337                     "r" => "ART",
 338                 );
 339
 340     ## The type of a MARC record is found at position 06 of the leader
 341     my $typeofrecord = substr($leader, 6, 1);
 342
 343     ## ToDo: for books, field 008 positions 24-27 might have a few more
 344     ## hints
 345
 346     my %typehash;
 347
 348     ## the ukmarc here is just a guess
 349     if ($intype eq "marc21" || $intype eq "ukmarc") {
 350         %typehash = %ustypehash;
 351     }
 352     elsif ($intype eq "unimarc") {
 353         %typehash = %unitypehash;
 354     }
 355     else {
 356         ## assume MARC21 as default
 357         %typehash = %ustypehash;
 358     }
 359
 360     if (!exists $typehash{$typeofrecord}) {
 361         print "TY  - BOOK\r\n"; ## most reasonable default
 362         warn ("no type found - assume BOOK") if $marcprint;
 363     }
 364     else {
 365         print "TY  - $typehash{$typeofrecord}\r\n";
 366     }
 367
 368     ## use $typeofrecord as the return value, just in case
 369     $typeofrecord;
 370 }
 371
 372 ##********************************************************************
 373 ## normalize_author(): normalizes an authorname
 374 ## Arguments: authorname subfield a
 375 ##            authorname subfield b
 376 ##            authorname subfield c
 377 ##            name type if known: 0=direct order
 378 ##                               1=only surname or full name in
 379 ##                                 inverted order
 380 ##                               3=family, clan, dynasty name
 381 ## Returns: the normalized authorname
 382 ##********************************************************************
 383 sub normalize_author {
 384     my($rawauthora, $rawauthorb, $rawauthorc, $nametype) = @_;
 385
 386     if ($nametype == 0) {
 387         # ToDo: convert every input to Last[,(F.|First)[ (M.|Middle)[,Suffix]]]
 388         warn("name >>$rawauthora<< in direct order - leave as is") if $marcprint;
 389         return $rawauthora;
 390     }
 391     elsif ($nametype == 1) {
 392         ## start munging subfield a (the real name part)
 393         ## remove spaces after separators
 394         $rawauthora =~ s%([,.]+) *%$1%g;
 395
 396         ## remove trailing separators after spaces
 397         $rawauthora =~ s% *[,;:/]*$%%;
 398
 399         ## remove periods after a non-abbreviated name
 400         $rawauthora =~ s%(\w{2,})\.%$1%g;
 401
 402         ## start munging subfield b (something like the suffix)
 403         ## remove trailing separators after spaces
 404         $rawauthorb =~ s% *[,;:/]*$%%;
 405
 406         ## we currently ignore subfield c until someone complains
 407         if (length($rawauthorb) > 0) {
 408             return join ",", ($rawauthora, $rawauthorb);
 409         }
 410         else {
 411             return $rawauthora;
 412         }
 413     }
 414     elsif ($nametype == 3) {
 415         return $rawauthora;
 416     }
 417 }
 418
 419 ##********************************************************************
 420 ## get_author(): gets authorname info from MARC fields 100, 700
 421 ## Argument: field (100 or 700)
 422 ## Returns: an author string in the format found in the record
 423 ##********************************************************************
 424 sub get_author {
 425     my ($authorfield) = @_;
 426     my ($indicator);
 427
 428     ## the sequence of the name parts is encoded either in indicator
 429     ## 1 (marc21) or 2 (unimarc)
 430     if ($intype eq "unimarc") {
 431         $indicator = 2;
 432     }
 433     else { ## assume marc21
 434         $indicator = 1;
 435     }
 436
 437     print "<marc>:Author(Ind$indicator): ", $authorfield->indicator("$indicator"),"\r\n" if $marcprint;
 438     print "<marc>:Author(\$a): ", $authorfield->subfield('a'),"\r\n" if $marcprint;
 439     print "<marc>:Author(\$b): ", $authorfield->subfield('b'),"\r\n" if $marcprint;
 440     print "<marc>:Author(\$c): ", $authorfield->subfield('c'),"\r\n" if $marcprint;
 441     print "<marc>:Author(\$h): ", $authorfield->subfield('h'),"\r\n" if $marcprint;
 442     if ($intype eq "ukmarc") {
 443         my $authorname = $authorfield->subfield('a') . "," . $authorfield->subfield('h');
 444         normalize_author($authorname, $authorfield->subfield('b'), $authorfield->subfield('c'), $authorfield->indicator("$indicator"));
 445     }
 446     else {
 447         normalize_author($authorfield->subfield('a'), $authorfield->subfield('b'), $authorfield->subfield('c'), $authorfield->indicator("$indicator"));
 448     }
 449 }
 450
 451 ##********************************************************************
 452 ## get_editor(): gets editor info from MARC fields 110, 111, 710, 711
 453 ## Argument: field (110, 111, 710, or 711)
 454 ## Returns: an author string in the format found in the record
 455 ##********************************************************************
 456 sub get_editor {
 457     my ($editorfield) = @_;
 458
 459     if (!$editorfield) {
 460         return;
 461     }
 462     else {
 463         print "<marc>Editor(\$a): ", $editorfield->subfield('a'),"\r\n" if $marcprint;
 464         print "<marc>Editor(\$b): ", $editorfield->subfield('b'),"\r\n" if $marcprint;
 465         print "<marc>editor(\$c): ", $editorfield->subfield('c'),"\r\n" if $marcprint;
 466         return $editorfield->subfield('a');
 467     }
 468 }
 469
 470 ##********************************************************************
 471 ## print_title(): gets info from MARC field 245
 472 ## Arguments: field (245)
 473 ## Returns:
 474 ##********************************************************************
 475 sub print_title {
 476     my ($titlefield) = @_;
 477     if (!$titlefield) {
 478         print "<marc>empty title field (245)\r\n" if $marcprint;
 479         warn("empty title field (245)") if $marcprint;
 480     }
 481     else {
 482         print "<marc>Title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
 483         print "<marc>Title(\$b): ",$titlefield->subfield('b'),"\r\n" if $marcprint;
 484         print "<marc>Title(\$c): ",$titlefield->subfield('c'),"\r\n" if $marcprint;
 485
 486         ## The title is usually written in a very odd notation. The title
 487         ## proper ($a) often ends with a space followed by a separator like
 488         ## a slash or a colon. The subtitle ($b) doesn't start with a space
 489         ## so simple concatenation looks odd. We have to conditionally remove
 490         ## the separator and make sure there's a space between title and
 491         ## subtitle
 492
 493         my $clean_title = $titlefield->subfield('a');
 494
 495         my $clean_subtitle = $titlefield->subfield('b');
 496         $clean_title =~ s% *[/:;.]$%%;
 497         $clean_subtitle =~ s%^ *(.*) *[/:;.]$%$1%;
 498
 499         if (length($clean_title) > 0
 500             || (length($clean_subtitle) > 0 && $intype ne "unimarc")) {
 501             print "TI  - ", &charconv($clean_title);
 502
 503             ## subfield $b is relevant only for marc21/ukmarc
 504             if (length($clean_subtitle) > 0 && $intype ne "unimarc") {
 505                 print ": ",&charconv($clean_subtitle);
 506             }
 507             print "\r\n";
 508         }
 509
 510         ## The statement of responsibility is just this: horrors. There is
 511         ## no formal definition how authors, editors and the like should
 512         ## be written and designated. The field is free-form and resistant
 513         ## to all parsing efforts, so this information is lost on me
 514     }
 515 }
 516
 517 ##********************************************************************
 518 ## print_stitle(): prints info from series title field
 519 ## Arguments: field
 520 ## Returns:
 521 ##********************************************************************
 522 sub print_stitle {
 523     my ($titlefield) = @_;
 524
 525     if (!$titlefield) {
 526         print "<marc>empty series title field\r\n" if $marcprint;
 527     }
 528     else {
 529         print "<marc>Series title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
 530         my $clean_title = $titlefield->subfield('a');
 531
 532         $clean_title =~ s% *[/:;.]$%%;
 533
 534         if (length($clean_title) > 0) {
 535             print "T2  - ", &charconv($clean_title),"\r\n";
 536         }
 537
 538         if ($intype eq "unimarc") {
 539             print "<marc>Series vol(\$v): ",$titlefield->subfield('v'),"\r\n" if $marcprint;
 540             if (length($titlefield->subfield('v')) > 0) {
 541                 print "VL  - ", &charconv($titlefield->subfield('v')),"\r\n";
 542             }
 543         }
 544     }
 545 }
 546
 547 ##********************************************************************
 548 ## print_isbn(): gets info from MARC field 020
 549 ## Arguments: field (020)
 550 ##********************************************************************
 551 sub print_isbn {
 552     my($isbnfield) = @_;
 553
 554     if (!$isbnfield || length ($isbnfield->subfield('a')) == 0) {
 555         print "<marc>no isbn found (020\$a)\r\n" if $marcprint;
 556         warn("no isbn found") if $marcprint;
 557     }
 558     else {
 559         if (length ($isbnfield->subfield('a')) < 10) {
 560             print "<marc>truncated isbn (020\$a)\r\n" if $marcprint;
 561             warn("truncated isbn") if $marcprint;
 562         }
 563
 564         my $isbn = substr($isbnfield->subfield('a'), 0, 10);
 565         print "SN  - ", &charconv($isbn), "\r\n";
 566     }
 567 }
 568
 569 ##********************************************************************
 570 ## print_issn(): gets info from MARC field 022
 571 ## Arguments: field (022)
 572 ##********************************************************************
 573 sub print_issn {
 574     my($issnfield) = @_;
 575
 576     if (!$issnfield || length ($issnfield->subfield('a')) == 0) {
 577         print "<marc>no issn found (022\$a)\r\n" if $marcprint;
 578         warn("no issn found") if $marcprint;
 579     }
 580     else {
 581         if (length ($issnfield->subfield('a')) < 9) {
 582             print "<marc>truncated issn (022\$a)\r\n" if $marcprint;
 583             warn("truncated issn") if $marcprint;
 584         }
 585
 586         my $issn = substr($issnfield->subfield('a'), 0, 9);
 587         print "SN  - ", &charconv($issn), "\r\n";
 588     }
 589 }
 590
 591 ###
 592 # print_uri() prints info from 856 u
 593 ###
 594 sub print_uri {
 595     my $f856 = shift;
 596
 597     if (my $uri = $f856->subfield('u')){
 598         print "UR  - ", charconv($uri), "\n";
 599     }
 600 }
 601
 602 ##********************************************************************
 603 ## print_loc_callno(): gets info from MARC field 050
 604 ## Arguments: field (050)
 605 ##********************************************************************
 606 sub print_loc_callno {
 607     my($callnofield) = @_;
 608
 609     if (!$callnofield || length ($callnofield->subfield('a')) == 0) {
 610         print "<marc>no LOC call number found (050\$a)\r\n" if $marcprint;
 611         warn("no LOC call number found") if $marcprint;
 612     }
 613     else {
 614         print "AV  - ", &charconv($callnofield->subfield('a')), " ", &charconv($callnofield->subfield('b')), "\r\n";
 615     }
 616 }
 617
 618 ##********************************************************************
 619 ## print_dewey(): gets info from MARC field 082
 620 ## Arguments: field (082)
 621 ##********************************************************************
 622 sub print_dewey {
 623     my($deweyfield) = @_;
 624
 625     if (!$deweyfield || length ($deweyfield->subfield('a')) == 0) {
 626         print "<marc>no Dewey number found (082\$a)\r\n" if $marcprint;
 627         warn("no Dewey number found") if $marcprint;
 628     }
 629     else {
 630         print "U1  - ", &charconv($deweyfield->subfield('a')), " ", &charconv($deweyfield->subfield('2')), "\r\n";
 631     }
 632 }
 633
 634 ##********************************************************************
 635 ## print_pubinfo(): gets info from MARC field 260
 636 ## Arguments: field (260)
 637 ##********************************************************************
 638 sub print_pubinfo {
 639     my($pubinfofield) = @_;
 640
 641     if (!$pubinfofield) {
 642         print "<marc>no publication information found (260)\r\n" if $marcprint;
 643         warn("no publication information found") if $marcprint;
 644     }
 645     else {
 646         ## the following information is available in MARC21:
 647         ## $a place -> CY
 648         ## $b publisher -> PB
 649         ## $c date -> PY
 650         ## the corresponding subfields for UNIMARC:
 651         ## $a place -> CY
 652         ## $c publisher -> PB
 653         ## $d date -> PY
 654
 655         ## all of them are repeatable. We pool all places into a
 656         ## comma-separated list in CY. We also pool all publishers
 657         ## into a comma-separated list in PB.  We break the rule with
 658         ## the date field because this wouldn't make much sense. In
 659         ## this case, we use the first occurrence for PY, the second
 660         ## for Y2, and ignore the rest
 661
 662         my @pubsubfields = $pubinfofield->subfields();
 663         my @cities;
 664         my @publishers;
 665         my $pycounter = 0;
 666
 667         my $pubsub_place;
 668         my $pubsub_publisher;
 669         my $pubsub_date;
 670
 671         if ($intype eq "unimarc") {
 672             $pubsub_place = "a";
 673             $pubsub_publisher = "c";
 674             $pubsub_date = "d";
 675         }
 676         else { ## assume marc21
 677             $pubsub_place = "a";
 678             $pubsub_publisher = "b";
 679             $pubsub_date = "c";
 680         }
 681
 682         ## loop over all subfield list entries
 683         for my $tuple (@pubsubfields) {
 684             ## each tuple consists of the subfield code and the value
 685             if (@$tuple[0] eq $pubsub_place) {
 686                 ## strip any trailing crap
 687                 $_ = @$tuple[1];
 688                 s% *[,;:/]$%%;
 689                 ## pool all occurrences in a list
 690                 push (@cities, $_);
 691             }
 692             elsif (@$tuple[0] eq $pubsub_publisher) {
 693                 ## strip any trailing crap
 694                 $_ = @$tuple[1];
 695                 s% *[,;:/]$%%;
 696                 ## pool all occurrences in a list
 697                 push (@publishers, $_);
 698             }
 699             elsif (@$tuple[0] eq $pubsub_date) {
 700                 ## the dates are free-form, so we want to extract
 701                 ## a four-digit year and leave the rest as
 702                 ## "other info"
 703                 $protoyear = @$tuple[1];
 704                 print "<marc>Year (260\$c): $protoyear\r\n" if $marcprint;
 705
 706                 ## strip any separator chars at the end
 707                 $protoyear =~ s% *[\.;:/]*$%%;
 708
 709                 ## isolate a four-digit year. We discard anything
 710                 ## preceeding the year, but keep everything after
 711                 ## the year as other info.
 712                 $protoyear =~ s%\D*([0-9\-]{4})(.*)%$1///$2%;
 713
 714                 ## check what we've got. If there is no four-digit
 715                 ## year, make it up. If digits are replaced by '-',
 716                 ## replace those with 0s
 717
 718                 if (index($protoyear, "/") == 4) {
 719                     ## have year info
 720                     ## replace all '-' in the four-digit year
 721                     ## by '0'
 722                     substr($protoyear,0,4) =~ s!-!0!g;
 723                 }
 724                 else {
 725                     ## have no year info
 726                     print "<marc>no four-digit year found, use 0000\r\n" if $marcprint;
 727                     $protoyear = "0000///$protoyear";
 728                     warn("no four-digit year found, use 0000") if $marcprint;
 729                 }
 730
 731                 if ($pycounter == 0 && length($protoyear)) {
 732                     print "PY  - $protoyear\r\n";
 733                 }
 734                 elsif ($pycounter == 1 && length($_)) {
 735                     print "Y2  - $protoyear\r\n";
 736                 }
 737                 ## else: discard
 738             }
 739             ## else: discard
 740         }
 741
 742         ## now dump the collected CY and PB lists
 743         if (@cities > 0) {
 744             print "CY  - ", &charconv(join(", ", @cities)), "\r\n";
 745         }
 746         if (@publishers > 0) {
 747             print "PB  - ", &charconv(join(", ", @publishers)), "\r\n";
 748         }
 749     }
 750 }
 751
 752 ##********************************************************************
 753 ## get_keywords(): prints info from MARC fields 6XX
 754 ## Arguments: list of fields (6XX)
 755 ##********************************************************************
 756 sub get_keywords {
 757     my($href, $fieldname, @keywords) = @_;
 758
 759     ## a list of all possible subfields
 760     my @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '2', '3', '4');
 761
 762     ## loop over all 6XX fields
 763     foreach $kwfield (@keywords) {
 764         if ($kwfield != undef) {
 765             ## authornames get special treatment
 766             if ($fieldname eq "600") {
 767                 my $val = normalize_author($kwfield->subfield('a'), $kwfield->subfield('b'), $kwfield->subfield('c'), $kwfield->indicator('1'));
 768                 ${$href}{$val} += 1;
 769                 print "<marc>Field $kwfield subfield a:", $kwfield->subfield('a'), "\r\n<marc>Field $kwfield subfield b:", $kwfield->subfield('b'), "\r\n<marc>Field $kwfield subfield c:", $kwfield->subfield('c'), "\r\n" if $marcprint;
 770             }
 771             else {
 772                 ## retrieve all available subfields
 773                 @kwsubfields = $kwfield->subfields();
 774
 775                 ## loop over all available subfield tuples
 776                 foreach $kwtuple (@kwsubfields) {
 777                     ## loop over all subfields to check
 778                     foreach $subfield (@subfields) {
 779                         ## [0] contains subfield code
 780                         if (@$kwtuple[0] eq $subfield) {
 781                             ## [1] contains value, remove trailing separators
 782                             @$kwtuple[1] =~ s% *[,;.:/]*$%%;
 783                             if (length(@$kwtuple[1]) > 0) {
 784                                 ## add to hash
 785                                 ${$href}{@$kwtuple[1]} += 1;
 786                                 print "<marc>Field $fieldname subfield $subfield:", @$kwtuple[1], "\r\n" if $marcprint;
 787                             }
 788                             ## we can leave the subfields loop here
 789                             last;
 790                         }
 791                     }
 792                 }
 793             }
 794         }
 795     }
 796 }
 797
 798 ##********************************************************************
 799 ## pool_subx(): adds contents of several subfields to a list
 800 ## Arguments: reference to a list
 801 ##            field name
 802 ##            list of fields (5XX)
 803 ##********************************************************************
 804 sub pool_subx {
 805     my($aref, $fieldname, @notefields) = @_;
 806
 807     ## we use a list that contains the interesting subfields
 808     ## for each field
 809     # ToDo: this is apparently correct only for marc21
 810     my @subfields;
 811
 812     if ($fieldname eq "500") {
 813         @subfields = ('a');
 814     }
 815     elsif ($fieldname eq "501") {
 816         @subfields = ('a');
 817     }
 818     elsif ($fieldname eq "502") {
 819         @subfields = ('a');
 820             }
 821     elsif ($fieldname eq "504") {
 822         @subfields = ('a', 'b');
 823     }
 824     elsif ($fieldname eq "505") {
 825         @subfields = ('a', 'g', 'r', 't', 'u');
 826     }
 827     elsif ($fieldname eq "506") {
 828         @subfields = ('a', 'b', 'c', 'd', 'e');
 829     }
 830     elsif ($fieldname eq "507") {
 831         @subfields = ('a', 'b');
 832     }
 833     elsif ($fieldname eq "508") {
 834         @subfields = ('a');
 835     }
 836     elsif ($fieldname eq "510") {
 837         @subfields = ('a', 'b', 'c', 'x', '3');
 838     }
 839     elsif ($fieldname eq "511") {
 840         @subfields = ('a');
 841     }
 842     elsif ($fieldname eq "513") {
 843         @subfields = ('a', 'b');
 844     }
 845     elsif ($fieldname eq "514") {
 846         @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'm', 'u', 'z');
 847     }
 848     elsif ($fieldname eq "515") {
 849         @subfields = ('a');
 850     }
 851     elsif ($fieldname eq "516") {
 852         @subfields = ('a');
 853     }
 854     elsif ($fieldname eq "518") {
 855         @subfields = ('a', '3');
 856     }
 857     elsif ($fieldname eq "521") {
 858         @subfields = ('a', 'b', '3');
 859     }
 860     elsif ($fieldname eq "522") {
 861         @subfields = ('a');
 862     }
 863     elsif ($fieldname eq "524") {
 864         @subfields = ('a', '2', '3');
 865     }
 866     elsif ($fieldname eq "525") {
 867         @subfields = ('a');
 868     }
 869     elsif ($fieldname eq "526") {
 870         @subfields = ('a', 'b', 'c', 'd', 'i', 'x', 'z', '5');
 871     }
 872     elsif ($fieldname eq "530") {
 873         @subfields = ('a', 'b', 'c', 'd', 'u', '3');
 874     }
 875     elsif ($fieldname eq "533") {
 876         @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'm', 'n', '3');
 877     }
 878     elsif ($fieldname eq "534") {
 879         @subfields = ('a', 'b', 'c', 'e', 'f', 'k', 'l', 'm', 'n', 'p', 't', 'x', 'z');
 880     }
 881     elsif ($fieldname eq "535") {
 882         @subfields = ('a', 'b', 'c', 'd', 'g', '3');
 883     }
 884
 885     ## loop over all notefields
 886     foreach $notefield (@notefields) {
 887         if ($notefield != undef) {
 888             ## retrieve all available subfield tuples
 889             @notesubfields = $notefield->subfields();
 890
 891             ## loop over all subfield tuples
 892             foreach $notetuple (@notesubfields) {
 893                 ## loop over all subfields to check
 894                 foreach $subfield (@subfields) {
 895                     ## [0] contains subfield code
 896                     if (@$notetuple[0] eq $subfield) {
 897                         ## [1] contains value, remove trailing separators
 898                         print "<marc>field $fieldname subfield $subfield: ", @$notetuple[1], "\r\n" if $marcprint;
 899                         @$notetuple[1] =~ s% *[,;.:/]*$%%;
 900                         if (length(@$notetuple[1]) > 0) {
 901                             ## add to list
 902                             push @{$aref}, @$notetuple[1];
 903                         }
 904                         last;
 905                     }
 906                 }
 907             }
 908         }
 909     }
 910 }
 911
 912 ##********************************************************************
 913 ## print_abstract(): prints abstract fields
 914 ## Arguments: list of fields (520)
 915 ##********************************************************************
 916 sub print_abstract {
 917     # ToDo: take care of repeatable subfields
 918     my(@abfields) = @_;
 919
 920     ## we check the following subfields
 921     my @subfields = ('a', 'b');
 922
 923     ## we generate a list for all useful strings
 924     my @abstrings;
 925
 926     ## loop over all abfields
 927     foreach $abfield (@abfields) {
 928         foreach $field (@subfields) {
 929             if (length ($abfield->subfield($field)) > 0) {
 930                 my $ab = $abfield->subfield($field);
 931
 932                 print "<marc>field 520 subfield $field: $ab\r\n" if $marcprint;
 933
 934                 ## strip trailing separators
 935                 $ab =~ s% *[;,:./]*$%%;
 936
 937                 ## add string to the list
 938                 push (@abstrings, $ab);
 939             }
 940         }
 941     }
 942
 943     my $allabs = join "; ", @abstrings;
 944
 945     if (length($allabs) > 0) {
 946         print "N2  - ", &charconv($allabs), "\r\n";
 947     }
 948
 949 }
 950
 951
 952
 953 ##********************************************************************
 954 ## charconv(): converts to a different charset based on a global var
 955 ## Arguments: string
 956 ## Returns: string
 957 ##********************************************************************
 958 sub charconv {
 959     if ($utf) {
 960         ## return unaltered if already utf-8
 961         return @_;
 962     }
 963     elsif ($uniout eq "t") {
 964         ## convert to utf-8
 965         return marc8_to_utf8("@_");
 966     }
 967     else {
 968         ## return unaltered if no utf-8 requested
 969         return @_;
 970     }
 971 }
 972 1;