From 0bd7d912b894c8d469454fab3184d81c8a8d308b Mon Sep 17 00:00:00 2001 From: joshferraro Date: Wed, 4 May 2005 13:41:36 +0000 Subject: [PATCH] Files relevant to LibLime's spellcheck and suggest features are placed in this directory. --- .../make_spellcheck_suggest.pl | 237 ++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100755 misc/spellcheck_suggest/make_spellcheck_suggest.pl diff --git a/misc/spellcheck_suggest/make_spellcheck_suggest.pl b/misc/spellcheck_suggest/make_spellcheck_suggest.pl new file mode 100755 index 0000000000..0622a967aa --- /dev/null +++ b/misc/spellcheck_suggest/make_spellcheck_suggest.pl @@ -0,0 +1,237 @@ +#!/usr/bin/perl -w +## This Script creates a Koha suggest and spellcheck database +## for those features as visible on LibLime's opac: opac.liblime.com +## It also contains the needed specs for creating a table of +## queries for statistical purposes as well as a method of +## returning popular searches via the suggest and spellcheck. +## The code for running suggest and spellcheck can be found +## either in Koha 2.4 CVS (HEAD as of this writing) or at +## LibLime's website in the downlaods +## section: http://liblime.com/c/downloads.html +## +##Author: Joshua Ferraro jmf at liblime dot com +## +## TODO: add suggest features, merge the two of them? +## There are a few configurable variables. + +## CONFIGURABLE VARIABLES #################### +## + # Change this to where your Koha modules are (C4 directory) +use lib '/var/www/labs.liblime.com/cvsroots/koha/'; + # These are the tags that have meaningful data + # for the databases I've worked with (MARC21 only) + # you may need to change them depending on your data +my @tags=( +#Tag documentation from http://lcweb.loc.gov/marc/bibliographic/ecbdhome.html +"020a", # INTERNATIONAL STANDARD BOOK NUMBER +#"022a", # INTERNATIONAL STANDARD SERIAL NUMBER +"100a", # MAIN ENTRY--PERSONAL NAME +"110a", # MAIN ENTRY--CORPORATE NAME +#"110b", # Subordinate unit +#"110c", # Location of meeting +#"111a", # MAIN ENTRY--MEETING NAME +#"111c", # Location of meeting +"130a", # MAIN ENTRY--UNIFORM TITLE +"240a", # UNIFORM TITLE +"245a", # TITLE STATEMENT +"245b", # Remainder of title +"245c", # Statement of responsibility, etc. +"245p", # Name of part/section of a work +"246a", # VARYING FORM OF TITLE +"246b", # Remainder of title +#"260b", # PUBLICATION, DISTRIBUTION, ETC. (IMPRINT) +"440a", # SERIES STATEMENT/ADDED ENTRY--TITLE +"440p", # Name of part/section of a work +#"500a", # GENERAL NOTE +"505t", # FORMATTED CONTENTS NOTE (t is Title) +"511a", # PARTICIPANT OR PERFORMER NOTE +#"520a", # SUMMARY, ETC. +"534a", # ORIGINAL VERSION NOTE +#"534k", # Key title of original +#"534t", # Title statement of original +#"586a", # AWARDS NOTE +"600a", # SUBJECT ADDED ENTRY--PERSONAL NAME +"610a", # SUBJECT ADDED ENTRY--CORPORATE NAME +"611a", # SUBJECT ADDED ENTRY--MEETING NAME +"630a", # SUBJECT ADDED ENTRY--UNIFORM TITLE +"650a", # SUBJECT ADDED ENTRY--TOPICAL TERM +"651a", # SUBJECT ADDED ENTRY--GEOGRAPHIC NAME +"700a", # ADDED ENTRY--PERSONAL NAME +"710a", # ADDED ENTRY--CORPORATE NAME +#"711a", # ADDED ENTRY--MEETING NAME +#"720a", # ADDED ENTRY--UNCONTROLLED NAME +"730a", # ADDED ENTRY--UNIFORM TITLE +"740a", # ADDED ENTRY--UNCONTROLLED RELATED/ANALYTICAL TITLE +#"752a", # ADDED ENTRY--HIERARCHICAL PLACE NAME +"800a", # SERIES ADDED ENTRY--PERSONAL NAME +#"810a", # SERIES ADDED ENTRY--CORPORATE NAME +#"811a", # SERIES ADDED ENTRY--MEETING NAME +"830a", # SERIES ADDED ENTRY--UNIFORM TITLE +#"942k" # Holdings Branch ?? Unique to NPL?? +); +## Leave this next bit alone +use strict; +use C4::Context; +## + # SUGGEST DATABASE INFO + # You'll need to change this if you want to keep your 'suggest' database + # separate from your Koha database -- simply comment out the next line + # and uncomment the one after it, adding your site info (check out GRANT + # syntax in the mysql manual if you're unsure how enable authentication) +# +#my dbh2 = C4::Context->dbh; +# +#my $dbh2=DBI->connect("DBI:mysql::localhost","",""); +my $dbh2=DBI->connect("DBI:mysql:demosuggest:localhost","sugg","Free2cirC"); +######################################################################## +## End of most common configurable variables: in most cases you won't need +## edit any further ... of course feel free to indulge yourself ;-) +######################################################################## +my $dbh=C4::Context->dbh; +my $counter = 0; + +# Check for existance of suggest database and add if it doesn't. +print "Step 1 of 5: Checking to make sure suggest tables exist\n"; +my $check_tables_query = "select distinct resultcount from ?"; +my @tables = ("notdistinctspchk", "notdistinctsugg", "spellcheck", "suggestions"); +my %tables = ( notdistinctspchk => "( display varchar(40) not null default, + suggestion varchar(40) not null default, +foreach my $table (@tables) { + my $sth_check=$dbh2->prepare($check_tables_query) || die "cant prepare query: $DBI::errstr"; + my $rv = $sth_check->execute($table); + if($rv eq undef) { + print "$table missing ... creating it now\n"; + my $create_this = "CREATE TABLE \'$table\' \( + display varchar\(40\) NOT NULL default \'\', + suggestion varchar\(40\) NOT NULL default \'\', + resultcount varchar\(40\) NOT NULL default \'0\' + \) TYPE=MyISAM"; + my $sth_create = $dbh->prepare($create_this) || die "can't prepare query: $DBI::errstr"; + $sth_create->execute() || die "can't execute: $DBI::errstr"; + print "$table created ...\n"; + }else { + print "$table exists ... moving along\n"; + } +} +print "All tables present ... moving along\n"; + +print "Step 2 of 5: Deleting old data\n"; +my $clear_out = "DELETE FROM notdistinctspchk"; +# Clear out old data +my $sth_clear_out=$dbh2->prepare($clear_out) || die "cant prepare query"; +$sth_clear_out->execute(); +print "Step 3 of 5: Creating non-distinct table from various Koha tables\n"; +my $query_words = "SELECT DISTINCT word, COUNT(word) FROM marc_word"; +my $query_marc_subfields = "SELECT DISTINCT subfieldvalue, COUNT(subfieldvalue) FROM marc_subfield_table"; +my $query_titles = "SELECT DISTINCT title, COUNT(title) FROM biblio GROUP BY title"; +my $query_authors = "SELECT DISTINCT author, COUNT(author) FROM biblio GROUP BY author"; + +my @queries = ("$query_words", "$query_marc_subfields", "$query_titles", "$query_authors"); + +foreach my $query (@queries) { + + #we need to do some special stuff for marc_word and marc_subfield_table queries + if ($query eq $queries[0]) { #marc_word + my $listoftagsubfields; + my $notfirst; + foreach my $tag (@tags) { + $listoftagsubfields.="$tag, "; + if (!$notfirst) { + $query.=" WHERE tagsubfield=\'$tag\'"; + $notfirst = 1; + } else { + $query.=" OR tagsubfield=\'$tag\'"; + } + }#foreach + $query.=" GROUP BY word"; + print "Finished building marc_word list\n"; + print "Adding marc_word entries with the following tagsubfields:"."$listoftagsubfields"."\n"; + } + + if ($query eq $queries[1]) { #marc_subfield_table + my $listofsubfieldstuff; #for testing + my $notfirst; + foreach my $tag (@tags) { + my $justtag = $tag; + $justtag =~ s/\D\Z//; + my $subfieldcode = $&; + $listofsubfieldstuff.="$justtag, "."$subfieldcode, "; + if (!$notfirst) { + $query.=" WHERE (tag=\'$justtag\' and subfieldcode=\'$subfieldcode\')"; + $notfirst = 1; + } else { + $query.=" OR (tag=\'$justtag\' and subfieldcode=\'$subfieldcode\')"; + } + }#foreach + $query.=" GROUP BY subfieldvalue"; + print "Finished building marc_subfield_table list\n"; + print "Adding marc_subfield_table entries with the following tags and subfields:"."$listofsubfieldstuff"."\n"; + } + + my $sth=$dbh->prepare($query) || die "cant prepare query"; + $sth->execute(); + + my $insert = "INSERT INTO notdistinctspchk(suggestion,display,resultcount) VALUES(?,?,?)"; + + my $sth2=$dbh2->prepare($insert); + + while (my ($phraseterm,$count)=$sth->fetchrow_array) { + if ($phraseterm) { + #$display looks exactly like the DB + my $display = $phraseterm; + #except for a few things + $display =~s/ / /g; + $display =~ s/^\s+//; #remove leading whitespace + $display =~ s/\s+$//; #remove trailing whitespace + $display =~ s/(\.|\/)/ /g; + + #suggestion is tweaked for optimal searching + my $suggestion = $phraseterm; + $suggestion =~ tr/A-Z/a-z/; + $suggestion =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\})/ /g; + $suggestion =~s/(\Aand-or |\Aand\/or |\Aanon |\Aan |\Aa |\Abut |\Aby |\Ade |\Ader |\Adr |\Adu|et |\Afor |\Afrom |\Ain |\Ainto |\Ait |\Amy |\Anot |\Aon |\Aor |\Aper |\Apt |\Aspp |\Ato |\Avs |\Awith |\Athe )/ /g; + $suggestion =~s/( and-or | and\/or | anon | an | a | but | by | de | der | dr | du|et | for | from | in | into | it | my | not | on | or | per | pt | spp | to | vs | with | the )/ /g; + + $suggestion =~s/ / /g; + + $suggestion =~ s/^\s+//; #remove leading whitespace + $suggestion =~ s/\s+$//; #remove trailing whitespace + + if (length($suggestion)>2) { + $sth2->execute($suggestion,$display,$count) || die "can't execute write"; + $counter++; + } #if + } #if + }#while +print $counter." more records added...\n"; +$sth2->finish; +$sth->finish; +} + +# Now grab distincts from there and insert into our REAL database + +print "Step 4 of 5: Deleting old distinct entries\n"; +my $clear_distincts = "DELETE FROM spellcheck"; + +# Clear out old data +my $sth_clear_distincts=$dbh2->prepare($clear_distincts) || die "cant prepare query"; +$sth_clear_distincts->execute(); + +print "Step 5 of 5: Creating distinct spellcheck table out of non-distinct table\n"; +my $query_distincts = "SELECT DISTINCT suggestion, display, COUNT(display) FROM notdistinctspchk GROUP BY suggestion"; +my $insert_distincts = "INSERT INTO spellcheck(suggestion,display,resultcount) VALUES(?,?,?)"; +my $distinctcounter = 0; + +my $sth=$dbh2->prepare($query_distincts) || die "cant prepare query"; +$sth->execute(); +my $sth2=$dbh2->prepare($insert_distincts) || die "cant prepare query"; +while (my ($suggestion,$display,$count)=$sth->fetchrow_array) { + if ($count) { + $sth2->execute($suggestion,$display,$count) || die "can't execute write"; + $distinctcounter++; + } +} +print "Finished: total distinct items added to spellcheck: "."$distinctcounter\n"; + +$dbh->disconnect(); +$dbh2->disconnect(); -- 2.39.5