Files relevant to LibLime's spellcheck and suggest features are placed
in this directory.
This commit is contained in:
parent
e2085c673a
commit
0bd7d912b8
1 changed files with 237 additions and 0 deletions
237
misc/spellcheck_suggest/make_spellcheck_suggest.pl
Executable file
237
misc/spellcheck_suggest/make_spellcheck_suggest.pl
Executable file
|
@ -0,0 +1,237 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
## This Script creates a Koha suggest and spellcheck database
|
||||||
|
## for those features as visible on LibLime's opac: opac.liblime.com
|
||||||
|
## It also contains the needed specs for creating a table of
|
||||||
|
## queries for statistical purposes as well as a method of
|
||||||
|
## returning popular searches via the suggest and spellcheck.
|
||||||
|
## The code for running suggest and spellcheck can be found
|
||||||
|
## either in Koha 2.4 CVS (HEAD as of this writing) or at
|
||||||
|
## LibLime's website in the downlaods
|
||||||
|
## section: http://liblime.com/c/downloads.html
|
||||||
|
##
|
||||||
|
##Author: Joshua Ferraro jmf at liblime dot com
|
||||||
|
##
|
||||||
|
## TODO: add suggest features, merge the two of them?
|
||||||
|
## There are a few configurable variables.
|
||||||
|
|
||||||
|
## CONFIGURABLE VARIABLES ####################
|
||||||
|
##
|
||||||
|
# Change this to where your Koha modules are (C4 directory)
|
||||||
|
use lib '/var/www/labs.liblime.com/cvsroots/koha/';
|
||||||
|
# These are the tags that have meaningful data
|
||||||
|
# for the databases I've worked with (MARC21 only)
|
||||||
|
# you may need to change them depending on your data
|
||||||
|
my @tags=(
|
||||||
|
#Tag documentation from http://lcweb.loc.gov/marc/bibliographic/ecbdhome.html
|
||||||
|
"020a", # INTERNATIONAL STANDARD BOOK NUMBER
|
||||||
|
#"022a", # INTERNATIONAL STANDARD SERIAL NUMBER
|
||||||
|
"100a", # MAIN ENTRY--PERSONAL NAME
|
||||||
|
"110a", # MAIN ENTRY--CORPORATE NAME
|
||||||
|
#"110b", # Subordinate unit
|
||||||
|
#"110c", # Location of meeting
|
||||||
|
#"111a", # MAIN ENTRY--MEETING NAME
|
||||||
|
#"111c", # Location of meeting
|
||||||
|
"130a", # MAIN ENTRY--UNIFORM TITLE
|
||||||
|
"240a", # UNIFORM TITLE
|
||||||
|
"245a", # TITLE STATEMENT
|
||||||
|
"245b", # Remainder of title
|
||||||
|
"245c", # Statement of responsibility, etc.
|
||||||
|
"245p", # Name of part/section of a work
|
||||||
|
"246a", # VARYING FORM OF TITLE
|
||||||
|
"246b", # Remainder of title
|
||||||
|
#"260b", # PUBLICATION, DISTRIBUTION, ETC. (IMPRINT)
|
||||||
|
"440a", # SERIES STATEMENT/ADDED ENTRY--TITLE
|
||||||
|
"440p", # Name of part/section of a work
|
||||||
|
#"500a", # GENERAL NOTE
|
||||||
|
"505t", # FORMATTED CONTENTS NOTE (t is Title)
|
||||||
|
"511a", # PARTICIPANT OR PERFORMER NOTE
|
||||||
|
#"520a", # SUMMARY, ETC.
|
||||||
|
"534a", # ORIGINAL VERSION NOTE
|
||||||
|
#"534k", # Key title of original
|
||||||
|
#"534t", # Title statement of original
|
||||||
|
#"586a", # AWARDS NOTE
|
||||||
|
"600a", # SUBJECT ADDED ENTRY--PERSONAL NAME
|
||||||
|
"610a", # SUBJECT ADDED ENTRY--CORPORATE NAME
|
||||||
|
"611a", # SUBJECT ADDED ENTRY--MEETING NAME
|
||||||
|
"630a", # SUBJECT ADDED ENTRY--UNIFORM TITLE
|
||||||
|
"650a", # SUBJECT ADDED ENTRY--TOPICAL TERM
|
||||||
|
"651a", # SUBJECT ADDED ENTRY--GEOGRAPHIC NAME
|
||||||
|
"700a", # ADDED ENTRY--PERSONAL NAME
|
||||||
|
"710a", # ADDED ENTRY--CORPORATE NAME
|
||||||
|
#"711a", # ADDED ENTRY--MEETING NAME
|
||||||
|
#"720a", # ADDED ENTRY--UNCONTROLLED NAME
|
||||||
|
"730a", # ADDED ENTRY--UNIFORM TITLE
|
||||||
|
"740a", # ADDED ENTRY--UNCONTROLLED RELATED/ANALYTICAL TITLE
|
||||||
|
#"752a", # ADDED ENTRY--HIERARCHICAL PLACE NAME
|
||||||
|
"800a", # SERIES ADDED ENTRY--PERSONAL NAME
|
||||||
|
#"810a", # SERIES ADDED ENTRY--CORPORATE NAME
|
||||||
|
#"811a", # SERIES ADDED ENTRY--MEETING NAME
|
||||||
|
"830a", # SERIES ADDED ENTRY--UNIFORM TITLE
|
||||||
|
#"942k" # Holdings Branch ?? Unique to NPL??
|
||||||
|
);
|
||||||
|
## Leave this next bit alone
|
||||||
|
use strict;
|
||||||
|
use C4::Context;
|
||||||
|
##
|
||||||
|
# SUGGEST DATABASE INFO
|
||||||
|
# You'll need to change this if you want to keep your 'suggest' database
|
||||||
|
# separate from your Koha database -- simply comment out the next line
|
||||||
|
# and uncomment the one after it, adding your site info (check out GRANT
|
||||||
|
# syntax in the mysql manual if you're unsure how enable authentication)
|
||||||
|
#
|
||||||
|
#my dbh2 = C4::Context->dbh;
|
||||||
|
#
|
||||||
|
#my $dbh2=DBI->connect("DBI:mysql:<add your database name here>:localhost","<add your mysql user here>","<add your password here>");
|
||||||
|
my $dbh2=DBI->connect("DBI:mysql:demosuggest:localhost","sugg","Free2cirC");
|
||||||
|
########################################################################
|
||||||
|
## End of most common configurable variables: in most cases you won't need
|
||||||
|
## edit any further ... of course feel free to indulge yourself ;-)
|
||||||
|
########################################################################
|
||||||
|
my $dbh=C4::Context->dbh;
|
||||||
|
my $counter = 0;
|
||||||
|
|
||||||
|
# Check for existance of suggest database and add if it doesn't.
|
||||||
|
print "Step 1 of 5: Checking to make sure suggest tables exist\n";
|
||||||
|
my $check_tables_query = "select distinct resultcount from ?";
|
||||||
|
my @tables = ("notdistinctspchk", "notdistinctsugg", "spellcheck", "suggestions");
|
||||||
|
my %tables = ( notdistinctspchk => "( display varchar(40) not null default,
|
||||||
|
suggestion varchar(40) not null default,
|
||||||
|
foreach my $table (@tables) {
|
||||||
|
my $sth_check=$dbh2->prepare($check_tables_query) || die "cant prepare query: $DBI::errstr";
|
||||||
|
my $rv = $sth_check->execute($table);
|
||||||
|
if($rv eq undef) {
|
||||||
|
print "$table missing ... creating it now\n";
|
||||||
|
my $create_this = "CREATE TABLE \'$table\' \(
|
||||||
|
display varchar\(40\) NOT NULL default \'\',
|
||||||
|
suggestion varchar\(40\) NOT NULL default \'\',
|
||||||
|
resultcount varchar\(40\) NOT NULL default \'0\'
|
||||||
|
\) TYPE=MyISAM";
|
||||||
|
my $sth_create = $dbh->prepare($create_this) || die "can't prepare query: $DBI::errstr";
|
||||||
|
$sth_create->execute() || die "can't execute: $DBI::errstr";
|
||||||
|
print "$table created ...\n";
|
||||||
|
}else {
|
||||||
|
print "$table exists ... moving along\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print "All tables present ... moving along\n";
|
||||||
|
|
||||||
|
print "Step 2 of 5: Deleting old data\n";
|
||||||
|
my $clear_out = "DELETE FROM notdistinctspchk";
|
||||||
|
# Clear out old data
|
||||||
|
my $sth_clear_out=$dbh2->prepare($clear_out) || die "cant prepare query";
|
||||||
|
$sth_clear_out->execute();
|
||||||
|
print "Step 3 of 5: Creating non-distinct table from various Koha tables\n";
|
||||||
|
my $query_words = "SELECT DISTINCT word, COUNT(word) FROM marc_word";
|
||||||
|
my $query_marc_subfields = "SELECT DISTINCT subfieldvalue, COUNT(subfieldvalue) FROM marc_subfield_table";
|
||||||
|
my $query_titles = "SELECT DISTINCT title, COUNT(title) FROM biblio GROUP BY title";
|
||||||
|
my $query_authors = "SELECT DISTINCT author, COUNT(author) FROM biblio GROUP BY author";
|
||||||
|
|
||||||
|
my @queries = ("$query_words", "$query_marc_subfields", "$query_titles", "$query_authors");
|
||||||
|
|
||||||
|
foreach my $query (@queries) {
|
||||||
|
|
||||||
|
#we need to do some special stuff for marc_word and marc_subfield_table queries
|
||||||
|
if ($query eq $queries[0]) { #marc_word
|
||||||
|
my $listoftagsubfields;
|
||||||
|
my $notfirst;
|
||||||
|
foreach my $tag (@tags) {
|
||||||
|
$listoftagsubfields.="$tag, ";
|
||||||
|
if (!$notfirst) {
|
||||||
|
$query.=" WHERE tagsubfield=\'$tag\'";
|
||||||
|
$notfirst = 1;
|
||||||
|
} else {
|
||||||
|
$query.=" OR tagsubfield=\'$tag\'";
|
||||||
|
}
|
||||||
|
}#foreach
|
||||||
|
$query.=" GROUP BY word";
|
||||||
|
print "Finished building marc_word list\n";
|
||||||
|
print "Adding marc_word entries with the following tagsubfields:"."$listoftagsubfields"."\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($query eq $queries[1]) { #marc_subfield_table
|
||||||
|
my $listofsubfieldstuff; #for testing
|
||||||
|
my $notfirst;
|
||||||
|
foreach my $tag (@tags) {
|
||||||
|
my $justtag = $tag;
|
||||||
|
$justtag =~ s/\D\Z//;
|
||||||
|
my $subfieldcode = $&;
|
||||||
|
$listofsubfieldstuff.="$justtag, "."$subfieldcode, ";
|
||||||
|
if (!$notfirst) {
|
||||||
|
$query.=" WHERE (tag=\'$justtag\' and subfieldcode=\'$subfieldcode\')";
|
||||||
|
$notfirst = 1;
|
||||||
|
} else {
|
||||||
|
$query.=" OR (tag=\'$justtag\' and subfieldcode=\'$subfieldcode\')";
|
||||||
|
}
|
||||||
|
}#foreach
|
||||||
|
$query.=" GROUP BY subfieldvalue";
|
||||||
|
print "Finished building marc_subfield_table list\n";
|
||||||
|
print "Adding marc_subfield_table entries with the following tags and subfields:"."$listofsubfieldstuff"."\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
my $sth=$dbh->prepare($query) || die "cant prepare query";
|
||||||
|
$sth->execute();
|
||||||
|
|
||||||
|
my $insert = "INSERT INTO notdistinctspchk(suggestion,display,resultcount) VALUES(?,?,?)";
|
||||||
|
|
||||||
|
my $sth2=$dbh2->prepare($insert);
|
||||||
|
|
||||||
|
while (my ($phraseterm,$count)=$sth->fetchrow_array) {
|
||||||
|
if ($phraseterm) {
|
||||||
|
#$display looks exactly like the DB
|
||||||
|
my $display = $phraseterm;
|
||||||
|
#except for a few things
|
||||||
|
$display =~s/ / /g;
|
||||||
|
$display =~ s/^\s+//; #remove leading whitespace
|
||||||
|
$display =~ s/\s+$//; #remove trailing whitespace
|
||||||
|
$display =~ s/(\.|\/)/ /g;
|
||||||
|
|
||||||
|
#suggestion is tweaked for optimal searching
|
||||||
|
my $suggestion = $phraseterm;
|
||||||
|
$suggestion =~ tr/A-Z/a-z/;
|
||||||
|
$suggestion =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\})/ /g;
|
||||||
|
$suggestion =~s/(\Aand-or |\Aand\/or |\Aanon |\Aan |\Aa |\Abut |\Aby |\Ade |\Ader |\Adr |\Adu|et |\Afor |\Afrom |\Ain |\Ainto |\Ait |\Amy |\Anot |\Aon |\Aor |\Aper |\Apt |\Aspp |\Ato |\Avs |\Awith |\Athe )/ /g;
|
||||||
|
$suggestion =~s/( and-or | and\/or | anon | an | a | but | by | de | der | dr | du|et | for | from | in | into | it | my | not | on | or | per | pt | spp | to | vs | with | the )/ /g;
|
||||||
|
|
||||||
|
$suggestion =~s/ / /g;
|
||||||
|
|
||||||
|
$suggestion =~ s/^\s+//; #remove leading whitespace
|
||||||
|
$suggestion =~ s/\s+$//; #remove trailing whitespace
|
||||||
|
|
||||||
|
if (length($suggestion)>2) {
|
||||||
|
$sth2->execute($suggestion,$display,$count) || die "can't execute write";
|
||||||
|
$counter++;
|
||||||
|
} #if
|
||||||
|
} #if
|
||||||
|
}#while
|
||||||
|
print $counter." more records added...\n";
|
||||||
|
$sth2->finish;
|
||||||
|
$sth->finish;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Now grab distincts from there and insert into our REAL database
|
||||||
|
|
||||||
|
print "Step 4 of 5: Deleting old distinct entries\n";
|
||||||
|
my $clear_distincts = "DELETE FROM spellcheck";
|
||||||
|
|
||||||
|
# Clear out old data
|
||||||
|
my $sth_clear_distincts=$dbh2->prepare($clear_distincts) || die "cant prepare query";
|
||||||
|
$sth_clear_distincts->execute();
|
||||||
|
|
||||||
|
print "Step 5 of 5: Creating distinct spellcheck table out of non-distinct table\n";
|
||||||
|
my $query_distincts = "SELECT DISTINCT suggestion, display, COUNT(display) FROM notdistinctspchk GROUP BY suggestion";
|
||||||
|
my $insert_distincts = "INSERT INTO spellcheck(suggestion,display,resultcount) VALUES(?,?,?)";
|
||||||
|
my $distinctcounter = 0;
|
||||||
|
|
||||||
|
my $sth=$dbh2->prepare($query_distincts) || die "cant prepare query";
|
||||||
|
$sth->execute();
|
||||||
|
my $sth2=$dbh2->prepare($insert_distincts) || die "cant prepare query";
|
||||||
|
while (my ($suggestion,$display,$count)=$sth->fetchrow_array) {
|
||||||
|
if ($count) {
|
||||||
|
$sth2->execute($suggestion,$display,$count) || die "can't execute write";
|
||||||
|
$distinctcounter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print "Finished: total distinct items added to spellcheck: "."$distinctcounter\n";
|
||||||
|
|
||||||
|
$dbh->disconnect();
|
||||||
|
$dbh2->disconnect();
|
Loading…
Reference in a new issue