From a887b004367a32659cc9b7f314663264b9f219bb Mon Sep 17 00:00:00 2001 From: rangi Date: Sun, 22 May 2005 01:18:45 +0000 Subject: [PATCH] First cut of a search engine using Plucene NOT FOR PRODUCTION, but if anyone wishes to test/improve go right ahead --- koha-plucene/indexer.pl | 64 +++++++++++++++++++++++++++ koha-plucene/search.cgi | 98 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100755 koha-plucene/indexer.pl create mode 100755 koha-plucene/search.cgi diff --git a/koha-plucene/indexer.pl b/koha-plucene/indexer.pl new file mode 100755 index 0000000000..ee8a16c6cb --- /dev/null +++ b/koha-plucene/indexer.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w + +# This script will build an index of all the biblios in a koha database +# Its using english stemming at the moment. But that can be changed and is only +# indexing author and title + +# Combine this with the search.cgi script to search Koha using Plucene +# This is still a work in progress, use with caution + +# $Id$ + +# Copyright 2005 Katipo Communications +# +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA + +use lib '/usr/local/koha/intranet/modules'; +use strict; +use C4::Context; +use Plucene::Index::Writer; +use Plucene::Plugin::Analyzer::PorterAnalyzer; +use Plucene::Document; + +# connect to the database and fetch all the biblios +my $dbh = C4::Context->dbh(); + +my $query = "SELECT * FROM biblio"; +my $sth = $dbh->prepare($query); + +$sth->execute(); + +# create an index writer +# currently it makes the index in /tmp/plucene +# PLEASE change this if you want to use the script in production +my $writer = Plucene::Index::Writer->new( + "/tmp/plucene", + Plucene::Plugin::Analyzer::PorterAnalyzer->new(), + 1 # Create the index from scratch +); + +# For each biblio, add its information to the index + +while ( my $data = $sth->fetchrow_hashref() ) { + my $doc = Plucene::Document->new(); + $doc->add( + Plucene::Document::Field->Keyword( filename => $data->{biblionumber} ) + ); + $doc->add( Plucene::Document::Field->Text( title => $data->{'title'} ) ); + $doc->add( Plucene::Document::Field->Text( author => $data->{'author'} ) ); + $writer->add_document($doc); +} + diff --git a/koha-plucene/search.cgi b/koha-plucene/search.cgi new file mode 100755 index 0000000000..3277159e3e --- /dev/null +++ b/koha-plucene/search.cgi @@ -0,0 +1,98 @@ +#!/usr/bin/perl + +# script to search the plucene index of the database +# most of this will be shifted to a module when it moves out of the proof of concept stage + +# $Id$ + +# Copyright 2005 Katipo Communications +# +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, + +use strict; + +use Plucene::Search::IndexSearcher; +use Plucene::Plugin::Analyzer::PorterAnalyzer; +use Plucene::QueryParser; +use Plucene::Search::HitCollector; + +use C4::Auth; +use C4::Interface::CGI::Output; + +use Data::Dumper; + +use CGI; +my $cgi = new CGI; + +# get a template, opac-pluceneresults.tmpl is currently an exact copy of +# opac-searchresults.tmpl so just make a copy. +my ( $template, $borrowernumber, $cookie ) = get_template_and_user( + { + template_name => "opac-pluceneresults.tmpl", + query => $cgi, + type => "opac", + authnotrequired => 1, + } +); + +# the script expects an input called query; +my $query = $cgi->param('query'); + +# tell the script what index to use (change this to match whatever is in indexer.pl) +my $searcher = Plucene::Search::IndexSearcher->new("/tmp/plucene/"); + +# the important bit here is default=>"title" +# that says if we dont specify what to search, search the title field +my $parser = Plucene::QueryParser->new( + { + analyzer => Plucene::Plugin::Analyzer::PorterAnalyzer->new(), + default => "title" + } +); + +my $parsed = $parser->parse($query); + +my @docs; + +# build an array of results, +# we could use the $score to rank them, but its currently not doing that +my $hc = Plucene::Search::HitCollector->new( + collect => sub { + my ( $self, $doc, $score ) = @_; + my $res = eval { $searcher->doc($doc) }; + push @docs, $res if $res; + } +); + +# do the searh +$searcher->search_hc( $parsed, $hc ); + +# map the results into a format our template is expecting +my @results = map { + { + biblionumber => $_->get("filename")->string, + title => $_->get("title")->string, + author => $_->get("author")->string, + } +} @docs; + +# pass the results to the template +my $num_records = @results; +$template->param( + search_results => \@results, + numrecords => $num_records, + searchdesc => $query +); +output_html_with_http_headers $cgi, $cookie, $template->output; -- 2.39.5