Tatsuki SUGIURA
sugi****@users*****
2006年 7月 12日 (水) 20:41:57 JST
Index: slashjp/plugins/SearchToo/SearchToo/Classic.pm diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Classic.pm:1.1 --- /dev/null Wed Jul 12 20:41:57 2006 +++ slashjp/plugins/SearchToo/SearchToo/Classic.pm Wed Jul 12 20:41:57 2006 @@ -0,0 +1,194 @@ +package Slash::SearchToo::Classic; + +use strict; +use Slash::Utility; +use Slash::DB::Utility; +use vars qw($VERSION); +use base 'Slash::DB::Utility'; +use base 'Slash::SearchToo'; + +($VERSION) = ' $Revision: 1.1 $ ' =~ /\$Revision:\s+([^\s]+)/; + +# FRY: I did it! And it's all thanks to the books at my local library. + +################################################################# +sub new { + my($class, $user) = @_; + my $self = {}; + + my $plugin = getCurrentStatic('plugin'); + return unless $plugin->{'Search'}; + + bless($self, $class); + $self->{virtual_user} = $user; + $self->sqlConnect(); + + return $self; +} + +################################################################# +sub getOps { + my %ops = ( + stories => 1, + comments => 1, + journals => 1, + polls => 1, + users => 1, + submissions => 1, + test => \&testSearch, + ); + return \%ops; +} + +################################################################# +sub findRecords { + my($self, $type, $query, $opts) = @_; + + my(%processed); + my $results = {}; + my $records = []; + + my $constants = getCurrentStatic(); + my $oldsearch = getObject('Slash::Search', { db_type => 'search' }); + + + ### set up common query terms + my %terms = ( + query => $query->{query}, + ); + + if ($query->{topic}) { + my @topics = ref $query->{topic} + ? @{$query->{topic}} + : $query->{topic}; + $processed{tid} = $topics[0] if @topics; + # API is expecting multiple args in _multi, so we fake it + $processed{_multi}{tid} = \@topics if @topics > 1; + } + + if ($query->{section}) { + my $reader = getObject('Slash::DB', { db_type => 'reader' }); + # get section name, for most compatibility with this API + my $skin = $reader->getSkin($query->{section}); + $processed{section} = $skin->{name} if $skin && $skin->{name}; + } + + for (qw(uid author submitter)) { + $processed{$_} = $query->{$_} if $query->{$_} && $query->{$_} =~ /^\d+$/; + } + + + ### set up common options + # old API cannot tell us total or matches + # undef if we cannot find for sure, or if not applicable + my $total = undef; + my $matches = undef; + my $start = $opts->{records_start} || 0; + my $max = $opts->{records_max} || $constants->{search_default_display}; + # if we are not getting total number of matches, fetch an extra so we + # know if there are more, for pagination purposes + $max++ if !defined $matches; + + # sort can be an arrayref, but old API can handle only one + my $sort = ref $opts->{sort} ? $opts->{sort}[0] : $opts->{sort}; + $sort = ($opts->{sort} eq 'date' || $opts->{sort} eq 1) ? 1 : + ($opts->{sort} eq 'relevance' || $opts->{sort} eq 2) ? 2 : + 0; + +### options not used in this backend +# date_start => '', date_end => '', + + + ### dispatch to different queries + if ($type eq 'stories') { + for (qw(tid _multi section author submitter)) { + $terms{$_} = $processed{$_} if $processed{$_}; + } + + $records = $oldsearch->findStory(\%terms, $start, $max, $sort); + } + + elsif ($type eq 'comments') { + for (qw(section)) { + $terms{$_} = $processed{$_} if $processed{$_}; + } + %terms = (%terms, + sid => $query->{sid}, + threshold => $query->{points_min}, + ); + + $records = $oldsearch->findComments(\%terms, $start, $max, $sort); + } + + elsif ($type eq 'journals') { + for (qw(tid uid)) { + $terms{$_} = $processed{$_} if $processed{$_}; + } + + $records = $oldsearch->findJournalEntry(\%terms, $start, $max, $sort); + } + + elsif ($type eq 'polls') { + for (qw(tid section uid)) { + $terms{$_} = $processed{$_} if $processed{$_}; + } + + $records = $oldsearch->findPollQuestion(\%terms, $start, $max, $sort); + } + + elsif ($type eq 'users') { + # sigh, why is this ONE method passing info in an additional parameter? + $records = $oldsearch->findUsers(\%terms, $start, $max, $sort, $query->{journal_only}); + } + + elsif ($type eq 'submissions') { + for (qw(tid section uid)) { + $terms{$_} = $processed{$_} if $processed{$_}; + } + %terms = (%terms, + note => $query->{note}, + ); + + $records = $oldsearch->findSubmission(\%terms, $start, $max, $sort); + } + + $self->prepResults($results, $records, [$total, $matches, $start, $max]); + return $results; +} + +################################################################# +# this is a way of adding extra search thingys; we could call another +# search method (as defaultSearch calls findRecords), or just make this our +# search method. +sub testSearch { + my($reader, $constants, $user, $form, $gSkin, $searchDB, $rss, $query, $opts) = @_; + + my $results = {}; + my $records = ['a' .. 'z']; + my $total = 26; + my $matches = 26; + my $start = $opts->{records_start} || 0; + my $max = $opts->{records_max} || 26; + + $records = [ @{$records}[$start .. ($start + $max)] ]; + $searchDB->prepResults($results, $records, [$total, $matches, $start, $max]); + + my %return; + $return{results} = $results; + $return{noresults} = 'No results'; + $return{template} = \ <<EOT; +[% FOREACH letter=results.records %] +<p>[% letter %]</p> +[% END %] +[% PROCESS pagination %] +<p> +EOT + + $return{rss} = {} if $rss; + + return \%return; +} + +1; + +__END__ Index: slashjp/plugins/SearchToo/SearchToo/Indexer.pm diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Indexer.pm:1.1 --- /dev/null Wed Jul 12 20:41:57 2006 +++ slashjp/plugins/SearchToo/SearchToo/Indexer.pm Wed Jul 12 20:41:57 2006 @@ -0,0 +1,407 @@ +package Slash::SearchToo::Indexer; + +use strict; +use File::Copy; +use File::Find; +use File::Path; +use File::Spec::Functions; +use Slash::Utility; +use Slash::DB::Utility; +use vars qw($VERSION); +use base 'Slash::SearchToo'; +use base 'Slash::SearchToo::Classic'; + +($VERSION) = ' $Revision: 1.1 $ ' =~ /\$Revision:\s+([^\s]+)/; + +# FRY: I did it! And it's all thanks to the books at my local library. + +# This is a superclass for various SearchToo engines that do indexing etc. + + +################################################################# +# fields that will be combined into the content field, +# for indexing and tokenization; first field is main one to excerpt +our %content = ( + comments => [qw(comment subject)], + stories => [qw(introtext bodytext title)], +); + +# additional fields that will be indexed and tokenized +our %text = ( + comments => [ qw(tids) ], + stories => [ qw(tids) ], +); + +our %primary = ( + comments => 'cid', +); + +# turn into hashes +for my $hash (\%text, \%content) { + for my $type (keys %$hash) { + my $arr = $hash->{$type}; + $hash->{$type} = { map { ($_ => 1) } @$arr }; + $hash->{_array}{$type} = $arr; + } +} + +################################################################# +sub new { + my($class, $user) = @_; + my $plugin = getCurrentStatic('plugin'); + return unless $plugin->{'SearchToo'}; + + my $handled; + { no strict; + $handled = ${$class . '::handled'}; + } + + my $self = { + _fields => { + content => \%content, + text => \%text, + primary => \%primary, + }, + }; + $self->{_handled} = $handled if $handled; + + bless $self, $class; + $self->{virtual_user} = $user; + $self->sqlConnect(); + + return $self; +} + +################################################################# +sub findRecords { + my($self, $type, $query, $opts) = @_; + + # let Classic handle for now + return Slash::SearchToo::Classic::findRecords(@_) unless $self->_handled($type); + +slashProfInit(); +slashProf('findRecords setup'); + + my $constants = getCurrentStatic(); + + my $processed = $self->_fudge_data($query); + my $results = {}; + my $records = []; + + ### set up common query terms + my $terms = { + query => $query->{query}, + }; + + + ### set up common options + my $sopts = {}; + $sopts->{total} = 0; + $sopts->{matches} = 0; + $sopts->{start} = $opts->{records_start} || 0; + $sopts->{max} = $opts->{records_max} || $constants->{search_default_display}; + + # sort can be an arrayref, but stick with one for now + ## no way to sort by date yet + $sopts->{sort} = ref $opts->{sort} ? $opts->{sort}[0] : $opts->{sort}; + $sopts->{sort} = ($opts->{sort} eq 'date' || $opts->{sort} eq 1) ? 1 : + ($opts->{sort} eq 'relevance' || $opts->{sort} eq 2) ? 2 : + 0; + + ### dispatch to different queries + if ($type eq 'comments') { + for (qw(section)) { + $terms->{$_} = $processed->{$_} if $processed->{$_}; + } + %$terms = (%$terms, + sid => $query->{sid}, + points_min => $query->{points_min}, + ); + } + +slashProf('_findRecords', 'findRecords setup'); + $self->_findRecords($results, $records, $sopts, $terms, $opts); +slashProf('getRecords', '_findRecords'); + $self->getRecords($type => $records); +slashProf('prepResults', 'getRecords'); + $self->prepResults($results, $records, $sopts); +slashProf('', 'getRecords'); + +slashProfEnd(); + + return $results; + + +} + +################################################################# +sub addRecords { + my($self, $type, $data, $opts) = @_; + + return unless $self->_handled($type); + +slashProfInit(); +slashProf('addRecords setup'); + + $data = [ $data ] unless ref $data eq 'ARRAY'; + + my @documents; + +slashProf('prepare records', 'addRecords setup'); + + for my $record (@$data) { + next unless keys %$record; + my $processed = $self->_fudge_data($record); + my %document; + + if ($type eq 'comments') { + %document = ( + cid => $record->{cid}, + + date => $record->{date}, + points => $record->{points}, + + comment => $record->{comment}, + subject => $record->{subject}, + sid => $record->{discussion_id}, + primaryskid => $processed->{section}, + tids => join(' ', @{$processed->{topic}}), + ); + } + + push @documents, \%document; + } + + # so we can index outside the main dir + if ($opts->{dir}) { + $self->_dir($opts->{dir}); + } + + # only bother if not adding, i.e., if modifying; if adding we + # assume it is new + unless ($opts->{add}) { + $self->deleteRecords($type => [ map $_->{ $self->{_fields}{primary}{$type} }, @documents ]); + } + +slashProf('add docs', 'prepare records'); + + my $count = $self->_addRecords($type, \@documents, $opts); + +slashProf('', 'add docs'); + + # clear it out when we're done + if ($opts->{dir}) { + $self->_dir(''); + } + +slashProfEnd(); + + return $count; +} + +################################################################# +sub prepRecord { + my($self, $type, $data, $opts) = @_; + + return unless $self->_handled($type); + + # default to writer + my $db = $opts->{db} || getCurrentDB(); + my %record; + + $data = { $primary{$type} => $data } unless ref $data; + + # this could possibly be done to get a bunch of comments at once ... + if ($type eq 'comments') { + my $comment = $db->getComment($data->{cid}) or return {}; + for (qw(date points cid subject)) { + $record{$_} = $comment->{$_}; + } + + $record{comment} = $data->{comment} || $db->getCommentText($data->{cid}); + + my $discussion = $db->getDiscussion($comment->{sid}); + $record{discussion_id} = $discussion->{id}; + $record{section} = $discussion->{primaryskid}; + $record{topic} = $discussion->{stoid} + ? $db->getStoryTopicsRendered($discussion->{stoid}) + : $discussion->{topic}; + } + + return \%record; +} + +################################################################# +sub getRecords { + my($self, $type, $data, $opts) = @_; + + return unless $self->_handled($type); + + # default to ... search? reader? + my $db = $opts->{db} || getObject('Slash::DB', { type => 'reader' }); + my %record; + + if ($type eq 'comments') { + for my $datum (@$data) { + # just return the whole comment ... why not? + my $comment = $db->getComment($datum->{cid}); + if ($comment) { + @{$datum}{keys %$comment} = values %$comment; + } else { + $datum = {}; + next; + } + if ($comment->{sid}) { + my $discussion = $db->getDiscussion($comment->{sid}); + @{$datum}{qw( + primaryskid url title + author_uid did + )} = @{$discussion}{qw( + primaryskid url title + uid id + )}; + } + } + } +} + +################################################################# +# handle delete too? +sub storeRecords { + my($self, $type, $data, $opts) = @_; +return; + return unless $self->_handled($type); + + my $slashdb = getCurrentDB(); + + $data = [ $data ] unless ref $data eq 'ARRAY'; + + my $count = 0; + for my $record (@$data) { + next unless $record; + + # deal with multiple instances of same type => id + $count++ if $slashdb->sqlInsert('search_index_dump', { + type => $type, + id => $record, + status => $opts->{add} ? 'new' : 'changed', + }); + } + + return $count; +} + +################################################################# +# move prepared index data to live +sub moveLive { + my($self, $type, $dir) = @_; + + return unless $self->can('_dir') && ($dir || $self->can('_backup_dir')); + + my $backup_dir = $self->_backup_dir($type, $dir); + $dir = $self->_dir($type, ''); + + my @time = localtime; + my $now = sprintf "-%04d%02d%02d-%02d%02d%02d", $time[5]+1900, $time[4]+1, $time[3], $time[2], $time[1], $time[0]; + $dir =~ s|/+$||; # just in case + my $olddir = $dir . $now; + my $tmpdir = $dir . '-tmp'; + + # copy staging to temp dir + _moveFind($backup_dir, $tmpdir); + # move live to backup + rename($dir, $olddir); + # move temp to live + rename($tmpdir, $dir); + + # kick old? +} + +################################################################# +sub _moveFind { + my($olddir, $newdir); + find(sub { + my($old) = $File::Find::name; + my $new = s/^\Q$olddir/$newdir/; + if (-d $old) { + eval { + mkpath($new, 0, 0775); + }; + if ($@) { + warn "Can't create path $new: $@"; + } + } elsif (-f _) { + copy($old, $new) or warn "Can't copy file $new: $!"; + } + }, $olddir); +} + +################################################################# +sub _field_exists { + my($self, $field, $key, $type) = @_; + return unless $field; + $type = $self->_type($type); + + return $self->{_fields}{$field}{$type}{$key}; +} + +################################################################# +sub _field_list { + my($self, $field, $type) = @_; + return unless $field; + $type = $self->_type($type); + + return $self->{_fields}{$field}{_array}{$type}; +} + +################################################################# +sub _primary { + my($self, $type) = @_; + $type = $self->_type($type); + + return $self->{_fields}{primary}{$type}; +} + +################################################################# +sub _handled { + my($self, $type) = @_; + $type = $self->_type($type); + return $type =~ $self->{_handled}; +} + +################################################################# +sub _type { + my($self, $type) = @_; + $self->{_type} = $type if defined $type; + return $self->{_type}; +} + +################################################################# +sub _class { + my($self) = @_; + unless ($self->{_class}) { + ($self->{_class} = lc ref $self) =~ s/^.+:://; + } + return $self->{_class}; +} + +################################################################# +sub _dir { + my($self, $type, $dir) = @_; + $self->{_dir} = $dir if defined $dir; + $self->{_dir} ||= catdir(getCurrentStatic('datadir'), 'search_index'); + + return catdir($self->{_dir}, $self->_class, $self->_type($type)); +} + +################################################################# +sub _backup_dir { + my($self, $type, $dir) = @_; + my $backup_dir = $dir || catdir(getCurrentStatic('datadir', 'search_index_tmp')); + + return $self->_dir($type, $backup_dir); +} + +1; + +__END__ Index: slashjp/plugins/SearchToo/SearchToo/Kinosearch.pm diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Kinosearch.pm:1.1 --- /dev/null Wed Jul 12 20:41:57 2006 +++ slashjp/plugins/SearchToo/SearchToo/Kinosearch.pm Wed Jul 12 20:41:57 2006 @@ -0,0 +1,330 @@ +package Slash::SearchToo::Kinosearch; + +# STILL IN PROGRESS NOT READY FOR USE + +use strict; +use File::Path; +use File::Spec::Functions; +use Slash::Utility; +use Slash::DB::Utility; +use vars qw($VERSION); +use base 'Slash::SearchToo::Indexer'; + +use Search::Kinosearch::KSearch; +use Search::Kinosearch::Kindexer; + +($VERSION) = ' $Revision: 1.1 $ ' =~ /\$Revision:\s+([^\s]+)/; + +# FRY: I did it! And it's all thanks to the books at my local library. + +our $handled = qr{^(?:comments)$}; + +our $backend = 'DB_File'; + +################################################################# +sub getOps { + my %ops = ( + stories => 1, + comments => 1, + journals => 1, + polls => 1, + users => 1, + submissions => 1, + ); + return \%ops; +} + +################################################################# +sub _findRecords { + my($self, $results, $records, $sopts, $terms, $opts) = @_; + + my $constants = getCurrentStatic(); + +slashProf('init search'); + + my $querystring = $terms->{query}; + # escape special chars + # none, allow all special chars +# $querystring =~ s/([&^|!{}[\]:\\])~*?/\\$1/g; # allowed: ()"+- + # normalize to lower case ??? + $querystring =~ s/\b(?!AND|NOT|OR)(\w+)\b/\L$1/g; + + $sopts->{max}++; # until we get matches/num_hits working + my $searcher_opts = { + -num_results => $sopts->{max}, + -offset => $sopts->{start}, + -excerpt_field => $self->_field_list('content')->[0] + }; + + if ($sopts->{'sort'} == 1) { + $searcher_opts->{-sort_by} = 'timestamp'; + } elsif ($sopts->{'sort'} == 2) { + $searcher_opts->{-sort_by} = 'relevance'; + } + + my $searcher = $self->_searcher(undef, undef, $searcher_opts) or return $results; + + $searcher->add_query( + -string => $querystring, + -lowercase => 1, + -tokenize => 1, + -stem => 1, + -required => 1, + -fields => { # ??? adjust weights? + map { ( $_ => 1 ) } $self->_field_list('content') + } + ); + + +# if (length $terms->{points_min}) { # ??? +# # no need to bother with adding this to the query, since it's all comments +# if ($terms->{points_min} == $constants->{comment_minscore}) { +# delete $terms->{points_min}; +# } else { # ($terms{points_min} != $constants->{comment_maxscore}) { + delete $terms->{points_min}; +# } +# } + + for my $key (keys %$terms) { + next if $key eq 'query' || ! length($terms->{$key}); + + $searcher->add_query( + -string => $terms->{$key}, + -required => 1, + -fields => $key, + ); + } + +#use Data::Dumper; +#print Dumper $searcher; + +slashProf('search', 'init search'); + my $status = $searcher->process || {}; + + $sopts->{total} = $status->{num_docs}; + $sopts->{matches} = $status->{num_hits}; + +slashProf('fetch results', 'search'); + + while (my $obj = $searcher->fetch_result_hashref) { + my %data = ( + score => $obj->{score}, + $self->_primary => $obj->{doc_id}, + excerpt => $obj->{excerpt}, + ); + + push @$records, \%data; + } + +slashProf('', 'fetch results'); + +use Data::Dumper; +print Dumper $records; + + return 1; +} + +################################################################# +sub _addRecords { + my($self, $type, $documents, $opts) = @_; + + my $writer = $opts->{writer} || $self->_writer; + + if (!$writer->{_is_old}) { # ??? + for my $field (keys %{$documents->[0]}) { + $writer->define_field( + -name => $field, + # only store the main content field, for excerpting + -store => $self->_field_list('content')->[0] eq $field + ); + } + } + + my $count = 0; + my @docs; + for my $document (@$documents) { + my %doc; + # start new document by *id + $writer->new_document($document->{ $self->_primary }); + +#printf "%d:%s\n", $document->{ $self->_primary }, $document->{date}; + + # timestamp is Unix epoch + if ($document->{date}) { + $writer->set_document_timestamp(timeCalc(delete $document->{date}, "%s", 0)); + } + + for my $key (keys %$document) { + next unless length $document->{$key}; + next if $key eq $self->_primary; + + $writer->set_field($key => $document->{$key}); + + my $is_text = $self->_field_exists(text => $key); + my $is_content = $self->_field_exists(content => $key); + + if ($is_text || $is_content) { + $writer->lc_field($key) if $is_content; + $writer->tokenize_field($key); + $writer->stem_field($key) if $is_content; + } +#printf "%s:%s\n", $key, $document->{$key}; + } + + $writer->add_document; +#printf "%d\n\n", $count; + $count++; + } + + $writer->finish unless $opts->{writer}; + +# # only optimize if requested (as usual), and changes were made +# $self->optimize($type) if $opts->{optimize} && $count; + + return $count; +} + +################################################################# +# Plucene-specific helper methods +sub isIndexed { # ??? + my($self, $type, $id, $opts) = @_; + + return unless $self->_handled($type); + + my $preader = ($opts->{_reader} || $self->_reader) or return; + + my $found = $preader->doc_is_indexed($id); + +# $preader->close unless $opts->{_reader}; + + return $found || 0; +} + +################################################################# +sub optimize { # ??? + my($self, $type) = @_; + + return unless $self->_handled($type); + +slashProf('optimize'); + +slashProf('', 'optimize'); + + return 1; +} + +################################################################# +sub merge { # ??? + my($self, $type, $dirs, $opts) = @_; + + return unless $self->_handled($type); + +slashProf('merge'); + + my @alldirs; + for (@$dirs) { + push @alldirs, $self->_dir($type => $_); + } + my $dir = $self->_dir($type => $opts->{dir}); + ## backup $dir? + +slashProf('', 'merge'); + + return scalar @alldirs; +} + +################################################################# +sub deleteRecords { # ??? + my($self, $type, $ids, $opts) = @_; + + return unless $self->_handled($type); + +slashProf('deleteRecords'); + + my $preader = $self->_reader or return; + + $ids = [ $ids ] unless ref $ids; + + my $count = 0; + for my $id (@$ids) { + my($found) = $self->isIndexed($type => $id, { _reader => $preader }); + if ($found) { + $count += $found; + $preader->delete_document($id); + } + } + +# # only optimize if requested (as usual), and changes were made +# $self->optimize($type) if $opts->{optimize} && $count; + +slashProf('', 'deleteRecords'); + + return $count; +} + +################################################################# +sub _searcher { + my($self, $type, $dir, $opts) = @_; + $dir = $self->_dir($type, $dir); + + my $constants = getCurrentStatic(); + $opts ||= {}; + + my $preader = $self->_reader($type) or return undef; + + return Search::Kinosearch::KSearch->new( + -stoplist => {}, + -kindex => $preader, + -any_or_all => 'all', + -sort_by => 'relevance', # relevance, timestamp + -allow_boolean => 0, + -allow_phrases => 0, +# -max_terms => 6, # ??? + -excerpt_length => $constants->{search_text_length}, + %$opts + ); +} + +################################################################# +sub _reader { + my($self, $type, $dir) = @_; + $dir = $self->_dir($type, $dir); + + return undef unless -e catdir($dir, 'kindex'); + + return Search::Kinosearch::Kindexer->new( + -stoplist => {}, + -mode => 'readonly', + -backend => $backend, + -kindexpath => catdir($dir, 'kindex'), + -kinodatapath => catdir($dir, 'kindex', 'kinodata'), + ); +} + +################################################################# +sub _writer { + my($self, $type, $dir) = @_; + $dir = $self->_dir($type, $dir); + + my $mode = -e catdir($dir, 'kindex') ? 'overwrite' : 'create'; + + my $tmp = catdir($dir, 'ktemp'); + + mkpath($dir, 0, 0775) unless -e $dir; + mkpath($tmp, 0, 0775) unless -e $tmp; + + return Search::Kinosearch::Kindexer->new( + -stoplist => {}, + -mode => $mode, # create, overwrite, update, readonly + -backend => $backend, + -kindexpath => catdir($dir, 'kindex'), + -kinodatapath => catdir($dir, 'kindex', 'kinodata'), + -temp_directory => catdir($dir, 'ktemp'), + -enable_updates => 0, + -phrase_matching => 0, + ); +} + +1; + +__END__ Index: slashjp/plugins/SearchToo/SearchToo/Makefile.PL diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Makefile.PL:1.1 --- /dev/null Wed Jul 12 20:41:57 2006 +++ slashjp/plugins/SearchToo/SearchToo/Makefile.PL Wed Jul 12 20:41:57 2006 @@ -0,0 +1,11 @@ +use ExtUtils::MakeMaker; +# See lib/ExtUtils/MakeMaker.pm for details of how to influence +# the contents of the Makefile that is written. +WriteMakefile( + 'PM' => { + 'Classic.pm' => '$(INST_LIBDIR)/Slash/SearchToo/Classic.pm', + 'Indexer.pm' => '$(INST_LIBDIR)/Slash/SearchToo/Indexer.pm', + 'Kinosearch.pm' => '$(INST_LIBDIR)/Slash/SearchToo/Kinosearch.pm', + 'Plucene.pm' => '$(INST_LIBDIR)/Slash/SearchToo/Plucene.pm', + }, +); Index: slashjp/plugins/SearchToo/SearchToo/Plucene.pm diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Plucene.pm:1.1 --- /dev/null Wed Jul 12 20:41:57 2006 +++ slashjp/plugins/SearchToo/SearchToo/Plucene.pm Wed Jul 12 20:41:57 2006 @@ -0,0 +1,385 @@ +package Slash::SearchToo::Plucene; + +# STILL IN PROGRESS NOT READY FOR USE + +use strict; +use File::Spec::Functions; +use Slash::Utility; +use Slash::DB::Utility; +use Slash::SearchToo::Classic; +use vars qw($VERSION); +use base 'Slash::SearchToo::Indexer'; + +use Plucene::Document; +use Plucene::Document::DateSerializer; +use Plucene::Index::Writer; +use Plucene::QueryParser; +use Plucene::Search::HitCollector; +use Plucene::Search::IndexSearcher; +use Plucene::Search::TermQuery; + +($VERSION) = ' $Revision: 1.1 $ ' =~ /\$Revision:\s+([^\s]+)/; + +# FRY: I did it! And it's all thanks to the books at my local library. + +our $handled = qr{^(?:comments)$}; + +################################################################# +sub getOps { + my %ops = ( + stories => 1, + comments => 1, + journals => 1, + polls => 1, + users => 1, + submissions => 1, + ); + return \%ops; +} + +################################################################# +sub _findRecords { + my($self, $results, $records, $sopts, $terms, $opts) = @_; + + my $constants = getCurrentStatic(); + +slashProf('init search', 'findRecords setup'); + + my $parser = Plucene::QueryParser->new({ + analyzer => $self->_analyzer, + default => 'content' + }); + + my $querystring = $terms->{query}; + # escape special chars + $querystring =~ s/([-+&|!{}[\]:\\])~*?/\\$1/g; # allowed: ()"^ + # normalize to lower case + $querystring =~ s/\b(?!AND|NOT|OR)(\w+)\b/\L$1/g; + my $newquery; + eval { $newquery = $parser->parse('+(' . $querystring . ')') } or return $results; + + my $filter = 0; + if (length $terms->{points_min}) { + # no need to bother with adding this to the query, since it's all comments + if ($terms->{points_min} == $constants->{comment_minscore}) { + delete $terms->{points_min}; + } else { # ($terms{points_min} != $constants->{comment_maxscore}) { + $filter = Slash::SearchToo::Plucene::Filter->new({ + field => '_points_', + from => _get_sortable_points(delete $terms->{points_min}), + to => _get_sortable_points($constants->{comment_maxscore}), + }); + } + } + + for my $key (keys %$terms) { + next if $key eq 'query' || ! length($terms->{$key}); + my $term = Plucene::Index::Term->new({ + field => $key, + text => $terms->{$key} + }) or next; + my $term_query = Plucene::Search::TermQuery->new({ term => $term }) or next; + $newquery->add($term_query, 1); + } +#use Data::Dumper; +#print STDERR Dumper $newquery; +#print STDERR $newquery->to_string, ":$filter\n"; + + my $searcher = $self->_searcher or return $results; +slashProf('search', 'init search'); + my $docs = $searcher->search_top($newquery, $filter, $sopts->{start} + $sopts->{max}); + + $sopts->{total} = $searcher->max_doc; + $sopts->{matches} = $docs->total_hits; + +slashProf('fetch results', 'search'); + my $skip = $sopts->{start}; + for my $obj (sort { $b->{score} <=> $a->{score} } $docs->score_docs) { + if ($skip > 0) { + $skip--; + next; + } + + last if @$records >= $sopts->{max}; + + my($doc, $score) = @{$obj}{qw(doc score)}; + my $docobj = $searcher->doc($doc); + my %data = ( score => $score ); + + for my $field ($docobj->fields) { + my $name = $field->name; + next if $name =~ /^(?:content|id)$/; + $data{$name} = $field->string; + } + + push @$records, \%data; + } + + return 1; +} + +################################################################# +sub _addRecords { + my($self, $type, $documents, $opts) = @_; + + my $writer = $self->_writer; + + my $count = 0; + for my $document (@$documents) { + my $doc = Plucene::Document->new; + + # combine our text fields into one, and then remove them; we + # don't need them stored separately + # normalize to lower case + $document->{content} = lc join ' ', @{$document}{ @{$self->_field_list('content')} }; + delete @{$document}{ @{$self->_field_list('content')} }; + + $document->{_date_} = _get_sortable_date(delete $document->{date}); + $document->{_points_} = _get_sortable_points(delete $document->{points}); + + for my $key (keys %$document) { + next unless length $document->{$key}; + my $field; + + if ($key eq 'content' || $self->_field_exists(text => $key)) { + $field = Plucene::Document::Field->Text($key, $document->{$key}); + } else { + $field = Plucene::Document::Field->Keyword($key, $document->{$key}); + } + + $doc->add($field); + } + + $writer->add_document($doc); + $count++; + } + + undef $writer; + + # only optimize if requested (as usual), and changes were made + $self->optimize($type) if $opts->{optimize} && $count; + + return $count; + +} + +################################################################# +# Plucene-specific helper methods +sub isIndexed { + my($self, $type, $id, $opts) = @_; + + return unless $self->_handled($type); + + my $preader = ($opts->{_reader} || $self->_reader) or return; + + my $term = Plucene::Index::Term->new({ + field => $self->_primary, + text => $id + }); + + my $found = $preader->doc_freq($term); + + $preader->close unless $opts->{_reader}; + + return $found ? ($found, $term) : 0; +} + +################################################################# +sub optimize { + my($self, $type) = @_; + + return unless $self->_handled($type); + +slashProf('optimize'); + + my $writer = $self->_writer; + $writer->optimize; + undef $writer; +slashProf('', 'optimize'); + + return 1; +} + +################################################################# +sub merge { + my($self, $type, $dirs, $opts) = @_; + + return unless $self->_handled($type); + +slashProf('merge'); + + my @alldirs; + for (@$dirs) { + push @alldirs, $self->_dir($type => $_); + } + my $dir = $self->_dir($type => $opts->{dir}); + ## backup $dir? + + if (@alldirs) { + my $writer = $self->_writer; + $writer->add_indexes(@alldirs); + } + +slashProf('', 'merge'); + + return scalar @alldirs; +} + +################################################################# +sub deleteRecords { + my($self, $type, $ids, $opts) = @_; + + return unless $self->_handled($type); + +slashProf('deleteRecords'); + + my $preader = $self->_reader or return; + + $ids = [ $ids ] unless ref $ids; + + my $count = 0; + for my $id (@$ids) { + my($found, $term) = $self->isIndexed($type => $id, { _reader => $preader }); + if ($found) { + $count += $found; + $preader->delete_term($term); + } + } + + $preader->close; + + # only optimize if requested (as usual), and changes were made + $self->optimize($type) if $opts->{optimize} && $count; + +slashProf('', 'deleteRecords'); + + return $count; +} + +################################################################# +# make it easier to sort by serializing the date +sub _get_sortable_date { + my($time, $format) = @_; + $format ||= '%Y-%m-%d %H:%M:%S'; + return freeze_date(Time::Piece->strptime($time, $format)); +} + +################################################################# +# make it easier to sort by converting to alphabet +sub _get_sortable_points { + my($points) = @_; + + my $constants = getCurrentStatic(); + my $min = $constants->{comment_minscore}; + my $max = $constants->{comment_maxscore}; + + $points = $points < $min + ? $min + : $points > $max + ? $max + : $points; + + my $start = $min; + my $finish = 'a'; + until ($start == $points) { + $start++; + $finish++; + } + + return $finish; +} + +################################################################# +sub _searcher { + my($self, $type, $dir) = @_; + $type = $self->_type($type); + $dir = $self->_dir($type, $dir); + + return $self->{_searcher}{$type}{$dir} if $self->{_searcher}{$type}{$dir}; + + return -e $dir + ? ($self->{_searcher}{$type}{$dir} = Plucene::Search::IndexSearcher->new($dir)) + : undef; +} + +################################################################# +sub _reader { + my($self, $type, $dir) = @_; + $type = $self->_type($type); + $dir = $self->_dir($type, $dir); + + return $self->{_reader}{$type}{$dir} if $self->{_reader}{$type}{$dir}; + + return -e $dir + ? ($self->{_reader}{$type}{$dir} = Plucene::Index::Reader->open($dir)) + : undef; +} + +################################################################# +sub _writer { + my($self, $type, $dir) = @_; + $type = $self->_type($type); + $dir = $self->_dir($type, $dir); + + return $self->{_writer}{$type}{$dir} if $self->{_writer}{$type}{$dir}; + + return $self->{_writer}{$type}{$dir} = Plucene::Index::Writer->new( + $dir, $self->_analyzer, + -e catfile($dir, 'segments') ? 0 : 1 + ); +} + +################################################################# +sub close_searcher { + my($self, $type, $dir) = @_; + $type = $self->_type($type); + $dir = $self->_dir($type, $dir); + + my $searcher = delete $self->{_searcher}{$type}{$dir} or return; + $searcher->close; +} + +################################################################# +sub close_reader { + my($self, $type, $dir) = @_; + $type = $self->_type($type); + $dir = $self->_dir($type, $dir); + + my $preader = delete $self->{_reader}{$type}{$dir} or return; + $preader->close; +} + +################################################################# +sub close_writer { + my($self, $type, $dir) = @_; + $type = $self->_type($type); + $dir = $self->_dir($type, $dir); + + my $writer = delete $self->{_writer}{$type}{$dir} or return; + undef $writer; +} + +# maybe add our own analyzer ... +use Plucene::Analysis::StopAnalyzer; +sub _analyzer { + return Plucene::Analysis::StopAnalyzer->new; +} + +################################################################# +################################################################# +package Slash::SearchToo::Plucene::Filter; +use base 'Plucene::Search::DateFilter'; + +sub new { + my($self, $args) = @_; + bless { + field => $args->{field}, + from => $args->{from}, + to => $args->{to}, + }, $self; +} + + +1; + +__END__