[Slashdotjp-dev 479] CVS update: slashjp/plugins/SearchToo/SearchToo

Back to archive index

Tatsuki SUGIURA sugi****@users*****
2006年 7月 12日 (水) 20:41:57 JST


Index: slashjp/plugins/SearchToo/SearchToo/Classic.pm
diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Classic.pm:1.1
--- /dev/null	Wed Jul 12 20:41:57 2006
+++ slashjp/plugins/SearchToo/SearchToo/Classic.pm	Wed Jul 12 20:41:57 2006
@@ -0,0 +1,194 @@
+package Slash::SearchToo::Classic;
+
+use strict;
+use Slash::Utility;
+use Slash::DB::Utility;
+use vars qw($VERSION);
+use base 'Slash::DB::Utility';
+use base 'Slash::SearchToo';
+
+($VERSION) = ' $Revision: 1.1 $ ' =~ /\$Revision:\s+([^\s]+)/;
+
+# FRY: I did it!  And it's all thanks to the books at my local library.
+
+#################################################################
+sub new {
+	my($class, $user) = @_;
+	my $self = {};
+
+	my $plugin = getCurrentStatic('plugin');
+	return unless $plugin->{'Search'};
+
+	bless($self, $class);
+	$self->{virtual_user} = $user;
+	$self->sqlConnect();
+
+	return $self;
+}
+
+#################################################################
+sub getOps {
+	my %ops = (
+		stories		=> 1,
+		comments	=> 1,
+		journals	=> 1,
+		polls		=> 1,
+		users		=> 1,
+		submissions	=> 1,
+		test		=> \&testSearch,
+	);
+	return \%ops;
+}
+
+#################################################################
+sub findRecords {
+	my($self, $type, $query, $opts) = @_;
+
+	my(%processed);
+	my $results = {};
+	my $records = [];
+
+	my $constants = getCurrentStatic();
+	my $oldsearch = getObject('Slash::Search', { db_type => 'search' });
+
+
+	### set up common query terms
+	my %terms = (
+		query	=> $query->{query},
+	);
+
+	if ($query->{topic}) {
+		my @topics = ref $query->{topic}
+			? @{$query->{topic}}
+			: $query->{topic};
+		$processed{tid} = $topics[0] if @topics;
+		# API is expecting multiple args in _multi, so we fake it
+		$processed{_multi}{tid} = \@topics if @topics > 1;
+	}
+
+	if ($query->{section}) {
+		my $reader = getObject('Slash::DB', { db_type => 'reader' });
+		# get section name, for most compatibility with this API
+		my $skin = $reader->getSkin($query->{section});
+		$processed{section} = $skin->{name} if $skin && $skin->{name};
+	}
+
+	for (qw(uid author submitter)) {
+		$processed{$_} = $query->{$_} if $query->{$_} && $query->{$_} =~ /^\d+$/;
+	}
+
+
+	### set up common options
+	# old API cannot tell us total or matches
+	# undef if we cannot find for sure, or if not applicable
+	my $total	= undef;
+	my $matches	= undef;
+	my $start	= $opts->{records_start} || 0;
+	my $max		= $opts->{records_max}   || $constants->{search_default_display};
+	# if we are not getting total number of matches, fetch an extra so we
+	# know if there are more, for pagination purposes
+	$max++ if !defined $matches;
+
+	# sort can be an arrayref, but old API can handle only one
+	my $sort  = ref $opts->{sort} ? $opts->{sort}[0] : $opts->{sort};
+	$sort = ($opts->{sort} eq 'date'      || $opts->{sort} eq 1) ? 1 :
+		($opts->{sort} eq 'relevance' || $opts->{sort} eq 2) ? 2 :
+		0;
+
+### options not used in this backend
+#	date_start => '', date_end => '',	
+
+
+	### dispatch to different queries
+	if ($type eq 'stories') {
+		for (qw(tid _multi section author submitter)) {
+			$terms{$_} = $processed{$_} if $processed{$_};
+		}
+
+		$records = $oldsearch->findStory(\%terms, $start, $max, $sort);
+	}
+
+	elsif ($type eq 'comments') {
+		for (qw(section)) {
+			$terms{$_} = $processed{$_} if $processed{$_};
+		}
+		%terms = (%terms,
+			sid		=> $query->{sid},
+			threshold	=> $query->{points_min},
+		);
+
+		$records = $oldsearch->findComments(\%terms, $start, $max, $sort);
+	}
+
+	elsif ($type eq 'journals') {
+		for (qw(tid uid)) {
+			$terms{$_} = $processed{$_} if $processed{$_};
+		}
+
+		$records = $oldsearch->findJournalEntry(\%terms, $start, $max, $sort);
+	}
+
+	elsif ($type eq 'polls') {
+		for (qw(tid section uid)) {
+			$terms{$_} = $processed{$_} if $processed{$_};
+		}
+
+		$records = $oldsearch->findPollQuestion(\%terms, $start, $max, $sort);
+	}
+
+	elsif ($type eq 'users') {
+		# sigh, why is this ONE method passing info in an additional parameter?
+		$records = $oldsearch->findUsers(\%terms, $start, $max, $sort, $query->{journal_only});
+	}
+
+	elsif ($type eq 'submissions') {
+		for (qw(tid section uid)) {
+			$terms{$_} = $processed{$_} if $processed{$_};
+		}
+		%terms = (%terms,
+			note		=> $query->{note},
+		);
+
+		$records = $oldsearch->findSubmission(\%terms, $start, $max, $sort);
+	}
+
+	$self->prepResults($results, $records, [$total, $matches, $start, $max]);
+	return $results;
+}
+
+#################################################################
+# this is a way of adding extra search thingys; we could call another
+# search method (as defaultSearch calls findRecords), or just make this our
+# search method.
+sub testSearch {
+	my($reader, $constants, $user, $form, $gSkin, $searchDB, $rss, $query, $opts) = @_;
+
+	my $results	= {};
+	my $records	= ['a' .. 'z'];
+	my $total	= 26;
+	my $matches	= 26;
+	my $start	= $opts->{records_start} || 0;
+	my $max		= $opts->{records_max} || 26;
+
+	$records = [ @{$records}[$start .. ($start + $max)] ];
+	$searchDB->prepResults($results, $records, [$total, $matches, $start, $max]);
+
+	my %return;
+	$return{results}   = $results;
+	$return{noresults} = 'No results';
+	$return{template}  = \ <<EOT;
+[% FOREACH letter=results.records %]
+<p>[% letter %]</p>
+[% END %]
+[% PROCESS pagination %]
+<p>
+EOT
+
+	$return{rss} = {} if $rss;
+
+	return \%return;
+}
+
+1;
+
+__END__
Index: slashjp/plugins/SearchToo/SearchToo/Indexer.pm
diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Indexer.pm:1.1
--- /dev/null	Wed Jul 12 20:41:57 2006
+++ slashjp/plugins/SearchToo/SearchToo/Indexer.pm	Wed Jul 12 20:41:57 2006
@@ -0,0 +1,407 @@
+package Slash::SearchToo::Indexer;
+
+use strict;
+use File::Copy;
+use File::Find;
+use File::Path;
+use File::Spec::Functions;
+use Slash::Utility;
+use Slash::DB::Utility;
+use vars qw($VERSION);
+use base 'Slash::SearchToo';
+use base 'Slash::SearchToo::Classic';
+
+($VERSION) = ' $Revision: 1.1 $ ' =~ /\$Revision:\s+([^\s]+)/;
+
+# FRY: I did it!  And it's all thanks to the books at my local library.
+
+# This is a superclass for various SearchToo engines that do indexing etc.
+
+
+#################################################################
+# fields that will be combined into the content field,
+# for indexing and tokenization; first field is main one to excerpt
+our %content = (
+	comments	=> [qw(comment subject)],
+	stories		=> [qw(introtext bodytext title)],
+);
+
+# additional fields that will be indexed and tokenized
+our %text = (
+	comments	=> [ qw(tids) ],
+	stories		=> [ qw(tids) ],
+);
+
+our %primary = (
+	comments	=> 'cid',
+);
+
+# turn into hashes
+for my $hash (\%text, \%content) {
+	for my $type (keys %$hash) {
+		my $arr = $hash->{$type};
+		$hash->{$type} = { map { ($_ => 1) } @$arr };
+		$hash->{_array}{$type} = $arr;
+	}
+}
+
+#################################################################
+sub new {
+	my($class, $user) = @_;
+	my $plugin = getCurrentStatic('plugin');
+	return unless $plugin->{'SearchToo'};
+
+	my $handled;
+	{ no strict;
+		$handled = ${$class . '::handled'};
+	}
+
+	my $self = {
+		_fields => {
+			content => \%content,
+			text	=> \%text,
+			primary	=> \%primary,
+		},
+	};
+	$self->{_handled} = $handled if $handled;
+
+	bless $self, $class;
+	$self->{virtual_user} = $user;
+	$self->sqlConnect();
+
+	return $self;
+}
+
+#################################################################
+sub findRecords {
+	my($self, $type, $query, $opts) = @_;
+
+	# let Classic handle for now
+	return Slash::SearchToo::Classic::findRecords(@_) unless $self->_handled($type);
+
+slashProfInit();
+slashProf('findRecords setup');
+
+	my $constants = getCurrentStatic();
+
+	my $processed = $self->_fudge_data($query);
+	my $results = {};
+	my $records = [];
+
+	### set up common query terms
+	my $terms = {
+		query	=> $query->{query},
+	};
+
+
+	### set up common options
+	my $sopts = {};
+	$sopts->{total}   = 0;
+	$sopts->{matches} = 0;
+	$sopts->{start}   = $opts->{records_start} || 0;
+	$sopts->{max}     = $opts->{records_max}   || $constants->{search_default_display};
+
+	# sort can be an arrayref, but stick with one for now
+	## no way to sort by date yet
+	$sopts->{sort} = ref $opts->{sort} ? $opts->{sort}[0] : $opts->{sort};
+	$sopts->{sort} = ($opts->{sort} eq 'date'	|| $opts->{sort} eq 1) ? 1 :
+			($opts->{sort} eq 'relevance'	|| $opts->{sort} eq 2) ? 2 :
+			0;
+
+	### dispatch to different queries
+	if ($type eq 'comments') {
+		for (qw(section)) {
+			$terms->{$_} = $processed->{$_} if $processed->{$_};
+		}
+		%$terms = (%$terms,
+			sid		=> $query->{sid},
+			points_min	=> $query->{points_min},
+		);
+	}
+
+slashProf('_findRecords', 'findRecords setup');
+	$self->_findRecords($results, $records, $sopts, $terms, $opts);
+slashProf('getRecords', '_findRecords');
+	$self->getRecords($type => $records);
+slashProf('prepResults', 'getRecords');
+	$self->prepResults($results, $records, $sopts);
+slashProf('', 'getRecords');
+
+slashProfEnd();
+
+	return $results;
+
+
+}
+
+#################################################################
+sub addRecords {
+	my($self, $type, $data, $opts) = @_;
+
+	return unless $self->_handled($type);
+
+slashProfInit();
+slashProf('addRecords setup');
+
+	$data = [ $data ] unless ref $data eq 'ARRAY';
+
+	my @documents;
+
+slashProf('prepare records', 'addRecords setup');
+
+	for my $record (@$data) {
+		next unless keys %$record;
+		my $processed = $self->_fudge_data($record);
+		my %document;
+
+		if ($type eq 'comments') {
+			%document = (
+				cid			=> $record->{cid},
+
+				date			=> $record->{date},
+				points			=> $record->{points},
+
+				comment			=> $record->{comment},
+				subject			=> $record->{subject},
+				sid			=> $record->{discussion_id},
+				primaryskid		=> $processed->{section},
+				tids			=> join(' ', @{$processed->{topic}}),
+			);
+		}
+
+		push @documents, \%document;
+	}
+
+	# so we can index outside the main dir
+	if ($opts->{dir}) {
+		$self->_dir($opts->{dir});
+	}
+
+	# only bother if not adding, i.e., if modifying; if adding we
+	# assume it is new
+	unless ($opts->{add}) {
+		$self->deleteRecords($type => [ map $_->{ $self->{_fields}{primary}{$type} }, @documents ]);
+	}
+
+slashProf('add docs', 'prepare records');
+
+	my $count = $self->_addRecords($type, \@documents, $opts);
+
+slashProf('', 'add docs');
+
+	# clear it out when we're done
+	if ($opts->{dir}) {
+		$self->_dir('');
+	}
+
+slashProfEnd();
+
+	return $count;
+}
+
+#################################################################
+sub prepRecord {
+	my($self, $type, $data, $opts) = @_;
+
+	return unless $self->_handled($type);
+
+	# default to writer
+	my $db = $opts->{db} || getCurrentDB();
+	my %record;
+
+	$data = { $primary{$type} => $data } unless ref $data;
+
+	# this could possibly be done to get a bunch of comments at once ...
+	if ($type eq 'comments') {
+		my $comment = $db->getComment($data->{cid}) or return {};
+		for (qw(date points cid subject)) {
+			$record{$_} = $comment->{$_};
+		}
+
+		$record{comment} = $data->{comment} || $db->getCommentText($data->{cid});
+
+		my $discussion = $db->getDiscussion($comment->{sid});
+		$record{discussion_id}    = $discussion->{id};
+		$record{section}          = $discussion->{primaryskid};
+		$record{topic}            = $discussion->{stoid}
+			? $db->getStoryTopicsRendered($discussion->{stoid})
+			: $discussion->{topic};
+	}
+
+	return \%record;
+}
+
+#################################################################
+sub getRecords {
+	my($self, $type, $data, $opts) = @_;
+
+	return unless $self->_handled($type);
+
+	# default to ... search?  reader?
+	my $db = $opts->{db} || getObject('Slash::DB', { type => 'reader' });
+	my %record;
+
+	if ($type eq 'comments') {
+		for my $datum (@$data) {
+			# just return the whole comment ... why not?
+			my $comment = $db->getComment($datum->{cid});
+			if ($comment) {
+				@{$datum}{keys %$comment} = values %$comment;
+			} else {
+				$datum = {};
+				next;
+			}
+			if ($comment->{sid}) {
+				my $discussion = $db->getDiscussion($comment->{sid});
+				@{$datum}{qw(
+					primaryskid url title
+					author_uid did
+				)} = @{$discussion}{qw(
+					primaryskid url title
+					uid id
+				)};
+			}
+		}
+	}
+}
+
+#################################################################
+# handle delete too?
+sub storeRecords {
+	my($self, $type, $data, $opts) = @_;
+return;
+	return unless $self->_handled($type);
+
+	my $slashdb = getCurrentDB();
+
+	$data = [ $data ] unless ref $data eq 'ARRAY';
+
+	my $count = 0;
+	for my $record (@$data) {
+		next unless $record;
+
+		# deal with multiple instances of same type => id
+		$count++ if $slashdb->sqlInsert('search_index_dump', {
+			type	=> $type,
+			id	=> $record,
+			status	=> $opts->{add} ? 'new' : 'changed',
+		});
+	}
+
+	return $count;
+}
+
+#################################################################
+# move prepared index data to live
+sub moveLive {
+	my($self, $type, $dir) = @_;
+
+	return unless $self->can('_dir') && ($dir || $self->can('_backup_dir'));
+
+	my $backup_dir = $self->_backup_dir($type, $dir);
+	$dir = $self->_dir($type, '');
+
+	my @time = localtime;
+	my $now = sprintf "-%04d%02d%02d-%02d%02d%02d", $time[5]+1900, $time[4]+1, $time[3], $time[2], $time[1], $time[0];
+	$dir =~ s|/+$||; # just in case
+	my $olddir = $dir . $now;
+	my $tmpdir = $dir . '-tmp';
+
+	# copy staging to temp dir
+	_moveFind($backup_dir, $tmpdir);
+	# move live to backup
+	rename($dir, $olddir);
+	# move temp to live
+	rename($tmpdir, $dir);
+
+	# kick old?
+}
+
+#################################################################
+sub _moveFind {
+	my($olddir, $newdir);
+	find(sub {
+		my($old) = $File::Find::name;
+		my $new = s/^\Q$olddir/$newdir/;
+		if (-d $old) {
+			eval {
+				mkpath($new, 0, 0775);
+			};
+			if ($@) {
+				warn "Can't create path $new: $@";
+			}
+		} elsif (-f _) {
+			copy($old, $new) or warn "Can't copy file $new: $!";
+		}
+	}, $olddir);
+}
+
+#################################################################
+sub _field_exists {
+	my($self, $field, $key, $type) = @_;
+	return unless $field;
+	$type = $self->_type($type);
+
+	return $self->{_fields}{$field}{$type}{$key};
+}
+
+#################################################################
+sub _field_list {
+	my($self, $field, $type) = @_;
+	return unless $field;
+	$type = $self->_type($type);
+
+	return $self->{_fields}{$field}{_array}{$type};
+}
+
+#################################################################
+sub _primary {
+	my($self, $type) = @_;
+	$type = $self->_type($type);
+
+	return $self->{_fields}{primary}{$type};
+}
+
+#################################################################
+sub _handled {
+	my($self, $type) = @_;
+	$type = $self->_type($type);
+	return $type =~ $self->{_handled};
+}
+
+#################################################################
+sub _type {
+	my($self, $type) = @_;
+	$self->{_type} = $type if defined $type;
+	return $self->{_type};
+}
+
+#################################################################
+sub _class {
+	my($self) = @_;
+	unless ($self->{_class}) {
+		($self->{_class} = lc ref $self) =~ s/^.+:://;
+	}
+	return $self->{_class};
+}
+
+#################################################################
+sub _dir {
+	my($self, $type, $dir) = @_;
+	$self->{_dir} = $dir if defined $dir;
+	$self->{_dir} ||= catdir(getCurrentStatic('datadir'), 'search_index');
+
+	return catdir($self->{_dir}, $self->_class, $self->_type($type));
+}
+
+#################################################################
+sub _backup_dir {
+	my($self, $type, $dir) = @_;
+	my $backup_dir = $dir || catdir(getCurrentStatic('datadir', 'search_index_tmp'));
+
+	return $self->_dir($type, $backup_dir);
+}
+
+1;
+
+__END__
Index: slashjp/plugins/SearchToo/SearchToo/Kinosearch.pm
diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Kinosearch.pm:1.1
--- /dev/null	Wed Jul 12 20:41:57 2006
+++ slashjp/plugins/SearchToo/SearchToo/Kinosearch.pm	Wed Jul 12 20:41:57 2006
@@ -0,0 +1,330 @@
+package Slash::SearchToo::Kinosearch;
+
+# STILL IN PROGRESS NOT READY FOR USE
+
+use strict;
+use File::Path;
+use File::Spec::Functions;
+use Slash::Utility;
+use Slash::DB::Utility;
+use vars qw($VERSION);
+use base 'Slash::SearchToo::Indexer';
+
+use Search::Kinosearch::KSearch;
+use Search::Kinosearch::Kindexer;
+
+($VERSION) = ' $Revision: 1.1 $ ' =~ /\$Revision:\s+([^\s]+)/;
+
+# FRY: I did it!  And it's all thanks to the books at my local library.
+
+our $handled = qr{^(?:comments)$};
+
+our $backend = 'DB_File';
+
+#################################################################
+sub getOps {
+	my %ops = (
+		stories		=> 1,
+		comments	=> 1,
+		journals	=> 1,
+		polls		=> 1,
+		users		=> 1,
+		submissions	=> 1,
+	);
+	return \%ops;
+}
+
+#################################################################
+sub _findRecords {
+	my($self, $results, $records, $sopts, $terms, $opts) = @_;
+
+	my $constants = getCurrentStatic();
+
+slashProf('init search');
+
+	my $querystring = $terms->{query};
+	# escape special chars
+	# none, allow all special chars
+#	$querystring =~ s/([&^|!{}[\]:\\])~*?/\\$1/g; # allowed: ()"+-
+	# normalize to lower case ???
+	$querystring =~ s/\b(?!AND|NOT|OR)(\w+)\b/\L$1/g;
+
+	$sopts->{max}++;  # until we get matches/num_hits working
+	my $searcher_opts = {
+		-num_results	=> $sopts->{max},
+		-offset		=> $sopts->{start},
+		-excerpt_field	=> $self->_field_list('content')->[0]
+	};
+
+	if ($sopts->{'sort'} == 1) {
+		$searcher_opts->{-sort_by} = 'timestamp';
+	} elsif ($sopts->{'sort'} == 2) {
+		$searcher_opts->{-sort_by} = 'relevance';
+	}
+
+	my $searcher = $self->_searcher(undef, undef, $searcher_opts) or return $results;
+
+	$searcher->add_query(
+		-string    => $querystring,
+		-lowercase => 1,
+		-tokenize  => 1,
+		-stem      => 1,
+		-required  => 1,
+		-fields    => {  # ??? adjust weights?
+			map { ( $_ => 1 ) } $self->_field_list('content')
+		}
+	);
+
+
+#	if (length $terms->{points_min}) { # ???
+#		# no need to bother with adding this to the query, since it's all comments
+#		if ($terms->{points_min} == $constants->{comment_minscore}) {
+#			delete $terms->{points_min};
+#		} else { # ($terms{points_min} != $constants->{comment_maxscore}) {
+			delete $terms->{points_min};
+#		}
+#	}
+
+	for my $key (keys %$terms) {
+		next if $key eq 'query' || ! length($terms->{$key});
+
+		$searcher->add_query(
+			-string    => $terms->{$key},
+			-required  => 1,
+			-fields    => $key,
+		);
+	}
+
+#use Data::Dumper;
+#print Dumper $searcher;
+
+slashProf('search', 'init search');
+	my $status = $searcher->process || {};
+
+	$sopts->{total}   = $status->{num_docs};
+	$sopts->{matches} = $status->{num_hits};
+
+slashProf('fetch results', 'search');
+
+	while (my $obj = $searcher->fetch_result_hashref) {
+		my %data = (
+			score           => $obj->{score},
+			$self->_primary => $obj->{doc_id},
+			excerpt		=> $obj->{excerpt},
+		);
+
+		push @$records, \%data;
+	}
+
+slashProf('', 'fetch results');
+
+use Data::Dumper;
+print Dumper $records;
+
+	return 1;
+}
+
+#################################################################
+sub _addRecords {
+	my($self, $type, $documents, $opts) = @_;
+
+	my $writer = $opts->{writer} || $self->_writer;
+
+	if (!$writer->{_is_old}) { # ???
+		for my $field (keys %{$documents->[0]}) {
+			$writer->define_field(
+				-name   => $field,
+				# only store the main content field, for excerpting
+				-store  => $self->_field_list('content')->[0] eq $field
+			);
+		}
+	}
+
+	my $count = 0;
+	my @docs;
+	for my $document (@$documents) {
+		my %doc;
+		# start new document by *id
+		$writer->new_document($document->{ $self->_primary });
+
+#printf "%d:%s\n", $document->{ $self->_primary }, $document->{date};
+
+		# timestamp is Unix epoch
+		if ($document->{date}) {
+			$writer->set_document_timestamp(timeCalc(delete $document->{date}, "%s", 0));
+		}
+ 
+		for my $key (keys %$document) {
+			next unless length $document->{$key};
+			next if $key eq $self->_primary;
+
+			$writer->set_field($key => $document->{$key});
+
+			my $is_text    = $self->_field_exists(text    => $key);
+			my $is_content = $self->_field_exists(content => $key);
+
+			if ($is_text || $is_content) {
+				$writer->lc_field($key) if $is_content;
+				$writer->tokenize_field($key);
+				$writer->stem_field($key) if $is_content;
+			}
+#printf "%s:%s\n", $key, $document->{$key};
+		}
+
+		$writer->add_document;
+#printf "%d\n\n", $count;
+		$count++;
+	}
+
+	$writer->finish unless $opts->{writer};
+
+#	# only optimize if requested (as usual), and changes were made
+#	$self->optimize($type) if $opts->{optimize} && $count;
+
+	return $count;
+}
+
+#################################################################
+# Plucene-specific helper methods
+sub isIndexed { # ???
+	my($self, $type, $id, $opts) = @_;
+
+	return unless $self->_handled($type);
+
+	my $preader = ($opts->{_reader} || $self->_reader) or return;
+
+	my $found = $preader->doc_is_indexed($id);
+
+#	$preader->close unless $opts->{_reader};
+
+	return $found || 0;
+}
+
+#################################################################
+sub optimize { # ???
+	my($self, $type) = @_;
+
+	return unless $self->_handled($type);
+
+slashProf('optimize');
+
+slashProf('', 'optimize');
+
+	return 1;
+}
+
+#################################################################
+sub merge { # ???
+	my($self, $type, $dirs, $opts) = @_;
+
+	return unless $self->_handled($type);
+
+slashProf('merge');
+
+	my @alldirs;
+	for (@$dirs) {
+		push @alldirs, $self->_dir($type => $_);
+	}
+	my $dir = $self->_dir($type => $opts->{dir});
+	## backup $dir?
+
+slashProf('', 'merge');
+
+	return scalar @alldirs;
+}
+
+#################################################################
+sub deleteRecords { # ???
+	my($self, $type, $ids, $opts) = @_;
+
+	return unless $self->_handled($type);
+
+slashProf('deleteRecords');
+
+	my $preader = $self->_reader or return;
+
+	$ids = [ $ids ] unless ref $ids;
+
+	my $count = 0;
+	for my $id (@$ids) {
+		my($found) = $self->isIndexed($type => $id, { _reader => $preader });
+		if ($found) {
+			$count += $found;
+			$preader->delete_document($id);
+		}
+	}
+
+#	# only optimize if requested (as usual), and changes were made
+#	$self->optimize($type) if $opts->{optimize} && $count;
+
+slashProf('', 'deleteRecords');
+
+	return $count;
+}
+
+#################################################################
+sub _searcher {
+	my($self, $type, $dir, $opts) = @_;
+	$dir = $self->_dir($type, $dir);
+
+	my $constants = getCurrentStatic();
+	$opts ||= {};
+
+	my $preader = $self->_reader($type) or return undef;
+
+	return Search::Kinosearch::KSearch->new(
+		-stoplist		=> {},
+		-kindex			=> $preader,
+		-any_or_all		=> 'all',
+		-sort_by		=> 'relevance', # relevance, timestamp
+		-allow_boolean		=> 0,
+		-allow_phrases		=> 0,
+#		-max_terms		=> 6, # ???
+		-excerpt_length		=> $constants->{search_text_length},
+		%$opts
+	);
+}
+
+#################################################################
+sub _reader {
+	my($self, $type, $dir) = @_;
+	$dir = $self->_dir($type, $dir);
+
+	return undef unless -e catdir($dir, 'kindex');
+
+	return Search::Kinosearch::Kindexer->new(
+		-stoplist		=> {},
+		-mode			=> 'readonly',
+		-backend		=> $backend,
+		-kindexpath		=> catdir($dir, 'kindex'),
+		-kinodatapath		=> catdir($dir, 'kindex', 'kinodata'),
+	);
+}
+
+#################################################################
+sub _writer {
+	my($self, $type, $dir) = @_;
+	$dir = $self->_dir($type, $dir);
+
+	my $mode = -e catdir($dir, 'kindex') ? 'overwrite' : 'create';
+
+	my $tmp = catdir($dir, 'ktemp');
+
+	mkpath($dir, 0, 0775) unless -e $dir;
+	mkpath($tmp, 0, 0775) unless -e $tmp;
+
+	return Search::Kinosearch::Kindexer->new(
+		-stoplist		=> {},
+		-mode			=> $mode, # create, overwrite, update, readonly
+		-backend		=> $backend,
+		-kindexpath		=> catdir($dir, 'kindex'),
+		-kinodatapath		=> catdir($dir, 'kindex', 'kinodata'),
+		-temp_directory		=> catdir($dir, 'ktemp'),
+		-enable_updates		=> 0,
+		-phrase_matching	=> 0,
+	);
+}
+
+1;
+
+__END__
Index: slashjp/plugins/SearchToo/SearchToo/Makefile.PL
diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Makefile.PL:1.1
--- /dev/null	Wed Jul 12 20:41:57 2006
+++ slashjp/plugins/SearchToo/SearchToo/Makefile.PL	Wed Jul 12 20:41:57 2006
@@ -0,0 +1,11 @@
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+    'PM'	=> {
+    	'Classic.pm'		=> '$(INST_LIBDIR)/Slash/SearchToo/Classic.pm',
+    	'Indexer.pm'		=> '$(INST_LIBDIR)/Slash/SearchToo/Indexer.pm',
+    	'Kinosearch.pm'		=> '$(INST_LIBDIR)/Slash/SearchToo/Kinosearch.pm',
+    	'Plucene.pm'		=> '$(INST_LIBDIR)/Slash/SearchToo/Plucene.pm',
+    },
+);
Index: slashjp/plugins/SearchToo/SearchToo/Plucene.pm
diff -u /dev/null slashjp/plugins/SearchToo/SearchToo/Plucene.pm:1.1
--- /dev/null	Wed Jul 12 20:41:57 2006
+++ slashjp/plugins/SearchToo/SearchToo/Plucene.pm	Wed Jul 12 20:41:57 2006
@@ -0,0 +1,385 @@
+package Slash::SearchToo::Plucene;
+
+# STILL IN PROGRESS NOT READY FOR USE
+
+use strict;
+use File::Spec::Functions;
+use Slash::Utility;
+use Slash::DB::Utility;
+use Slash::SearchToo::Classic;
+use vars qw($VERSION);
+use base 'Slash::SearchToo::Indexer';
+
+use Plucene::Document;
+use Plucene::Document::DateSerializer;
+use Plucene::Index::Writer;
+use Plucene::QueryParser;
+use Plucene::Search::HitCollector;
+use Plucene::Search::IndexSearcher;
+use Plucene::Search::TermQuery;
+
+($VERSION) = ' $Revision: 1.1 $ ' =~ /\$Revision:\s+([^\s]+)/;
+
+# FRY: I did it!  And it's all thanks to the books at my local library.
+
+our $handled = qr{^(?:comments)$};
+
+#################################################################
+sub getOps {
+	my %ops = (
+		stories		=> 1,
+		comments	=> 1,
+		journals	=> 1,
+		polls		=> 1,
+		users		=> 1,
+		submissions	=> 1,
+	);
+	return \%ops;
+}
+
+#################################################################
+sub _findRecords {
+	my($self, $results, $records, $sopts, $terms, $opts) = @_;
+
+	my $constants = getCurrentStatic();
+
+slashProf('init search', 'findRecords setup');
+
+	my $parser = Plucene::QueryParser->new({
+		analyzer => $self->_analyzer,
+		default  => 'content'
+	});
+
+	my $querystring = $terms->{query};
+	# escape special chars
+	$querystring =~ s/([-+&|!{}[\]:\\])~*?/\\$1/g; # allowed: ()"^
+	# normalize to lower case
+	$querystring =~ s/\b(?!AND|NOT|OR)(\w+)\b/\L$1/g;
+	my $newquery;
+	eval { $newquery = $parser->parse('+(' . $querystring . ')') } or return $results;
+
+	my $filter = 0;
+	if (length $terms->{points_min}) {
+		# no need to bother with adding this to the query, since it's all comments
+		if ($terms->{points_min} == $constants->{comment_minscore}) {
+			delete $terms->{points_min};
+		} else { # ($terms{points_min} != $constants->{comment_maxscore}) {
+			$filter = Slash::SearchToo::Plucene::Filter->new({
+				field => '_points_',
+				from  => _get_sortable_points(delete $terms->{points_min}),
+				to    => _get_sortable_points($constants->{comment_maxscore}),
+			});
+		}
+	}
+
+	for my $key (keys %$terms) {
+		next if $key eq 'query' || ! length($terms->{$key});
+		my $term = Plucene::Index::Term->new({
+			field	=> $key,
+			text	=> $terms->{$key}
+		}) or next;
+		my $term_query = Plucene::Search::TermQuery->new({ term => $term }) or next;
+		$newquery->add($term_query, 1);
+	}
+#use Data::Dumper;
+#print STDERR Dumper $newquery;
+#print STDERR $newquery->to_string, ":$filter\n";
+
+	my $searcher = $self->_searcher or return $results;
+slashProf('search', 'init search');
+	my $docs = $searcher->search_top($newquery, $filter, $sopts->{start} + $sopts->{max});
+
+	$sopts->{total}   = $searcher->max_doc;
+	$sopts->{matches} = $docs->total_hits;
+
+slashProf('fetch results', 'search');
+	my $skip = $sopts->{start};
+	for my $obj (sort { $b->{score} <=> $a->{score} } $docs->score_docs) {
+		if ($skip > 0) {
+			$skip--;
+			next;
+		}
+
+		last if @$records >= $sopts->{max};
+
+		my($doc, $score) = @{$obj}{qw(doc score)};
+		my $docobj = $searcher->doc($doc);
+		my %data   = ( score => $score );
+
+		for my $field ($docobj->fields) {
+			my $name = $field->name;
+			next if $name =~ /^(?:content|id)$/;
+			$data{$name} = $field->string;
+		}
+
+		push @$records, \%data;
+	}
+
+	return 1;
+}
+
+#################################################################
+sub _addRecords {
+	my($self, $type, $documents, $opts) = @_;
+
+	my $writer = $self->_writer;
+
+	my $count = 0;
+	for my $document (@$documents) {
+		my $doc = Plucene::Document->new;
+
+		# combine our text fields into one, and then remove them; we
+		# don't need them stored separately
+		# normalize to lower case
+		$document->{content} = lc join ' ', @{$document}{ @{$self->_field_list('content')} };
+		delete @{$document}{ @{$self->_field_list('content')} };
+
+		$document->{_date_}   = _get_sortable_date(delete $document->{date});
+		$document->{_points_} = _get_sortable_points(delete $document->{points});
+
+		for my $key (keys %$document) {
+			next unless length $document->{$key};
+			my $field;
+
+			if ($key eq 'content' || $self->_field_exists(text => $key)) {
+				$field = Plucene::Document::Field->Text($key, $document->{$key});
+			} else {
+				$field = Plucene::Document::Field->Keyword($key, $document->{$key});
+			}
+
+			$doc->add($field);
+		}
+
+		$writer->add_document($doc);
+		$count++;
+	}
+
+	undef $writer;
+
+	# only optimize if requested (as usual), and changes were made
+	$self->optimize($type) if $opts->{optimize} && $count;
+
+	return $count;
+
+}
+
+#################################################################
+# Plucene-specific helper methods
+sub isIndexed {
+	my($self, $type, $id, $opts) = @_;
+
+	return unless $self->_handled($type);
+
+	my $preader = ($opts->{_reader} || $self->_reader) or return;
+
+	my $term = Plucene::Index::Term->new({
+		field	=> $self->_primary,
+		text	=> $id
+	});
+
+	my $found = $preader->doc_freq($term);
+
+	$preader->close unless $opts->{_reader};
+
+	return $found ? ($found, $term) : 0;
+}
+
+#################################################################
+sub optimize {
+	my($self, $type) = @_;
+
+	return unless $self->_handled($type);
+
+slashProf('optimize');
+
+	my $writer = $self->_writer;
+	$writer->optimize;
+	undef $writer;
+slashProf('', 'optimize');
+
+	return 1;
+}
+
+#################################################################
+sub merge {
+	my($self, $type, $dirs, $opts) = @_;
+
+	return unless $self->_handled($type);
+
+slashProf('merge');
+
+	my @alldirs;
+	for (@$dirs) {
+		push @alldirs, $self->_dir($type => $_);
+	}
+	my $dir = $self->_dir($type => $opts->{dir});
+	## backup $dir?
+
+	if (@alldirs) {
+		my $writer = $self->_writer;
+		$writer->add_indexes(@alldirs);
+	}
+
+slashProf('', 'merge');
+
+	return scalar @alldirs;
+}
+
+#################################################################
+sub deleteRecords {
+	my($self, $type, $ids, $opts) = @_;
+
+	return unless $self->_handled($type);
+
+slashProf('deleteRecords');
+
+	my $preader = $self->_reader or return;
+
+	$ids = [ $ids ] unless ref $ids;
+
+	my $count = 0;
+	for my $id (@$ids) {
+		my($found, $term) = $self->isIndexed($type => $id, { _reader => $preader });
+		if ($found) {
+			$count += $found;
+			$preader->delete_term($term);
+		}
+	}
+
+	$preader->close;
+
+	# only optimize if requested (as usual), and changes were made
+	$self->optimize($type) if $opts->{optimize} && $count;
+
+slashProf('', 'deleteRecords');
+
+	return $count;
+}
+
+#################################################################
+# make it easier to sort by serializing the date
+sub _get_sortable_date {
+	my($time, $format) = @_;
+	$format ||= '%Y-%m-%d %H:%M:%S';
+	return freeze_date(Time::Piece->strptime($time, $format));
+}
+
+#################################################################
+# make it easier to sort by converting to alphabet
+sub _get_sortable_points {
+	my($points) = @_;
+
+	my $constants = getCurrentStatic();
+	my $min = $constants->{comment_minscore};
+	my $max = $constants->{comment_maxscore};
+
+	$points = $points < $min
+		? $min
+		: $points > $max
+			? $max
+			: $points;
+
+	my $start  = $min;
+	my $finish = 'a';
+	until ($start == $points) {
+		$start++;
+		$finish++;
+	}
+
+	return $finish;
+}
+
+#################################################################
+sub _searcher {
+	my($self, $type, $dir) = @_;
+	$type = $self->_type($type);
+	$dir = $self->_dir($type, $dir);
+
+	return $self->{_searcher}{$type}{$dir} if $self->{_searcher}{$type}{$dir};
+
+	return -e $dir
+		? ($self->{_searcher}{$type}{$dir} = Plucene::Search::IndexSearcher->new($dir))
+		: undef;
+}
+
+#################################################################
+sub _reader {
+	my($self, $type, $dir) = @_;
+	$type = $self->_type($type);
+	$dir = $self->_dir($type, $dir);
+
+	return $self->{_reader}{$type}{$dir} if $self->{_reader}{$type}{$dir};
+
+	return -e $dir
+		? ($self->{_reader}{$type}{$dir} = Plucene::Index::Reader->open($dir))
+		: undef;
+}
+
+#################################################################
+sub _writer {
+	my($self, $type, $dir) = @_;
+	$type = $self->_type($type);
+	$dir = $self->_dir($type, $dir);
+
+	return $self->{_writer}{$type}{$dir} if $self->{_writer}{$type}{$dir};
+
+	return $self->{_writer}{$type}{$dir} = Plucene::Index::Writer->new(
+		$dir, $self->_analyzer,
+		-e catfile($dir, 'segments') ? 0 : 1
+	);
+}
+
+#################################################################
+sub close_searcher {
+	my($self, $type, $dir) = @_;
+	$type = $self->_type($type);
+	$dir = $self->_dir($type, $dir);
+
+	my $searcher = delete $self->{_searcher}{$type}{$dir} or return;
+	$searcher->close;
+}
+
+#################################################################
+sub close_reader {
+	my($self, $type, $dir) = @_;
+	$type = $self->_type($type);
+	$dir = $self->_dir($type, $dir);
+
+	my $preader = delete $self->{_reader}{$type}{$dir} or return;
+	$preader->close;
+}
+
+#################################################################
+sub close_writer {
+	my($self, $type, $dir) = @_;
+	$type = $self->_type($type);
+	$dir = $self->_dir($type, $dir);
+
+	my $writer = delete $self->{_writer}{$type}{$dir} or return;
+	undef $writer;
+}
+
+# maybe add our own analyzer ...
+use Plucene::Analysis::StopAnalyzer;
+sub _analyzer {
+	return Plucene::Analysis::StopAnalyzer->new;
+}
+
+#################################################################
+#################################################################
+package Slash::SearchToo::Plucene::Filter;
+use base 'Plucene::Search::DateFilter';
+
+sub new {
+	my($self, $args) = @_;
+	bless {
+		field => $args->{field},
+		from  => $args->{from},
+		to    => $args->{to},
+	}, $self;
+}
+
+
+1;
+
+__END__


Slashdotjp-dev メーリングリストの案内
Back to archive index