Kouhei Sutou
null+****@clear*****
Thu Feb 16 18:09:51 JST 2017
Kouhei Sutou 2017-02-16 18:09:51 +0900 (Thu, 16 Feb 2017) New Revision: fa629aa37a3d1a6270660490a0736dd6cbc42b81 https://github.com/groonga/groonga/commit/fa629aa37a3d1a6270660490a0736dd6cbc42b81 Message: mrb: support order by estimated size optimization It's disabled by default. You can enable it by GRN_ORDER_BY_ESTIMATED_SIZE_ENABLE=yes environment variable. Added files: lib/mrb/scripts/scan_info_data_size_estimator.rb Modified files: lib/ctx_impl_mrb.c lib/grn_mrb.h lib/mrb.c lib/mrb/scripts/expression.rb lib/mrb/scripts/expression_size_estimator.rb lib/mrb/scripts/expression_tree/function_call.rb lib/mrb/scripts/scan_info_builder.rb lib/mrb/scripts/sources.am test/mruby/run-test.rb test/mruby/suite/query_optimizer/index/test_match.rb Modified: lib/ctx_impl_mrb.c (+7 -1) =================================================================== --- lib/ctx_impl_mrb.c 2017-02-16 17:35:56 +0900 (c8929fd) +++ lib/ctx_impl_mrb.c 2017-02-16 18:09:51 +0900 (d442f46) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2013-2016 Brazil + Copyright(C) 2013-2017 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -196,6 +196,12 @@ grn_ctx_impl_mrb_init_bindings(grn_ctx *ctx) mrb->ud = ctx; ctx->impl->mrb.module = mrb_define_module(mrb, "Groonga"); + mrb_define_const(mrb, + ctx->impl->mrb.module, + "ORDER_BY_ESTIMATED_SIZE", + grn_mrb_is_order_by_estimated_size_enabled() ? + mrb_true_value() : + mrb_false_value()); mrb_define_class_method(mrb, ctx->impl->mrb.module, "init", mrb_groonga_init, MRB_ARGS_NONE()); mrb_funcall(mrb, mrb_obj_value(ctx->impl->mrb.module), "init", 0); Modified: lib/grn_mrb.h (+2 -1) =================================================================== --- lib/grn_mrb.h 2017-02-16 17:35:56 +0900 (832aa94) +++ lib/grn_mrb.h 2017-02-16 18:09:51 +0900 (2fd00b3) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2013-2016 Brazil + Copyright(C) 2013-2017 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -34,6 +34,7 @@ void grn_mrb_init_from_env(void); #ifdef GRN_WITH_MRUBY GRN_API mrb_value grn_mrb_load(grn_ctx *ctx, const char *path); GRN_API const char *grn_mrb_get_system_ruby_scripts_dir(grn_ctx *ctx); +grn_bool grn_mrb_is_order_by_estimated_size_enabled(void); #endif #ifdef __cplusplus Modified: lib/mrb.c (+19 -1) =================================================================== --- lib/mrb.c 2017-02-16 17:35:56 +0900 (931c94f) +++ lib/mrb.c 2017-02-16 18:09:51 +0900 (09704a3) @@ -1,6 +1,6 @@ /* -*- c-basic-offset: 2 -*- */ /* - Copyright(C) 2013-2015 Brazil + Copyright(C) 2013-2017 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -38,6 +38,7 @@ #define E_LOAD_ERROR (mrb_class_get(mrb, "LoadError")) static char grn_mrb_ruby_scripts_dir[GRN_ENV_BUFFER_SIZE]; +static grn_bool grn_mrb_order_by_estimated_size_enable = GRN_FALSE; void grn_mrb_init_from_env(void) @@ -45,6 +46,23 @@ grn_mrb_init_from_env(void) grn_getenv("GRN_RUBY_SCRIPTS_DIR", grn_mrb_ruby_scripts_dir, GRN_ENV_BUFFER_SIZE); + { + char grn_order_by_estimated_size_enable_env[GRN_ENV_BUFFER_SIZE]; + grn_getenv("GRN_ORDER_BY_ESTIMATED_SIZE_ENABLE", + grn_order_by_estimated_size_enable_env, + GRN_ENV_BUFFER_SIZE); + if (strcmp(grn_order_by_estimated_size_enable_env, "yes") == 0) { + grn_mrb_order_by_estimated_size_enable = GRN_TRUE; + } else { + grn_mrb_order_by_estimated_size_enable = GRN_FALSE; + } + } +} + +grn_bool +grn_mrb_is_order_by_estimated_size_enabled(void) +{ + return grn_mrb_order_by_estimated_size_enable; } #ifdef GRN_WITH_MRUBY Modified: lib/mrb/scripts/expression.rb (+1 -0) =================================================================== --- lib/mrb/scripts/expression.rb 2017-02-16 17:35:56 +0900 (cd40160) +++ lib/mrb/scripts/expression.rb 2017-02-16 18:09:51 +0900 (27dadcb) @@ -6,6 +6,7 @@ require "expression_tree_builder" require "scan_info" require "scan_info_builder" +require "scan_info_data_size_estimator" require "expression_size_estimator" module Groonga Modified: lib/mrb/scripts/expression_size_estimator.rb (+2 -136) =================================================================== --- lib/mrb/scripts/expression_size_estimator.rb 2017-02-16 17:35:56 +0900 (13806b3) +++ lib/mrb/scripts/expression_size_estimator.rb 2017-02-16 18:09:51 +0900 (597b56f) @@ -31,7 +31,8 @@ module Groonga current_size = 0 end - size = estimate_data(data) + estimator = ScanInfoDataSizeEstimator.new(data, @table) + size = estimator.estimate case data.logical_op when Operator::AND current_size = size if size < current_size @@ -49,140 +50,5 @@ module Groonga end current_size end - - private - def estimate_data(data) - search_index = data.search_indexes.first - return @table_size if search_index.nil? - - index_column = resolve_index_column(search_index.index_column, - data.op) - return @table_size if index_column.nil? - - size = nil - case data.op - when Operator::MATCH, - Operator::FUZZY - size = estimate_match(data, index_column) - when Operator::REGEXP - size = estimate_regexp(data, index_column) - when Operator::EQUAL - size = estimate_equal(data, index_column) - when Operator::LESS, - Operator::LESS_EQUAL, - Operator::GREATER, - Operator::GREATER_EQUAL - size = estimate_range(data, index_column) - when Operator::PREFIX - size = estimate_prefix(data, index_column) - when Operator::CALL - size = estimate_call(data, index_column) - end - size || @table_size - end - - def resolve_index_column(index_column, operator) - while index_column.is_a?(Accessor) - index_info = index_column.find_index(operator) - return nil if index_info.nil? - index_column = index_info.index - end - - index_column - end - - def sampling_cursor_limit(n_terms) - limit = n_terms * 0.01 - if limit < 10 - 10 - elsif limit > 1000 - 1000 - else - limit.to_i - end - end - - def estimate_match(data, index_column) - index_column.estimate_size(:query => data.query.value) - end - - def estimate_regexp(data, index_column) - index_column.estimate_size(:query => data.query.value, - :mode => data.op) - end - - def estimate_equal(data, index_column) - lexicon = index_column.lexicon - query = data.query - if query.domain == lexicon.id - term_id = query.value - else - term_id = lexicon[query] - end - return 0 if term_id.nil? - - index_column.estimate_size(:term_id => term_id) - end - - def estimate_range(data, index_column) - lexicon = index_column.lexicon - n_terms = lexicon.size - return 0 if n_terms.zero? - - value = data.query.value - options = { - :limit => sampling_cursor_limit(n_terms), - } - case data.op - when Operator::LESS - options[:max] = value - options[:flags] = TableCursorFlags::LT - when Operator::LESS_EQUAL - options[:max] = value - options[:flags] = TableCursorFlags::LE - when Operator::GREATER - options[:min] = value - options[:flags] = TableCursorFlags::GT - when Operator::GREATER_EQUAL - options[:min] = value - options[:flags] = TableCursorFlags::GE - end - TableCursor.open(lexicon, options) do |cursor| - size = index_column.estimate_size(:lexicon_cursor => cursor) - size += 1 if cursor.next != ID::NIL - size - end - end - - def estimate_prefix(data, index_column) - lexicon = index_column.lexicon - n_terms = lexicon.size - return 0 if n_terms.zero? - - value = data.query.value - options = { - :min => value, - :limit => sampling_cursor_limit(n_terms), - :flags => TableCursorFlags::PREFIX, - } - TableCursor.open(lexicon, options) do |cursor| - size = index_column.estimate_size(:lexicon_cursor => cursor) - size += 1 if cursor.next != ID::NIL - size - end - end - - def estimate_call(data, index_column) - procedure = data.args[0] - arguments = data.args[1..-1].collect do |arg| - if arg.is_a?(::Groonga::Object) - ExpressionTree::Variable.new(arg) - else - ExpressionTree::Constant.new(arg) - end - end - node = ExpressionTree::FunctionCall.new(procedure, arguments) - node.estimate_size(@table) - end end end Modified: lib/mrb/scripts/expression_tree/function_call.rb (+12 -1) =================================================================== --- lib/mrb/scripts/expression_tree/function_call.rb 2017-02-16 17:35:56 +0900 (44c6213) +++ lib/mrb/scripts/expression_tree/function_call.rb 2017-02-16 18:09:51 +0900 (dd61480) @@ -20,10 +20,21 @@ module Groonga return table.size unles****@proce***** == "between" column, min, min_border, max, max_border = @arguments + index_info = column.column.find_index(Operator::CALL) return table.size if index_info.nil? - index_column = index_info.index + while index_column.is_a?(Groonga::Accessor) + if index_column.have_next? + index_column = index_column.next + else + index_column = index_column.object + index_info = index_column.find_index(Operator::CALL) + return table.size if index_info.nil? + index_column = index_info.index + end + end + lexicon = index_column.lexicon options = { :min => min.value, Modified: lib/mrb/scripts/scan_info_builder.rb (+43 -7) =================================================================== --- lib/mrb/scripts/scan_info_builder.rb 2017-02-16 17:35:56 +0900 (8b46054) +++ lib/mrb/scripts/scan_info_builder.rb 2017-02-16 18:09:51 +0900 (d5a72bc) @@ -7,6 +7,8 @@ module Groonga @expression = expression @operator = operator @record_exist = record_exist + @variable = @expression[0] + @table = Context.instance[@variable.domain] end RELATION_OPERATORS = [ @@ -55,7 +57,6 @@ module Groonga context = BuildContext.new(@expression) codes = context.codes - variable = context.variable while context.have_next? code = context.code code_op = context.code_op @@ -85,7 +86,7 @@ module Groonga when Operator::PUSH context.data ||= ScanInfoData.new(i) data = context.data - if code.value == variable + if code.value == @variable context.status = :var else data.args << code.value @@ -178,7 +179,6 @@ module Groonga n_relation_expressions = 0 n_logical_expressions = 0 status = :start - variable = @expression[0] codes =****@expre***** codes.each_with_index do |code, i| case code.op @@ -213,7 +213,7 @@ module Groonga n_relation_expressions += 1 status = :start else - if code.value == variable + if code.value == @variable status = :var else status = :const @@ -404,7 +404,45 @@ module Groonga end optimized_data_list << data end - optimized_data_list + + optimize_by_estimated_size(optimized_data_list) + end + + def optimize_by_estimated_size(data_list) + return data_list unless Groonga::ORDER_BY_ESTIMATED_SIZE + + start_index = nil + data_list.size.times do |i| + data = data_list[i] + if data.logical_op != Operator::AND + if start_index.nil? + start_index = i + else + sort_by_estimated_size!(data_list, start_index...i) + start_index = nil + end + end + end + unless start_index.nil? + sort_by_estimated_size!(data_list, start_index...data_list.size) + end + data_list + end + + def sort_by_estimated_size!(data_list, range) + target_data_list = data_list[range] + return if target_data_list.size < 2 + + start_logical_op = target_data_list.first.logical_op + sorted_data_list = target_data_list.sort_by do |data| + estimator = ScanInfoDataSizeEstimator.new(data, @table) + estimator.estimate + end + sorted_data_list.each do |sorted_data| + sorted_data.logical_op = Operator::AND + end + sorted_data_list.first.logical_op = start_logical_op + data_list[range] = sorted_data_list end def range_operations?(data, next_data) @@ -501,7 +539,6 @@ module Groonga class BuildContext attr_accessor :status - attr_reader :variable attr_reader :codes attr_reader :n_codes attr_reader :i @@ -510,7 +547,6 @@ module Groonga def initialize(expression) @expression = expression @status = :start - @variable = @expression[0] @current_data = nil @codes =****@expre***** @n_codes =****@codes***** Added: lib/mrb/scripts/scan_info_data_size_estimator.rb (+185 -0) 100644 =================================================================== --- /dev/null +++ lib/mrb/scripts/scan_info_data_size_estimator.rb 2017-02-16 18:09:51 +0900 (5d3dc4e) @@ -0,0 +1,185 @@ +module Groonga + class ScanInfoDataSizeEstimator + def initialize(data, table) + @data = data + @table = table + @table_size =****@table***** + end + + def estimate + search_index =****@data*****_indexes.first + return @table_size if search_index.nil? + + index_column = resolve_index_column(search_index.index_column) + return @table_size if index_column.nil? + + size = nil + case****@data***** + when Operator::MATCH, + Operator::FUZZY + size = estimate_match(index_column) + when Operator::REGEXP + size = estimate_regexp(index_column) + when Operator::EQUAL + size = estimate_equal(index_column) + when Operator::LESS, + Operator::LESS_EQUAL, + Operator::GREATER, + Operator::GREATER_EQUAL + size = estimate_range(index_column) + when Operator::PREFIX + size = estimate_prefix(index_column) + when Operator::CALL + size = estimate_call(index_column) + end + size || @table_size + end + + private + def resolve_index_column(index_column) + while index_column.is_a?(Accessor) + index_info = index_column.find_index(@data.op) + return nil if index_info.nil? + break if index_info.index == index_column + index_column = index_info.index + end + + index_column + end + + def sampling_cursor_limit(n_terms) + limit = n_terms * 0.01 + if limit < 10 + 10 + elsif limit > 1000 + 1000 + else + limit.to_i + end + end + + def estimate_match(index_column) + index_column.estimate_size(:query => @data.query.value) + end + + def estimate_regexp(index_column) + index_column.estimate_size(:query => @data.query.value, + :mode => @data.op) + end + + def estimate_equal(index_column) + query =****@data***** + if index_column.is_a?(Accessor) + table = index_column.object + if index_column.name == "_id" + if table.id?(query.value) + 1 + else + 0 + end + else + if table[query.value] + 1 + else + 0 + end + end + else + lexicon = index_column.lexicon + if query.domain == lexicon.id + term_id = query.value + else + term_id = lexicon[query] + end + return 0 if term_id.nil? + + index_column.estimate_size(:term_id => term_id) + end + end + + def estimate_range(index_column) + if index_column.is_a?(Table) + is_table_search = true + lexicon = index_column + elsif index_column.is_a?(Groonga::Accessor) + is_table_search = true + lexicon = index_column.object + else + is_table_search = false + lexicon = index_column.lexicon + end + n_terms = lexicon.size + return 0 if n_terms.zero? + + value =****@data***** + options = { + :limit => sampling_cursor_limit(n_terms), + } + case****@data***** + when Operator::LESS + options[:max] = value + options[:flags] = TableCursorFlags::LT + when Operator::LESS_EQUAL + options[:max] = value + options[:flags] = TableCursorFlags::LE + when Operator::GREATER + options[:min] = value + options[:flags] = TableCursorFlags::GT + when Operator::GREATER_EQUAL + options[:min] = value + options[:flags] = TableCursorFlags::GE + end + TableCursor.open(lexicon, options) do |cursor| + if is_table_search + size = 1 + else + size = index_column.estimate_size(:lexicon_cursor => cursor) + end + size += 1 if cursor.next != ID::NIL + size + end + end + + def estimate_prefix(index_column) + is_table_search = + (index_column.is_a?(Accessor) and + index_column.name == "_key") + if is_table_search + lexicon = index_column.object + else + lexicon = index_column.lexicon + end + n_terms = lexicon.size + return 0 if n_terms.zero? + + value =****@data***** + options = { + :min => value, + :limit => sampling_cursor_limit(n_terms), + :flags => TableCursorFlags::PREFIX, + } + TableCursor.open(lexicon, options) do |cursor| + if is_table_search + size = 1 + else + size = index_column.estimate_size(:lexicon_cursor => cursor) + end + size += 1 if cursor.next != ID::NIL + size + end + end + + def estimate_call(index_column) + procedure =****@data*****[0] + arguments =****@data*****[1..-1].collect do |arg| + if arg.is_a?(::Groonga::Object) + ExpressionTree::Variable.new(arg) + else + ExpressionTree::Constant.new(arg) + end + end + node = ExpressionTree::FunctionCall.new(procedure, arguments) + node.estimate_size(@table) + end + end +end Modified: lib/mrb/scripts/sources.am (+1 -0) =================================================================== --- lib/mrb/scripts/sources.am 2017-02-16 17:35:56 +0900 (718c8d0) +++ lib/mrb/scripts/sources.am 2017-02-16 18:09:51 +0900 (877168c) @@ -28,6 +28,7 @@ RUBY_SCRIPT_FILES = \ scan_info.rb \ scan_info_builder.rb \ scan_info_data.rb \ + scan_info_data_size_estimator.rb \ scan_info_search_index.rb \ table.rb \ table_cursor.rb \ Modified: test/mruby/run-test.rb (+1 -0) =================================================================== --- test/mruby/run-test.rb 2017-02-16 17:35:56 +0900 (60b1cc6) +++ test/mruby/run-test.rb 2017-02-16 18:09:51 +0900 (df59cd3) @@ -53,6 +53,7 @@ end ENV["GRN_PLUGINS_DIR"] = (build_top_dir_path + "plugins").to_s ENV["GRN_RUBY_SCRIPTS_DIR"] = (build_top_dir_path + "lib/mrb/scripts").to_s +ENV["GRN_ORDER_BY_ESTIMATED_SIZE_ENABLE"] = "yes" $LOAD_PATH.unshift((rroonga_dir_path + "ext" + "groonga").to_s) $LOAD_PATH.unshift((rroonga_dir_path + "lib").to_s) Modified: test/mruby/suite/query_optimizer/index/test_match.rb (+59 -0) =================================================================== --- test/mruby/suite/query_optimizer/index/test_match.rb 2017-02-16 17:35:56 +0900 (addc2f1) +++ test/mruby/suite/query_optimizer/index/test_match.rb 2017-02-16 18:09:51 +0900 (32cb8af) @@ -142,4 +142,63 @@ class TestIndexMatch < QueryOptimizerTestCase expr: <0..2> DUMP end + + sub_test_case("optimization by estimate size") do + def setup + super + 10.times do |i| + @logs.add(:message => "Groonga#{i}") + end + 2.times do |i| + @logs.add(:message => "Rroonga#{i}") + end + 100.times do |i| + @logs.add(:message => "Mroonga#{i}") + end + end + + def test_and + filter = "(message @ 'Groonga') && (message @ 'Rroonga')" + assert_equal(<<-DUMP, dump_plan(filter)) +[0] + op: <match> + logical_op: <or> + index: <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]> + query: <"Rroonga"> + expr: <3..5> +[1] + op: <match> + logical_op: <and> + index: <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]> + query: <"Groonga"> + expr: <0..2> + DUMP + end + + def test_and_and_not + filter = "(message @ 'Groonga') && " + filter << "(message @ 'Rroonga') &! " + filter << "(message @ 'Mroonga')" + assert_equal(<<-DUMP, dump_plan(filter)) +[0] + op: <match> + logical_op: <or> + index: <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]> + query: <"Rroonga"> + expr: <3..5> +[1] + op: <match> + logical_op: <and> + index: <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]> + query: <"Groonga"> + expr: <0..2> +[2] + op: <match> + logical_op: <and_not> + index: <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]> + query: <"Mroonga"> + expr: <7..9> + DUMP + end + end end -------------- next part -------------- HTML����������������������������...Download