[Groonga-commit] groonga/groonga at fa629aa [master] mrb: support order by estimated size optimization

Back to archive index

Kouhei Sutou null+****@clear*****
Thu Feb 16 18:09:51 JST 2017


Kouhei Sutou	2017-02-16 18:09:51 +0900 (Thu, 16 Feb 2017)

  New Revision: fa629aa37a3d1a6270660490a0736dd6cbc42b81
  https://github.com/groonga/groonga/commit/fa629aa37a3d1a6270660490a0736dd6cbc42b81

  Message:
    mrb: support order by estimated size optimization
    
    It's disabled by default. You can enable it by
    GRN_ORDER_BY_ESTIMATED_SIZE_ENABLE=yes environment variable.

  Added files:
    lib/mrb/scripts/scan_info_data_size_estimator.rb
  Modified files:
    lib/ctx_impl_mrb.c
    lib/grn_mrb.h
    lib/mrb.c
    lib/mrb/scripts/expression.rb
    lib/mrb/scripts/expression_size_estimator.rb
    lib/mrb/scripts/expression_tree/function_call.rb
    lib/mrb/scripts/scan_info_builder.rb
    lib/mrb/scripts/sources.am
    test/mruby/run-test.rb
    test/mruby/suite/query_optimizer/index/test_match.rb

  Modified: lib/ctx_impl_mrb.c (+7 -1)
===================================================================
--- lib/ctx_impl_mrb.c    2017-02-16 17:35:56 +0900 (c8929fd)
+++ lib/ctx_impl_mrb.c    2017-02-16 18:09:51 +0900 (d442f46)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2013-2016 Brazil
+  Copyright(C) 2013-2017 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -196,6 +196,12 @@ grn_ctx_impl_mrb_init_bindings(grn_ctx *ctx)
 
   mrb->ud = ctx;
   ctx->impl->mrb.module = mrb_define_module(mrb, "Groonga");
+  mrb_define_const(mrb,
+                   ctx->impl->mrb.module,
+                   "ORDER_BY_ESTIMATED_SIZE",
+                   grn_mrb_is_order_by_estimated_size_enabled() ?
+                   mrb_true_value() :
+                   mrb_false_value());
   mrb_define_class_method(mrb, ctx->impl->mrb.module,
                           "init", mrb_groonga_init, MRB_ARGS_NONE());
   mrb_funcall(mrb, mrb_obj_value(ctx->impl->mrb.module), "init", 0);

  Modified: lib/grn_mrb.h (+2 -1)
===================================================================
--- lib/grn_mrb.h    2017-02-16 17:35:56 +0900 (832aa94)
+++ lib/grn_mrb.h    2017-02-16 18:09:51 +0900 (2fd00b3)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2013-2016 Brazil
+  Copyright(C) 2013-2017 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -34,6 +34,7 @@ void grn_mrb_init_from_env(void);
 #ifdef GRN_WITH_MRUBY
 GRN_API mrb_value grn_mrb_load(grn_ctx *ctx, const char *path);
 GRN_API const char *grn_mrb_get_system_ruby_scripts_dir(grn_ctx *ctx);
+grn_bool grn_mrb_is_order_by_estimated_size_enabled(void);
 #endif
 
 #ifdef __cplusplus

  Modified: lib/mrb.c (+19 -1)
===================================================================
--- lib/mrb.c    2017-02-16 17:35:56 +0900 (931c94f)
+++ lib/mrb.c    2017-02-16 18:09:51 +0900 (09704a3)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2013-2015 Brazil
+  Copyright(C) 2013-2017 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -38,6 +38,7 @@
 #define E_LOAD_ERROR (mrb_class_get(mrb, "LoadError"))
 
 static char grn_mrb_ruby_scripts_dir[GRN_ENV_BUFFER_SIZE];
+static grn_bool grn_mrb_order_by_estimated_size_enable = GRN_FALSE;
 
 void
 grn_mrb_init_from_env(void)
@@ -45,6 +46,23 @@ grn_mrb_init_from_env(void)
   grn_getenv("GRN_RUBY_SCRIPTS_DIR",
              grn_mrb_ruby_scripts_dir,
              GRN_ENV_BUFFER_SIZE);
+  {
+    char grn_order_by_estimated_size_enable_env[GRN_ENV_BUFFER_SIZE];
+    grn_getenv("GRN_ORDER_BY_ESTIMATED_SIZE_ENABLE",
+               grn_order_by_estimated_size_enable_env,
+               GRN_ENV_BUFFER_SIZE);
+    if (strcmp(grn_order_by_estimated_size_enable_env, "yes") == 0) {
+      grn_mrb_order_by_estimated_size_enable = GRN_TRUE;
+    } else {
+      grn_mrb_order_by_estimated_size_enable = GRN_FALSE;
+    }
+  }
+}
+
+grn_bool
+grn_mrb_is_order_by_estimated_size_enabled(void)
+{
+  return grn_mrb_order_by_estimated_size_enable;
 }
 
 #ifdef GRN_WITH_MRUBY

  Modified: lib/mrb/scripts/expression.rb (+1 -0)
===================================================================
--- lib/mrb/scripts/expression.rb    2017-02-16 17:35:56 +0900 (cd40160)
+++ lib/mrb/scripts/expression.rb    2017-02-16 18:09:51 +0900 (27dadcb)
@@ -6,6 +6,7 @@ require "expression_tree_builder"
 require "scan_info"
 require "scan_info_builder"
 
+require "scan_info_data_size_estimator"
 require "expression_size_estimator"
 
 module Groonga

  Modified: lib/mrb/scripts/expression_size_estimator.rb (+2 -136)
===================================================================
--- lib/mrb/scripts/expression_size_estimator.rb    2017-02-16 17:35:56 +0900 (13806b3)
+++ lib/mrb/scripts/expression_size_estimator.rb    2017-02-16 18:09:51 +0900 (597b56f)
@@ -31,7 +31,8 @@ module Groonga
             current_size = 0
           end
 
-          size = estimate_data(data)
+          estimator = ScanInfoDataSizeEstimator.new(data, @table)
+          size = estimator.estimate
           case data.logical_op
           when Operator::AND
             current_size = size if size < current_size
@@ -49,140 +50,5 @@ module Groonga
       end
       current_size
     end
-
-    private
-    def estimate_data(data)
-      search_index = data.search_indexes.first
-      return @table_size if search_index.nil?
-
-      index_column = resolve_index_column(search_index.index_column,
-                                          data.op)
-      return @table_size if index_column.nil?
-
-      size = nil
-      case data.op
-      when Operator::MATCH,
-           Operator::FUZZY
-        size = estimate_match(data, index_column)
-      when Operator::REGEXP
-        size = estimate_regexp(data, index_column)
-      when Operator::EQUAL
-        size = estimate_equal(data, index_column)
-      when Operator::LESS,
-           Operator::LESS_EQUAL,
-           Operator::GREATER,
-           Operator::GREATER_EQUAL
-        size = estimate_range(data, index_column)
-      when Operator::PREFIX
-        size = estimate_prefix(data, index_column)
-      when Operator::CALL
-        size = estimate_call(data, index_column)
-      end
-      size || @table_size
-    end
-
-    def resolve_index_column(index_column, operator)
-      while index_column.is_a?(Accessor)
-        index_info = index_column.find_index(operator)
-        return nil if index_info.nil?
-        index_column = index_info.index
-      end
-
-      index_column
-    end
-
-    def sampling_cursor_limit(n_terms)
-      limit = n_terms * 0.01
-      if limit < 10
-        10
-      elsif limit > 1000
-        1000
-      else
-        limit.to_i
-      end
-    end
-
-    def estimate_match(data, index_column)
-      index_column.estimate_size(:query => data.query.value)
-    end
-
-    def estimate_regexp(data, index_column)
-      index_column.estimate_size(:query => data.query.value,
-                                 :mode => data.op)
-    end
-
-    def estimate_equal(data, index_column)
-      lexicon = index_column.lexicon
-      query = data.query
-      if query.domain == lexicon.id
-        term_id = query.value
-      else
-        term_id = lexicon[query]
-      end
-      return 0 if term_id.nil?
-
-      index_column.estimate_size(:term_id => term_id)
-    end
-
-    def estimate_range(data, index_column)
-      lexicon = index_column.lexicon
-      n_terms = lexicon.size
-      return 0 if n_terms.zero?
-
-      value = data.query.value
-      options = {
-        :limit => sampling_cursor_limit(n_terms),
-      }
-      case data.op
-      when Operator::LESS
-        options[:max] = value
-        options[:flags] = TableCursorFlags::LT
-      when Operator::LESS_EQUAL
-        options[:max] = value
-        options[:flags] = TableCursorFlags::LE
-      when Operator::GREATER
-        options[:min] = value
-        options[:flags] = TableCursorFlags::GT
-      when Operator::GREATER_EQUAL
-        options[:min] = value
-        options[:flags] = TableCursorFlags::GE
-      end
-      TableCursor.open(lexicon, options) do |cursor|
-        size = index_column.estimate_size(:lexicon_cursor => cursor)
-        size += 1 if cursor.next != ID::NIL
-        size
-      end
-    end
-
-    def estimate_prefix(data, index_column)
-      lexicon = index_column.lexicon
-      n_terms = lexicon.size
-      return 0 if n_terms.zero?
-
-      value = data.query.value
-      options = {
-        :min => value,
-        :limit => sampling_cursor_limit(n_terms),
-        :flags => TableCursorFlags::PREFIX,
-      }
-      TableCursor.open(lexicon, options) do |cursor|
-        size = index_column.estimate_size(:lexicon_cursor => cursor)
-        size += 1 if cursor.next != ID::NIL
-        size
-      end
-    end
-
-    def estimate_call(data, index_column)
-      procedure = data.args[0]
-      arguments = data.args[1..-1].collect do |arg|
-        if arg.is_a?(::Groonga::Object)
-          ExpressionTree::Variable.new(arg)
-        else
-          ExpressionTree::Constant.new(arg)
-        end
-      end
-      node = ExpressionTree::FunctionCall.new(procedure, arguments)
-      node.estimate_size(@table)
-    end
   end
 end

  Modified: lib/mrb/scripts/expression_tree/function_call.rb (+12 -1)
===================================================================
--- lib/mrb/scripts/expression_tree/function_call.rb    2017-02-16 17:35:56 +0900 (44c6213)
+++ lib/mrb/scripts/expression_tree/function_call.rb    2017-02-16 18:09:51 +0900 (dd61480)
@@ -20,10 +20,21 @@ module Groonga
         return table.size unles****@proce***** == "between"
 
         column, min, min_border, max, max_border = @arguments
+
         index_info = column.column.find_index(Operator::CALL)
         return table.size if index_info.nil?
-
         index_column = index_info.index
+        while index_column.is_a?(Groonga::Accessor)
+          if index_column.have_next?
+            index_column = index_column.next
+          else
+            index_column = index_column.object
+            index_info = index_column.find_index(Operator::CALL)
+            return table.size if index_info.nil?
+            index_column = index_info.index
+          end
+        end
+
         lexicon = index_column.lexicon
         options = {
           :min => min.value,

  Modified: lib/mrb/scripts/scan_info_builder.rb (+43 -7)
===================================================================
--- lib/mrb/scripts/scan_info_builder.rb    2017-02-16 17:35:56 +0900 (8b46054)
+++ lib/mrb/scripts/scan_info_builder.rb    2017-02-16 18:09:51 +0900 (d5a72bc)
@@ -7,6 +7,8 @@ module Groonga
       @expression = expression
       @operator = operator
       @record_exist = record_exist
+      @variable = @expression[0]
+      @table = Context.instance[@variable.domain]
     end
 
     RELATION_OPERATORS = [
@@ -55,7 +57,6 @@ module Groonga
 
       context = BuildContext.new(@expression)
       codes = context.codes
-      variable = context.variable
       while context.have_next?
         code = context.code
         code_op = context.code_op
@@ -85,7 +86,7 @@ module Groonga
         when Operator::PUSH
           context.data ||= ScanInfoData.new(i)
           data = context.data
-          if code.value == variable
+          if code.value == @variable
             context.status = :var
           else
             data.args << code.value
@@ -178,7 +179,6 @@ module Groonga
       n_relation_expressions = 0
       n_logical_expressions = 0
       status = :start
-      variable = @expression[0]
       codes =****@expre*****
       codes.each_with_index do |code, i|
         case code.op
@@ -213,7 +213,7 @@ module Groonga
             n_relation_expressions += 1
             status = :start
           else
-            if code.value == variable
+            if code.value == @variable
               status = :var
             else
               status = :const
@@ -404,7 +404,45 @@ module Groonga
         end
         optimized_data_list << data
       end
-      optimized_data_list
+
+      optimize_by_estimated_size(optimized_data_list)
+    end
+
+    def optimize_by_estimated_size(data_list)
+      return data_list unless Groonga::ORDER_BY_ESTIMATED_SIZE
+
+      start_index = nil
+      data_list.size.times do |i|
+        data = data_list[i]
+        if data.logical_op != Operator::AND
+          if start_index.nil?
+            start_index = i
+          else
+            sort_by_estimated_size!(data_list, start_index...i)
+            start_index = nil
+          end
+        end
+      end
+      unless start_index.nil?
+        sort_by_estimated_size!(data_list, start_index...data_list.size)
+      end
+      data_list
+    end
+
+    def sort_by_estimated_size!(data_list, range)
+      target_data_list = data_list[range]
+      return if target_data_list.size < 2
+
+      start_logical_op = target_data_list.first.logical_op
+      sorted_data_list = target_data_list.sort_by do |data|
+        estimator = ScanInfoDataSizeEstimator.new(data, @table)
+        estimator.estimate
+      end
+      sorted_data_list.each do |sorted_data|
+        sorted_data.logical_op = Operator::AND
+      end
+      sorted_data_list.first.logical_op = start_logical_op
+      data_list[range] = sorted_data_list
     end
 
     def range_operations?(data, next_data)
@@ -501,7 +539,6 @@ module Groonga
 
     class BuildContext
       attr_accessor :status
-      attr_reader :variable
       attr_reader :codes
       attr_reader :n_codes
       attr_reader :i
@@ -510,7 +547,6 @@ module Groonga
       def initialize(expression)
         @expression = expression
         @status = :start
-        @variable = @expression[0]
         @current_data = nil
         @codes =****@expre*****
         @n_codes =****@codes*****

  Added: lib/mrb/scripts/scan_info_data_size_estimator.rb (+185 -0) 100644
===================================================================
--- /dev/null
+++ lib/mrb/scripts/scan_info_data_size_estimator.rb    2017-02-16 18:09:51 +0900 (5d3dc4e)
@@ -0,0 +1,185 @@
+module Groonga
+  class ScanInfoDataSizeEstimator
+    def initialize(data, table)
+      @data = data
+      @table = table
+      @table_size =****@table*****
+    end
+
+    def estimate
+      search_index =****@data*****_indexes.first
+      return @table_size if search_index.nil?
+
+      index_column = resolve_index_column(search_index.index_column)
+      return @table_size if index_column.nil?
+
+      size = nil
+      case****@data*****
+      when Operator::MATCH,
+           Operator::FUZZY
+        size = estimate_match(index_column)
+      when Operator::REGEXP
+        size = estimate_regexp(index_column)
+      when Operator::EQUAL
+        size = estimate_equal(index_column)
+      when Operator::LESS,
+           Operator::LESS_EQUAL,
+           Operator::GREATER,
+           Operator::GREATER_EQUAL
+        size = estimate_range(index_column)
+      when Operator::PREFIX
+        size = estimate_prefix(index_column)
+      when Operator::CALL
+        size = estimate_call(index_column)
+      end
+      size || @table_size
+    end
+
+    private
+    def resolve_index_column(index_column)
+      while index_column.is_a?(Accessor)
+        index_info = index_column.find_index(@data.op)
+        return nil if index_info.nil?
+        break if index_info.index == index_column
+        index_column = index_info.index
+      end
+
+      index_column
+    end
+
+    def sampling_cursor_limit(n_terms)
+      limit = n_terms * 0.01
+      if limit < 10
+        10
+      elsif limit > 1000
+        1000
+      else
+        limit.to_i
+      end
+    end
+
+    def estimate_match(index_column)
+      index_column.estimate_size(:query => @data.query.value)
+    end
+
+    def estimate_regexp(index_column)
+      index_column.estimate_size(:query => @data.query.value,
+                                 :mode => @data.op)
+    end
+
+    def estimate_equal(index_column)
+      query =****@data*****
+      if index_column.is_a?(Accessor)
+        table = index_column.object
+        if index_column.name == "_id"
+          if table.id?(query.value)
+            1
+          else
+            0
+          end
+        else
+          if table[query.value]
+            1
+          else
+            0
+          end
+        end
+      else
+        lexicon = index_column.lexicon
+        if query.domain == lexicon.id
+          term_id = query.value
+        else
+          term_id = lexicon[query]
+        end
+        return 0 if term_id.nil?
+
+        index_column.estimate_size(:term_id => term_id)
+      end
+    end
+
+    def estimate_range(index_column)
+      if index_column.is_a?(Table)
+        is_table_search = true
+        lexicon = index_column
+      elsif index_column.is_a?(Groonga::Accessor)
+        is_table_search = true
+        lexicon = index_column.object
+      else
+        is_table_search = false
+        lexicon = index_column.lexicon
+      end
+      n_terms = lexicon.size
+      return 0 if n_terms.zero?
+
+      value =****@data*****
+      options = {
+        :limit => sampling_cursor_limit(n_terms),
+      }
+      case****@data*****
+      when Operator::LESS
+        options[:max] = value
+        options[:flags] = TableCursorFlags::LT
+      when Operator::LESS_EQUAL
+        options[:max] = value
+        options[:flags] = TableCursorFlags::LE
+      when Operator::GREATER
+        options[:min] = value
+        options[:flags] = TableCursorFlags::GT
+      when Operator::GREATER_EQUAL
+        options[:min] = value
+        options[:flags] = TableCursorFlags::GE
+      end
+      TableCursor.open(lexicon, options) do |cursor|
+        if is_table_search
+          size = 1
+        else
+          size = index_column.estimate_size(:lexicon_cursor => cursor)
+        end
+        size += 1 if cursor.next != ID::NIL
+        size
+      end
+    end
+
+    def estimate_prefix(index_column)
+      is_table_search =
+        (index_column.is_a?(Accessor) and
+        index_column.name == "_key")
+      if is_table_search
+        lexicon = index_column.object
+      else
+        lexicon = index_column.lexicon
+      end
+      n_terms = lexicon.size
+      return 0 if n_terms.zero?
+
+      value =****@data*****
+      options = {
+        :min => value,
+        :limit => sampling_cursor_limit(n_terms),
+        :flags => TableCursorFlags::PREFIX,
+      }
+      TableCursor.open(lexicon, options) do |cursor|
+        if is_table_search
+          size = 1
+        else
+          size = index_column.estimate_size(:lexicon_cursor => cursor)
+        end
+        size += 1 if cursor.next != ID::NIL
+        size
+      end
+    end
+
+    def estimate_call(index_column)
+      procedure =****@data*****[0]
+      arguments =****@data*****[1..-1].collect do |arg|
+        if arg.is_a?(::Groonga::Object)
+          ExpressionTree::Variable.new(arg)
+        else
+          ExpressionTree::Constant.new(arg)
+        end
+      end
+      node = ExpressionTree::FunctionCall.new(procedure, arguments)
+      node.estimate_size(@table)
+    end
+  end
+end

  Modified: lib/mrb/scripts/sources.am (+1 -0)
===================================================================
--- lib/mrb/scripts/sources.am    2017-02-16 17:35:56 +0900 (718c8d0)
+++ lib/mrb/scripts/sources.am    2017-02-16 18:09:51 +0900 (877168c)
@@ -28,6 +28,7 @@ RUBY_SCRIPT_FILES =				\
 	scan_info.rb				\
 	scan_info_builder.rb			\
 	scan_info_data.rb			\
+	scan_info_data_size_estimator.rb	\
 	scan_info_search_index.rb		\
 	table.rb				\
 	table_cursor.rb				\

  Modified: test/mruby/run-test.rb (+1 -0)
===================================================================
--- test/mruby/run-test.rb    2017-02-16 17:35:56 +0900 (60b1cc6)
+++ test/mruby/run-test.rb    2017-02-16 18:09:51 +0900 (df59cd3)
@@ -53,6 +53,7 @@ end
 
 ENV["GRN_PLUGINS_DIR"] = (build_top_dir_path + "plugins").to_s
 ENV["GRN_RUBY_SCRIPTS_DIR"] = (build_top_dir_path + "lib/mrb/scripts").to_s
+ENV["GRN_ORDER_BY_ESTIMATED_SIZE_ENABLE"] = "yes"
 
 $LOAD_PATH.unshift((rroonga_dir_path + "ext" + "groonga").to_s)
 $LOAD_PATH.unshift((rroonga_dir_path + "lib").to_s)

  Modified: test/mruby/suite/query_optimizer/index/test_match.rb (+59 -0)
===================================================================
--- test/mruby/suite/query_optimizer/index/test_match.rb    2017-02-16 17:35:56 +0900 (addc2f1)
+++ test/mruby/suite/query_optimizer/index/test_match.rb    2017-02-16 18:09:51 +0900 (32cb8af)
@@ -142,4 +142,63 @@ class TestIndexMatch < QueryOptimizerTestCase
   expr:       <0..2>
     DUMP
   end
+
+  sub_test_case("optimization by estimate size") do
+    def setup
+      super
+      10.times do |i|
+        @logs.add(:message => "Groonga#{i}")
+      end
+      2.times do |i|
+        @logs.add(:message => "Rroonga#{i}")
+      end
+      100.times do |i|
+        @logs.add(:message => "Mroonga#{i}")
+      end
+    end
+
+    def test_and
+      filter = "(message @ 'Groonga') && (message @ 'Rroonga')"
+      assert_equal(<<-DUMP, dump_plan(filter))
+[0]
+  op:         <match>
+  logical_op: <or>
+  index:      <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]>
+  query:      <"Rroonga">
+  expr:       <3..5>
+[1]
+  op:         <match>
+  logical_op: <and>
+  index:      <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]>
+  query:      <"Groonga">
+  expr:       <0..2>
+      DUMP
+    end
+
+    def test_and_and_not
+      filter = "(message @ 'Groonga') && "
+      filter << "(message @ 'Rroonga') &! "
+      filter << "(message @ 'Mroonga')"
+      assert_equal(<<-DUMP, dump_plan(filter))
+[0]
+  op:         <match>
+  logical_op: <or>
+  index:      <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]>
+  query:      <"Rroonga">
+  expr:       <3..5>
+[1]
+  op:         <match>
+  logical_op: <and>
+  index:      <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]>
+  query:      <"Groonga">
+  expr:       <0..2>
+[2]
+  op:         <match>
+  logical_op: <and_not>
+  index:      <[#<column:index Terms.Logs_message range:Logs sources:[Logs.message] flags:POSITION>]>
+  query:      <"Mroonga">
+  expr:       <7..9>
+      DUMP
+    end
+  end
 end
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index