Kouhei Sutou
null+****@clear*****
Wed Mar 25 18:44:37 JST 2015
Kouhei Sutou 2015-03-25 18:44:37 +0900 (Wed, 25 Mar 2015) New Revision: 238b29166e0be28c84a56d2cb5f19e62812fc60d https://github.com/groonga/groonga/commit/238b29166e0be28c84a56d2cb5f19e62812fc60d Message: doc: add a document about scorer It's not completed yet. Added files: doc/source/example/reference/scorer/usage_one_no_argument_no_weight.log doc/source/example/reference/scorer/usage_one_no_argument_weight.log doc/source/example/reference/scorer/usage_one_one_argument_no_weight.log doc/source/example/reference/scorer/usage_setup_data.log doc/source/example/reference/scorer/usage_setup_schema.log doc/source/reference/scorer.rst doc/source/reference/scorers/scorer_tf_at_most.rst doc/source/reference/scorers/scorer_tf_idf.rst Modified files: doc/source/reference.rst Added: doc/source/example/reference/scorer/usage_one_no_argument_no_weight.log (+39 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/scorer/usage_one_no_argument_no_weight.log 2015-03-25 18:44:37 +0900 (05cea6c) @@ -0,0 +1,39 @@ +Execution example:: + + select Memos \ + --match_columns "scorer_tf_idf(content)" \ + --query "Groonga" \ + --output_columns "content, _score" \ + --sortby "-_score" + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 2 + # ], + # [ + # [ + # "content", + # "Text" + # ], + # [ + # "_score", + # "Int32" + # ] + # ], + # [ + # "Groonga! Groonga! Groonga! Groonga is very fast!", + # 2 + # ], + # [ + # "Groonga is very easy full text search engine!", + # 1 + # ] + # ] + # ] + # ] Added: doc/source/example/reference/scorer/usage_one_no_argument_weight.log (+39 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/scorer/usage_one_no_argument_weight.log 2015-03-25 18:44:37 +0900 (8c93087) @@ -0,0 +1,39 @@ +Execution example:: + + select Memos \ + --match_columns "scorer_tf_idf(content) * 10" \ + --query "Groonga" \ + --output_columns "content, _score" \ + --sortby "-_score" + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 2 + # ], + # [ + # [ + # "content", + # "Text" + # ], + # [ + # "_score", + # "Int32" + # ] + # ], + # [ + # "Groonga! Groonga! Groonga! Groonga is very fast!", + # 22 + # ], + # [ + # "Groonga is very easy full text search engine!", + # 10 + # ] + # ] + # ] + # ] Added: doc/source/example/reference/scorer/usage_one_one_argument_no_weight.log (+39 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/scorer/usage_one_one_argument_no_weight.log 2015-03-25 18:44:37 +0900 (23e138d) @@ -0,0 +1,39 @@ +Execution example:: + + select Memos \ + --match_columns "scorer_tf_at_most(content, 2.0)" \ + --query "Groonga" \ + --output_columns "content, _score" \ + --sortby "-_score" + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 2 + # ], + # [ + # [ + # "content", + # "Text" + # ], + # [ + # "_score", + # "Int32" + # ] + # ], + # [ + # "Groonga! Groonga! Groonga! Groonga is very fast!", + # 2 + # ], + # [ + # "Groonga is very easy full text search engine!", + # 1 + # ] + # ] + # ] + # ] Added: doc/source/example/reference/scorer/usage_setup_data.log (+41 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/scorer/usage_setup_data.log 2015-03-25 18:44:37 +0900 (ac422a1) @@ -0,0 +1,41 @@ +Execution example:: + + load --table Memos + [ + { + "_key": "memo1", + "title": "Groonga is easy", + "content": "Groonga is very easy full text search engine!" + }, + { + "_key": "memo2", + "title": "Mroonga is easy", + "content": "Mroonga is more easier full text search engine!" + }, + { + "_key": "memo3", + "title": "Rroonga is easy", + "content": "Ruby is very helpful." + }, + { + "_key": "memo4", + "title": "Groonga is fast", + "content": "Groonga! Groonga! Groonga! Groonga is very fast!" + }, + { + "_key": "memo5", + "title": "PGroonga is fast", + "content": "PGroonga is very fast!" + }, + { + "_key": "memo6", + "title": "PGroonga is useful", + "content": "SQL is easy because many client libraries exist." + }, + { + "_key": "memo7", + "title": "Mroonga is also useful", + "content": "MySQL has replication feature. Mroonga can use it." + } + ] + # [[0, 1337566253.89858, 0.000355720520019531], 7] Added: doc/source/example/reference/scorer/usage_setup_schema.log (+16 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/scorer/usage_setup_schema.log 2015-03-25 18:44:37 +0900 (2cd7c81) @@ -0,0 +1,16 @@ +Execution example:: + + table_create Memos TABLE_HASH_KEY ShortText + # [[0, 1337566253.89858, 0.000355720520019531], true] + column_create Memos title COLUMN_SCALAR ShortText + # [[0, 1337566253.89858, 0.000355720520019531], true] + column_create Memos content COLUMN_SCALAR Text + # [[0, 1337566253.89858, 0.000355720520019531], true] + table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto + # [[0, 1337566253.89858, 0.000355720520019531], true] + column_create Terms title_index COLUMN_INDEX|WITH_POSITION Memos title + # [[0, 1337566253.89858, 0.000355720520019531], true] + column_create Terms content_index COLUMN_INDEX|WITH_POSITION Memos content + # [[0, 1337566253.89858, 0.000355720520019531], true] Modified: doc/source/reference.rst (+1 -0) =================================================================== --- doc/source/reference.rst 2015-03-25 18:34:04 +0900 (2456b00) +++ doc/source/reference.rst 2015-03-25 18:44:37 +0900 (32cb58d) @@ -19,6 +19,7 @@ Reference manual reference/tokenizers reference/token_filters reference/query_expanders + reference/scorer reference/grn_expr reference/regular_expression reference/function Added: doc/source/reference/scorer.rst (+192 -0) 100644 =================================================================== --- /dev/null +++ doc/source/reference/scorer.rst 2015-03-25 18:44:37 +0900 (9075da1) @@ -0,0 +1,192 @@ +.. -*- rst -*- + +.. highlightlang:: none + +.. groonga-command +.. database: scorer + +Scorer +====== + +Summary +------- + +Groonga has scorer module that customizes score function. Score +function computes score of matched record. The default scorer function +uses the number of appeared terms. It is also known as TF (term +frequency). + +TF is a fast score function but it's not suitable for the following +cases: + + * Search query contains one or more frequently-appearing words such + as "the" and "a". + * Document contains many same keywords such as "They are keyword, + keyword, keyword ... and keyword". Search engine spammer may use + the technique. + +Score function can solve these cases. For example, `TF-IDF +<http://en.wikipedia.org/wiki/Tf%E2%80%93idf>`_ (term +frequency-inverse document frequency) can solve the first case. +`Okapi BM25 <http://en.wikipedia.org/wiki/Okapi_BM25>`_ can solve the +second case. But their are slower than TF. + +Groonga provides TF-IDF based scorer as +:doc:`/reference/scorers/scorer_tf_idf` but doesn't provide Okapi BM25 +based scorer yet. + +.. _note: + + You don't need to resolve scoring only by score function. Score + function is highly depends on search query. You may use metadata of + matched record. + + For example, Google uses `PageRank + <http://en.wikipedia.org/wiki/PageRank>`_ for scoring. You may be + able to use data type ("title" data are important rather than + "memo" data), tag, geolocation and so on. + + Please stop to think about only score function for scoring. + +Usage +----- + +This section describes how to use scorer. + +Here are a schema definition and sample data to show usage. + +Sample schema: + +.. groonga-command +.. include:: ../example/reference/scorer/usage_setup_schema.log +.. table_create Memos TABLE_HASH_KEY ShortText +.. column_create Memos title COLUMN_SCALAR ShortText +.. column_create Memos content COLUMN_SCALAR Text +.. +.. table_create Terms TABLE_PAT_KEY ShortText \ +.. --default_tokenizer TokenBigram \ +.. --normalizer NormalizerAuto +.. column_create Terms title_index COLUMN_INDEX|WITH_POSITION Memos title +.. column_create Terms content_index COLUMN_INDEX|WITH_POSITION Memos content + +Sample data: + +.. groonga-command +.. include:: ../example/reference/scorer/usage_setup_data.log +.. load --table Memos +.. [ +.. { +.. "_key": "memo1", +.. "title": "Groonga is easy", +.. "content": "Groonga is very easy full text search engine!" +.. }, +.. { +.. "_key": "memo2", +.. "title": "Mroonga is easy", +.. "content": "Mroonga is more easier full text search engine!" +.. }, +.. { +.. "_key": "memo3", +.. "title": "Rroonga is easy", +.. "content": "Ruby is very helpful." +.. }, +.. { +.. "_key": "memo4", +.. "title": "Groonga is fast", +.. "content": "Groonga! Groonga! Groonga! Groonga is very fast!" +.. }, +.. { +.. "_key": "memo5", +.. "title": "PGroonga is fast", +.. "content": "PGroonga is very fast!" +.. }, +.. { +.. "_key": "memo6", +.. "title": "PGroonga is useful", +.. "content": "SQL is easy because many client libraries exist." +.. }, +.. { +.. "_key": "memo7", +.. "title": "Mroonga is also useful", +.. "content": "MySQL has replication feature. Mroonga can use it." +.. } +.. ] + +You can specify custom score function in :ref:`select-match-columns`. +Here are syntaxes:: + + SCORE_FUNCTION(COLUMN) + + SCORE_FUNCTION(COLUMN) * WEIGHT + + SCORE_FUNCTION(COLUMN, ARGUMENT1, ARGUMENT2, ...) + + SCORE_FUNCTION(COLUMN, ARGUMENT1, ARGUMENT2, ...) * WEIGHT + + SCORE_FUNCTION1(COLUMN1) * WEIGHT1 || + SCORE_FUNCTION2(COLUMN2) * WEIGHT2 || + ... + + SCORE_FUNCTION1(COLUMN1, ARGUMENT1, ARGUMENT2, ...) * WEIGHT1 || + SCORE_FUNCTION2(COLUMN2, ARGUMENT1, ARGUMENT2, ...) * WEIGHT2 || + ... + +Here is a simplest example: + +.. groonga-command +.. include:: ../example/reference/scorer/usage_one_no_argument_no_weight.log +.. select Memos \ +.. --match_columns "scorer_tf_idf(content)" \ +.. --query "Groonga" \ +.. --output_columns "content, _score" \ +.. --sortby "-_score" + +``Groonga! Groonga! Groonga! Groonga is very fast!`` contains 4 +``Groonga``. If you use TF based scorer that is the default scorer, +``_score`` is ``4``. But the actual ``_score`` is ``2``. Because the +``select`` command uses TF-IDF based scorer ``scorer_tf_idf()``. + +Here is an example that uses weight: + +.. groonga-command +.. include:: ../example/reference/scorer/usage_one_no_argument_weight.log +.. select Memos \ +.. --match_columns "scorer_tf_idf(content) * 10" \ +.. --query "Groonga" \ +.. --output_columns "content, _score" \ +.. --sortby "-_score" + +``Groonga! Groonga! Groonga! Groonga is very fast!`` has ``22`` as +``_score``. It had ``2`` as ``_score`` in the previous example that +doesn't specify weight. + +Here is an example that uses scorer that requires one +argument. :doc:`/reference/scorers/scorer_tf_at_most` scorer requires +one argument. You can limit TF score by the scorer. + +.. groonga-command +.. include:: ../example/reference/scorer/usage_one_one_argument_no_weight.log +.. select Memos \ +.. --match_columns "scorer_tf_at_most(content, 2.0)" \ +.. --query "Groonga" \ +.. --output_columns "content, _score" \ +.. --sortby "-_score" + +``Groonga! Groonga! Groonga! Groonga is very fast!`` contains 4 +``Groonga``. If you use normal TF based scorer that is the default +scorer, ``_score`` is ``4``. But the actual ``_score`` is ``2``. +Because the scorer used in the ``select`` command limits the maximum +score value to ``2``. + +TODO: Describe about how to use multiple scorer in one match_columns. + +Built-in scorers +---------------- + +Here are built-in scores: + +.. toctree:: + :maxdepth: 1 + :glob: + + scorers/* Added: doc/source/reference/scorers/scorer_tf_at_most.rst (+14 -0) 100644 =================================================================== --- /dev/null +++ doc/source/reference/scorers/scorer_tf_at_most.rst 2015-03-25 18:44:37 +0900 (34517f7) @@ -0,0 +1,14 @@ +.. -*- rst -*- + +.. highlightlang:: none + +.. groonga-command +.. database: scorer_tf_at_most + +``scorer_tf_at_most`` +===================== + +Summary +------- + +TODO Added: doc/source/reference/scorers/scorer_tf_idf.rst (+14 -0) 100644 =================================================================== --- /dev/null +++ doc/source/reference/scorers/scorer_tf_idf.rst 2015-03-25 18:44:37 +0900 (b819733) @@ -0,0 +1,14 @@ +.. -*- rst -*- + +.. highlightlang:: none + +.. groonga-command +.. database: scorer_tf_idf + +``scorer_tf_idf`` +================= + +Summary +------- + +TODO -------------- next part -------------- HTML����������������������������...Download