Kouhei Sutou
null+****@clear*****
Fri Mar 27 16:07:48 JST 2015
Kouhei Sutou 2015-03-27 16:07:48 +0900 (Fri, 27 Mar 2015) New Revision: c801f3cc2845daf3e314e2c663d2083d96a44e52 https://github.com/groonga/groonga/commit/c801f3cc2845daf3e314e2c663d2083d96a44e52 Message: doc: add scorer_tf_idf Added files: doc/source/reference/example/reference/scorers/scorer_tf_idf/usage_no_weight.log doc/source/reference/example/reference/scorers/scorer_tf_idf/usage_setup_data.log doc/source/reference/example/reference/scorers/scorer_tf_idf/usage_setup_schema.log Modified files: doc/source/reference/scorer.rst doc/source/reference/scorers/scorer_tf_idf.rst Added: doc/source/reference/example/reference/scorers/scorer_tf_idf/usage_no_weight.log (+51 -0) 100644 =================================================================== --- /dev/null +++ doc/source/reference/example/reference/scorers/scorer_tf_idf/usage_no_weight.log 2015-03-27 16:07:48 +0900 (cde9e9c) @@ -0,0 +1,51 @@ +Execution example:: + + select Logs \ + --match_columns "scorer_tf_idf(message)" \ + --query "Error OR Info" \ + --output_columns "message, _score" \ + --sortby "-_score" + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 5 + # ], + # [ + # [ + # "message", + # "Text" + # ], + # [ + # "_score", + # "Int32" + # ] + # ], + # [ + # "Info Info Info Info", + # 3 + # ], + # [ + # "Error", + # 2 + # ], + # [ + # "Info Info Info", + # 2 + # ], + # [ + # "Info Info", + # 1 + # ], + # [ + # "Info", + # 1 + # ] + # ] + # ] + # ] Added: doc/source/reference/example/reference/scorers/scorer_tf_idf/usage_setup_data.log (+19 -0) 100644 =================================================================== --- /dev/null +++ doc/source/reference/example/reference/scorers/scorer_tf_idf/usage_setup_data.log 2015-03-27 16:07:48 +0900 (4e08536) @@ -0,0 +1,19 @@ +Execution example:: + + load --table Logs + [ + {"message": "Error"}, + {"message": "Warning"}, + {"message": "Warning Warning"}, + {"message": "Warning Warning Warning"}, + {"message": "Info"}, + {"message": "Info Info"}, + {"message": "Info Info Info"}, + {"message": "Info Info Info Info"}, + {"message": "Notice"}, + {"message": "Notice Notice"}, + {"message": "Notice Notice Notice"}, + {"message": "Notice Notice Notice Notice"}, + {"message": "Notice Notice Notice Notice Notice"} + ] + # [[0, 1337566253.89858, 0.000355720520019531], 13] Added: doc/source/reference/example/reference/scorers/scorer_tf_idf/usage_setup_schema.log (+12 -0) 100644 =================================================================== --- /dev/null +++ doc/source/reference/example/reference/scorers/scorer_tf_idf/usage_setup_schema.log 2015-03-27 16:07:48 +0900 (700d4ab) @@ -0,0 +1,12 @@ +Execution example:: + + table_create Logs TABLE_NO_KEY + # [[0, 1337566253.89858, 0.000355720520019531], true] + column_create Logs message COLUMN_SCALAR Text + # [[0, 1337566253.89858, 0.000355720520019531], true] + table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto + # [[0, 1337566253.89858, 0.000355720520019531], true] + column_create Terms message_index COLUMN_INDEX|WITH_POSITION Logs message + # [[0, 1337566253.89858, 0.000355720520019531], true] Modified: doc/source/reference/scorer.rst (+1 -12) =================================================================== --- doc/source/reference/scorer.rst 2015-03-27 15:27:57 +0900 (df31670) +++ doc/source/reference/scorer.rst 2015-03-27 16:07:48 +0900 (9626641) @@ -35,18 +35,7 @@ Groonga provides TF-IDF based scorer as :doc:`/reference/scorers/scorer_tf_idf` but doesn't provide Okapi BM25 based scorer yet. -.. _note: - - You don't need to resolve scoring only by score function. Score - function is highly depends on search query. You may use metadata of - matched record. - - For example, Google uses `PageRank - <http://en.wikipedia.org/wiki/PageRank>`_ for scoring. You may be - able to use data type ("title" data are important rather than - "memo" data), tag, geolocation and so on. - - Please stop to think about only score function for scoring. +.. include:: scoring_note.rst Usage ----- Modified: doc/source/reference/scorers/scorer_tf_idf.rst (+91 -1) =================================================================== --- doc/source/reference/scorers/scorer_tf_idf.rst 2015-03-27 15:27:57 +0900 (525c1c3) +++ doc/source/reference/scorers/scorer_tf_idf.rst 2015-03-27 16:07:48 +0900 (97c7bad) @@ -17,4 +17,94 @@ Summary ------- -TODO +``scorer_tf_idf`` is a scorer based of `TF-IDF +<http://en.wikipedia.org/wiki/Tf%E2%80%93idf>`_ (term +frequency-inverse document frequency) score function. + +To put it simply, TF (term frequency) divided by DF (document +frequency) is TF-IDF. "TF" means that "the number of occurrences is +more important". "TF divided by DF" means that "the number of +occurrences of important term is more important". + +The default score function in Groonga is TF (term frequency). It +doesn't care about term importance but is fast. + +TF-IDF cares about term importance but is slower than TF. + +TF-IDF will compute more suitable score rather than TF for many cases. +But it's not perfect. + +If document contains many same keywords such as "They are keyword, +keyword, keyword ... and keyword". It increases score by TF and +TF-IDF. Search engine spammer may use the technique. But TF-IDF +doesn't guard from the technique. + +`Okapi BM25 <http://en.wikipedia.org/wiki/Okapi_BM25>`_ can solve the +case. But it's more slower than TF-IDF and not implemented yet in +Groonga. + +.. include:: ../scoring_note.rst + +Usage +----- + +This section describes how to use this scorer. + +Here are a schema definition and sample data to show usage. + +Sample schema: + +.. groonga-command +.. include:: ../example/reference/scorers/scorer_tf_idf/usage_setup_schema.log +.. table_create Logs TABLE_NO_KEY +.. column_create Logs message COLUMN_SCALAR Text +.. +.. table_create Terms TABLE_PAT_KEY ShortText \ +.. --default_tokenizer TokenBigram \ +.. --normalizer NormalizerAuto +.. column_create Terms message_index COLUMN_INDEX|WITH_POSITION Logs message + +Sample data: + +.. groonga-command +.. include:: ../example/reference/scorers/scorer_tf_idf/usage_setup_data.log +.. load --table Logs +.. [ +.. {"message": "Error"}, +.. {"message": "Warning"}, +.. {"message": "Warning Warning"}, +.. {"message": "Warning Warning Warning"}, +.. {"message": "Info"}, +.. {"message": "Info Info"}, +.. {"message": "Info Info Info"}, +.. {"message": "Info Info Info Info"}, +.. {"message": "Notice"}, +.. {"message": "Notice Notice"}, +.. {"message": "Notice Notice Notice"}, +.. {"message": "Notice Notice Notice Notice"}, +.. {"message": "Notice Notice Notice Notice Notice"} +.. ] + +You specify ``scorer_tf_idf`` in :ref:`select-match-columns` like the +following: + +.. groonga-command +.. include:: ../example/reference/scorers/scorer_tf_idf/usage_no_weight.log +.. select Logs \ +.. --match_columns "scorer_tf_idf(message)" \ +.. --query "Error OR Info" \ +.. --output_columns "message, _score" \ +.. --sortby "-_score" + +Both the score of ``Info Info Info`` and the score of ``Error`` are +``2`` even ``Info Info Info`` includes three ``Info`` terms. Because +``Error`` is more important term rather than ``Info``. The number of +documents that include ``Info`` is ``4``. The number of documents that +include ``Error`` is ``1``. Term that is included in less documents +means that the term is more characteristic term. Characteristic term +is important term. + +See also +-------- + +* :doc:`../scorer` -------------- next part -------------- HTML����������������������������...Download