Yasuhiro Horimoto 2018-12-27 16:42:35 +0900 (Thu, 27 Dec 2018) Revision: f28e72d043ef9039529248af5708947e5b66057d https://github.com/groonga/groonga/commit/f28e72d043ef9039529248af5708947e5b66057d Message: doc: Separate from tokenizers page Added files: doc/source/example/reference/tokenizers/token-mecab-include-class-option.log doc/source/example/reference/tokenizers/token-mecab-include-form-option.log doc/source/example/reference/tokenizers/token-mecab-include-reading-option.log doc/source/example/reference/tokenizers/token-mecab-target-class-and-include-class-option.log doc/source/example/reference/tokenizers/token-mecab-use-reading-option.log doc/source/reference/tokenizers/token_mecab.rst Added: doc/source/example/reference/tokenizers/token-mecab-include-class-option.log (+116 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/tokenizers/token-mecab-include-class-option.log 2018-12-27 16:42:35 +0900 (3b5d3409a) @@ -0,0 +1,116 @@ +Execution example:: + + tokenize 'TokenMecab("include_class", true)' '彼の名前は山田さんのはずです。' + # [ + # [ + # 0, + # 1545892715.887472, + # 0.03757452964782715 + # ], + # [ + # { + # "value": "彼", + # "position": 0, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "名詞", + # "subclass0": "代名詞", + # "subclass1": "一般" + # } + # }, + # { + # "value": "の", + # "position": 1, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "助詞", + # "subclass0": "連体化" + # } + # }, + # { + # "value": "名前", + # "position": 2, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "名詞", + # "subclass0": "一般" + # } + # }, + # { + # "value": "は", + # "position": 3, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "助詞", + # "subclass0": "係助詞" + # } + # }, + # { + # "value": "山田", + # "position": 4, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "名詞", + # "subclass0": "固有名詞", + # "subclass1": "人名", + # "subclass2": "姓" + # } + # }, + # { + # "value": "さん", + # "position": 5, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "名詞", + # "subclass0": "接尾", + # "subclass1": "人名" + # } + # }, + # { + # "value": "の", + # "position": 6, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "助詞", + # "subclass0": "連体化" + # } + # }, + # { + # "value": "はず", + # "position": 7, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "名詞", + # "subclass0": "非自立", + # "subclass1": "一般" + # } + # }, + # { + # "value": "です", + # "position": 8, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "助動詞" + # } + # }, + # { + # "value": "。", + # "position": 9, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "class": "記号", + # "subclass0": "句点" + # } + # } + # ] + # ] Added: doc/source/example/reference/tokenizers/token-mecab-include-form-option.log (+104 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/tokenizers/token-mecab-include-form-option.log 2018-12-27 16:42:35 +0900 (657849b47) @@ -0,0 +1,104 @@ +Execution example:: + + tokenize 'TokenMecab("include_form", true)' '彼の名前は山田さんのはずです。' + # [ + # [ + # 0, + # 1545892987.209944, + # 0.0004286766052246094 + # ], + # [ + # { + # "value": "彼", + # "position": 0, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "base_form": "彼" + # } + # }, + # { + # "value": "の", + # "position": 1, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "base_form": "の" + # } + # }, + # { + # "value": "名前", + # "position": 2, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "base_form": "名前" + # } + # }, + # { + # "value": "は", + # "position": 3, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "base_form": "は" + # } + # }, + # { + # "value": "山田", + # "position": 4, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "base_form": "山田" + # } + # }, + # { + # "value": "さん", + # "position": 5, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "base_form": "さん" + # } + # }, + # { + # "value": "の", + # "position": 6, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "base_form": "の" + # } + # }, + # { + # "value": "はず", + # "position": 7, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "base_form": "はず" + # } + # }, + # { + # "value": "です", + # "position": 8, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "inflected_type": "特殊・デス", + # "inflected_form": "基本形", + # "base_form": "です" + # } + # }, + # { + # "value": "。", + # "position": 9, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "base_form": "。" + # } + # } + # ] + # ] Added: doc/source/example/reference/tokenizers/token-mecab-include-reading-option.log (+102 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/tokenizers/token-mecab-include-reading-option.log 2018-12-27 16:42:35 +0900 (9c5bf5f18) @@ -0,0 +1,102 @@ +Execution example:: + + tokenize 'TokenMecab("include_reading", true)' '彼の名前は山田さんのはずです。' + # [ + # [ + # 0, + # 1545892913.226588, + # 0.0003414154052734375 + # ], + # [ + # { + # "value": "彼", + # "position": 0, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "カレ" + # } + # }, + # { + # "value": "の", + # "position": 1, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "ノ" + # } + # }, + # { + # "value": "名前", + # "position": 2, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "ナマエ" + # } + # }, + # { + # "value": "は", + # "position": 3, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "ハ" + # } + # }, + # { + # "value": "山田", + # "position": 4, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "ヤマダ" + # } + # }, + # { + # "value": "さん", + # "position": 5, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "サン" + # } + # }, + # { + # "value": "の", + # "position": 6, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "ノ" + # } + # }, + # { + # "value": "はず", + # "position": 7, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "ハズ" + # } + # }, + # { + # "value": "です", + # "position": 8, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "デス" + # } + # }, + # { + # "value": "。", + # "position": 9, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "。" + # } + # } + # ] + # ] Added: doc/source/example/reference/tokenizers/token-mecab-target-class-and-include-class-option.log (+39 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/tokenizers/token-mecab-target-class-and-include-class-option.log 2018-12-27 16:42:35 +0900 (f35a0e8ae) @@ -0,0 +1,39 @@ +Execution example:: + + tokenize 'TokenMecab("target_class", "-名詞/非自立", "target_class", "-名詞/接尾/人名", "target_class", "名詞", "include_reading", true)' '彼の名前は山田さんのはずです。' + # [ + # [ + # 0, + # 1545893197.914959, + # 0.0003139972686767578 + # ], + # [ + # { + # "value": "彼", + # "position": 0, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "カレ" + # } + # }, + # { + # "value": "名前", + # "position": 1, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "ナマエ" + # } + # }, + # { + # "value": "山田", + # "position": 2, + # "force_prefix": false, + # "force_prefix_search": false, + # "metadata": { + # "reading": "ヤマダ" + # } + # } + # ] + # ] Added: doc/source/example/reference/tokenizers/token-mecab-use-reading-option.log (+72 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/tokenizers/token-mecab-use-reading-option.log 2018-12-27 16:42:35 +0900 (c7ec801aa) @@ -0,0 +1,72 @@ +Execution example:: + + tokenize 'TokenMecab("use_reading", true)' '彼の名前は山田さんのはずです。' + # [ + # [ + # 0, + # 1545893087.556662, + # 0.0003693103790283203 + # ], + # [ + # { + # "value": "カレ", + # "position": 0, + # "force_prefix": false, + # "force_prefix_search": false + # }, + # { + # "value": "ノ", + # "position": 1, + # "force_prefix": false, + # "force_prefix_search": false + # }, + # { + # "value": "ナマエ", + # "position": 2, + # "force_prefix": false, + # "force_prefix_search": false + # }, + # { + # "value": "ハ", + # "position": 3, + # "force_prefix": false, + # "force_prefix_search": false + # }, + # { + # "value": "ヤマダ", + # "position": 4, + # "force_prefix": false, + # "force_prefix_search": false + # }, + # { + # "value": "サン", + # "position": 5, + # "force_prefix": false, + # "force_prefix_search": false + # }, + # { + # "value": "ノ", + # "position": 6, + # "force_prefix": false, + # "force_prefix_search": false + # }, + # { + # "value": "ハズ", + # "position": 7, + # "force_prefix": false, + # "force_prefix_search": false + # }, + # { + # "value": "デス", + # "position": 8, + # "force_prefix": false, + # "force_prefix_search": false + # }, + # { + # "value": "。", + # "position": 9, + # "force_prefix": false, + # "force_prefix_search": false + # } + # ] + # ] Added: doc/source/reference/tokenizers/token_mecab.rst (+156 -0) 100644 =================================================================== --- /dev/null +++ doc/source/reference/tokenizers/token_mecab.rst 2018-12-27 16:42:35 +0900 (87876a6d4) @@ -0,0 +1,156 @@ +.. -*- rst -*- + +.. highlightlang:: none + +.. groonga-command +.. database: tokenizers + +``TokenMecab`` +================ + +Summary +------- + +``TokenMecab`` is a tokenizer based on `MeCab +<https://taku910.github.io/mecab/>`_ part-of-speech and +morphological analyzer. + +MeCab doesn't depend on Japanese. You can use MeCab for other +languages by creating dictionary for the languages. You can use `NAIST +Japanese Dictionary <http://osdn.jp/projects/naist-jdic/>`_ +for Japanese. + +You need to install an additional package to using TokenMecab. +For more detail of how to installing an additional package, see `how to install each OS <http://groonga.org/docs/install.html>`_ . + +``TokenMecab`` is good for precision rather than recall. You can find +``東京都`` and ``京都`` texts by ``京都`` query with +:ref:`token-bigram` but ``東京都`` isn't expected. You can find only +``京都`` text by ``京都`` query with ``TokenMecab``. + +If you want to support neologisms, you need to keep updating your +MeCab dictionary. It needs maintain cost. (:ref:`token-bigram` doesn't +require dictionary maintenance because :ref:`token-bigram` doesn't use +dictionary.) `mecab-ipadic-NEologd : Neologism dictionary for MeCab +<https://github.com/neologd/mecab-ipadic-neologd>`_ may help you. + +Syntax +------ + +``TokenMecab`` has optional parameter:: + + TokenMecab + + TokenMecab("include_class", true) + + TokenMecab("target_class", true) + + TokenMecab("include_form", true) + + TokenMecab("use_reading", true) + +Usage +----- + +Simple usage +------------ + +Here is an example of ``TokenMeCab``. ``東京都`` is tokenized to ``東京`` +and ``都``. They don't include ``京都``: + +.. groonga-command +.. include:: ../../example/reference/tokenizers/token-mecab.log +.. tokenize TokenMecab "東京都" + +``TokenMecab`` can also specify options. +``TokenMecab`` has ``target_class`` option, ``include_class`` option, +``include_reading`` option, ``include_form`` option and ``use_reading`` option. + +``target_class`` option searches a token of specifying a part-of-speech. +For example, you can search only a noun as below. + +.. groonga-command +.. include:: ../../example/reference/tokenizers/token-mecab-target-class-option.log +.. tokenize 'TokenMecab("target_class", "名詞")' '彼の名前は山田さんのはずです。' + +``include_class`` option outputs class and subclass in Mecab’s metadata as below. + +.. groonga-command +.. include:: ../../example/reference/tokenizers/token-mecab-include-class-option.log +.. tokenize 'TokenMecab("include_class", true)' '彼の名前は山田さんのはずです。' + +You can exclude needless token with ``target_class`` and class and sub class of this option outputs. + +``include_reading`` outputs reading in Mecab’s metadata as below. + +.. groonga-command +.. include:: ../../example/reference/tokenizers/token-mecab-include-reading-option.log +.. tokenize 'TokenMecab("include_reading", true)' '彼の名前は山田さんのはずです。' + +You can get reading of a token with this option. + +``include_form`` outputs inflected_type, inflected_form and base_form in Mecab’s metadata as below. + +.. groonga-command +.. include:: ../../example/reference/tokenizers/token-mecab-include-form-option.log +.. tokenize 'TokenMecab("include_form", true)' '彼の名前は山田さんのはずです。' + +``use_reading`` supports a search by kana. +This option is useful for countermeasure of orthographical variants because it searches with kana. + +.. groonga-command +.. include:: ../../example/reference/tokenizers/token-mecab-use-reading-option.log +.. tokenize 'TokenMecab("include_form", true)' '彼の名前は山田さんのはずです。' + +Advanced usage +-------------- + +``target_class`` option can also specify subclasses and exclude or add specific +part-of-speech of specific using + or -. +So, you can also search a noun with excluding non-independent word and suffix of +person name as below. + +In this way you can search exclude the noise of token. + +.. groonga-command +.. include:: ../../example/reference/tokenizers/token-mecab-target-class-option-complex.log +.. tokenize 'TokenMecab("target_class", "-名詞/非自立", "target_class", "-名詞/接尾/人名", "target_class", "名詞")' '彼の名前は山田さんのはずです。' + +In addition, you can get reading of a token that exclude the noise with ``include_reading`` option as below. + +.. groonga-command +.. include:: ../../example/reference/tokenizers/token-mecab-target-class-and-include-class-option.log +.. tokenize 'TokenMecab("target_class", "-名詞/非自立", "target_class", "-名詞/接尾/人名", "target_class", "名詞", "include_reading", true)' '彼の名前は山田さんのはずです。' + +Parameters +---------- + +Optional parameter +^^^^^^^^^^^^^^^^^^ + +There are four optional parameters ``include_class`` , ``target_class`` , ``include_form`` and ``use_reading`` . + +``include_class`` +""""""""""""""""" + +Outputs class and subclass in Mecab’s metadata. + +``target_class`` +"""""""""""""""" + +Outputs a token of specifying a part-of-speech. + +``include_form`` +"""""""""""""""" + +Outputs inflected_type, inflected_form and base_form in Mecab’s metadata. + +``use_reading`` +""""""""""""""" + +Outputs reading of token. + +See also +---------- + +* :doc:`../commands/tokenize` -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181227/dd3d6134/attachment-0001.html>