Kouhei Sutou
null+****@clear*****
Tue Mar 17 18:46:19 JST 2015
Kouhei Sutou 2015-03-17 18:46:19 +0900 (Tue, 17 Mar 2015) New Revision: cdfdb040a263ab4bb10ddb342317718e808364e4 https://github.com/groonga/groonga/commit/cdfdb040a263ab4bb10ddb342317718e808364e4 Message: doc: add document about regular expression Added files: doc/source/example/reference/regular_expression/anchor_z.log doc/source/example/reference/regular_expression/character_class_characters.log doc/source/example/reference/regular_expression/character_class_range.log doc/source/example/reference/regular_expression/choice.log doc/source/example/reference/regular_expression/group_back_reference.log doc/source/example/reference/regular_expression/group_scope_reducing.log doc/source/example/reference/regular_expression/index_definitions.log doc/source/example/reference/regular_expression/quantifier_plus.log doc/source/example/reference/regular_expression/search_by_index_filter.log doc/source/example/reference/regular_expression/search_by_index_query.log doc/source/example/reference/regular_expression/usage_filter.log doc/source/example/reference/regular_expression/usage_query.log doc/source/example/reference/regular_expression/usage_setup.log doc/source/reference/regular_expression.rst Added: doc/source/example/reference/regular_expression/anchor_z.log (+31 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/anchor_z.log 2015-03-17 18:46:19 +0900 (b537ee6) @@ -0,0 +1,31 @@ +Execution example:: + + select Logs --filter 'message @~ "%\\\\z"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 1 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 2, + # "host1:[warning]: Remained disk space is less than 30%" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/character_class_characters.log (+35 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/character_class_characters.log 2015-03-17 18:46:19 +0900 (e23d5b2) @@ -0,0 +1,35 @@ +Execution example:: + + select Logs --filter 'message @~ "[Dd]isk"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 2 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 2, + # "host1:[warning]: Remained disk space is less than 30%" + # ], + # [ + # 3, + # "host1:[error]: Disk full" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/character_class_range.log (+31 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/character_class_range.log 2015-03-17 18:46:19 +0900 (d4b0553) @@ -0,0 +1,31 @@ +Execution example:: + + select Logs --filter 'message @~ "[0-9][0-9]%"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 1 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 2, + # "host1:[warning]: Remained disk space is less than 30%" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/choice.log (+35 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/choice.log 2015-03-17 18:46:19 +0900 (ea17b9f) @@ -0,0 +1,35 @@ +Execution example:: + + select Logs --filter 'message @~ "warning|info"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 2 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 2, + # "host1:[warning]: Remained disk space is less than 30%" + # ], + # [ + # 5, + # "host2:[info]: Shutdown" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/group_back_reference.log (+39 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/group_back_reference.log 2015-03-17 18:46:19 +0900 (eec7c41) @@ -0,0 +1,39 @@ +Execution example:: + + select Logs --filter 'message @~ "e(r)\\\\1o\\\\1"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 3 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 1, + # "host1:[error]: No memory" + # ], + # [ + # 3, + # "host1:[error]: Disk full" + # ], + # [ + # 4, + # "host2:[error]: No memory" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/group_scope_reducing.log (+35 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/group_scope_reducing.log 2015-03-17 18:46:19 +0900 (297d935) @@ -0,0 +1,35 @@ +Execution example:: + + select Logs --filter 'message @~ "\\\\[(warning|info)\\\\]"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 2 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 2, + # "host1:[warning]: Remained disk space is less than 30%" + # ], + # [ + # 5, + # "host2:[info]: Shutdown" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/index_definitions.log (+9 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/index_definitions.log 2015-03-17 18:46:19 +0900 (303fdb0) @@ -0,0 +1,9 @@ +Execution example:: + + table_create RegexpLexicon TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenRegexp \ + --normalizer NormalizerAuto + # [[0, 1337566253.89858, 0.000355720520019531], true] + column_create RegexpLexicon logs_message_index \ + COLUMN_INDEX|WITH_POSITION Logs message + # [[0, 1337566253.89858, 0.000355720520019531], true] Added: doc/source/example/reference/regular_expression/quantifier_plus.log (+39 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/quantifier_plus.log 2015-03-17 18:46:19 +0900 (c366548) @@ -0,0 +1,39 @@ +Execution example:: + + select Logs --filter 'message @~ "er+or"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 3 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 1, + # "host1:[error]: No memory" + # ], + # [ + # 3, + # "host1:[error]: Disk full" + # ], + # [ + # 4, + # "host2:[error]: No memory" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/search_by_index_filter.log (+39 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/search_by_index_filter.log 2015-03-17 18:46:19 +0900 (961df57) @@ -0,0 +1,39 @@ +Execution example:: + + select Logs --filter 'message @~ "\\\\Ahost1:"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 3 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 1, + # "host1:[error]: No memory" + # ], + # [ + # 2, + # "host1:[warning]: Remained disk space is less than 30%" + # ], + # [ + # 3, + # "host1:[error]: Disk full" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/search_by_index_query.log (+39 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/search_by_index_query.log 2015-03-17 18:46:19 +0900 (a85281f) @@ -0,0 +1,39 @@ +Execution example:: + + select Logs --query message:~\\\\Ahost1 + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 3 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 1, + # "host1:[error]: No memory" + # ], + # [ + # 2, + # "host1:[warning]: Remained disk space is less than 30%" + # ], + # [ + # 3, + # "host1:[error]: Disk full" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/usage_filter.log (+35 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/usage_filter.log 2015-03-17 18:46:19 +0900 (e23d5b2) @@ -0,0 +1,35 @@ +Execution example:: + + select Logs --filter 'message @~ "[Dd]isk"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 2 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 2, + # "host1:[warning]: Remained disk space is less than 30%" + # ], + # [ + # 3, + # "host1:[error]: Disk full" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/usage_query.log (+35 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/usage_query.log 2015-03-17 18:46:19 +0900 (e63185b) @@ -0,0 +1,35 @@ +Execution example:: + + select Logs --query 'message:~"[Dd]isk"' + # [ + # [ + # 0, + # 1337566253.89858, + # 0.000355720520019531 + # ], + # [ + # [ + # [ + # 2 + # ], + # [ + # [ + # "_id", + # "UInt32" + # ], + # [ + # "message", + # "Text" + # ] + # ], + # [ + # 2, + # "host1:[warning]: Remained disk space is less than 30%" + # ], + # [ + # 3, + # "host1:[error]: Disk full" + # ] + # ] + # ] + # ] Added: doc/source/example/reference/regular_expression/usage_setup.log (+15 -0) 100644 =================================================================== --- /dev/null +++ doc/source/example/reference/regular_expression/usage_setup.log 2015-03-17 18:46:19 +0900 (5d76eec) @@ -0,0 +1,15 @@ +Execution example:: + + table_create Logs TABLE_NO_KEY + # [[0, 1337566253.89858, 0.000355720520019531], true] + column_create Logs message COLUMN_SCALAR Text + # [[0, 1337566253.89858, 0.000355720520019531], true] + load --table Logs + [ + {"message": "host1:[error]: No memory"}, + {"message": "host1:[warning]: Remained disk space is less than 30%"}, + {"message": "host1:[error]: Disk full"}, + {"message": "host2:[error]: No memory"}, + {"message": "host2:[info]: Shutdown"} + ] + # [[0, 1337566253.89858, 0.000355720520019531], 5] Added: doc/source/reference/regular_expression.rst (+397 -0) 100644 =================================================================== --- /dev/null +++ doc/source/reference/regular_expression.rst 2015-03-17 18:46:19 +0900 (406dd5b) @@ -0,0 +1,397 @@ +.. -*- rst -*- + +.. highlightlang:: none + +.. groonga-command +.. database: regular_expression + +Regular expression +================== + +Summary +------- + +Groonga supports pattern match by regular expression. Regular +expression is widely used format to describe a pattern. Regular +expression is useful to represent complex pattern. + +In most cases, pattern match by regular expression is evaluated as +sequential search. It'll be slow for many records and many texts. + +In some cases, pattern match by regular expression can be evaluated +by index. It's very fast rather than sequential search. Patterns +that can be evaluated by index are described later. + +There are many regular expression syntaxes. Groonga uses the same +syntax in Ruby. Because Groonga uses the same regular expression +engine as Ruby. The regular expression engine is `Onigmo +<https://github.com/k-takata/Onigmo/>`_. Characteristic difference +with other regular expression syntax is ``^`` and ``$``. The regular +expression syntax in Ruby, ``^`` means the beginning of line and ``$`` +means the end of line. ``^`` means the beginning of text and ``$`` +means the end of text in other most regular expression. The regular +expression syntax in Ruby uses ``\A`` for the beginning of text and +``\z`` for the end of text. + +You can use regular expression in :ref:`select-query` and +:ref:`select-filter` options of :doc:`/reference/commands/select` +command. + +Usage +----- + +Here are a schema definition and sample data to show usage. There is +only one table, ``Logs``. ``Logs`` table has only ``message`` +column. Log messages are stored into the ``message`` column. + +.. groonga-command +.. include:: ../example/reference/regular_expression/usage_setup.log +.. table_create Logs TABLE_NO_KEY +.. column_create Logs message COLUMN_SCALAR Text +.. +.. load --table Logs +.. [ +.. {"message": "host1:[error]: No memory"}, +.. {"message": "host1:[warning]: Remained disk space is less than 30%"}, +.. {"message": "host1:[error]: Disk full"}, +.. {"message": "host2:[error]: No memory"}, +.. {"message": "host2:[info]: Shutdown"} +.. ] + +Here is an example that uses regular expression in +:ref:`select-query`. You need to use +``${COLUMN}:~${REGULAR_EXPRESSION}`` syntax. + +.. groonga-command +.. include:: ../example/reference/regular_expression/usage_query.log +.. select Logs --query 'message:~"[Dd]isk"' + +Here is an example that uses regular expression in +:ref:`select-filter`. You need to use +``${COLUMN} @~ ${REGULAR_EXPRESSION}`` syntax. + +.. groonga-command +.. include:: ../example/reference/regular_expression/usage_filter.log +.. select Logs --filter 'message @~ "[Dd]isk"' + +Index +----- + +Groonga can search records by regular expression with index. It's very +fast rather than sequential search. + +But it doesn't support all regular expression patterns. It supports +only the following regular expression patterns. The patterns will be +increased in the future. + + * Literal only pattern such as ``disk`` + * The begging of text and literal only pattern such as ``\Adisk`` + * The end of text and literal only pattern such as ``disk\z`` + +You need to create an index for fast regular expression search. Here +are requirements of index: + + * Lexicon must be :ref:`table-pat-key` table. + * Lexicon must use :ref:`token-regexp` tokenizer. + * Index column must has ``WITH_POSITION`` flag. + +Other configurations such as lexicon's normalizer are optional. You +can choose what you like. If you want to use case-insensitive search, +use :ref:`normalizer-auto` normalizer. + +Here are recommended index definitions. In general, it's reasonable +index definitions. + +.. groonga-command +.. include:: ../example/reference/regular_expression/index_definitions.log +.. table_create RegexpLexicon TABLE_PAT_KEY ShortText \ +.. --default_tokenizer TokenRegexp \ +.. --normalizer NormalizerAuto +.. column_create RegexpLexicon logs_message_index \ +.. COLUMN_INDEX|WITH_POSITION Logs message + +Now, you can use index for regular expression search. The following +regular expression can be evaluated by index because it uses only "the +beginning of text" and "literal". + +.. groonga-command +.. include:: ../example/reference/regular_expression/search_by_index_query.log +.. select Logs --query message:~\\\\Ahost1 + +Here is an example that uses :ref:`select-filter` instead of +:ref:`select-query`. It uses the same regular expression as the +previous example. + +.. groonga-command +.. include:: ../example/reference/regular_expression/search_by_index_filter.log +.. select Logs --filter 'message @~ "\\\\Ahost1:"' + +``\`` escape will confuse you because there are some steps that +require escape between you and Groonga. Here are steps that require +``\`` escape: + + * Shell only when you pass Groonga command from command line the + following:: + + % groonga /tmp/db select Logs --filter '"message @~ \"\\\\Ahost1:"\"' + + ``--filter '"message @~ \"\\\\Ahost1:\""'`` is evaluated as the + following two arguments by shell: + + * ``--filter`` + * ``"message @~ \"\\\\Ahost1:\""`` + + * Groonga command parser only when you pass Groonga command by + command line style (``COMMAND ARG1_VALUE ARG2_VALUE ...``) not + HTTP path style + (``/d/COMMAND?ARG1_NAME=ARG1_VALUE&ARG2_NAME=ARG3_VALUE``). + + ``"message @~ \"\\\\Ahost1:\""`` is evaluated as the following + value by Groonga command parser: + + * ``message @~ "\\Ahost1:"`` + + * :doc:`/reference/grn_exr` parser. ``\`` escape is required in both + :doc:`/reference/grn_exr/query_syntax` and + :doc:`/reference/grn_exr/script_syntax`. + + ``"\\Ahost1:"`` string literal in script syntax is evaluated as + the following value: + + * ``\Ahost1`` + + The value is evaluated as regular expression. + +Syntax +------ + +This section describes about only commonly used syntaxes. See `Onigmo +syntax documentation +<https://github.com/k-takata/Onigmo/blob/master/doc/RE>`_ for other +syntaxes and details. + +.. _regular-expression-escape: + +Escape +^^^^^^ + +In regular expression, there are the following special characters: + + * ``\`` + * ``|`` + * ``(`` + * ``)`` + * ``[`` + * ``]`` + * ``.`` + * ``*`` + * ``+`` + * ``?`` + * ``{`` + * ``}`` + * ``^`` + * ``$`` + +If you want to write pattern that matches these special character as +is, you need to escape them. + +You can escape them by putting ``\`` before special character. Here +are regular expressions that match special character itself: + + * ``\\`` + * ``\|`` + * ``\(`` + * ``\)`` + * ``\[`` + * ``\]`` + * ``\.`` + * ``\*`` + * ``\+`` + * ``\?`` + * ``\{`` + * ``\}`` + * ``\^`` + * ``\$`` + +.. groonga-command +.. include:: ../example/reference/regular_expression/choice.log +.. select Logs --filter 'message @~ "\\\\[error\\\\]"' + +If your regular expression doesn't work as you expected, confirm that +some special characters are used without escaping. + +.. _regular-expression-choice: + +Choice +^^^^^^ + +Choice syntax is ``A|B``. The regular expression matches when either +``A`` pattern or ``B`` pattern is matched. + +.. groonga-command +.. include:: ../example/reference/regular_expression/choice.log +.. select Logs --filter 'message @~ "warning|info"' + +.. _caution: + + Regular expression that uses this syntax can't be evaluated by + index. + +.. _regular-expression-group: + +Group +^^^^^ + +Group syntax is ``(...)``. Group provides the following features: + + * Back reference + * Scope reducing + +You can refer matched groups by ``\n`` (``n`` is the group number) +syntax. For example, ``e(r)\1o\1`` matches ``error``. Because ``\1`` +is replaced with match result (``r``) of the first group ``(r)``. + +.. groonga-command +.. include:: ../example/reference/regular_expression/group_back_reference.log +.. select Logs --filter 'message @~ "e(r)\\\\1o\\\\1"' + +You can also use more powerful back reference features. See `"8. Back +reference" section in Onigmo documentation +<https://github.com/k-takata/Onigmo/blob/master/doc/RE#L302>`_ for +details. + +Group syntax reduces scope. For example, ``\[(warning|info)\]`` +reduces choice syntax scope. The regular expression matches +``[warning]`` and ``[info]``. + +.. groonga-command +.. include:: ../example/reference/regular_expression/group_scope_reducing.log +.. select Logs --filter 'message @~ "\\\\[(warning|info)\\\\]"' + +You can also use more powerful group related features. See +`"7. Extended groups" section in Onigmo documentation +<https://github.com/k-takata/Onigmo/blob/master/doc/RE#L225>`_ for +details. + +.. _caution: + + Regular expression that uses this syntax can't be evaluated by + index. + +.. _regular-expression-character-class: + +Character class +^^^^^^^^^^^^^^^ + +Character class syntax is ``[...]``. Character class is useful to +specify multiple characters to be matched. + +For example, ``[Dd]`` matches ``D`` or ``d``. + +.. groonga-command +.. include:: ../example/reference/regular_expression/character_class_characters.log +.. select Logs --filter 'message @~ "[Dd]isk"' + +You can specify characters by range. For example, ``[0-9]`` matches +one digit. + +.. groonga-command +.. include:: ../example/reference/regular_expression/character_class_range.log +.. select Logs --filter 'message @~ "[0-9][0-9]%"' + +You can also use more powerful character class related features. See +`"6. Character class" section in Onigmo documentation +<https://github.com/k-takata/Onigmo/blob/master/doc/RE#L164>`_ for +details. + +.. _caution: + + Regular expression that uses this syntax can't be evaluated by + index. + +.. _regular-expression-anchor: + +Anchor +^^^^^^ + +There are the following commonly used anchor syntaxes. Some anchors +can be evaluated by index. + +.. list-table:: + :header-rows: 1 + + * - Anchor + - Description + - Index ready + * - ``^`` + - The beginning of line + - No + * - ``$`` + - The end of line + - No + * - ``\A`` + - The beginning of text + - Yes + * - ``\z`` + - The end of text + - No + +Here is an example that uses ``\z``. + +.. groonga-command +.. include:: ../example/reference/regular_expression/anchor_z.log +.. select Logs --filter 'message @~ "%\\\\z"' + +You can also use more anchors. See `"5. Anchors" section in Onigmo +documentation +<https://github.com/k-takata/Onigmo/blob/master/doc/RE#L152>`_ for +details. + +.. _caution: + + Regular expression that uses this syntax except ``\A`` and ``\z`` + can't be evaluated by index. + +.. _regular-expression-quantifier: + +Quantifier +^^^^^^^^^^ + +There are the following commonly used quantifier syntaxes. + +.. list-table:: + :header-rows: 1 + + * - Quantifier + - Description + * - ``?`` + - 0 or 1 time + * - ``*`` + - 0 or more times + * - ``+`` + - 1 or more times + +For example, ``er+or`` matches ``error``, ``errror`` and so on. + +.. groonga-command +.. include:: ../example/reference/regular_expression/quantifier_plus.log +.. select Logs --filter 'message @~ "er+or"' + +You can also use more quantifiers. See `"4. Quantifier" section in Onigmo +documentation +<https://github.com/k-takata/Onigmo/blob/master/doc/RE#L119>`_ for +details. + +.. _caution: + + Regular expression that uses this syntax can't be evaluated by + index. + +Others +^^^^^^ + +There are more syntaxes. If you're interested in them, see `Onigmo +documentation +<https://github.com/k-takata/Onigmo/blob/master/doc/RE>`_ for +details. You may be interested in "character type" and "character" +syntaxes. -------------- next part -------------- HTML����������������������������... Download