[Groonga-commit] groonga/groonga at b7b522a [master] Partially support keyword extraction from regexp search

Back to archive index

Kouhei Sutou null+****@clear*****
Sat Dec 9 17:00:27 JST 2017


Kouhei Sutou	2017-12-09 17:00:27 +0900 (Sat, 09 Dec 2017)

  New Revision: b7b522a1703688ceb5ca448f814bed04c4f3f8f6
  https://github.com/groonga/groonga/commit/b7b522a1703688ceb5ca448f814bed04c4f3f8f6

  Message:
    Partially support keyword extraction from regexp search
    
    GitHub: fix #787
    
    It enables highlight_html and snippet_html for regexp search.
    
    Reported by takagi01. Thanks!!!

  Added files:
    test/command/suite/select/function/highlight_html/regexp.expected
    test/command/suite/select/function/highlight_html/regexp.test
  Modified files:
    lib/expr.c

  Modified: lib/expr.c (+98 -0)
===================================================================
--- lib/expr.c    2017-12-08 19:16:48 +0900 (0c38e0ae2)
+++ lib/expr.c    2017-12-09 17:00:27 +0900 (738948c51)
@@ -9076,6 +9076,94 @@ grn_expr_syntax_expand_query_by_table(grn_ctx *ctx,
   GRN_API_RETURN(ctx->rc);
 }
 
+/*
+ * TODO: It's very loose implementations. It just splits regexp by
+ * meta characters to extract keywords. For example, "r.*nga" has "r"
+ * and "nga" keywords.
+ */
+static void
+grn_expr_get_keywords_regexp(grn_ctx *ctx, grn_obj *keywords, grn_obj *regexp)
+{
+  const char *regexp_raw;
+  const char *regexp_raw_end;
+  grn_bool escaping = GRN_FALSE;
+  grn_obj keyword;
+
+  regexp_raw = GRN_TEXT_VALUE(regexp);
+  regexp_raw_end = regexp_raw + GRN_TEXT_LEN(regexp);
+
+  GRN_TEXT_INIT(&keyword, 0);
+  while (regexp_raw < regexp_raw_end) {
+    unsigned int char_len;
+
+    char_len = grn_charlen(ctx, regexp_raw, regexp_raw_end);
+
+    if (char_len == 1) {
+      if (escaping) {
+        escaping = GRN_FALSE;
+        switch (regexp_raw[0]) {
+        case 'A' :
+        case 'z' :
+          if (GRN_TEXT_LEN(&keyword) > 0) {
+            grn_vector_add_element(ctx,
+                                   keywords,
+                                   GRN_TEXT_VALUE(&keyword),
+                                   GRN_TEXT_LEN(&keyword),
+                                   0,
+                                   GRN_DB_TEXT);
+            GRN_BULK_REWIND(&keyword);
+          }
+          break;
+        default :
+          GRN_TEXT_PUTC(ctx, &keyword, regexp_raw[0]);
+          break;
+        }
+      } else {
+        switch (regexp_raw[0]) {
+        case '.' :
+          escaping = GRN_FALSE;
+          break;
+        case '*' :
+          escaping = GRN_FALSE;
+          if (GRN_TEXT_LEN(&keyword) > 0) {
+            grn_vector_add_element(ctx,
+                                   keywords,
+                                   GRN_TEXT_VALUE(&keyword),
+                                   GRN_TEXT_LEN(&keyword),
+                                   0,
+                                   GRN_DB_TEXT);
+            GRN_BULK_REWIND(&keyword);
+          }
+          break;
+        case '\\' :
+          escaping = GRN_TRUE;
+          break;
+        default :
+          escaping = GRN_FALSE;
+          GRN_TEXT_PUTC(ctx, &keyword, regexp_raw[0]);
+          break;
+        }
+      }
+    } else {
+      escaping = GRN_FALSE;
+      GRN_TEXT_PUT(ctx, &keyword, regexp_raw, char_len);
+    }
+
+    regexp_raw += char_len;
+  }
+
+  if (GRN_TEXT_LEN(&keyword) > 0) {
+    grn_vector_add_element(ctx,
+                           keywords,
+                           GRN_TEXT_VALUE(&keyword),
+                           GRN_TEXT_LEN(&keyword),
+                           0,
+                           GRN_DB_TEXT);
+  }
+
+  GRN_OBJ_FIN(ctx, &keyword);
+}
+
 grn_rc
 grn_expr_get_keywords(grn_ctx *ctx, grn_obj *expr, grn_obj *keywords)
 {
@@ -9111,6 +9199,16 @@ grn_expr_get_keywords(grn_ctx *ctx, grn_obj *expr, grn_obj *keywords)
                                      GRN_DB_TEXT);
             }
             break;
+          case GRN_OP_REGEXP :
+            /* TODO: It should be refined. */
+            if (is_index_searchable_regexp(ctx, si->query)) {
+              if (keywords->header.type == GRN_PVECTOR) {
+                GRN_PTR_PUT(ctx, keywords, si->query);
+              } else {
+                grn_expr_get_keywords_regexp(ctx, keywords, si->query);
+              }
+            }
+            break;
           case GRN_OP_SIMILAR :
             if (keywords->header.type == GRN_VECTOR &&
                 GRN_BULK_VSIZE(&(si->index)) > 0) {

  Added: test/command/suite/select/function/highlight_html/regexp.expected (+37 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/highlight_html/regexp.expected    2017-12-09 17:00:27 +0900 (265cb9d70)
@@ -0,0 +1,37 @@
+table_create Entries TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Entries body COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto
+[[0,0.0,0.0],true]
+column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
+[[0,0.0,0.0],true]
+load --table Entries
+[
+{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."}
+]
+[[0,0.0,0.0],1]
+select Entries   --filter "body @~ 'ro.*ga'"   --output_columns "highlight_html(body)"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        1
+      ],
+      [
+        [
+          "highlight_html",
+          null
+        ]
+      ],
+      [
+        "M<span class=\"keyword\">ro</span>on<span class=\"keyword\">ga</span> is a MySQL storage engine based on G<span class=\"keyword\">ro</span>on<span class=\"keyword\">ga</span>. &lt;b&gt;R<span class=\"keyword\">ro</span>on<span class=\"keyword\">ga</span>&lt;/b&gt; is a Ruby binding of G<span class=\"keyword\">ro</span>on<span class=\"keyword\">ga</span>."
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/select/function/highlight_html/regexp.test (+14 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/highlight_html/regexp.test    2017-12-09 17:00:27 +0900 (77eb47652)
@@ -0,0 +1,14 @@
+table_create Entries TABLE_NO_KEY
+column_create Entries body COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenRegexp --normalizer NormalizerAuto
+column_create Terms document_index COLUMN_INDEX|WITH_POSITION Entries body
+
+load --table Entries
+[
+{"body": "Mroonga is a MySQL storage engine based on Groonga. <b>Rroonga</b> is a Ruby binding of Groonga."}
+]
+
+select Entries \
+  --filter "body @~ 'ro.*ga'" \
+  --output_columns "highlight_html(body)"
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20171209/ec455164/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index