[Groonga-commit] groonga/groonga at 9d75c4d [master] Add html_untag() function

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Jun 26 17:58:01 JST 2013


Kouhei Sutou	2013-06-26 17:58:01 +0900 (Wed, 26 Jun 2013)

  New Revision: 9d75c4df92bc9c9323ee21f577ce059b1b65ddb9
  https://github.com/groonga/groonga/commit/9d75c4df92bc9c9323ee21f577ce059b1b65ddb9

  Message:
    Add html_untag() function
    
    It strips HTML tag from HTML and outputs only text.
    
    TODO:
    - Support attribute
    - Support nested element

  Added files:
    test/command/suite/select/function/html_untag/simple.expected
    test/command/suite/select/function/html_untag/simple.test
  Modified files:
    lib/proc.c

  Modified: lib/proc.c (+47 -0)
===================================================================
--- lib/proc.c    2013-06-26 16:31:07 +0900 (6d43027)
+++ lib/proc.c    2013-06-26 17:58:01 +0900 (8a8b5d8)
@@ -4011,6 +4011,50 @@ selector_sub_filter(grn_ctx *ctx, grn_obj *table, grn_obj *index,
   return run_sub_filter(ctx, table, nargs - 1, args + 1, res, op);
 }
 
+static grn_obj *
+func_html_untag(grn_ctx *ctx, int nargs, grn_obj **args,
+                grn_user_data *user_data)
+{
+  grn_obj *html;
+  grn_obj *text;
+  const char *html_raw;
+  int i, length;
+  grn_bool in_tag = GRN_FALSE;
+
+  if (nargs != 1) {
+    ERR(GRN_INVALID_ARGUMENT, "HTML is missing");
+    return NULL;
+  }
+
+  /* TODO: type check */
+  html = args[0];
+
+  text = GRN_PROC_ALLOC(html->header.domain, 0);
+  if (!text) {
+    return NULL;
+  }
+
+  html_raw = GRN_TEXT_VALUE(html);
+  length = GRN_TEXT_LEN(html);
+  for (i = 0; i < length; i++) {
+    switch (html_raw[i]) {
+    case '<' :
+      in_tag = GRN_TRUE;
+      break;
+    case '>' :
+      in_tag = GRN_FALSE;
+      break;
+    default :
+      if (!in_tag) {
+        GRN_TEXT_PUTC(ctx, text, html_raw[i]);
+      }
+      break;
+    }
+  }
+
+  return text;
+}
+
 #define DEF_VAR(v,name_str) do {\
   (v).name = (name_str);\
   (v).name_size = GRN_STRLEN(name_str);\
@@ -4217,4 +4261,7 @@ grn_db_init_builtin_query(grn_ctx *ctx)
                                     func_sub_filter, NULL, NULL, 0, NULL);
     grn_proc_set_selector(ctx, selector_proc, selector_sub_filter);
   }
+
+  grn_proc_create(ctx, "html_untag", -1, GRN_PROC_FUNCTION,
+                  func_html_untag, NULL, NULL, 0, NULL);
 }

  Added: test/command/suite/select/function/html_untag/simple.expected (+11 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/html_untag/simple.expected    2013-06-26 17:58:01 +0900 (9600ba0)
@@ -0,0 +1,11 @@
+table_create Entries TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Entries content COLUMN_SCALAR Text
+[[0,0.0,0.0],true]
+load --table Entries
+[
+{"content": "I <em>am</em> a boy."}
+]
+[[0,0.0,0.0],1]
+select Entries   --output_columns "html_untag(content)"   --command_version 2
+[[0,0.0,0.0],[[[1],[["html_untag","null"]],["I am a boy."]]]]

  Added: test/command/suite/select/function/html_untag/simple.test (+11 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/function/html_untag/simple.test    2013-06-26 17:58:01 +0900 (cc736c0)
@@ -0,0 +1,11 @@
+table_create Entries TABLE_NO_KEY
+column_create Entries content COLUMN_SCALAR Text
+
+load --table Entries
+[
+{"content": "I <em>am</em> a boy."}
+]
+
+select Entries \
+  --output_columns "html_untag(content)" \
+  --command_version 2
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index