Kouhei Sutou
null+****@clear*****
Wed Jun 26 17:58:01 JST 2013
Kouhei Sutou 2013-06-26 17:58:01 +0900 (Wed, 26 Jun 2013) New Revision: 9d75c4df92bc9c9323ee21f577ce059b1b65ddb9 https://github.com/groonga/groonga/commit/9d75c4df92bc9c9323ee21f577ce059b1b65ddb9 Message: Add html_untag() function It strips HTML tag from HTML and outputs only text. TODO: - Support attribute - Support nested element Added files: test/command/suite/select/function/html_untag/simple.expected test/command/suite/select/function/html_untag/simple.test Modified files: lib/proc.c Modified: lib/proc.c (+47 -0) =================================================================== --- lib/proc.c 2013-06-26 16:31:07 +0900 (6d43027) +++ lib/proc.c 2013-06-26 17:58:01 +0900 (8a8b5d8) @@ -4011,6 +4011,50 @@ selector_sub_filter(grn_ctx *ctx, grn_obj *table, grn_obj *index, return run_sub_filter(ctx, table, nargs - 1, args + 1, res, op); } +static grn_obj * +func_html_untag(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *html; + grn_obj *text; + const char *html_raw; + int i, length; + grn_bool in_tag = GRN_FALSE; + + if (nargs != 1) { + ERR(GRN_INVALID_ARGUMENT, "HTML is missing"); + return NULL; + } + + /* TODO: type check */ + html = args[0]; + + text = GRN_PROC_ALLOC(html->header.domain, 0); + if (!text) { + return NULL; + } + + html_raw = GRN_TEXT_VALUE(html); + length = GRN_TEXT_LEN(html); + for (i = 0; i < length; i++) { + switch (html_raw[i]) { + case '<' : + in_tag = GRN_TRUE; + break; + case '>' : + in_tag = GRN_FALSE; + break; + default : + if (!in_tag) { + GRN_TEXT_PUTC(ctx, text, html_raw[i]); + } + break; + } + } + + return text; +} + #define DEF_VAR(v,name_str) do {\ (v).name = (name_str);\ (v).name_size = GRN_STRLEN(name_str);\ @@ -4217,4 +4261,7 @@ grn_db_init_builtin_query(grn_ctx *ctx) func_sub_filter, NULL, NULL, 0, NULL); grn_proc_set_selector(ctx, selector_proc, selector_sub_filter); } + + grn_proc_create(ctx, "html_untag", -1, GRN_PROC_FUNCTION, + func_html_untag, NULL, NULL, 0, NULL); } Added: test/command/suite/select/function/html_untag/simple.expected (+11 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/html_untag/simple.expected 2013-06-26 17:58:01 +0900 (9600ba0) @@ -0,0 +1,11 @@ +table_create Entries TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Entries content COLUMN_SCALAR Text +[[0,0.0,0.0],true] +load --table Entries +[ +{"content": "I <em>am</em> a boy."} +] +[[0,0.0,0.0],1] +select Entries --output_columns "html_untag(content)" --command_version 2 +[[0,0.0,0.0],[[[1],[["html_untag","null"]],["I am a boy."]]]] Added: test/command/suite/select/function/html_untag/simple.test (+11 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/select/function/html_untag/simple.test 2013-06-26 17:58:01 +0900 (cc736c0) @@ -0,0 +1,11 @@ +table_create Entries TABLE_NO_KEY +column_create Entries content COLUMN_SCALAR Text + +load --table Entries +[ +{"content": "I <em>am</em> a boy."} +] + +select Entries \ + --output_columns "html_untag(content)" \ + --command_version 2 -------------- next part -------------- HTML����������������������������...Download