[Groonga-commit] groonga/groonga at 42cee5f [master] nkfc: support generating table based decompose func

Back to archive index

Kouhei Sutou null+****@clear*****
Wed Jun 22 00:28:31 JST 2016


Kouhei Sutou	2016-06-22 00:28:31 +0900 (Wed, 22 Jun 2016)

  New Revision: 42cee5fd157c44ee955b42e70efec6ebce415da0
  https://github.com/groonga/groonga/commit/42cee5fd157c44ee955b42e70efec6ebce415da0

  Message:
    nkfc: support generating table based decompose func

  Modified files:
    lib/nfkc.rb

  Modified: lib/nfkc.rb (+131 -11)
===================================================================
--- lib/nfkc.rb    2016-06-21 19:02:53 +0900 (d403060)
+++ lib/nfkc.rb    2016-06-22 00:28:31 +0900 (464311a)
@@ -287,21 +287,24 @@ class TableGenerator < SwitchGenerator
     end
     byte_size_groups.keys.sort.each do |common_bytes|
       chars = byte_size_groups[common_bytes]
-      @output.puts(<<-TABLE_HEADER)
-
-static #{return_type}#{space}#{table_name(type, common_bytes)}[] = {
-      TABLE_HEADER
-
       lines = []
+      n_values = 0
       last_bytes = chars.collect {|char| char.bytes.last}
       last_bytes.min.step(last_bytes.max).each_slice(8) do |slice|
         values = slice.collect do |last_byte|
           yield((common_bytes + [last_byte]).pack("c*"))
         end
+        n_values += values.size
         lines << ("  " + values.join(", "))
       end
-      @output.puts(lines.join(",\n"))
 
+      next if n_values == 1
+
+      @output.puts(<<-TABLE_HEADER)
+
+static #{return_type}#{space}#{table_name(type, common_bytes)}[] = {
+      TABLE_HEADER
+      @output.puts(lines.join(",\n"))
       @output.puts(<<-TABLE_FOOTER)
 };
       TABLE_FOOTER
@@ -313,6 +316,7 @@ static #{return_type}#{space}#{table_name(type, common_bytes)}[] = {
                               char_map,
                               default,
                               return_type,
+                              options={},
                               &converter)
     byte_size_groups = char_map.keys.group_by do |from|
       bytes = from.bytes
@@ -325,15 +329,17 @@ static #{return_type}#{space}#{table_name(type, common_bytes)}[] = {
                                  byte_size_groups,
                                  &converter)
 
+    modifier = options[:internal] ? "static inline " : ""
     @output.puts(<<-HEADER)
 
-#{return_type}
+#{modifier}#{return_type}
 grn_nfkc#{@unicode_version}_#{function_type}(const unsigned char *utf8)
 {
     HEADER
 
     prev_common_bytes = []
     prev_n_common_bytes = 0
+    first_group = true
     byte_size_groups.keys.sort.each do |common_bytes|
       chars = byte_size_groups[common_bytes]
       chars_bytes = chars.collect(&:bytes).sort
@@ -341,7 +347,19 @@ grn_nfkc#{@unicode_version}_#{function_type}(const unsigned char *utf8)
       max = chars_bytes.last.last
       n_common_bytes = 0
       if common_bytes.empty?
-        @output.puts(<<-BODY)
+        if min == max
+          value = yield(chars.join(""))
+          @output.puts(<<-BODY)
+  if (utf8[0] < 0x80) {
+    if (utf8[0] == #{"%#04x" % min}) {
+      return #{value};
+    } else {
+      return #{default};
+    }
+  } else {
+          BODY
+        else
+          @output.puts(<<-BODY)
   if (utf8[0] < 0x80) {
     if (utf8[0] >= #{"%#04x" % min} && utf8[0] <= #{"%#04x" % max}) {
       return #{table_name(type, common_bytes)}[utf8[0] - #{"%#04x" % min}];
@@ -349,8 +367,15 @@ grn_nfkc#{@unicode_version}_#{function_type}(const unsigned char *utf8)
       return #{default};
     }
   } else {
-        BODY
+          BODY
+        end
       else
+        if first_group
+          @output.puts(<<-BODY)
+  {
+          BODY
+        end
+
         found_different_byte = false
         common_bytes.each_with_index do |common_byte, i|
           unless found_different_byte
@@ -388,16 +413,27 @@ grn_nfkc#{@unicode_version}_#{function_type}(const unsigned char *utf8)
 
         n = chars_bytes.first.size - 1
         indent = "  " * common_bytes.size
-        @output.puts(<<-BODY)
+        if min == max
+          value = yield(chars.join(""))
+          @output.puts(<<-BODY)
+    #{indent}if (utf8[#{n}] == #{"%#04x" % min}) {
+    #{indent}  return #{value};
+    #{indent}}
+    #{indent}break;
+          BODY
+        else
+          @output.puts(<<-BODY)
     #{indent}if (utf8[#{n}] >= #{"%#04x" % min} && utf8[#{n}] <= #{"%#04x" % max}) {
     #{indent}  return #{table_name(type, common_bytes)}[utf8[#{n}] - #{"%#04x" % min}];
     #{indent}}
     #{indent}break;
-        BODY
+          BODY
+        end
       end
 
       prev_common_bytes = common_bytes
       prev_n_common_bytes = n_common_bytes
+      first_group = false
     end
 
     # p [prev_common_bytes.collect{|x| "%#04x" % x}, prev_n_common_bytes]
@@ -452,6 +488,90 @@ grn_nfkc#{@unicode_version}_#{function_type}(const unsigned char *utf8)
     end
   end
 
+  def generate_map2(map2)
+    generate_compose(map2)
+  end
+
+  def generate_compose(map2)
+    # require "pp"
+    # p map2.size
+    # pp map2.keys.group_by {|x| x.chars[1]}.size
+    # pp map2.keys.group_by {|x| x.chars[1]}.collect {|k, vs| [k, k.codepoints, vs.size, vs.group_by {|x| x.chars[0].bytesize}.collect {|k2, vs2| [k2, vs2.size]}]}
+    # pp map2.keys.group_by {|x| x.chars[0].bytesize}.collect {|k, vs| [k, vs.size]}
+    # pp map2
+
+    suffix_char_map = {}
+    map2.each do |source, destination|
+      chars = source.chars
+      if chars.size != 2
+        STDERR.puts "caution: more than two chars in pattern #{chars.join('|')}"
+        return
+      end
+      prefix, suffix = chars
+      suffix_char_map[suffix] ||= {}
+      suffix_char_map[suffix][prefix] = destination
+    end
+
+    compose_func_type_name = "grn_nfkc#{@unicode_version}_compose_func"
+
+    @output.puts(<<-TYPEDEF)
+
+typedef const char *#{compose_func_type_name}(const unsigned char *prefix_utf8);
+TYPEDEF
+
+    suffix_char_map.each do |suffix, prefix_char_map|
+      suffix_bytes = suffix.bytes.collect {|byte| "%02x" % byte}.join("")
+      default = "NULL"
+      generate_char_converter("compose_prefix_#{suffix_bytes}",
+                              "compose_prefix_#{suffix_bytes}",
+                              prefix_char_map,
+                              default,
+                              "const char *",
+                              :internal => true) do |prefix|
+        prefix.force_encoding("UTF-8")
+        to = prefix_char_map[prefix]
+        if to
+          escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("")
+          "\"#{escaped_value}\""
+        else
+          default
+        end
+      end
+    end
+
+    default = "NULL"
+    generate_char_converter("compose_suffix",
+                            "compose_suffix",
+                            suffix_char_map,
+                            default,
+                            "#{compose_func_type_name} *",
+                            :internal => true) do |suffix|
+      suffix.force_encoding("UTF-8")
+      if suffix_char_map.key?(suffix)
+        suffix_bytes = suffix.bytes.collect {|byte| "%02x" % byte}.join("")
+        "grn_nfkc#{@unicode_version}_compose_prefix_#{suffix_bytes}"
+      else
+        default
+      end
+    end
+
+    @output.puts(<<-BODY)
+
+const char *
+grn_nfkc#{@unicode_version}_map2(const unsigned char *prefix_utf8, const unsigned char *suffix_utf8)
+{
+  #{compose_func_type_name} *compose_func;
+
+  compose_func = grn_nfkc#{@unicode_version}_compose_suffix(suffix_utf8);
+  if (compose_func) {
+    return compose_func(prefix_utf8);
+  } else {
+    return NULL;
+  }
+}
+    BODY
+  end
+
   def to_bytes_map(char_map)
     bytes_map = {}
     char_map.each_key do |from|
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index