[Groonga-commit] groonga/groonga at e08f2ed [master] nfkc: support generating table based char_type converter

Back to archive index

Kouhei Sutou null+****@clear*****
Fri Feb 19 00:08:40 JST 2016


Kouhei Sutou	2016-02-19 00:08:40 +0900 (Fri, 19 Feb 2016)

  New Revision: e08f2ed4e3b1c34007a2439ed0cb3fadd2d6683e
  https://github.com/groonga/groonga/commit/e08f2ed4e3b1c34007a2439ed0cb3fadd2d6683e

  Message:
    nfkc: support generating table based char_type converter

  Modified files:
    lib/nfkc.rb

  Modified: lib/nfkc.rb (+89 -53)
===================================================================
--- lib/nfkc.rb    2016-02-18 23:42:03 +0900 (bf4fb58)
+++ lib/nfkc.rb    2016-02-19 00:08:40 +0900 (d403060)
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 # -*- coding: utf-8 -*-
 #
-# Copyright(C) 2010-2015 Brazil
+# Copyright(C) 2010-2016 Brazil
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -274,40 +274,79 @@ end
 
 class TableGenerator < SwitchGenerator
   private
-  def generate_map1(hash)
-    generate_decompose(hash)
+  def table_name(type, common_bytes)
+    suffix = common_bytes.collect {|byte| "%02x" % byte}.join("")
+    "grn_nfkc#{@unicode_version}_#{type}_table_#{suffix}"
   end
 
-  def generate_decompose(char_map)
+  def generate_char_convert_tables(type, char_map, return_type, byte_size_groups)
+    if return_type.end_with?("*")
+      space = ""
+    else
+      space = " "
+    end
+    byte_size_groups.keys.sort.each do |common_bytes|
+      chars = byte_size_groups[common_bytes]
+      @output.puts(<<-TABLE_HEADER)
+
+static #{return_type}#{space}#{table_name(type, common_bytes)}[] = {
+      TABLE_HEADER
+
+      lines = []
+      last_bytes = chars.collect {|char| char.bytes.last}
+      last_bytes.min.step(last_bytes.max).each_slice(8) do |slice|
+        values = slice.collect do |last_byte|
+          yield((common_bytes + [last_byte]).pack("c*"))
+        end
+        lines << ("  " + values.join(", "))
+      end
+      @output.puts(lines.join(",\n"))
+
+      @output.puts(<<-TABLE_FOOTER)
+};
+      TABLE_FOOTER
+    end
+  end
+
+  def generate_char_converter(type,
+                              function_type,
+                              char_map,
+                              default,
+                              return_type,
+                              &converter)
     byte_size_groups = char_map.keys.group_by do |from|
       bytes = from.bytes
       bytes[0..-2]
     end
 
-    generate_decompose_tables(char_map, byte_size_groups)
+    generate_char_convert_tables(type,
+                                 char_map,
+                                 return_type,
+                                 byte_size_groups,
+                                 &converter)
 
     @output.puts(<<-HEADER)
 
-const char *
-grn_nfkc#{@unicode_version}_map1(const unsigned char *utf8)
+#{return_type}
+grn_nfkc#{@unicode_version}_#{function_type}(const unsigned char *utf8)
 {
     HEADER
 
     prev_common_bytes = []
     prev_n_common_bytes = 0
     byte_size_groups.keys.sort.each do |common_bytes|
-      froms = byte_size_groups[common_bytes]
-      froms_bytes = froms.collect(&:bytes).sort
-      min = froms_bytes.first.last
-      max = froms_bytes.last.last
+      chars = byte_size_groups[common_bytes]
+      chars_bytes = chars.collect(&:bytes).sort
+      min = chars_bytes.first.last
+      max = chars_bytes.last.last
       n_common_bytes = 0
       if common_bytes.empty?
         @output.puts(<<-BODY)
   if (utf8[0] < 0x80) {
     if (utf8[0] >= #{"%#04x" % min} && utf8[0] <= #{"%#04x" % max}) {
-      return #{decompose_table_name(common_bytes)}[utf8[0] - #{"%#04x" % min}];
+      return #{table_name(type, common_bytes)}[utf8[0] - #{"%#04x" % min}];
     } else {
-      return NULL;
+      return #{default};
     }
   } else {
         BODY
@@ -347,11 +386,11 @@ grn_nfkc#{@unicode_version}_map1(const unsigned char *utf8)
           BODY
         end
 
-        n = froms_bytes.first.size - 1
+        n = chars_bytes.first.size - 1
         indent = "  " * common_bytes.size
         @output.puts(<<-BODY)
     #{indent}if (utf8[#{n}] >= #{"%#04x" % min} && utf8[#{n}] <= #{"%#04x" % max}) {
-    #{indent}  return #{decompose_table_name(common_bytes)}[utf8[#{n}] - #{"%#04x" % min}];
+    #{indent}  return #{table_name(type, common_bytes)}[utf8[#{n}] - #{"%#04x" % min}];
     #{indent}}
     #{indent}break;
         BODY
@@ -375,11 +414,44 @@ grn_nfkc#{@unicode_version}_map1(const unsigned char *utf8)
     @output.puts(<<-FOOTER)
   }
 
-  return NULL;
+  return #{default};
 }
     FOOTER
   end
 
+  def generate_blockcode_char_type(block_codes)
+    default = "GRN_CHAR_OTHERS"
+    generate_char_converter("char_type",
+                            "char_type",
+                            block_codes,
+                            default,
+                            "grn_char_type") do |char|
+      block_codes[char] || default
+    end
+  end
+
+  def generate_map1(hash)
+    generate_decompose(hash)
+  end
+
+  def generate_decompose(char_map)
+    default = "NULL"
+    generate_char_converter("decompose",
+                            "map1",
+                            char_map,
+                            default,
+                            "const char *") do |from|
+      from.force_encoding("UTF-8")
+      to = char_map[from]
+      if to
+        escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("")
+        "\"#{escaped_value}\""
+      else
+        default
+      end
+    end
+  end
+
   def to_bytes_map(char_map)
     bytes_map = {}
     char_map.each_key do |from|
@@ -392,42 +464,6 @@ grn_nfkc#{@unicode_version}_map1(const unsigned char *utf8)
     end
     bytes_map
   end
-
-  def decompose_table_name(common_bytes)
-    suffix = common_bytes.collect {|byte| "%02x" % byte}.join("")
-    "grn_nfkc#{@unicode_version}_decompose_table_#{suffix}"
-  end
-
-  def generate_decompose_tables(char_map, byte_size_groups)
-    byte_size_groups.keys.sort.each do |common_bytes|
-      froms = byte_size_groups[common_bytes]
-      @output.puts(<<-TABLE_HEADER)
-
-static const char *#{decompose_table_name(common_bytes)}[] = {
-      TABLE_HEADER
-
-      lines = []
-      last_bytes = froms.collect {|from| from.bytes.last}
-      last_bytes.min.step(last_bytes.max).each_slice(8) do |slice|
-        values = slice.collect do |last_byte|
-          from = (common_bytes + [last_byte]).pack("c*").force_encoding("UTF-8")
-          to = char_map[from]
-          if to
-            escaped_value = to.bytes.collect {|char| "\\x%02x" % char}.join("")
-            "\"#{escaped_value}\""
-          else
-            "NULL"
-          end
-        end
-        lines << ("  " + values.join(", "))
-      end
-      @output.puts(lines.join(",\n"))
-
-      @output.puts(<<-TABLE_FOOTER)
-};
-      TABLE_FOOTER
-    end
-  end
 end
 
 def create_bc(option)
@@ -584,7 +620,7 @@ map2 = create_map2(map1)
 File.open("nfkc#{unicode_version}.c", "w") do |output|
   output.puts(<<-HEADER)
 /* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2010-2015 Brazil
+/* Copyright(C) 2010-2016 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index