[Groonga-commit] groonga/groonga-normalizer-mysql [master] Add UCA collation dumper

Back to archive index

Kouhei Sutou null+****@clear*****
Sun Feb 10 18:42:36 JST 2013


Kouhei Sutou	2013-02-10 18:42:36 +0900 (Sun, 10 Feb 2013)

  New Revision: a4883526a389b0940bc17043e12f4b16d0c7454b
  https://github.com/groonga/groonga-normalizer-mysql/commit/a4883526a389b0940bc17043e12f4b16d0c7454b

  Log:
    Add UCA collation dumper

  Added files:
    tool/dump-difference-uca.rb

  Added: tool/dump-difference-uca.rb (+104 -0) 100644
===================================================================
--- /dev/null
+++ tool/dump-difference-uca.rb    2013-02-10 18:42:36 +0900 (9b201de)
@@ -0,0 +1,104 @@
+# Copyright (C) 2013  Kouhei Sutou <kou �� clear-code.com>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Library General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Library General Public License for more details.
+#
+# You should have received a copy of the GNU Library General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "English"
+
+if ARGV.size != 1
+  puts("Usage: #{$0} MYSQL_SOURCE/strings/ctype-uca.c")
+  exit(false)
+end
+
+def code_point_to_utf8(code_point)
+  [code_point].pack("U")
+end
+
+def utf8_to_code_point(utf8)
+  utf8.unpack("U")
+end
+
+current_page = nil
+pages = {}
+in_length = false
+lengths = []
+ARGF.each_line do |line|
+  case line
+  when / page(\d{3})data\[\]=/
+    current_page = $1.to_i(16)
+    pages[current_page] = []
+  when /^\s*0x(?:[\da-z]+)(?:,\s*0x(?:[\da-z]+))*,?$/i
+    next if current_page.nil?
+    weights = line.chomp.split(/,\s*/).collect do |component|
+      Integer(component)
+    end
+    pages[current_page].concat(weights)
+  when / uca_length\[256\]=/
+    in_length = true
+  when /^\d+(?:,\d+)*,?$/
+    next unless in_length
+    _lengths = line.chomp.split(/,/).collect {|length| Integer(length)}
+    lengths.concat(_lengths)
+  when /^\};$/
+    current_page = nil
+    in_length = false
+  end
+end
+
+pages.each do |page, flatten_weights|
+  weights = flatten_weights.each_slice(lengths[page])
+  pages[page] = weights.with_index.collect do |weight, i|
+    if weight.all?(&:zero?)
+      weight = [0]
+    else
+      while weight.last.zero?
+        weight.pop
+      end
+    end
+    code_point = (page << 8) + i
+    {
+      :weight     => weight,
+      :code_point => code_point,
+      :utf8       => code_point_to_utf8(code_point),
+    }
+  end
+end
+
+sorted_pages = pages.sort_by do |page, characters|
+  page
+end
+
+weight_based_characters = {}
+sorted_pages.each do |page, characters|
+  characters.each do |character|
+    weight = character[:weight]
+    weight_based_characters[weight] ||= []
+    weight_based_characters[weight] << character
+  end
+end
+
+n_idencials = 0
+weight_based_characters.each do |weight, characters|
+  next if characters.size == 1
+  n_idencials += 1
+  formatted_weight = weight.collect {|component| '%#07x' % component}.join(', ')
+  puts "weight: #{formatted_weight}"
+  characters.each do |character|
+    utf8 = character[:utf8]
+    code_point = character[:code_point]
+    p ["U+%04x" % code_point, utf8]
+  end
+end
+
+puts "Number of idencial weights #{n_idencials}"
-------------- next part --------------
HTML����������������������������...
Download 



More information about the Groonga-commit mailing list
Back to archive index