[Groonga-commit] ranguba/chupa-text at 8b997e6 [master] Add body size limitation

Back to archive index
Kouhei Sutou null+****@clear*****
Sun Mar 3 05:30:32 JST 2019


Kouhei Sutou	2019-03-03 05:30:32 +0900 (Sun, 03 Mar 2019)

  Revision: 8b997e607d58b9ff052936a81ec216dcf2ce6a51
  https://github.com/ranguba/chupa-text/commit/8b997e607d58b9ff052936a81ec216dcf2ce6a51

  Message:
    Add body size limitation

  Modified files:
    lib/chupa-text/command/chupa-text.rb
    lib/chupa-text/data.rb
    lib/chupa-text/extractor.rb
    lib/chupa-text/utf8-converter.rb
    test/test-extractor.rb

  Modified: lib/chupa-text/command/chupa-text.rb (+8 -2)
===================================================================
--- lib/chupa-text/command/chupa-text.rb    2019-03-03 05:06:25 +0900 (e68f6c3)
+++ lib/chupa-text/command/chupa-text.rb    2019-03-03 05:30:32 +0900 (6f8482f)
@@ -1,4 +1,4 @@
-# Copyright (C) 2013-2017  Kouhei Sutou <kou****@clear*****>
+# Copyright (C) 2013-2019  Kouhei Sutou <kou****@clear*****>
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -49,6 +49,7 @@ module ChupaText
         @mime_formatter_options = {}
         @need_screenshot = true
         @expected_screenshot_size = [200, 200]
+        @max_body_size = nil
       end
 
       def run(*arguments)
@@ -143,6 +144,11 @@ module ChupaText
                   "(default: #{@expected_screenshot_size.join("x")})") do |size|
           @expected_screenshot_size = size
         end
+        parser.on("--max-body-size=BYTE", Integer,
+                  "The max byte of extracted body.",
+                  "(default: no limit)") do |size|
+          @max_body_size = size
+        end
 
         parser.separator("")
         parser.separator("Log related options:")
@@ -190,7 +196,7 @@ module ChupaText
       end
 
       def create_extractor
-        extractor = Extractor.new
+        extractor = Extractor.new(max_body_size: @max_body_size)
         extractor.apply_configuration(@configuration)
         extractor
       end

  Modified: lib/chupa-text/data.rb (+11 -3)
===================================================================
--- lib/chupa-text/data.rb    2019-03-03 05:06:25 +0900 (b156499)
+++ lib/chupa-text/data.rb    2019-03-03 05:30:32 +0900 (c42f534)
@@ -198,12 +198,20 @@ module ChupaText
       @need_screenshot
     end
 
-    def to_utf8_body_data
-      b = body
+    def to_utf8_body_data(max_body_size: nil)
+      b = nil
+      if max_body_size
+        open do |input|
+          b = input.read(max_body_size)
+        end
+      else
+        b = body
+      end
       return self if b.nil?
+
       converter = UTF8Converter.new(b)
       utf8_body = converter.convert
-      if b.equal?(utf8_body)
+      if max_body_size.nil? and b.equal?(utf8_body)
         self
       else
         TextData.new(utf8_body, source_data: self)

  Modified: lib/chupa-text/extractor.rb (+4 -3)
===================================================================
--- lib/chupa-text/extractor.rb    2019-03-03 05:06:25 +0900 (4ce2c11)
+++ lib/chupa-text/extractor.rb    2019-03-03 05:30:32 +0900 (c6eb6fa)
@@ -21,8 +21,9 @@ module ChupaText
   class Extractor
     include Loggable
 
-    def initialize
+    def initialize(max_body_size: nil)
       @decomposers = []
+      @max_body_size = max_body_size
     end
 
     # Sets the extractor up by the configuration. It adds decomposers
@@ -90,11 +91,11 @@ module ChupaText
       if decomposer.nil?
         if target.text_plain?
           debug {"#{log_tag}[extract][text-plain]"}
-          yield(target.to_utf8_body_data)
+          yield(target.to_utf8_body_data(max_body_size: @max_body_size))
         else
           debug {"#{log_tag}[extract][decomposer] not found"}
           if target.text?
-            yield(target.to_utf8_body_data)
+            yield(target.to_utf8_body_data(max_body_size: @max_body_size))
           end
         end
       else

  Modified: lib/chupa-text/utf8-converter.rb (+42 -24)
===================================================================
--- lib/chupa-text/utf8-converter.rb    2019-03-03 05:06:25 +0900 (ca72e90)
+++ lib/chupa-text/utf8-converter.rb    2019-03-03 05:30:32 +0900 (ca05602)
@@ -16,8 +16,9 @@
 
 module ChupaText
   class UTF8Converter
-    def initialize(string)
+    def initialize(string, max_size: nil)
       @string = string
+      @max_size = max_size
     end
 
     def convert
@@ -26,44 +27,51 @@ module ChupaText
       when Encoding::UTF_8
         bom_size, bom_encoding = detect_bom
         if bom_size
-          retur****@strin*****(bom_size,
-                                   @string.bytesize - bom_size)
+          utf8_string =****@strin*****(bom_size,
+                                          @string.bytesize - bom_size)
         else
-          return @string
+          utf8_string = @string
         end
+        return truncate(utf8_string)
       when Encoding::ASCII_8BIT
-        return @string if****@strin*****_only?
+        return truncate(@string) if****@strin*****_only?
       else
-        retur****@strin*****(Encoding::UTF_8,
-                              invalid: :replace,
-                              undef: :replace,
-                              replace: "")
+        utf8_string =****@strin*****(Encoding::UTF_8,
+                                     invalid: :replace,
+                                     undef: :replace,
+                                     replace: "")
+        return truncate(utf8_string)
       end
 
       bom_size, bom_encoding = detect_bom
       if bom_encoding
         string_without_bom =****@strin*****(bom_size,
                                                @string.bytesize - bom_size)
-        return string_without_bom.encode(Encoding::UTF_8,
-                                         bom_encoding,
-                                         invalid: :replace,
-                                         undef: :replace,
-                                         replace: "")
+        utf8_string = string_without_bom.encode(Encoding::UTF_8,
+                                                bom_encoding,
+                                                invalid: :replace,
+                                                undef: :replace,
+                                                replace: "")
+        return truncate(utf8_string)
       end
 
       guessed_encoding = guess_encoding
       if guessed_encoding
-        @string.encode(Encoding::UTF_8,
-                       guessed_encoding,
-                       invalid: :replace,
-                       undef: :replace,
-                       replace: "")
+        truncate(@string.encode(Encoding::UTF_8,
+                                guessed_encoding,
+                                invalid: :replace,
+                                undef: :replace,
+                                replace: ""))
       else
-        utf8_body =****@strin*****
-        utf8_body.force_encoding(Encoding::UTF_8)
-        utf8_body.scrub!("")
-        utf8_body.gsub!(/\p{Control}+/, "")
-        utf8_body
+        if @max_size
+          utf8_string =****@strin*****(0, @max_size)
+        else
+          utf8_string =****@strin*****
+        end
+        utf8_string.force_encoding(Encoding::UTF_8)
+        utf8_string.scrub!("")
+        utf8_string.gsub!(/\p{Control}+/, "")
+        utf8_string
       end
     end
 
@@ -113,5 +121,15 @@ module ChupaText
         @string.force_encoding(original_encoding)
       end
     end
+
+    def truncate(string)
+      if @max_size and string.bytesize > @max_size
+        truncated = string.byteslice(0, @max_size)
+        truncated.scrub!("")
+        truncated
+      else
+        string
+      end
+    end
   end
 end

  Modified: test/test-extractor.rb (+10 -0)
===================================================================
--- test/test-extractor.rb    2019-03-03 05:06:25 +0900 (0598f9f)
+++ test/test-extractor.rb    2019-03-03 05:30:32 +0900 (46ae862)
@@ -228,5 +228,15 @@ class TestExtractor < Test::Unit::TestCase
         assert_equal(["こんにちは"], extract(data))
       end
     end
+
+    sub_test_case("max body size") do
+      def test_last_invalid
+        @extractor = ChupaText::Extractor.new(max_body_size: 5)
+        data = ChupaText::Data.new
+        data.mime_type = "text/plain"
+        data.body = "こん"
+        assert_equal(["こ"], extract(data))
+      end
+    end
   end
 end
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190303/ac1a2ce3/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index