Kouhei Sutou 2019-03-03 05:30:32 +0900 (Sun, 03 Mar 2019) Revision: 8b997e607d58b9ff052936a81ec216dcf2ce6a51 https://github.com/ranguba/chupa-text/commit/8b997e607d58b9ff052936a81ec216dcf2ce6a51 Message: Add body size limitation Modified files: lib/chupa-text/command/chupa-text.rb lib/chupa-text/data.rb lib/chupa-text/extractor.rb lib/chupa-text/utf8-converter.rb test/test-extractor.rb Modified: lib/chupa-text/command/chupa-text.rb (+8 -2) =================================================================== --- lib/chupa-text/command/chupa-text.rb 2019-03-03 05:06:25 +0900 (e68f6c3) +++ lib/chupa-text/command/chupa-text.rb 2019-03-03 05:30:32 +0900 (6f8482f) @@ -1,4 +1,4 @@ -# Copyright (C) 2013-2017 Kouhei Sutou <kou****@clear*****> +# Copyright (C) 2013-2019 Kouhei Sutou <kou****@clear*****> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -49,6 +49,7 @@ module ChupaText @mime_formatter_options = {} @need_screenshot = true @expected_screenshot_size = [200, 200] + @max_body_size = nil end def run(*arguments) @@ -143,6 +144,11 @@ module ChupaText "(default: #{@expected_screenshot_size.join("x")})") do |size| @expected_screenshot_size = size end + parser.on("--max-body-size=BYTE", Integer, + "The max byte of extracted body.", + "(default: no limit)") do |size| + @max_body_size = size + end parser.separator("") parser.separator("Log related options:") @@ -190,7 +196,7 @@ module ChupaText end def create_extractor - extractor = Extractor.new + extractor = Extractor.new(max_body_size: @max_body_size) extractor.apply_configuration(@configuration) extractor end Modified: lib/chupa-text/data.rb (+11 -3) =================================================================== --- lib/chupa-text/data.rb 2019-03-03 05:06:25 +0900 (b156499) +++ lib/chupa-text/data.rb 2019-03-03 05:30:32 +0900 (c42f534) @@ -198,12 +198,20 @@ module ChupaText @need_screenshot end - def to_utf8_body_data - b = body + def to_utf8_body_data(max_body_size: nil) + b = nil + if max_body_size + open do |input| + b = input.read(max_body_size) + end + else + b = body + end return self if b.nil? + converter = UTF8Converter.new(b) utf8_body = converter.convert - if b.equal?(utf8_body) + if max_body_size.nil? and b.equal?(utf8_body) self else TextData.new(utf8_body, source_data: self) Modified: lib/chupa-text/extractor.rb (+4 -3) =================================================================== --- lib/chupa-text/extractor.rb 2019-03-03 05:06:25 +0900 (4ce2c11) +++ lib/chupa-text/extractor.rb 2019-03-03 05:30:32 +0900 (c6eb6fa) @@ -21,8 +21,9 @@ module ChupaText class Extractor include Loggable - def initialize + def initialize(max_body_size: nil) @decomposers = [] + @max_body_size = max_body_size end # Sets the extractor up by the configuration. It adds decomposers @@ -90,11 +91,11 @@ module ChupaText if decomposer.nil? if target.text_plain? debug {"#{log_tag}[extract][text-plain]"} - yield(target.to_utf8_body_data) + yield(target.to_utf8_body_data(max_body_size: @max_body_size)) else debug {"#{log_tag}[extract][decomposer] not found"} if target.text? - yield(target.to_utf8_body_data) + yield(target.to_utf8_body_data(max_body_size: @max_body_size)) end end else Modified: lib/chupa-text/utf8-converter.rb (+42 -24) =================================================================== --- lib/chupa-text/utf8-converter.rb 2019-03-03 05:06:25 +0900 (ca72e90) +++ lib/chupa-text/utf8-converter.rb 2019-03-03 05:30:32 +0900 (ca05602) @@ -16,8 +16,9 @@ module ChupaText class UTF8Converter - def initialize(string) + def initialize(string, max_size: nil) @string = string + @max_size = max_size end def convert @@ -26,44 +27,51 @@ module ChupaText when Encoding::UTF_8 bom_size, bom_encoding = detect_bom if bom_size - retur****@strin*****(bom_size, - @string.bytesize - bom_size) + utf8_string =****@strin*****(bom_size, + @string.bytesize - bom_size) else - return @string + utf8_string = @string end + return truncate(utf8_string) when Encoding::ASCII_8BIT - return @string if****@strin*****_only? + return truncate(@string) if****@strin*****_only? else - retur****@strin*****(Encoding::UTF_8, - invalid: :replace, - undef: :replace, - replace: "") + utf8_string =****@strin*****(Encoding::UTF_8, + invalid: :replace, + undef: :replace, + replace: "") + return truncate(utf8_string) end bom_size, bom_encoding = detect_bom if bom_encoding string_without_bom =****@strin*****(bom_size, @string.bytesize - bom_size) - return string_without_bom.encode(Encoding::UTF_8, - bom_encoding, - invalid: :replace, - undef: :replace, - replace: "") + utf8_string = string_without_bom.encode(Encoding::UTF_8, + bom_encoding, + invalid: :replace, + undef: :replace, + replace: "") + return truncate(utf8_string) end guessed_encoding = guess_encoding if guessed_encoding - @string.encode(Encoding::UTF_8, - guessed_encoding, - invalid: :replace, - undef: :replace, - replace: "") + truncate(@string.encode(Encoding::UTF_8, + guessed_encoding, + invalid: :replace, + undef: :replace, + replace: "")) else - utf8_body =****@strin***** - utf8_body.force_encoding(Encoding::UTF_8) - utf8_body.scrub!("") - utf8_body.gsub!(/\p{Control}+/, "") - utf8_body + if @max_size + utf8_string =****@strin*****(0, @max_size) + else + utf8_string =****@strin***** + end + utf8_string.force_encoding(Encoding::UTF_8) + utf8_string.scrub!("") + utf8_string.gsub!(/\p{Control}+/, "") + utf8_string end end @@ -113,5 +121,15 @@ module ChupaText @string.force_encoding(original_encoding) end end + + def truncate(string) + if @max_size and string.bytesize > @max_size + truncated = string.byteslice(0, @max_size) + truncated.scrub!("") + truncated + else + string + end + end end end Modified: test/test-extractor.rb (+10 -0) =================================================================== --- test/test-extractor.rb 2019-03-03 05:06:25 +0900 (0598f9f) +++ test/test-extractor.rb 2019-03-03 05:30:32 +0900 (46ae862) @@ -228,5 +228,15 @@ class TestExtractor < Test::Unit::TestCase assert_equal(["こんにちは"], extract(data)) end end + + sub_test_case("max body size") do + def test_last_invalid + @extractor = ChupaText::Extractor.new(max_body_size: 5) + data = ChupaText::Data.new + data.mime_type = "text/plain" + data.body = "こん" + assert_equal(["こ"], extract(data)) + end + end end end -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190303/ac1a2ce3/attachment-0001.html>