Kouhei Sutou
null+****@clear*****
Wed Jul 5 16:30:52 JST 2017
Kouhei Sutou 2017-07-05 16:30:52 +0900 (Wed, 05 Jul 2017) New Revision: f71574c574dcb5aeadcb934bd156d67fae3f67f1 https://github.com/ranguba/chupa-text-decomposer-html/commit/f71574c574dcb5aeadcb934bd156d67fae3f67f1 Message: Support content based detection Modified files: lib/chupa-text/decomposers/html.rb test/test-html.rb Modified: lib/chupa-text/decomposers/html.rb (+10 -2) =================================================================== --- lib/chupa-text/decomposers/html.rb 2017-05-02 12:37:34 +0900 (3b0095c) +++ lib/chupa-text/decomposers/html.rb 2017-07-05 16:30:52 +0900 (b23c991) @@ -28,8 +28,16 @@ module ChupaText "application/xhtml+xml", ] def target?(data) - TARGET_EXTENSIONS.include?(data.extension) or - TARGET_MIME_TYPES.include?(data.mime_type) + return true if TARGET_EXTENSIONS.include?(data.extension) + return true if TARGET_MIME_TYPES.include?(data.mime_type) + + body = data.body + return false if body.nil? + + return true if body.start_with?("<!DOCTYPE html ") + return true if body.start_with?("<html") + + false end def decompose(data) Modified: test/test-html.rb (+26 -0) =================================================================== --- test/test-html.rb 2017-05-02 12:37:34 +0900 (a504a69) +++ test/test-html.rb 2017-07-05 16:30:52 +0900 (b5b24a7) @@ -51,6 +51,12 @@ class TestHTML < Test::Unit::TestCase def test_txt assert_false(@decomposer.target?(create_data("index.txt"))) end + + def test_php + assert do + not****@decom*****?(create_data("index.php")) + end + end end sub_test_case("mime-type") do @@ -72,6 +78,26 @@ class TestHTML < Test::Unit::TestCase assert_false(@decomposer.target?(create_data("text/plain"))) end end + + sub_test_case("content") do + def create_data(body) + data = ChupaText::Data.new + data.body = body + data + end + + def test_doctype_html + assert do + @decomposer.target?(create_data("<!DOCTYPE html ")) + end + end + + def test_html + assert do + @decomposer.target?(create_data("<html")) + end + end + end end sub_test_case("decompose") do -------------- next part -------------- HTML����������������������������...Download