Kouhei Sutou 2019-02-25 16:08:02 +0900 (Mon, 25 Feb 2019) Revision: c3336f8690ad891f569a0991452eb6a6af1ac327 https://github.com/ranguba/chupa-text/commit/c3336f8690ad891f569a0991452eb6a6af1ac327 Message: docx: add support special characters Added files: test/fixture/docx/special-characters.docx Modified files: lib/chupa-text/decomposers/office-open-xml.rb test/decomposers/test-office-open-xml-document.rb Modified: lib/chupa-text/decomposers/office-open-xml.rb (+12 -3) =================================================================== --- lib/chupa-text/decomposers/office-open-xml.rb 2019-02-25 15:37:09 +0900 (3bcfb21) +++ lib/chupa-text/decomposers/office-open-xml.rb 2019-02-25 16:08:02 +0900 (4438557) @@ -14,6 +14,7 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +require "cgi/util" require "rexml/parsers/sax2parser" require "rexml/sax2listener" @@ -65,7 +66,7 @@ module ChupaText private def parse(io, listener) - source = REXML::IOSource.new(io) + source = REXML::Source.new(io.read) parser = REXML::Parsers::SAX2Parser.new(source) parser.listen(listener) parser.parse @@ -99,11 +100,17 @@ module ChupaText end def characters(text) - @output << text if @in_target + add_text(text) end def cdata(content) - @output << content if @in_target + add_text(content) + end + + private + def add_text(text) + return unless @in_target + @output << CGI.unescapeHTML(text) end end @@ -164,6 +171,8 @@ module ChupaText def set_attribute(value) return if****@name*****? + + value = CGI.unescapeHTML(value) case @type when :w3cdtf value = Time.xmlschema(value) Modified: test/decomposers/test-office-open-xml-document.rb (+15 -0) =================================================================== --- test/decomposers/test-office-open-xml-document.rb 2019-02-25 15:37:09 +0900 (61fa8ea) +++ test/decomposers/test-office-open-xml-document.rb 2019-02-25 16:08:02 +0900 (031ec8a) @@ -125,5 +125,20 @@ Page2 BODY end end + + sub_test_case("special characters") do + def decompose + super(fixture_path("docx", "special-characters.docx")) + end + + def test_body + assert_equal([<<-BODY], decompose.collect(&:body)) +Ampersand: & +Reference: & +HTML: <a href=""> +Single quote: '' + BODY + end + end end end Added: test/fixture/docx/special-characters.docx (+23 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/docx/special-characters.docx 2019-02-25 16:08:02 +0900 (412ad36) @@ -0,0 +1,23 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document +uri: file:/tmp/fJrpPb_special-characters.docx +path: /tmp/fJrpPb_special-characters.docx +size: 4335 +Content-Type: multipart/mixed; boundary=6209a314891dc152454e6d1c1de1318340f6ccc9 + +--6209a314891dc152454e6d1c1de1318340f6ccc9 +mime-type: text/plain +uri: file:/tmp/fJrpPb_special-characters.txt +path: /tmp/fJrpPb_special-characters.txt +size: 65 +created_time: 2019-02-25 07:08:23 UTC +source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"] +creator: Writer +producer: LibreOffice 5.2 + +Ampersand: & +Reference: & +HTML: <a href=""> +Single quote: '' + +--6209a314891dc152454e6d1c1de1318340f6ccc9-- -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190225/6a49367f/attachment-0001.html>