Kouhei Sutou 2019-03-01 09:46:54 +0900 (Fri, 01 Mar 2019) Revision: b16d07e37aab4c68cbe928a96bd704e38dc9e7fe https://github.com/ranguba/chupa-text-decomposer-spreadsheet/commit/b16d07e37aab4c68cbe928a96bd704e38dc9e7fe Message: Handle invalid format file Added files: test/fixture/xls/broken.xls Modified files: lib/chupa-text/decomposers/spreadsheet.rb test/helper.rb test/test-spreadsheet.rb Modified: lib/chupa-text/decomposers/spreadsheet.rb (+42 -14) =================================================================== --- lib/chupa-text/decomposers/spreadsheet.rb 2019-02-25 12:10:05 +0900 (1e3c2a3) +++ lib/chupa-text/decomposers/spreadsheet.rb 2019-03-01 09:46:54 +0900 (bde2f63) @@ -5,6 +5,8 @@ require "digest/sha1" module ChupaText module Decomposers class Spreadsheet < Decomposer + include Loggable + registry.register("spreadsheet", self) TARGET_EXTENSIONS = ["ods", "xls", "xlsx", "xlsm", "xml"] @@ -29,21 +31,47 @@ module ChupaText end def decompose(data) - book = Roo::Spreadsheet.open(data.path.to_s) - book.sheets.each do |sheet_name| - sheet = book.sheet(sheet_name) - body = sheet.to_csv - text_data = TextData.new(body, source_data: data) - text_data["name"] = sheet_name - text_data["digest"] = Digest::SHA1.hexdigest(body) - text_data["size"] = body.bytesize - text_data["first-row"] = sheet.first_row - text_data["last-row"] = sheet.last_row - text_data["first-column"] = sheet.first_column && sheet.first_column_as_letter - text_data["last-column"] = sheet.last_column && sheet.last_column_as_letter - yield text_data + open_book(data) do |book| + book.sheets.each do |sheet_name| + sheet = book.sheet(sheet_name) + body = sheet.to_csv + text_data = TextData.new(body, source_data: data) + text_data["name"] = sheet_name + text_data["digest"] = Digest::SHA1.hexdigest(body) + text_data["size"] = body.bytesize + text_data["first-row"] = sheet.first_row + text_data["last-row"] = sheet.last_row + text_data["first-column"] = sheet.first_column && sheet.first_column_as_letter + text_data["last-column"] = sheet.last_column && sheet.last_column_as_letter + yield text_data + end + end + end + + private + def open_book(data) + book = nil + begin + book = Roo::Spreadsheet.open(data.path.to_s) + rescue Ole::Storage::FormatError => format_error + error do + message = "#{log_tag} Invalid format: " + message << "#{format_error.class}: #{format_error.message}\n" + message << format_error.backtrace.join("\n") + message + end + return end - book.close + + begin + yield(book) + ensure + book.close + end + end + + def log_tag + "[decomposer][spreadsheet]" end end end Added: test/fixture/xls/broken.xls (+22 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/xls/broken.xls 2019-03-01 09:46:54 +0900 (2e3fc1b) @@ -0,0 +1,22 @@ +MIME-Version: 1.0 +mime-type: application/vnd.ms-excel +uri: file:/tmp/C3k0Tj_broken.xls +path: /tmp/C3k0Tj_broken.xls +size: 7 +Content-Type: multipart/mixed; boundary=boundary + +--boundary +mime-type: text/plain +uri: file:/tmp/C3k0Tj_broken.txt +path: /tmp/C3k0Tj_broken.txt +size: 28 +created_time: 2019-03-01 00:47:17 UTC +source-mime-types: ["application/pdf", "application/vnd.ms-excel"] +creator: Calc +producer: LibreOffice 5.2 + +C3k0Tj_broken +Broken +Page 1 + +--boundary-- Modified: test/helper.rb (+29 -0) =================================================================== --- test/helper.rb 2019-02-25 12:10:05 +0900 (07d839f) +++ test/helper.rb 2019-03-01 09:46:54 +0900 (0648161) @@ -3,4 +3,33 @@ module Helper base_dir = File.expand_path(__dir__) File.join(base_dir, "fixture", *components) end + + class CaptureLogger + def initialize(output) + @output = output + end + + def error(message=nil) + @output << [:error, message || yield] + end + end + + def capture_log + original_logger = ChupaText.logger + begin + output = [] + ChupaText.logger = CaptureLogger.new(output) + yield + normalize_log(output) + ensure + ChupaText.logger = original_logger + end + end + + def normalize_log(log) + log.collect do |level, message| + message = message.split("\n", 2)[0] + [level, message] + end + end end Modified: test/test-spreadsheet.rb (+14 -0) =================================================================== --- test/test-spreadsheet.rb 2019-02-25 12:10:05 +0900 (97db9b7) +++ test/test-spreadsheet.rb 2019-03-01 09:46:54 +0900 (e59c34c) @@ -45,6 +45,20 @@ class TestSpreadsheet < Test::Unit::TestCase decompose("xls/multi-sheets.xls")) end + def test_xls_broken + log = capture_log do + assert_equal([], decompose("xls/broken.xls")) + end + assert_equal([ + [ + :error, + "[decomposer][spreadsheet] Invalid format: " + + "Ole::Storage::FormatError: OLE2 signature is invalid" + ], + ], + log) + end + def test_xlsx assert_equal([<<-SHEET1, <<-SHEET2, <<-SHEET3], "Sheet1 - A1","Sheet1 - B1" -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190301/9f10e3bd/attachment-0001.html>