[Groonga-commit] ranguba/chupa-text-decomposer-spreadsheet at b16d07e [master] Handle invalid format file

Back to archive index
Kouhei Sutou null+****@clear*****
Fri Mar 1 09:46:54 JST 2019


Kouhei Sutou	2019-03-01 09:46:54 +0900 (Fri, 01 Mar 2019)

  Revision: b16d07e37aab4c68cbe928a96bd704e38dc9e7fe
  https://github.com/ranguba/chupa-text-decomposer-spreadsheet/commit/b16d07e37aab4c68cbe928a96bd704e38dc9e7fe

  Message:
    Handle invalid format file

  Added files:
    test/fixture/xls/broken.xls
  Modified files:
    lib/chupa-text/decomposers/spreadsheet.rb
    test/helper.rb
    test/test-spreadsheet.rb

  Modified: lib/chupa-text/decomposers/spreadsheet.rb (+42 -14)
===================================================================
--- lib/chupa-text/decomposers/spreadsheet.rb    2019-02-25 12:10:05 +0900 (1e3c2a3)
+++ lib/chupa-text/decomposers/spreadsheet.rb    2019-03-01 09:46:54 +0900 (bde2f63)
@@ -5,6 +5,8 @@ require "digest/sha1"
 module ChupaText
   module Decomposers
     class Spreadsheet < Decomposer
+      include Loggable
+
       registry.register("spreadsheet", self)
 
       TARGET_EXTENSIONS = ["ods", "xls", "xlsx", "xlsm", "xml"]
@@ -29,21 +31,47 @@ module ChupaText
       end
 
       def decompose(data)
-        book = Roo::Spreadsheet.open(data.path.to_s)
-        book.sheets.each do |sheet_name|
-          sheet = book.sheet(sheet_name)
-          body = sheet.to_csv
-          text_data = TextData.new(body, source_data: data)
-          text_data["name"] = sheet_name
-          text_data["digest"] = Digest::SHA1.hexdigest(body)
-          text_data["size"] = body.bytesize
-          text_data["first-row"] = sheet.first_row
-          text_data["last-row"] = sheet.last_row
-          text_data["first-column"] = sheet.first_column && sheet.first_column_as_letter
-          text_data["last-column"] = sheet.last_column && sheet.last_column_as_letter
-          yield text_data
+        open_book(data) do |book|
+          book.sheets.each do |sheet_name|
+            sheet = book.sheet(sheet_name)
+            body = sheet.to_csv
+            text_data = TextData.new(body, source_data: data)
+            text_data["name"] = sheet_name
+            text_data["digest"] = Digest::SHA1.hexdigest(body)
+            text_data["size"] = body.bytesize
+            text_data["first-row"] = sheet.first_row
+            text_data["last-row"] = sheet.last_row
+            text_data["first-column"] = sheet.first_column && sheet.first_column_as_letter
+            text_data["last-column"] = sheet.last_column && sheet.last_column_as_letter
+            yield text_data
+          end
+        end
+      end
+
+      private
+      def open_book(data)
+        book = nil
+        begin
+          book = Roo::Spreadsheet.open(data.path.to_s)
+        rescue Ole::Storage::FormatError => format_error
+          error do
+            message = "#{log_tag} Invalid format: "
+            message << "#{format_error.class}: #{format_error.message}\n"
+            message << format_error.backtrace.join("\n")
+            message
+          end
+          return
         end
-        book.close
+
+        begin
+          yield(book)
+        ensure
+          book.close
+        end
+      end
+
+      def log_tag
+        "[decomposer][spreadsheet]"
       end
     end
   end

  Added: test/fixture/xls/broken.xls (+22 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/xls/broken.xls    2019-03-01 09:46:54 +0900 (2e3fc1b)
@@ -0,0 +1,22 @@
+MIME-Version: 1.0
+mime-type: application/vnd.ms-excel
+uri: file:/tmp/C3k0Tj_broken.xls
+path: /tmp/C3k0Tj_broken.xls
+size: 7
+Content-Type: multipart/mixed; boundary=boundary
+
+--boundary
+mime-type: text/plain
+uri: file:/tmp/C3k0Tj_broken.txt
+path: /tmp/C3k0Tj_broken.txt
+size: 28
+created_time: 2019-03-01 00:47:17 UTC
+source-mime-types: ["application/pdf", "application/vnd.ms-excel"]
+creator: Calc
+producer: LibreOffice 5.2
+
+C3k0Tj_broken
+Broken
+Page 1
+
+--boundary--

  Modified: test/helper.rb (+29 -0)
===================================================================
--- test/helper.rb    2019-02-25 12:10:05 +0900 (07d839f)
+++ test/helper.rb    2019-03-01 09:46:54 +0900 (0648161)
@@ -3,4 +3,33 @@ module Helper
     base_dir = File.expand_path(__dir__)
     File.join(base_dir, "fixture", *components)
   end
+
+  class CaptureLogger
+    def initialize(output)
+      @output = output
+    end
+
+    def error(message=nil)
+      @output << [:error, message || yield]
+    end
+  end
+
+  def capture_log
+    original_logger = ChupaText.logger
+    begin
+      output = []
+      ChupaText.logger = CaptureLogger.new(output)
+      yield
+      normalize_log(output)
+    ensure
+      ChupaText.logger = original_logger
+    end
+  end
+
+  def normalize_log(log)
+    log.collect do |level, message|
+      message = message.split("\n", 2)[0]
+      [level, message]
+    end
+  end
 end

  Modified: test/test-spreadsheet.rb (+14 -0)
===================================================================
--- test/test-spreadsheet.rb    2019-02-25 12:10:05 +0900 (97db9b7)
+++ test/test-spreadsheet.rb    2019-03-01 09:46:54 +0900 (e59c34c)
@@ -45,6 +45,20 @@ class TestSpreadsheet < Test::Unit::TestCase
                    decompose("xls/multi-sheets.xls"))
     end
 
+    def test_xls_broken
+      log = capture_log do
+        assert_equal([], decompose("xls/broken.xls"))
+      end
+      assert_equal([
+                     [
+                       :error,
+                       "[decomposer][spreadsheet] Invalid format: " +
+                       "Ole::Storage::FormatError: OLE2 signature is invalid"
+                     ],
+                   ],
+                   log)
+    end
+
     def test_xlsx
       assert_equal([<<-SHEET1, <<-SHEET2, <<-SHEET3],
 "Sheet1 - A1","Sheet1 - B1"
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190301/9f10e3bd/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index