• R/O
  • SSH

chkcsv: Commit

Default repository for chkcsv.py.


Commit MetaInfo

Revision156a6dcf855a5b4bc4e16d406bf02debcc02cc45 (tree)
Time2020-01-27 04:41:23
AuthorDreas Nielsen <dreas.nielsen@gmai...>
CommiterDreas Nielsen

Log Message

Added option to check the order of columns in the CSV file.

Change Summary

Incremental Difference

diff -r a877f76095f7 -r 156a6dcf855a chkcsv/chkcsv.py
--- a/chkcsv/chkcsv.py Fri Jan 10 05:55:25 2020 -0800
+++ b/chkcsv/chkcsv.py Sun Jan 26 11:41:23 2020 -0800
@@ -39,12 +39,14 @@
3939 # 2011-09-25 First version. Version 0.8.0.0. RDN.
4040 # 2018-10-27 Converted to run under both Python 2 and 3. Version 1.0.0. RDN.
4141 # 2019-01-02 Corrected handling of next() for csv library. Version 1.0.1. RDN.
42-# 2018-01-04 Added check for data rows with more columns than column headers.
42+# 2019-01-04 Added check for data rows with more columns than column headers.
4343 # Version 1.1.0. RDN.
44+# 2020-01-26 Added an option to check that the order of columns in the CSV
45+# file is the same as in the specifications. RDN.
4446 # ============================================================================
4547
46-_version = "1.1.0"
47-_vdate = "2019-01-04"
48+_version = "1.2.0"
49+_vdate = "2020-01-26"
4850
4951 import sys
5052 from optparse import OptionParser
@@ -271,11 +273,12 @@
271273 def dispatch(self, check_funcs, data):
272274 errlist = [ f(data) for f in check_funcs ]
273275 return [ e for e in errlist if e ]
274- def __init__(self, fmt_spec, colname, column_required_default, data_required_default):
276+ def __init__(self, fmt_spec, colname, column_required_default, data_required_default, column_position):
275277 self.name = colname
276278 self.data_required = data_required_default
277279 # By default, all columns are required unless there is a specification indicating that it is not.
278280 self.column_required = column_required_default
281+ self.column_position = column_position
279282 specs = fmt_spec.options(colname)
280283 # Get the value for each option, using an appropriate function for each expected value type.
281284 for spec in specs:
@@ -350,6 +353,8 @@
350353 parser.add_option("-l", "--linelength", action="store_false", dest="linelength",
351354 default=True,
352355 help="Allow rows of the CSV file to have fewer columns than in the column headers. The default is to report an error for short data rows. If short data rows are allowed, any row without enough columns to match the format specification will still be reported as an error.")
356+ parser.add_option("-p", "--position", action="store_true", dest="position", default=False,
357+ help="Position (order) of columns in the CSV file must match that in the specification.")
353358 parser.add_option("-i", "--case-insensitive", action="store_true", dest="caseinsensitive",
354359 default=False,
355360 help="Case-insensitive matching of column names in the format configuration file and the CSV file. The default is case-sensitive (i.e., column names must match exactly).")
@@ -420,6 +425,7 @@
420425 :param column_required: Whether or not the column must be in the CSV file to be checked.
421426 :param data_required: Whether or not a data value is required on every row of the CSV file.
422427 :param chkopts: The name of a section in the format specification file containing additional options.
428+ :rtype: A dictionary of column type checking functions, indexed by column name.
423429 """
424430 fmtspecs = ConfigParser()
425431 try:
@@ -431,13 +437,13 @@
431437 # Convert ConfigParser object into a list of CsvChecker objects
432438 speccols = [ sect for sect in fmtspecs.sections() if sect != chkopts ]
433439 cols = {}
434- for col in speccols:
435- cols[col] = CsvChecker(fmtspecs, col, column_required, data_required)
440+ for i, col in enumerate(speccols):
441+ cols[col] = CsvChecker(fmtspecs, col, column_required, data_required, i)
436442 return cols
437443
438444
439445 def check_csv_file(csv_fname, cols, halt_on_err, columnexit, \
440- linelength, caseinsensitive, encoding=None):
446+ linelength, caseinsensitive, encoding=None, match_position=False):
441447 """Check that all of the required columns and data are present in the CSV file, and that
442448 the data conform to the appropriate type and other specifications.
443449
@@ -448,6 +454,8 @@
448454 :param linelength: Whether to report an error if any data row has a different number of items than indicated by the column headers.
449455 :param casesensitive: Whether column names in the specifications and CSV file should be compared case-insensitively.
450456 :param encoding: The character encoding of the CSV file.
457+ :param match_position: Whether or not the position (order) of columns in the CSV file must match that in the specifications.
458+ :rtype: A list of error messages as strings.
451459 """
452460 errorlist = []
453461 dialect = csv.Sniffer().sniff(open(csv_fname, "rt").readline())
@@ -467,8 +475,8 @@
467475 if len(req_missing) > 0:
468476 errorlist.append(("The following columns are required, but are not present in the CSV file: %s." % ", ".join(req_missing), csv_fname, 1))
469477 return errorlist
470- # Exit if there are extra columns and the option to exit is set.
471- if columnexit:
478+ # Exit if there are extra columns and either the option to exit is set or the column positions must match.
479+ if columnexit or match_position:
472480 if caseinsensitive:
473481 speccols_l = [ c.lower() for c in cols ]
474482 extra = [ col for col in colnames if not (col.lower() in speccols_l) ]
@@ -477,6 +485,24 @@
477485 if len(extra) > 0:
478486 errorlist.append(("The following columns have no format specifications but are in the CSV file: %s." % u", ".join(extra), csv_fname, 1))
479487 return errorlist
488+ # Report an error if the position (order) of columns is required to be the same and it is not.
489+ if match_position:
490+ spec_col_order = [c[0] for c in sorted([(c[1].name, c[1].column_position) for c in cols.items()], key=lambda p: p[1])]
491+ same_order = True
492+ if caseinsensitive:
493+ colnames_l = [ c.lower() for c in colnames ]
494+ specnames_l = [ c.lower() for c in spec_col_order ]
495+ for i, cname in enumerate(colnames_l):
496+ if specnames_l[i] != cname:
497+ same_order = False
498+ break
499+ else:
500+ for i, cname in enumerate(colnames):
501+ if spec_col_order[i] != cname:
502+ same_order = False
503+ break
504+ if not same_order:
505+ errorlist.append(("The order of columns in the CSV file is not the same as in the specifications", csv_fname, 1))
480506 # Column names common to specifications and data file. These will be used
481507 # to index the cols dictionary to get the appropriate check method
482508 # and to index the CSV column name list (colnames) to get the column position.
@@ -549,7 +575,7 @@
549575 cols = read_format_specs(fmt_file, opts.column_required, opts.data_required, chkopts)
550576 # Check the file
551577 errorlist = check_csv_file(csv_file, cols, opts.haltonerror,
552- opts.columnexit, opts.linelength, opts.caseinsensitive, opts.encoding)
578+ opts.columnexit, opts.linelength, opts.caseinsensitive, opts.encoding, opts.position)
553579 if len(errorlist) > 0:
554580 show_errors(errorlist)
555581 return 1
diff -r a877f76095f7 -r 156a6dcf855a doc/source/conf.py
--- a/doc/source/conf.py Fri Jan 10 05:55:25 2020 -0800
+++ b/doc/source/conf.py Sun Jan 26 11:41:23 2020 -0800
@@ -48,7 +48,7 @@
4848
4949 # General information about the project.
5050 project = u'chkcsv'
51-copyright = u'2011-2019, Dreas Nielsen'
51+copyright = u'2011-2020, Dreas Nielsen'
5252 author = u'Dreas Nielsen'
5353
5454 # The version info for the project you're documenting, acts as replacement for
@@ -56,9 +56,9 @@
5656 # built documents.
5757 #
5858 # The short X.Y version.
59-version = u'1.1'
59+version = u'1.2'
6060 # The full version, including alpha/beta/rc tags.
61-release = u'1.1.0'
61+release = u'1.2.0'
6262
6363 # The language for content autogenerated by Sphinx. Refer to documentation
6464 # for a list of supported languages.
diff -r a877f76095f7 -r 156a6dcf855a doc/source/index.rst
--- a/doc/source/index.rst Fri Jan 10 05:55:25 2020 -0800
+++ b/doc/source/index.rst Sun Jan 26 11:41:23 2020 -0800
@@ -6,7 +6,7 @@
66 ``chkcsv.py`` is a Python module and program that checks the format
77 and content of a comma-separated-value (CSV) or similar delimited text
88 file. It can check whether required columns are present, and the type,
9-length, and pattern of each column.
9+length, pattern, and order of each column.
1010
1111
1212 Syntax and Options
@@ -66,6 +66,8 @@
6666 data rows are allowed, any row without enough
6767 columns to match the format specification will
6868 still be reported as an error.
69+ -p, -- position Require that the position (order) of columns in the
70+ CSV file match that in the specifications.
6971 -i, --case-insensitive
7072 Case-insensitive matching of column names in
7173 the format configuration file and the CSV file.
@@ -332,7 +334,7 @@
332334 Copyright and License
333335 ================================
334336
335-Copyright (c) 2011-2019, R.Dreas Nielsen
337+Copyright (c) 2011-2020, R.Dreas Nielsen
336338
337339 This program is free software: you can redistribute it and/or modify it
338340 under the terms of the GNU General Public License as published by the
diff -r a877f76095f7 -r 156a6dcf855a setup.py
--- a/setup.py Fri Jan 10 05:55:25 2020 -0800
+++ b/setup.py Sun Jan 26 11:41:23 2020 -0800
@@ -1,7 +1,7 @@
11 from distutils.core import setup
22
33 setup(name='chkcsv',
4- version='1.1.0',
4+ version='1.2.0',
55 description="Checks the format of a CSV file with respect to a specifed set of column names and types.",
66 author='Dreas Nielsen',
77 author_email='dreas.nielsen@gmail.com',
@@ -26,8 +26,9 @@
2626 ],
2727 long_description="""``chkcsv.py`` is a Python module and program
2828 that checks the format of data in a CSV file. It can check whether required
29-columns and data are present, and the type of data in each column. Pattern
30-matching using regular expressions is supported.
29+columns and data are present, check whether the type of data in each column
30+matches the specifications, and check whether columns are in a specified
31+order. Pattern matching using regular expressions is supported.
3132
3233 Complete documentation is at http://chkcsv.osdn.io/."""
3334 )
Show on old repository browser