diff --git a/README.md b/README.md index ecd93b9..0ce25b3 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Consider two CSV files: name: Pancakes age: 2 -The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. +The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. To use a combination of columns as the key, separate them with a comma, e.g., `--key=id1,id2`. The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`. @@ -125,4 +125,4 @@ If the columns in the CSV have changed, those added or removed columns will be i Suppose current directory contains two csv files : one.csv two.csv - $ docker run --rm -v $(pwd):/files csvdiff one.csv two.csv \ No newline at end of file + $ docker run --rm -v $(pwd):/files csvdiff one.csv two.csv diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py index 650966d..973f686 100644 --- a/csv_diff/__init__.py +++ b/csv_diff/__init__.py @@ -2,6 +2,7 @@ from dictdiffer import diff import json import hashlib +from operator import itemgetter def load_csv(fp, key=None, dialect=None): @@ -18,7 +19,7 @@ def load_csv(fp, key=None, dialect=None): headings = next(fp) rows = [dict(zip(headings, line)) for line in fp] if key: - keyfn = lambda r: r[key] + keyfn = itemgetter(*key.split(",")) else: keyfn = lambda r: hashlib.sha1( json.dumps(r, sort_keys=True).encode("utf8") @@ -33,7 +34,7 @@ def load_json(fp, key=None): for item in raw_list: common_keys.update(item.keys()) if key: - keyfn = lambda r: r[key] + keyfn = itemgetter(*key.split(",")) else: keyfn = lambda r: hashlib.sha1( json.dumps(r, sort_keys=True).encode("utf8") diff --git a/csv_diff/cli.py b/csv_diff/cli.py index 919c81e..0a9c5ff 100644 --- a/csv_diff/cli.py +++ b/csv_diff/cli.py @@ -14,7 +14,10 @@ type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False), ) @click.option( - "--key", type=str, default=None, help="Column to use as a unique ID for each row" + "--key", + type=str, + default=None, + help="Column(s) to use as a unique ID for each row. To use multiple keys, separate them with a comma, e.g., key1,key2", ) @click.option( "--format", diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py index 0e3670f..298355e 100644 --- a/tests/test_csv_diff.py +++ b/tests/test_csv_diff.py @@ -51,6 +51,20 @@ 1,Cleo,5 2,Pancakes,3""" +ELEVEN = """state,county,pop +CA,Yikes,100 +NY,Beep,200 +CA,Zoinks,100 +NY,Zoinks,200 +""" + +TWELVE = """state,county,pop +CA,Yikes,100 +NY,Beep,200 +CA,Zoinks,300 +NY,Zoinks,200 +""" + def test_row_changed(): diff = compare( @@ -115,3 +129,17 @@ def test_tsv(): "columns_added": [], "columns_removed": [], } == diff + + +def test_multikey(): + diff = compare( + load_csv(io.StringIO(ELEVEN), key="state,county"), + load_csv(io.StringIO(TWELVE), key="state,county"), + ) + assert { + "added": [], + "removed": [], + "changed": [{"key": ("CA", "Zoinks"), "changes": {"pop": ["100", "300"]}}], + "columns_added": [], + "columns_removed": [], + } == diff