simonw · jsvine · Apr 2, 2021
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ Consider two CSV files:
       name: Pancakes
       age: 2
 
-The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed.
+The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. To use a combination of columns as the key, separate them with a comma, e.g., `--key=id1,id2`.
 
 The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`.
 
@@ -125,4 +125,4 @@ If the columns in the CSV have changed, those added or removed columns will be i
 
 Suppose current directory contains two csv files : one.csv two.csv
 
-    $ docker run --rm -v $(pwd):/files csvdiff one.csv two.csv
+    $ docker run --rm -v $(pwd):/files csvdiff one.csv two.csv
diff --git a/csv_diff/__init__.py b/csv_diff/__init__.py
@@ -2,6 +2,7 @@
 from dictdiffer import diff
 import json
 import hashlib
+from operator import itemgetter
 
 
 def load_csv(fp, key=None, dialect=None):
@@ -18,7 +19,7 @@ def load_csv(fp, key=None, dialect=None):
     headings = next(fp)
     rows = [dict(zip(headings, line)) for line in fp]
     if key:
-        keyfn = lambda r: r[key]
+        keyfn = itemgetter(*key.split(","))
     else:
         keyfn = lambda r: hashlib.sha1(
             json.dumps(r, sort_keys=True).encode("utf8")
@@ -33,7 +34,7 @@ def load_json(fp, key=None):
     for item in raw_list:
         common_keys.update(item.keys())
     if key:
-        keyfn = lambda r: r[key]
+        keyfn = itemgetter(*key.split(","))
     else:
         keyfn = lambda r: hashlib.sha1(
             json.dumps(r, sort_keys=True).encode("utf8")

diff --git a/csv_diff/cli.py b/csv_diff/cli.py
@@ -14,7 +14,10 @@
     type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False),
 )
 @click.option(
-    "--key", type=str, default=None, help="Column to use as a unique ID for each row"
+    "--key",
+    type=str,
+    default=None,
+    help="Column(s) to use as a unique ID for each row. To use multiple keys, separate them with a comma, e.g., key1,key2",
 )
 @click.option(
     "--format",

diff --git a/tests/test_csv_diff.py b/tests/test_csv_diff.py
@@ -51,6 +51,20 @@
 1,Cleo,5
 2,Pancakes,3"""
 
+ELEVEN = """state,county,pop
+CA,Yikes,100
+NY,Beep,200
+CA,Zoinks,100
+NY,Zoinks,200
+"""
+
+TWELVE = """state,county,pop
+CA,Yikes,100
+NY,Beep,200
+CA,Zoinks,300
+NY,Zoinks,200
+"""
+
 
 def test_row_changed():
     diff = compare(
@@ -115,3 +129,17 @@ def test_tsv():
         "columns_added": [],
         "columns_removed": [],
     } == diff
+
+
+def test_multikey():
+    diff = compare(
+        load_csv(io.StringIO(ELEVEN), key="state,county"),
+        load_csv(io.StringIO(TWELVE), key="state,county"),
+    )
+    assert {
+        "added": [],
+        "removed": [],
+        "changed": [{"key": ("CA", "Zoinks"), "changes": {"pop": ["100", "300"]}}],
+        "columns_added": [],
+        "columns_removed": [],
+    } == diff