Index.drop_duplicates() (#458)

dask · Dec 5, 2023 · 55b5b54 · 55b5b54
1 parent e323341
commit 55b5b54
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,5 @@ bench/shakespeare.txt
 *.sw?
 .DS_STORE
 \.tox/
+.idea/
+.ipynb_checkpoints/
diff --git a/dask_expr/_reductions.py b/dask_expr/_reductions.py
@@ -207,6 +207,8 @@ def _lower(self):
         # Convert back to Series if necessary
         if is_series_like(self._meta):
             shuffled = shuffled[shuffled.columns[0]]
+        elif is_index_like(self._meta):
+            shuffled = shuffled.index
 
         # Blockwise aggregate
         result = Aggregate(
@@ -499,16 +501,14 @@ def split_by(self):
     def _meta(self):
         return self.chunk(meta_nonempty(self.frame._meta), **self.chunk_kwargs)
 
-    def _subset_kwargs(self):
-        if is_series_like(self.frame._meta):
-            return {}
-        return {"subset": self.subset}
-
     @property
     def chunk_kwargs(self):
-        if PANDAS_GE_200:
-            return {"ignore_index": self.ignore_index, **self._subset_kwargs()}
-        return self._subset_kwargs()
+        out = {}
+        if is_dataframe_like(self.frame._meta):
+            out["subset"] = self.subset
+        if PANDAS_GE_200 and not is_index_like(self.frame._meta):
+            out["ignore_index"] = self.ignore_index
+        return out
 
     def _simplify_up(self, parent):
         if self.subset is not None and isinstance(parent, Projection):

diff --git a/dask_expr/_shuffle.py b/dask_expr/_shuffle.py
@@ -1034,6 +1034,10 @@ def _lower(self):
         )
         return SortIndexBlockwise(index_set)
 
+    @property
+    def npartitions(self):
+        return len(self.new_divisions)
+
 
 class _SetPartitionsPreSetIndex(Blockwise):
     _parameters = ["frame", "new_divisions", "ascending", "na_position"]

diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
@@ -24,7 +24,7 @@
 @pytest.fixture
 def pdf():
     pdf = lib.DataFrame({"x": range(100)})
-    pdf["y"] = pdf.x * 10.0
+    pdf["y"] = pdf.x // 7  # Not unique; duplicates span different partitions
     yield pdf
 
 
@@ -949,17 +949,21 @@ def test_drop_duplicates(df, pdf, split_out):
         pdf.drop_duplicates(ignore_index=True),
         check_index=split_out is not True,
     )
-    assert_eq(
-        df.drop_duplicates(subset=["x"], split_out=split_out),
-        pdf.drop_duplicates(subset=["x"]),
-    )
     assert_eq(
         df.drop_duplicates(subset=["y"], split_out=split_out),
         pdf.drop_duplicates(subset=["y"]),
     )
     assert_eq(
-        df.x.drop_duplicates(split_out=split_out),
-        pdf.x.drop_duplicates(),
+        df.y.drop_duplicates(split_out=split_out),
+        pdf.y.drop_duplicates(),
+    )
+
+    actual = df.set_index("y").index.drop_duplicates(split_out=split_out)
+    if split_out is True:
+        actual = actual.compute().sort_values()  # shuffle is unordered
+    assert_eq(
+        actual,
+        pdf.set_index("y").index.drop_duplicates(),
     )
 
     with pytest.raises(KeyError, match="'a'"):
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,5 @@ bench/shakespeare.txt @@
     *.sw?
     .DS_STORE
     \.tox/
+    .idea/
+    .ipynb_checkpoints/