Optimize Spark Dask take function

fugue-project · Jan 6, 2024 · cc1dbe7 · cc1dbe7
1 parent 29f105d
commit cc1dbe7
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 0 deletions.
diff --git a/fugue_dask/execution_engine.py b/fugue_dask/execution_engine.py
@@ -506,6 +506,11 @@ def _partition_take(partition, n, presort):
                 ).head(n)
 
         else:
+            if len(_presort.keys()) == 0 and n == 1:
+                return d.drop_duplicates(
+                    subset=partition_spec.partition_by, ignore_index=True, keep="first"
+                )
+
             d = (
                 d.groupby(partition_spec.partition_by, dropna=False)
                 .apply(_partition_take, n=n, presort=_presort, meta=meta)

diff --git a/fugue_spark/execution_engine.py b/fugue_spark/execution_engine.py
@@ -674,6 +674,9 @@ def _presort_to_col(_col: str, _asc: bool) -> Any:
 
         # If partition exists
         else:
+            if len(_presort.keys()) == 0 and n == 1:
+                return d.dropDuplicates(subset=partition_spec.partition_by)
+
             w = Window.partitionBy([col(x) for x in partition_spec.partition_by])
 
             if len(_presort.keys()) > 0: