Refactor and update tests after changes in dask-expr

dask · Mar 4, 2024 · 98801b5 · 98801b5
1 parent 6a697c6
commit 98801b5
Show file tree

Hide file tree

Showing 8 changed files with 29 additions and 76 deletions.
diff --git a/dask_ml/preprocessing/data.py b/dask_ml/preprocessing/data.py
@@ -818,7 +818,6 @@ def inverse_transform(self, X: Union[ArrayLike, DataFrameType]) -> DataFrameType
                 # Bug in pandas <= 0.20.3 lost name
                 if series.name is None:
                     series.name = col
-                series.divisions = X.divisions
             else:
                 # pandas
                 series = pd.Series(
@@ -1031,7 +1030,6 @@ def inverse_transform(
                 # Bug in pandas <= 0.20.3 lost name
                 if series.name is None:
                     series.name = col
-                series.divisions = X.divisions
             else:
                 # pandas
                 series = pd.Series(

diff --git a/dask_ml/preprocessing/label.py b/dask_ml/preprocessing/label.py
@@ -102,7 +102,7 @@ def _check_array(self, y: Union[ArrayLike, SeriesType]):
                 y = np.asarray(y)
 
         if isinstance(y, dd.Series):
-            if pd.api.types.is_categorical_dtype(y):
+            if isinstance(y.dtype, pd.CategoricalDtype):
                 # TODO(dask-3784): just call y.cat.as_known()
                 # https://github.com/dask/dask/issues/3784
                 if not y.cat.known:

diff --git a/tests/ensemble/test_blockwise.py b/tests/ensemble/test_blockwise.py
@@ -61,9 +61,6 @@ def test_bad_chunking_raises(self):
             # this should *really* be a ValueError...
             clf.fit(X, y)
 
-    @pytest.mark.xfail(
-        DASK_EXPR_ENABLED, reason="dask-expr computing early into np.ndarray"
-    )
     def test_hard_voting_frame(self):
         X, y = dask_ml.datasets.make_classification(chunks=25)
         X = dd.from_dask_array(X)
@@ -133,7 +130,7 @@ def test_soft_voting_array(self):
 
     @pytest.mark.xfail(
         DASK_EXPR_ENABLED,
-        reason="AttributeError: 'Scalar' object has no attribute '_chunks'",
+        reason="AssertionError da.utils.assert_eq(result, result2)",
     )
     def test_soft_voting_frame(self):
         X, y = dask_ml.datasets.make_classification(chunks=25)

diff --git a/tests/model_selection/test_incremental.py b/tests/model_selection/test_incremental.py
@@ -37,7 +37,6 @@
 from dask_ml.model_selection._incremental import _partial_fit, _score, fit
 from dask_ml.model_selection.utils_test import LinearFunction, _MaybeLinearFunction
 from dask_ml.utils import ConstantFunction
-from tests.conftest import DASK_EXPR_ENABLED
 
 pytestmark = [
     pytest.mark.skipif(not DISTRIBUTED_2_5_0, reason="hangs"),
@@ -230,9 +229,6 @@ def additional_calls(scores):
         await asyncio.sleep(0.1)
 
 
-@pytest.mark.xfail(
-    DASK_EXPR_ENABLED, reason="TypeError: 'coroutine' object is not iterable"
-)
 @gen_cluster(client=True)
 async def test_search_basic(c, s, a, b):
     for decay_rate, input_type, memory in itertools.product(

diff --git a/tests/preprocessing/test_data.py b/tests/preprocessing/test_data.py
@@ -96,10 +96,6 @@ def test_input_types(self, dask_df, pandas_df):
             exclude="n_samples_seen_",
         )
 
-    @pytest.mark.xfail(
-        DASK_EXPR_ENABLED,
-        reason="AttributeError: can't set attribute 'divisions'",
-    )
     def test_inverse_transform(self):
         a = dpp.StandardScaler()
         result = a.inverse_transform(a.fit_transform(X))
@@ -437,10 +433,6 @@ def test_encode_subset_of_columns(self, daskify):
 
         tm.assert_frame_equal(result, df)
 
-    @pytest.mark.xfail(
-        DASK_EXPR_ENABLED,
-        reason="AttributeError: can't set attribute 'divisions'",
-    )
     @pytest.mark.parametrize("daskify", [False, True])
     def test_drop_first(self, daskify):
         if daskify:
@@ -479,8 +471,9 @@ def test_transform_raises(self):
             de.transform(dummy.drop("B", axis="columns"))
         assert rec.match("Columns of 'X' do not match the training")
 
-    @pytest.mark.skip(
-        reason='AssertionError: Attributes of DataFrame.iloc[:, 0] (column name="A") are different'
+    @pytest.mark.xfail(
+        DASK_EXPR_ENABLED,
+        reason="Attribute 'dtype' are different; int64 vs pyarrow[string]",
     )
     def test_inverse_transform(self):
         de = dpp.DummyEncoder()
@@ -492,14 +485,12 @@ def test_inverse_transform(self):
         )
         de.fit(df)
         assert_eq_df(df, de.inverse_transform(de.transform(df)))
+
+        # This fails w/ dask-expr, dtype of col A differ int64 vs pyarrow[string]
         assert_eq_df(df, de.inverse_transform(de.transform(df).values))
 
 
 class TestOrdinalEncoder:
-    @pytest.mark.xfail(
-        DASK_EXPR_ENABLED,
-        reason="AttributeError: can't set attribute 'divisions'",
-    )
     @pytest.mark.parametrize("daskify", [False, True])
     @pytest.mark.parametrize("values", [True, False])
     def test_basic(self, daskify, values):
@@ -544,10 +535,6 @@ def test_transform_raises(self):
             de.transform(dummy.drop("B", axis="columns"))
         assert rec.match("Columns of 'X' do not match the training")
 
-    @pytest.mark.xfail(
-        DASK_EXPR_ENABLED,
-        reason="AttributeError: can't set attribute 'divisions'",
-    )
     def test_inverse_transform(self):
         enc = dpp.OrdinalEncoder()
         df = dd.from_pandas(
@@ -635,10 +622,6 @@ def test_transformed_shape(self):
         # dask array with nan rows
         assert a.transform(X_nan_rows).shape[1] == n_cols
 
-    @pytest.mark.xfail(
-        DASK_EXPR_ENABLED,
-        reason="TypeError: No dispatch for <class 'dask_expr._collection.Scalar'>",
-    )
     @pytest.mark.parametrize("daskify", [False, True])
     def test_df_transform(self, daskify):
         frame = df
@@ -667,11 +650,12 @@ def test_transformer_params(self):
         assert pf._transformer.interaction_only is pf.interaction_only
         assert pf._transformer.include_bias is pf.include_bias
 
-    @pytest.mark.xfail(
+    mark = pytest.mark.xfail(
         DASK_EXPR_ENABLED,
-        reason="TypeError: No dispatch for <class 'dask_expr._collection.Scalar'>",
+        reason="dask-expr: NotImplementedError in assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)",
     )
-    @pytest.mark.parametrize("daskify", [True, False])
+
+    @pytest.mark.parametrize("daskify", [pytest.param(True, marks=mark), False])
     def test_df_transform_index(self, daskify):
         frame = copy(df)
         if not daskify:

diff --git a/tests/preprocessing/test_label.py b/tests/preprocessing/test_label.py
@@ -102,9 +102,6 @@ def test_fit_transform_categorical(self):
         assert result.dtype == "int8"
         assert result.dtype == result.compute().dtype
 
-    @pytest.mark.skip(
-        reason="DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version."
-    )
     @pytest.mark.parametrize("array", [y, s])
     def test_inverse_transform(self, array):
         a = dpp.LabelEncoder()

diff --git a/tests/test_parallel_post_fit.py b/tests/test_parallel_post_fit.py
@@ -54,10 +54,6 @@ def test_laziness():
     assert 0 < x.compute() < 1
 
 
-@pytest.mark.xfail(
-    DASK_EXPR_ENABLED,
-    reason="AttributeError: 'MapPartitions' object has no attribute 'shape' / AttributeError: can't set attribute '_meta'",
-)
 def test_predict_meta_override():
     X = pd.DataFrame({"c_0": [1, 2, 3, 4]})
     y = np.array([1, 2, 3, 4])
@@ -66,13 +62,16 @@ def test_predict_meta_override():
     base.fit(pd.DataFrame(X), y)
 
     dd_X = dd.from_pandas(X, npartitions=2)
-    dd_X._meta = pd.DataFrame({"c_0": [5]})
 
-    # Failure when not proving predict_meta
-    # because of value dependent model
-    wrap = ParallelPostFit(base)
-    with pytest.raises(ValueError):
-        wrap.predict(dd_X)
+    if not DASK_EXPR_ENABLED:
+        # dask-expr cannot set _meta
+        dd_X._meta = pd.DataFrame({"c_0": [5]})
+
+        # Failure when not proving predict_meta
+        # because of value dependent model
+        wrap = ParallelPostFit(base)
+        with pytest.raises(ValueError):
+            wrap.predict(dd_X)
 
     # Success when providing meta over-ride
     wrap = ParallelPostFit(base, predict_meta=np.array([1]))
@@ -81,10 +80,6 @@ def test_predict_meta_override():
     assert_eq_ar(result, expected)
 
 
-@pytest.mark.xfail(
-    DASK_EXPR_ENABLED,
-    reason="AttributeError: 'MapPartitions' object has no attribute 'shape'",
-)
 def test_predict_proba_meta_override():
     X = pd.DataFrame({"c_0": [1, 2, 3, 4]})
     y = np.array([1, 2, 3, 4])
@@ -93,13 +88,16 @@ def test_predict_proba_meta_override():
     base.fit(pd.DataFrame(X), y)
 
     dd_X = dd.from_pandas(X, npartitions=2)
-    dd_X._meta = pd.DataFrame({"c_0": [5]})
 
-    # Failure when not proving predict_proba_meta
-    # because of value dependent model
-    wrap = ParallelPostFit(base)
-    with pytest.raises(ValueError):
-        wrap.predict_proba(dd_X)
+    if not DASK_EXPR_ENABLED:
+        # dask-expr cannot set _meta
+        dd_X._meta = pd.DataFrame({"c_0": [5]})
+
+        # Failure when not proving predict_proba_meta
+        # because of value dependent model
+        wrap = ParallelPostFit(base)
+        with pytest.raises(ValueError):
+            wrap.predict_proba(dd_X)
 
     # Success when providing meta over-ride
     wrap = ParallelPostFit(base, predict_proba_meta=np.array([[0.0, 0.1, 0.8, 0.1]]))
@@ -108,10 +106,6 @@ def test_predict_proba_meta_override():
     assert_eq_ar(result, expected)
 
 
-@pytest.mark.xfail(
-    DASK_EXPR_ENABLED,
-    reason="AttributeError: 'Scalar' object has no attribute 'shape'",
-)
 def test_transform_meta_override():
     X = pd.DataFrame({"cat_s": ["a", "b", "c", "d"]})
     dd_X = dd.from_pandas(X, npartitions=2)
@@ -148,10 +142,6 @@ def test_predict_correct_output_dtype():
     assert wrap_output.dtype == base_output.dtype
 
 
-@pytest.mark.xfail(
-    DASK_EXPR_ENABLED,
-    reason="AttributeError: 'MapPartitions' object has no attribute 'shape'",
-)
 @pytest.mark.parametrize("kind", ["numpy", "dask.dataframe", "dask.array"])
 def test_predict(kind):
     X, y = make_classification(chunks=100)
@@ -185,10 +175,6 @@ def test_predict(kind):
     assert_eq_ar(result, expected)
 
 
-@pytest.mark.xfail(
-    DASK_EXPR_ENABLED,
-    reason="AttributeError: 'MapPartitions' object has no attribute 'shape'",
-)
 @pytest.mark.parametrize("kind", ["numpy", "dask.dataframe", "dask.array"])
 def test_transform(kind):
     X, y = make_classification(chunks=100)

diff --git a/tests/test_partial.py b/tests/test_partial.py
@@ -14,7 +14,6 @@
 from dask_ml._partial import fit, predict
 from dask_ml.datasets import make_classification
 from dask_ml.wrappers import Incremental
-from tests.conftest import DASK_EXPR_ENABLED
 
 x = np.array([[1, 0], [2, 0], [3, 0], [4, 0], [0, 1], [0, 2], [3, 3], [4, 4]])
 
@@ -90,10 +89,6 @@ def test_fit_shuffle_blocks():
         )
 
 
-@pytest.mark.xfail(
-    DASK_EXPR_ENABLED,
-    reason="AttributeError: 'Scalar' object has no attribute 'shape'",
-)
 def test_dataframes():
     df = pd.DataFrame({"x": range(10), "y": [0, 1] * 5})
     ddf = dd.from_pandas(df, npartitions=2)