Skip to content

Commit

Permalink
Refactor and update tests after changes in dask-expr
Browse files Browse the repository at this point in the history
  • Loading branch information
milesgranger committed Mar 4, 2024
1 parent 6a697c6 commit 98801b5
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 76 deletions.
2 changes: 0 additions & 2 deletions dask_ml/preprocessing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,7 +818,6 @@ def inverse_transform(self, X: Union[ArrayLike, DataFrameType]) -> DataFrameType
# Bug in pandas <= 0.20.3 lost name
if series.name is None:
series.name = col
series.divisions = X.divisions
else:
# pandas
series = pd.Series(
Expand Down Expand Up @@ -1031,7 +1030,6 @@ def inverse_transform(
# Bug in pandas <= 0.20.3 lost name
if series.name is None:
series.name = col
series.divisions = X.divisions
else:
# pandas
series = pd.Series(
Expand Down
2 changes: 1 addition & 1 deletion dask_ml/preprocessing/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _check_array(self, y: Union[ArrayLike, SeriesType]):
y = np.asarray(y)

if isinstance(y, dd.Series):
if pd.api.types.is_categorical_dtype(y):
if isinstance(y.dtype, pd.CategoricalDtype):
# TODO(dask-3784): just call y.cat.as_known()
# https://github.com/dask/dask/issues/3784
if not y.cat.known:
Expand Down
5 changes: 1 addition & 4 deletions tests/ensemble/test_blockwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@ def test_bad_chunking_raises(self):
# this should *really* be a ValueError...
clf.fit(X, y)

@pytest.mark.xfail(
DASK_EXPR_ENABLED, reason="dask-expr computing early into np.ndarray"
)
def test_hard_voting_frame(self):
X, y = dask_ml.datasets.make_classification(chunks=25)
X = dd.from_dask_array(X)
Expand Down Expand Up @@ -133,7 +130,7 @@ def test_soft_voting_array(self):

@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: 'Scalar' object has no attribute '_chunks'",
reason="AssertionError da.utils.assert_eq(result, result2)",
)
def test_soft_voting_frame(self):
X, y = dask_ml.datasets.make_classification(chunks=25)
Expand Down
4 changes: 0 additions & 4 deletions tests/model_selection/test_incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
from dask_ml.model_selection._incremental import _partial_fit, _score, fit
from dask_ml.model_selection.utils_test import LinearFunction, _MaybeLinearFunction
from dask_ml.utils import ConstantFunction
from tests.conftest import DASK_EXPR_ENABLED

pytestmark = [
pytest.mark.skipif(not DISTRIBUTED_2_5_0, reason="hangs"),
Expand Down Expand Up @@ -230,9 +229,6 @@ def additional_calls(scores):
await asyncio.sleep(0.1)


@pytest.mark.xfail(
DASK_EXPR_ENABLED, reason="TypeError: 'coroutine' object is not iterable"
)
@gen_cluster(client=True)
async def test_search_basic(c, s, a, b):
for decay_rate, input_type, memory in itertools.product(
Expand Down
34 changes: 9 additions & 25 deletions tests/preprocessing/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,6 @@ def test_input_types(self, dask_df, pandas_df):
exclude="n_samples_seen_",
)

@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: can't set attribute 'divisions'",
)
def test_inverse_transform(self):
a = dpp.StandardScaler()
result = a.inverse_transform(a.fit_transform(X))
Expand Down Expand Up @@ -437,10 +433,6 @@ def test_encode_subset_of_columns(self, daskify):

tm.assert_frame_equal(result, df)

@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: can't set attribute 'divisions'",
)
@pytest.mark.parametrize("daskify", [False, True])
def test_drop_first(self, daskify):
if daskify:
Expand Down Expand Up @@ -479,8 +471,9 @@ def test_transform_raises(self):
de.transform(dummy.drop("B", axis="columns"))
assert rec.match("Columns of 'X' do not match the training")

@pytest.mark.skip(
reason='AssertionError: Attributes of DataFrame.iloc[:, 0] (column name="A") are different'
@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="Attribute 'dtype' are different; int64 vs pyarrow[string]",
)
def test_inverse_transform(self):
de = dpp.DummyEncoder()
Expand All @@ -492,14 +485,12 @@ def test_inverse_transform(self):
)
de.fit(df)
assert_eq_df(df, de.inverse_transform(de.transform(df)))

# This fails w/ dask-expr, dtype of col A differ int64 vs pyarrow[string]
assert_eq_df(df, de.inverse_transform(de.transform(df).values))


class TestOrdinalEncoder:
@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: can't set attribute 'divisions'",
)
@pytest.mark.parametrize("daskify", [False, True])
@pytest.mark.parametrize("values", [True, False])
def test_basic(self, daskify, values):
Expand Down Expand Up @@ -544,10 +535,6 @@ def test_transform_raises(self):
de.transform(dummy.drop("B", axis="columns"))
assert rec.match("Columns of 'X' do not match the training")

@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: can't set attribute 'divisions'",
)
def test_inverse_transform(self):
enc = dpp.OrdinalEncoder()
df = dd.from_pandas(
Expand Down Expand Up @@ -635,10 +622,6 @@ def test_transformed_shape(self):
# dask array with nan rows
assert a.transform(X_nan_rows).shape[1] == n_cols

@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="TypeError: No dispatch for <class 'dask_expr._collection.Scalar'>",
)
@pytest.mark.parametrize("daskify", [False, True])
def test_df_transform(self, daskify):
frame = df
Expand Down Expand Up @@ -667,11 +650,12 @@ def test_transformer_params(self):
assert pf._transformer.interaction_only is pf.interaction_only
assert pf._transformer.include_bias is pf.include_bias

@pytest.mark.xfail(
mark = pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="TypeError: No dispatch for <class 'dask_expr._collection.Scalar'>",
reason="dask-expr: NotImplementedError in assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)",
)
@pytest.mark.parametrize("daskify", [True, False])

@pytest.mark.parametrize("daskify", [pytest.param(True, marks=mark), False])
def test_df_transform_index(self, daskify):
frame = copy(df)
if not daskify:
Expand Down
3 changes: 0 additions & 3 deletions tests/preprocessing/test_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,6 @@ def test_fit_transform_categorical(self):
assert result.dtype == "int8"
assert result.dtype == result.compute().dtype

@pytest.mark.skip(
reason="DeprecationWarning: is_categorical_dtype is deprecated and will be removed in a future version."
)
@pytest.mark.parametrize("array", [y, s])
def test_inverse_transform(self, array):
a = dpp.LabelEncoder()
Expand Down
50 changes: 18 additions & 32 deletions tests/test_parallel_post_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,6 @@ def test_laziness():
assert 0 < x.compute() < 1


@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: 'MapPartitions' object has no attribute 'shape' / AttributeError: can't set attribute '_meta'",
)
def test_predict_meta_override():
X = pd.DataFrame({"c_0": [1, 2, 3, 4]})
y = np.array([1, 2, 3, 4])
Expand All @@ -66,13 +62,16 @@ def test_predict_meta_override():
base.fit(pd.DataFrame(X), y)

dd_X = dd.from_pandas(X, npartitions=2)
dd_X._meta = pd.DataFrame({"c_0": [5]})

# Failure when not proving predict_meta
# because of value dependent model
wrap = ParallelPostFit(base)
with pytest.raises(ValueError):
wrap.predict(dd_X)
if not DASK_EXPR_ENABLED:
# dask-expr cannot set _meta
dd_X._meta = pd.DataFrame({"c_0": [5]})

# Failure when not proving predict_meta
# because of value dependent model
wrap = ParallelPostFit(base)
with pytest.raises(ValueError):
wrap.predict(dd_X)

# Success when providing meta over-ride
wrap = ParallelPostFit(base, predict_meta=np.array([1]))
Expand All @@ -81,10 +80,6 @@ def test_predict_meta_override():
assert_eq_ar(result, expected)


@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: 'MapPartitions' object has no attribute 'shape'",
)
def test_predict_proba_meta_override():
X = pd.DataFrame({"c_0": [1, 2, 3, 4]})
y = np.array([1, 2, 3, 4])
Expand All @@ -93,13 +88,16 @@ def test_predict_proba_meta_override():
base.fit(pd.DataFrame(X), y)

dd_X = dd.from_pandas(X, npartitions=2)
dd_X._meta = pd.DataFrame({"c_0": [5]})

# Failure when not proving predict_proba_meta
# because of value dependent model
wrap = ParallelPostFit(base)
with pytest.raises(ValueError):
wrap.predict_proba(dd_X)
if not DASK_EXPR_ENABLED:
# dask-expr cannot set _meta
dd_X._meta = pd.DataFrame({"c_0": [5]})

# Failure when not proving predict_proba_meta
# because of value dependent model
wrap = ParallelPostFit(base)
with pytest.raises(ValueError):
wrap.predict_proba(dd_X)

# Success when providing meta over-ride
wrap = ParallelPostFit(base, predict_proba_meta=np.array([[0.0, 0.1, 0.8, 0.1]]))
Expand All @@ -108,10 +106,6 @@ def test_predict_proba_meta_override():
assert_eq_ar(result, expected)


@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: 'Scalar' object has no attribute 'shape'",
)
def test_transform_meta_override():
X = pd.DataFrame({"cat_s": ["a", "b", "c", "d"]})
dd_X = dd.from_pandas(X, npartitions=2)
Expand Down Expand Up @@ -148,10 +142,6 @@ def test_predict_correct_output_dtype():
assert wrap_output.dtype == base_output.dtype


@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: 'MapPartitions' object has no attribute 'shape'",
)
@pytest.mark.parametrize("kind", ["numpy", "dask.dataframe", "dask.array"])
def test_predict(kind):
X, y = make_classification(chunks=100)
Expand Down Expand Up @@ -185,10 +175,6 @@ def test_predict(kind):
assert_eq_ar(result, expected)


@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: 'MapPartitions' object has no attribute 'shape'",
)
@pytest.mark.parametrize("kind", ["numpy", "dask.dataframe", "dask.array"])
def test_transform(kind):
X, y = make_classification(chunks=100)
Expand Down
5 changes: 0 additions & 5 deletions tests/test_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from dask_ml._partial import fit, predict
from dask_ml.datasets import make_classification
from dask_ml.wrappers import Incremental
from tests.conftest import DASK_EXPR_ENABLED

x = np.array([[1, 0], [2, 0], [3, 0], [4, 0], [0, 1], [0, 2], [3, 3], [4, 4]])

Expand Down Expand Up @@ -90,10 +89,6 @@ def test_fit_shuffle_blocks():
)


@pytest.mark.xfail(
DASK_EXPR_ENABLED,
reason="AttributeError: 'Scalar' object has no attribute 'shape'",
)
def test_dataframes():
df = pd.DataFrame({"x": range(10), "y": [0, 1] * 5})
ddf = dd.from_pandas(df, npartitions=2)
Expand Down

0 comments on commit 98801b5

Please sign in to comment.