Skip to content

Commit

Permalink
deploy: 81227a5
Browse files Browse the repository at this point in the history
  • Loading branch information
ogrisel committed Oct 26, 2023
1 parent fd0cf8d commit 9cfc35a
Show file tree
Hide file tree
Showing 44 changed files with 1,024 additions and 706 deletions.
2 changes: 1 addition & 1 deletion .buildinfo
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 37341b2db80fe12eb750b88ba76a4dd1
config: 214d35bb16c7218d3b7aab3048acdedb
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.
Binary file modified .doctrees/auto_examples/sg_execution_times.doctree
Binary file not shown.
Binary file modified .doctrees/environment.pickle
Binary file not shown.
Binary file modified .doctrees/generated/hazardous.GradientBoostingIncidence.doctree
Binary file not shown.
Binary file modified .doctrees/generated/hazardous.IPCWEstimator.doctree
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
},
"outputs": [],
"source": [
"import numpy as np\nfrom scipy.stats import weibull_min\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nfrom hazardous import GradientBoostingIncidence\nfrom lifelines import AalenJohansenFitter\n\nrng = np.random.default_rng(0)\nn_samples = 2_000\n\n# Non-informative covariate because scikit-learn estimators expect at least\n# one feature.\nX_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)\n\nbase_scale = 1_000.0 # some arbitrary time unit\nt_max = 3.0 * base_scale\n\ndistributions = [\n {\"event_id\": 1, \"scale\": 10 * base_scale, \"shape\": 0.5},\n {\"event_id\": 2, \"scale\": 3 * base_scale, \"shape\": 1},\n {\"event_id\": 3, \"scale\": 2 * base_scale, \"shape\": 5},\n]\nevent_times = np.concatenate(\n [\n weibull_min.rvs(\n dist[\"shape\"],\n scale=dist[\"scale\"],\n size=n_samples,\n random_state=rng,\n ).reshape(-1, 1)\n for dist in distributions\n ],\n axis=1,\n)\nfirst_event_idx = np.argmin(event_times, axis=1)\n\ny_uncensored = pd.DataFrame(\n dict(\n event=first_event_idx + 1, # 0 is reserved as the censoring marker\n duration=event_times[np.arange(n_samples), first_event_idx],\n )\n)\ny_uncensored[\"event\"].value_counts().sort_index()"
"from time import perf_counter\nimport numpy as np\nfrom scipy.stats import weibull_min\nfrom sklearn.base import clone\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nfrom hazardous import GradientBoostingIncidence\nfrom lifelines import AalenJohansenFitter\n\nrng = np.random.default_rng(0)\nn_samples = 3_000\n\n# Non-informative covariate because scikit-learn estimators expect at least\n# one feature.\nX_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)\n\nbase_scale = 1_000.0 # some arbitrary time unit\n\ndistributions = [\n {\"event_id\": 1, \"scale\": 10 * base_scale, \"shape\": 0.5},\n {\"event_id\": 2, \"scale\": 3 * base_scale, \"shape\": 1},\n {\"event_id\": 3, \"scale\": 3 * base_scale, \"shape\": 5},\n]\nevent_times = np.concatenate(\n [\n weibull_min.rvs(\n dist[\"shape\"],\n scale=dist[\"scale\"],\n size=n_samples,\n random_state=rng,\n ).reshape(-1, 1)\n for dist in distributions\n ],\n axis=1,\n)\nfirst_event_idx = np.argmin(event_times, axis=1)\n\ny_uncensored = pd.DataFrame(\n dict(\n event=first_event_idx + 1, # 0 is reserved as the censoring marker\n duration=event_times[np.arange(n_samples), first_event_idx],\n )\n)\ny_uncensored[\"event\"].value_counts().sort_index()\nt_max = y_uncensored[\"duration\"].max()"
]
},
{
Expand Down Expand Up @@ -51,7 +51,7 @@
},
"outputs": [],
"source": [
"aj = AalenJohansenFitter(calculate_variance=True, seed=0)\n\ngb_incidence = GradientBoostingIncidence(\n learning_rate=0.03,\n n_iter=100,\n max_leaf_nodes=5,\n hard_zero_fraction=0.1,\n min_samples_leaf=50,\n loss=\"ibs\",\n show_progressbar=False,\n random_state=0,\n)"
"calculate_variance = n_samples <= 5_000\naj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)\n\ngb_incidence = GradientBoostingIncidence(\n learning_rate=0.03,\n n_iter=100,\n max_leaf_nodes=5,\n hard_zero_fraction=0.1,\n min_samples_leaf=50,\n loss=\"ibs\",\n show_progressbar=False,\n random_state=0,\n)"
]
},
{
Expand All @@ -69,7 +69,7 @@
},
"outputs": [],
"source": [
"def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=None):\n _, axes = plt.subplots(figsize=(12, 4), ncols=len(distributions), sharey=True)\n\n # Compute the estimate of the CIFs on a coarse grid.\n coarse_timegrid = np.linspace(0, t_max, num=100)\n\n # Compute the theoretical CIFs by integrating the hazard functions on a\n # fine-grained time grid. Note that integration errors can accumulate quite\n # quickly if the time grid is resolution too coarse, especially for the\n # Weibull distribution with shape < 1.\n fine_time_grid = np.linspace(0, t_max, num=100_000)\n dt = np.diff(fine_time_grid)[0]\n all_hazards = np.stack(\n [weibull_hazard(fine_time_grid, **dist) for dist in distributions],\n axis=0,\n )\n any_event_hazards = all_hazards.sum(axis=0)\n any_event_survival = np.exp(-(any_event_hazards.cumsum(axis=-1) * dt))\n\n censoring_fraction = (y[\"event\"] == 0).mean()\n plt.suptitle(\n \"Cause-specific cumulative incidence functions\"\n f\" ({censoring_fraction:.1%} censoring)\"\n )\n\n for event_id, (ax, hazards_i) in enumerate(zip(axes, all_hazards), 1):\n theoretical_cif = (hazards_i * any_event_survival).cumsum(axis=-1) * dt\n ax.plot(\n fine_time_grid,\n theoretical_cif,\n linestyle=\"dashed\",\n label=\"Theoretical incidence\",\n ),\n ax.legend(loc=\"lower right\")\n\n if gb_incidence is not None:\n gb_incidence.set_params(event_of_interest=event_id)\n gb_incidence.fit(X_dummy, y)\n cif_pred = gb_incidence.predict_cumulative_incidence(\n X_dummy[0:1], coarse_timegrid\n )[0]\n ax.plot(\n coarse_timegrid,\n cif_pred,\n label=\"GradientBoostingIncidence\",\n )\n ax.legend(loc=\"lower right\")\n ax.set(title=f\"Event {event_id}\")\n\n if aj is not None:\n aj.fit(y[\"duration\"], y[\"event\"], event_of_interest=event_id)\n aj.plot(label=\"Aalen-Johansen\", ax=ax)\n\n\nplot_cumulative_incidence_functions(\n distributions, gb_incidence=gb_incidence, aj=aj, y=y_uncensored\n)"
"def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=None):\n _, axes = plt.subplots(figsize=(12, 4), ncols=len(distributions), sharey=True)\n\n # Compute the estimate of the CIFs on a coarse grid.\n coarse_timegrid = np.linspace(0, t_max, num=100)\n\n # Compute the theoretical CIFs by integrating the hazard functions on a\n # fine-grained time grid. Note that integration errors can accumulate quite\n # quickly if the time grid is resolution too coarse, especially for the\n # Weibull distribution with shape < 1.\n fine_time_grid = np.linspace(0, t_max, num=1_000_000)\n dt = np.diff(fine_time_grid)[0]\n all_hazards = np.stack(\n [weibull_hazard(fine_time_grid, **dist) for dist in distributions],\n axis=0,\n )\n any_event_hazards = all_hazards.sum(axis=0)\n any_event_survival = np.exp(-(any_event_hazards.cumsum(axis=-1) * dt))\n\n censoring_fraction = (y[\"event\"] == 0).mean()\n plt.suptitle(\n \"Cause-specific cumulative incidence functions\"\n f\" ({censoring_fraction:.1%} censoring)\"\n )\n\n for event_id, (ax, hazards_i) in enumerate(zip(axes, all_hazards), 1):\n theoretical_cif = (hazards_i * any_event_survival).cumsum(axis=-1) * dt\n ax.plot(\n fine_time_grid,\n theoretical_cif,\n linestyle=\"dashed\",\n label=\"Theoretical incidence\",\n ),\n ax.legend(loc=\"lower right\")\n\n if gb_incidence is not None:\n tic = perf_counter()\n gb_incidence.set_params(event_of_interest=event_id)\n gb_incidence.fit(X_dummy, y)\n duration = perf_counter() - tic\n print(f\"GB Incidence for event {event_id} fit in {duration:.3f} s\")\n tic = perf_counter()\n cif_pred = gb_incidence.predict_cumulative_incidence(\n X_dummy[0:1], coarse_timegrid\n )[0]\n duration = perf_counter() - tic\n print(f\"GB Incidence for event {event_id} prediction in {duration:.3f} s\")\n ax.plot(\n coarse_timegrid,\n cif_pred,\n label=\"GradientBoostingIncidence\",\n )\n ax.legend(loc=\"lower right\")\n ax.set(title=f\"Event {event_id}\")\n\n if aj is not None:\n tic = perf_counter()\n aj.fit(y[\"duration\"], y[\"event\"], event_of_interest=event_id)\n duration = perf_counter() - tic\n print(f\"Aalen-Johansen for event {event_id} fit in {duration:.3f} s\")\n aj.plot(label=\"Aalen-Johansen\", ax=ax)\n\n\nplot_cumulative_incidence_functions(\n distributions, gb_incidence=gb_incidence, aj=aj, y=y_uncensored\n)"
]
},
{
Expand All @@ -87,7 +87,7 @@
},
"outputs": [],
"source": [
"censoring_times = rng.lognormal(\n mean=0.01 * t_max,\n sigma=t_max,\n size=n_samples,\n)\ny_censored = pd.DataFrame(\n dict(\n event=np.where(\n censoring_times < y_uncensored[\"duration\"], 0, y_uncensored[\"event\"]\n ),\n duration=np.minimum(censoring_times, y_uncensored[\"duration\"]),\n )\n)\ny_censored[\"event\"].value_counts().sort_index()"
"censoring_times = rng.lognormal(\n mean=5,\n sigma=3,\n size=n_samples,\n)\ny_censored = pd.DataFrame(\n dict(\n event=np.where(\n censoring_times < y_uncensored[\"duration\"], 0, y_uncensored[\"event\"]\n ),\n duration=np.minimum(censoring_times, y_uncensored[\"duration\"]),\n )\n)\ny_censored[\"event\"].value_counts().sort_index()"
]
},
{
Expand All @@ -108,6 +108,13 @@
"Note that the Aalen-Johansen estimator is unbiased and empirically recovers\nthe theoretical curves both with and without censoring. The\nGradientBoostingIncidence estimator also appears unbiased by censoring, but\nthe predicted curves are not smooth and not even monotonically increasing. By\nadjusting the hyper-parameters, notably the learning rate, the number of\nboosting iterations and leaf nodes, it is possible to somewhat control the\nsmoothness of the predicted curves, but it is likely that doing some kind of\nsmoothing post-processing could be beneficial (but maybe at the cost of\nintroducing some bias). This is left as future work.\n\nAlternatively, we could try to enable a monotonicity constraint at training\ntime, however, in practice this often causes a sever over-estimation bias for\nthe large time horizons:\n\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally let's try again to fit the GB Incidence models using a monotonicity\nconstraint:\n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -116,7 +123,14 @@
},
"outputs": [],
"source": [
"gb_incidence.set_params(monotonic_incidence=\"at_training_time\")\nplot_cumulative_incidence_functions(\n distributions, gb_incidence=gb_incidence, y=y_censored\n)"
"monotonic_gb_incidence = clone(gb_incidence).set_params(\n monotonic_incidence=\"at_training_time\"\n)\nplot_cumulative_incidence_functions(\n distributions, gb_incidence=monotonic_gb_incidence, y=y_censored\n)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The resulting incidence curves are indeed monotonic. However, for smaller\ntraining set sizes of the training set, the resulting models can be\nsignificantly biased, in particular in regions where the CIFs is getting\nflatter. This effect diminishes with larger training set sizes (lower\nepistemic uncertainty).\n\n"
]
}
],
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -29,28 +29,29 @@
severe over-estimation bias for large time horizons.
"""
# %%
from time import perf_counter
import numpy as np
from scipy.stats import weibull_min
from sklearn.base import clone
import pandas as pd
import matplotlib.pyplot as plt

from hazardous import GradientBoostingIncidence
from lifelines import AalenJohansenFitter

rng = np.random.default_rng(0)
n_samples = 2_000
n_samples = 3_000

# Non-informative covariate because scikit-learn estimators expect at least
# one feature.
X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)

base_scale = 1_000.0 # some arbitrary time unit
t_max = 3.0 * base_scale

distributions = [
{"event_id": 1, "scale": 10 * base_scale, "shape": 0.5},
{"event_id": 2, "scale": 3 * base_scale, "shape": 1},
{"event_id": 3, "scale": 2 * base_scale, "shape": 5},
{"event_id": 3, "scale": 3 * base_scale, "shape": 5},
]
event_times = np.concatenate(
[
Expand All @@ -73,6 +74,7 @@
)
)
y_uncensored["event"].value_counts().sort_index()
t_max = y_uncensored["duration"].max()

# %%
#
Expand Down Expand Up @@ -103,7 +105,8 @@ def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs):
# them as reference to check that the estimators are unbiased by the censoring.
# Here are the two estimators of interest:

aj = AalenJohansenFitter(calculate_variance=True, seed=0)
calculate_variance = n_samples <= 5_000
aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)

gb_incidence = GradientBoostingIncidence(
learning_rate=0.03,
Expand Down Expand Up @@ -135,7 +138,7 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
# fine-grained time grid. Note that integration errors can accumulate quite
# quickly if the time grid is resolution too coarse, especially for the
# Weibull distribution with shape < 1.
fine_time_grid = np.linspace(0, t_max, num=100_000)
fine_time_grid = np.linspace(0, t_max, num=1_000_000)
dt = np.diff(fine_time_grid)[0]
all_hazards = np.stack(
[weibull_hazard(fine_time_grid, **dist) for dist in distributions],
Expand All @@ -161,11 +164,17 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
ax.legend(loc="lower right")

if gb_incidence is not None:
tic = perf_counter()
gb_incidence.set_params(event_of_interest=event_id)
gb_incidence.fit(X_dummy, y)
duration = perf_counter() - tic
print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
tic = perf_counter()
cif_pred = gb_incidence.predict_cumulative_incidence(
X_dummy[0:1], coarse_timegrid
)[0]
duration = perf_counter() - tic
print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
ax.plot(
coarse_timegrid,
cif_pred,
Expand All @@ -175,7 +184,10 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
ax.set(title=f"Event {event_id}")

if aj is not None:
tic = perf_counter()
aj.fit(y["duration"], y["event"], event_of_interest=event_id)
duration = perf_counter() - tic
print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
aj.plot(label="Aalen-Johansen", ax=ax)


Expand All @@ -192,8 +204,8 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
# parameters to control the amount of censoring: lowering the location bound
# increases the amount of censoring.
censoring_times = rng.lognormal(
mean=0.01 * t_max,
sigma=t_max,
mean=5,
sigma=3,
size=n_samples,
)
y_censored = pd.DataFrame(
Expand Down Expand Up @@ -226,8 +238,21 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
# time, however, in practice this often causes a sever over-estimation bias for
# the large time horizons:

gb_incidence.set_params(monotonic_incidence="at_training_time")
# %%
#
# Finally let's try again to fit the GB Incidence models using a monotonicity
# constraint:

monotonic_gb_incidence = clone(gb_incidence).set_params(
monotonic_incidence="at_training_time"
)
plot_cumulative_incidence_functions(
distributions, gb_incidence=gb_incidence, y=y_censored
distributions, gb_incidence=monotonic_gb_incidence, y=y_censored
)
# %%
#
# The resulting incidence curves are indeed monotonic. However, for smaller
# training set sizes of the training set, the resulting models can be
# significantly biased, in particular in regions where the CIFs is getting
# flatter. This effect diminishes with larger training set sizes (lower
# epistemic uncertainty).
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 9cfc35a

Please sign in to comment.