deploy: 81227a5

soda-inria · Oct 26, 2023 · 9cfc35a · 9cfc35a
1 parent fd0cf8d
commit 9cfc35a
Show file tree

Hide file tree

Showing 44 changed files with 1,024 additions and 706 deletions.
diff --git a/.buildinfo b/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 37341b2db80fe12eb750b88ba76a4dd1
+config: 214d35bb16c7218d3b7aab3048acdedb
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/.doctrees/auto_examples/plot_marginal_cumulative_incidence_estimation.doctree b/.doctrees/auto_examples/plot_marginal_cumulative_incidence_estimation.doctree
diff --git a/.doctrees/auto_examples/sg_execution_times.doctree b/.doctrees/auto_examples/sg_execution_times.doctree
diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle
diff --git a/.doctrees/generated/hazardous.GradientBoostingIncidence.doctree b/.doctrees/generated/hazardous.GradientBoostingIncidence.doctree
diff --git a/.doctrees/generated/hazardous.IPCWEstimator.doctree b/.doctrees/generated/hazardous.IPCWEstimator.doctree
diff --git a/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip b/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
diff --git a/...oads/2932d6ac7842b6d781c27c5737ae52fa/plot_marginal_cumulative_incidence_estimation.ipynb b/...oads/2932d6ac7842b6d781c27c5737ae52fa/plot_marginal_cumulative_incidence_estimation.ipynb
@@ -15,7 +15,7 @@
       },
       "outputs": [],
       "source": [
-        "import numpy as np\nfrom scipy.stats import weibull_min\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nfrom hazardous import GradientBoostingIncidence\nfrom lifelines import AalenJohansenFitter\n\nrng = np.random.default_rng(0)\nn_samples = 2_000\n\n# Non-informative covariate because scikit-learn estimators expect at least\n# one feature.\nX_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)\n\nbase_scale = 1_000.0  # some arbitrary time unit\nt_max = 3.0 * base_scale\n\ndistributions = [\n    {\"event_id\": 1, \"scale\": 10 * base_scale, \"shape\": 0.5},\n    {\"event_id\": 2, \"scale\": 3 * base_scale, \"shape\": 1},\n    {\"event_id\": 3, \"scale\": 2 * base_scale, \"shape\": 5},\n]\nevent_times = np.concatenate(\n    [\n        weibull_min.rvs(\n            dist[\"shape\"],\n            scale=dist[\"scale\"],\n            size=n_samples,\n            random_state=rng,\n        ).reshape(-1, 1)\n        for dist in distributions\n    ],\n    axis=1,\n)\nfirst_event_idx = np.argmin(event_times, axis=1)\n\ny_uncensored = pd.DataFrame(\n    dict(\n        event=first_event_idx + 1,  # 0 is reserved as the censoring marker\n        duration=event_times[np.arange(n_samples), first_event_idx],\n    )\n)\ny_uncensored[\"event\"].value_counts().sort_index()"
+        "from time import perf_counter\nimport numpy as np\nfrom scipy.stats import weibull_min\nfrom sklearn.base import clone\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nfrom hazardous import GradientBoostingIncidence\nfrom lifelines import AalenJohansenFitter\n\nrng = np.random.default_rng(0)\nn_samples = 3_000\n\n# Non-informative covariate because scikit-learn estimators expect at least\n# one feature.\nX_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)\n\nbase_scale = 1_000.0  # some arbitrary time unit\n\ndistributions = [\n    {\"event_id\": 1, \"scale\": 10 * base_scale, \"shape\": 0.5},\n    {\"event_id\": 2, \"scale\": 3 * base_scale, \"shape\": 1},\n    {\"event_id\": 3, \"scale\": 3 * base_scale, \"shape\": 5},\n]\nevent_times = np.concatenate(\n    [\n        weibull_min.rvs(\n            dist[\"shape\"],\n            scale=dist[\"scale\"],\n            size=n_samples,\n            random_state=rng,\n        ).reshape(-1, 1)\n        for dist in distributions\n    ],\n    axis=1,\n)\nfirst_event_idx = np.argmin(event_times, axis=1)\n\ny_uncensored = pd.DataFrame(\n    dict(\n        event=first_event_idx + 1,  # 0 is reserved as the censoring marker\n        duration=event_times[np.arange(n_samples), first_event_idx],\n    )\n)\ny_uncensored[\"event\"].value_counts().sort_index()\nt_max = y_uncensored[\"duration\"].max()"
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "aj = AalenJohansenFitter(calculate_variance=True, seed=0)\n\ngb_incidence = GradientBoostingIncidence(\n    learning_rate=0.03,\n    n_iter=100,\n    max_leaf_nodes=5,\n    hard_zero_fraction=0.1,\n    min_samples_leaf=50,\n    loss=\"ibs\",\n    show_progressbar=False,\n    random_state=0,\n)"
+        "calculate_variance = n_samples <= 5_000\naj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)\n\ngb_incidence = GradientBoostingIncidence(\n    learning_rate=0.03,\n    n_iter=100,\n    max_leaf_nodes=5,\n    hard_zero_fraction=0.1,\n    min_samples_leaf=50,\n    loss=\"ibs\",\n    show_progressbar=False,\n    random_state=0,\n)"
       ]
     },
     {
@@ -69,7 +69,7 @@
       },
       "outputs": [],
       "source": [
-        "def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=None):\n    _, axes = plt.subplots(figsize=(12, 4), ncols=len(distributions), sharey=True)\n\n    # Compute the estimate of the CIFs on a coarse grid.\n    coarse_timegrid = np.linspace(0, t_max, num=100)\n\n    # Compute the theoretical CIFs by integrating the hazard functions on a\n    # fine-grained time grid. Note that integration errors can accumulate quite\n    # quickly if the time grid is resolution too coarse, especially for the\n    # Weibull distribution with shape < 1.\n    fine_time_grid = np.linspace(0, t_max, num=100_000)\n    dt = np.diff(fine_time_grid)[0]\n    all_hazards = np.stack(\n        [weibull_hazard(fine_time_grid, **dist) for dist in distributions],\n        axis=0,\n    )\n    any_event_hazards = all_hazards.sum(axis=0)\n    any_event_survival = np.exp(-(any_event_hazards.cumsum(axis=-1) * dt))\n\n    censoring_fraction = (y[\"event\"] == 0).mean()\n    plt.suptitle(\n        \"Cause-specific cumulative incidence functions\"\n        f\" ({censoring_fraction:.1%} censoring)\"\n    )\n\n    for event_id, (ax, hazards_i) in enumerate(zip(axes, all_hazards), 1):\n        theoretical_cif = (hazards_i * any_event_survival).cumsum(axis=-1) * dt\n        ax.plot(\n            fine_time_grid,\n            theoretical_cif,\n            linestyle=\"dashed\",\n            label=\"Theoretical incidence\",\n        ),\n        ax.legend(loc=\"lower right\")\n\n        if gb_incidence is not None:\n            gb_incidence.set_params(event_of_interest=event_id)\n            gb_incidence.fit(X_dummy, y)\n            cif_pred = gb_incidence.predict_cumulative_incidence(\n                X_dummy[0:1], coarse_timegrid\n            )[0]\n            ax.plot(\n                coarse_timegrid,\n                cif_pred,\n                label=\"GradientBoostingIncidence\",\n            )\n            ax.legend(loc=\"lower right\")\n            ax.set(title=f\"Event {event_id}\")\n\n        if aj is not None:\n            aj.fit(y[\"duration\"], y[\"event\"], event_of_interest=event_id)\n            aj.plot(label=\"Aalen-Johansen\", ax=ax)\n\n\nplot_cumulative_incidence_functions(\n    distributions, gb_incidence=gb_incidence, aj=aj, y=y_uncensored\n)"
+        "def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=None):\n    _, axes = plt.subplots(figsize=(12, 4), ncols=len(distributions), sharey=True)\n\n    # Compute the estimate of the CIFs on a coarse grid.\n    coarse_timegrid = np.linspace(0, t_max, num=100)\n\n    # Compute the theoretical CIFs by integrating the hazard functions on a\n    # fine-grained time grid. Note that integration errors can accumulate quite\n    # quickly if the time grid is resolution too coarse, especially for the\n    # Weibull distribution with shape < 1.\n    fine_time_grid = np.linspace(0, t_max, num=1_000_000)\n    dt = np.diff(fine_time_grid)[0]\n    all_hazards = np.stack(\n        [weibull_hazard(fine_time_grid, **dist) for dist in distributions],\n        axis=0,\n    )\n    any_event_hazards = all_hazards.sum(axis=0)\n    any_event_survival = np.exp(-(any_event_hazards.cumsum(axis=-1) * dt))\n\n    censoring_fraction = (y[\"event\"] == 0).mean()\n    plt.suptitle(\n        \"Cause-specific cumulative incidence functions\"\n        f\" ({censoring_fraction:.1%} censoring)\"\n    )\n\n    for event_id, (ax, hazards_i) in enumerate(zip(axes, all_hazards), 1):\n        theoretical_cif = (hazards_i * any_event_survival).cumsum(axis=-1) * dt\n        ax.plot(\n            fine_time_grid,\n            theoretical_cif,\n            linestyle=\"dashed\",\n            label=\"Theoretical incidence\",\n        ),\n        ax.legend(loc=\"lower right\")\n\n        if gb_incidence is not None:\n            tic = perf_counter()\n            gb_incidence.set_params(event_of_interest=event_id)\n            gb_incidence.fit(X_dummy, y)\n            duration = perf_counter() - tic\n            print(f\"GB Incidence for event {event_id} fit in {duration:.3f} s\")\n            tic = perf_counter()\n            cif_pred = gb_incidence.predict_cumulative_incidence(\n                X_dummy[0:1], coarse_timegrid\n            )[0]\n            duration = perf_counter() - tic\n            print(f\"GB Incidence for event {event_id} prediction in {duration:.3f} s\")\n            ax.plot(\n                coarse_timegrid,\n                cif_pred,\n                label=\"GradientBoostingIncidence\",\n            )\n            ax.legend(loc=\"lower right\")\n            ax.set(title=f\"Event {event_id}\")\n\n        if aj is not None:\n            tic = perf_counter()\n            aj.fit(y[\"duration\"], y[\"event\"], event_of_interest=event_id)\n            duration = perf_counter() - tic\n            print(f\"Aalen-Johansen for event {event_id} fit in {duration:.3f} s\")\n            aj.plot(label=\"Aalen-Johansen\", ax=ax)\n\n\nplot_cumulative_incidence_functions(\n    distributions, gb_incidence=gb_incidence, aj=aj, y=y_uncensored\n)"
       ]
     },
     {
@@ -87,7 +87,7 @@
       },
       "outputs": [],
       "source": [
-        "censoring_times = rng.lognormal(\n    mean=0.01 * t_max,\n    sigma=t_max,\n    size=n_samples,\n)\ny_censored = pd.DataFrame(\n    dict(\n        event=np.where(\n            censoring_times < y_uncensored[\"duration\"], 0, y_uncensored[\"event\"]\n        ),\n        duration=np.minimum(censoring_times, y_uncensored[\"duration\"]),\n    )\n)\ny_censored[\"event\"].value_counts().sort_index()"
+        "censoring_times = rng.lognormal(\n    mean=5,\n    sigma=3,\n    size=n_samples,\n)\ny_censored = pd.DataFrame(\n    dict(\n        event=np.where(\n            censoring_times < y_uncensored[\"duration\"], 0, y_uncensored[\"event\"]\n        ),\n        duration=np.minimum(censoring_times, y_uncensored[\"duration\"]),\n    )\n)\ny_censored[\"event\"].value_counts().sort_index()"
       ]
     },
     {
@@ -108,6 +108,13 @@
         "Note that the Aalen-Johansen estimator is unbiased and empirically recovers\nthe theoretical curves both with and without censoring. The\nGradientBoostingIncidence estimator also appears unbiased by censoring, but\nthe predicted curves are not smooth and not even monotonically increasing. By\nadjusting the hyper-parameters, notably the learning rate, the number of\nboosting iterations and leaf nodes, it is possible to somewhat control the\nsmoothness of the predicted curves, but it is likely that doing some kind of\nsmoothing post-processing could be beneficial (but maybe at the cost of\nintroducing some bias). This is left as future work.\n\nAlternatively, we could try to enable a monotonicity constraint at training\ntime, however, in practice this often causes a sever over-estimation bias for\nthe large time horizons:\n\n"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Finally let's try again to fit the GB Incidence models using a monotonicity\nconstraint:\n\n"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -116,7 +123,14 @@
       },
       "outputs": [],
       "source": [
-        "gb_incidence.set_params(monotonic_incidence=\"at_training_time\")\nplot_cumulative_incidence_functions(\n    distributions, gb_incidence=gb_incidence, y=y_censored\n)"
+        "monotonic_gb_incidence = clone(gb_incidence).set_params(\n    monotonic_incidence=\"at_training_time\"\n)\nplot_cumulative_incidence_functions(\n    distributions, gb_incidence=monotonic_gb_incidence, y=y_censored\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "The resulting incidence curves are indeed monotonic. However, for smaller\ntraining set sizes of the training set, the resulting models can be\nsignificantly biased, in particular in regions where the CIFs is getting\nflatter. This effect diminishes with larger training set sizes (lower\nepistemic uncertainty).\n\n"
       ]
     }
   ],

diff --git a/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip b/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
diff --git a/_downloads/878bb6ffe7fba36a0655277b8151e4cd/plot_marginal_cumulative_incidence_estimation.py b/_downloads/878bb6ffe7fba36a0655277b8151e4cd/plot_marginal_cumulative_incidence_estimation.py
@@ -29,28 +29,29 @@
 severe over-estimation bias for large time horizons.
 """
 # %%
+from time import perf_counter
 import numpy as np
 from scipy.stats import weibull_min
+from sklearn.base import clone
 import pandas as pd
 import matplotlib.pyplot as plt
 
 from hazardous import GradientBoostingIncidence
 from lifelines import AalenJohansenFitter
 
 rng = np.random.default_rng(0)
-n_samples = 2_000
+n_samples = 3_000
 
 # Non-informative covariate because scikit-learn estimators expect at least
 # one feature.
 X_dummy = np.zeros(shape=(n_samples, 1), dtype=np.float32)
 
 base_scale = 1_000.0  # some arbitrary time unit
-t_max = 3.0 * base_scale
 
 distributions = [
     {"event_id": 1, "scale": 10 * base_scale, "shape": 0.5},
     {"event_id": 2, "scale": 3 * base_scale, "shape": 1},
-    {"event_id": 3, "scale": 2 * base_scale, "shape": 5},
+    {"event_id": 3, "scale": 3 * base_scale, "shape": 5},
 ]
 event_times = np.concatenate(
     [
@@ -73,6 +74,7 @@
     )
 )
 y_uncensored["event"].value_counts().sort_index()
+t_max = y_uncensored["duration"].max()
 
 # %%
 #
@@ -103,7 +105,8 @@ def weibull_hazard(t, shape=1.0, scale=1.0, **ignored_kwargs):
 # them as reference to check that the estimators are unbiased by the censoring.
 # Here are the two estimators of interest:
 
-aj = AalenJohansenFitter(calculate_variance=True, seed=0)
+calculate_variance = n_samples <= 5_000
+aj = AalenJohansenFitter(calculate_variance=calculate_variance, seed=0)
 
 gb_incidence = GradientBoostingIncidence(
     learning_rate=0.03,
@@ -135,7 +138,7 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
     # fine-grained time grid. Note that integration errors can accumulate quite
     # quickly if the time grid is resolution too coarse, especially for the
     # Weibull distribution with shape < 1.
-    fine_time_grid = np.linspace(0, t_max, num=100_000)
+    fine_time_grid = np.linspace(0, t_max, num=1_000_000)
     dt = np.diff(fine_time_grid)[0]
     all_hazards = np.stack(
         [weibull_hazard(fine_time_grid, **dist) for dist in distributions],
@@ -161,11 +164,17 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
         ax.legend(loc="lower right")
 
         if gb_incidence is not None:
+            tic = perf_counter()
             gb_incidence.set_params(event_of_interest=event_id)
             gb_incidence.fit(X_dummy, y)
+            duration = perf_counter() - tic
+            print(f"GB Incidence for event {event_id} fit in {duration:.3f} s")
+            tic = perf_counter()
             cif_pred = gb_incidence.predict_cumulative_incidence(
                 X_dummy[0:1], coarse_timegrid
             )[0]
+            duration = perf_counter() - tic
+            print(f"GB Incidence for event {event_id} prediction in {duration:.3f} s")
             ax.plot(
                 coarse_timegrid,
                 cif_pred,
@@ -175,7 +184,10 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
             ax.set(title=f"Event {event_id}")
 
         if aj is not None:
+            tic = perf_counter()
             aj.fit(y["duration"], y["event"], event_of_interest=event_id)
+            duration = perf_counter() - tic
+            print(f"Aalen-Johansen for event {event_id} fit in {duration:.3f} s")
             aj.plot(label="Aalen-Johansen", ax=ax)
 
 
@@ -192,8 +204,8 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
 # parameters to control the amount of censoring: lowering the location bound
 # increases the amount of censoring.
 censoring_times = rng.lognormal(
-    mean=0.01 * t_max,
-    sigma=t_max,
+    mean=5,
+    sigma=3,
     size=n_samples,
 )
 y_censored = pd.DataFrame(
@@ -226,8 +238,21 @@ def plot_cumulative_incidence_functions(distributions, y, gb_incidence=None, aj=
 # time, however, in practice this often causes a sever over-estimation bias for
 # the large time horizons:
 
-gb_incidence.set_params(monotonic_incidence="at_training_time")
+# %%
+#
+# Finally let's try again to fit the GB Incidence models using a monotonicity
+# constraint:
+
+monotonic_gb_incidence = clone(gb_incidence).set_params(
+    monotonic_incidence="at_training_time"
+)
 plot_cumulative_incidence_functions(
-    distributions, gb_incidence=gb_incidence, y=y_censored
+    distributions, gb_incidence=monotonic_gb_incidence, y=y_censored
 )
 # %%
+#
+# The resulting incidence curves are indeed monotonic. However, for smaller
+# training set sizes of the training set, the resulting models can be
+# significantly biased, in particular in regions where the CIFs is getting
+# flatter. This effect diminishes with larger training set sizes (lower
+# epistemic uncertainty).
diff --git a/_images/sphx_glr_plot_marginal_cumulative_incidence_estimation_001.png b/_images/sphx_glr_plot_marginal_cumulative_incidence_estimation_001.png
diff --git a/_images/sphx_glr_plot_marginal_cumulative_incidence_estimation_002.png b/_images/sphx_glr_plot_marginal_cumulative_incidence_estimation_002.png
diff --git a/_images/sphx_glr_plot_marginal_cumulative_incidence_estimation_003.png b/_images/sphx_glr_plot_marginal_cumulative_incidence_estimation_003.png
diff --git a/_images/sphx_glr_plot_marginal_cumulative_incidence_estimation_thumb.png b/_images/sphx_glr_plot_marginal_cumulative_incidence_estimation_thumb.png