INRIA · ArturoAmorQ · Nov 25, 2024 · Jun 3, 2024 · Jun 18, 2024 · Jul 4, 2024
diff --git a/python_scripts/01_tabular_data_exploration.py b/python_scripts/01_tabular_data_exploration.py
@@ -360,9 +360,12 @@
 # We made important observations (which will be discussed later in more detail):
 #
 # * if your target variable is imbalanced (e.g., you have more samples from one
-#   target category than another), you may need special techniques for training
-#   and evaluating your machine learning model;
-# * having redundant (or highly correlated) columns can be a problem for some
-#   machine learning algorithms;
-# * contrary to decision tree, linear models can only capture linear
-#   interactions, so be aware of non-linear relationships in your data.
+#   target category than another), you may need to be careful when interpreting
+#   the values of performance metrics;
+# * columns can be redundant (or highly correlated), which is not necessarily a
+#   problem, but may require special treatment as we will cover in future
+#   notebooks;
+# * decision trees create prediction rules by comparing each feature to a
+#   threshold value, resulting in decision boundaries that are always parallel
+#   to the axes. In 2D, this means the boundaries are vertical or horizontal
+#   line segments at the feature threshold values.
diff --git a/python_scripts/cross_validation_learning_curve.py b/python_scripts/cross_validation_learning_curve.py
@@ -102,10 +102,14 @@
 # benefit to adding samples anymore or assessing the potential gain of adding
 # more samples into the training set.
 #
-# If we achieve a plateau and adding new samples in the training set does not
-# reduce the testing error, we might have reached the Bayes error rate using the
-# available model. Using a more complex model might be the only possibility to
-# reduce the testing error further.
+# If the testing error plateaus despite adding more training samples, it's
+# possible that the model has achieved its optimal performance. In this case,
+# using a more expressive model might help reduce the error further. Otherwise,
+# the error may have reached the Bayes error rate, the theoretical minimum error
+# due to inherent uncertainty not resolved by the available data. This minimum error is
+# non-zero whenever some of the variation of the target variable `y` depends on
+# external factors not fully observed in the features available in `X`, which is
+# almost always the case in practice.
 #
 # ## Summary
 #

diff --git a/python_scripts/linear_models_feature_engineering_classification.py b/python_scripts/linear_models_feature_engineering_classification.py
@@ -331,7 +331,7 @@ def plot_decision_boundary(model, title=None):
 # from the previous models: its decision boundary can take a diagonal
 # direction. Furthermore, we can observe that predictions are very confident in
 # the low density regions of the feature space, even very close to the decision
-# boundary
+# boundary.
 #
 # We can obtain very similar results by using a kernel approximation technique
 # such as the Nyström method with a polynomial kernel:

diff --git a/python_scripts/logistic_regression.py b/python_scripts/logistic_regression.py
@@ -151,7 +151,7 @@
 # by name or position. In the code above `logistic_regression[-1]` means the
 # last step of the pipeline. Then you can access the attributes of that step such
 # as `coef_`. Notice also that the `coef_` attribute is an array of shape (1,
-# `n_features`) an then we access it via its first entry. Alternatively one
+# `n_features`) and then we access it via its first entry. Alternatively one
 # could use `coef_.ravel()`.
 #
 # We are now ready to visualize the weight values as a barplot:

diff --git a/python_scripts/metrics_classification.py b/python_scripts/metrics_classification.py
@@ -347,7 +347,9 @@
 # of the positive class).
 
 # %%
-prevalence = target_test.value_counts()[1] / target_test.value_counts().sum()
+prevalence = (
+    target_test.value_counts()["donated"] / target_test.value_counts().sum()
+)
 print(f"Prevalence of the class 'donated': {prevalence:.2f}")
 
 # %% [markdown]

diff --git a/python_scripts/parameter_tuning_sol_03.py b/python_scripts/parameter_tuning_sol_03.py
@@ -153,7 +153,7 @@
 # holding on any axis of the parallel coordinate plot. You can then slide (move)
 # the range selection and cross two selections to see the intersections.
 #
-# Selecting the best performing models (i.e. above an accuracy of ~0.68), we
+# Selecting the best performing models (i.e. above R2 score of ~0.68), we
 # observe that **in this case**:
 #
 # - scaling the data is important. All the best performing models use scaled