Update docstrings

tryolabs · Jun 5, 2024 · 955d98a · 955d98a
1 parent b9afee8
commit 955d98a
Show file tree

Hide file tree

Showing 10 changed files with 554 additions and 30 deletions.
diff --git a/pipeline_lib/core/data_container.py b/pipeline_lib/core/data_container.py
@@ -45,7 +45,7 @@ def update(self, other: DataContainer) -> None:
         Update the data in this container with another DataContainer's data.
 
         Parameters
-        =========
+        ----------
         other : DataContainer
             The DataContainer to copy data from.
         """

diff --git a/pipeline_lib/core/random_state_generator.py b/pipeline_lib/core/random_state_generator.py
@@ -10,8 +10,9 @@ def get_random_state() -> Optional[RandomState]:
     """
     Get the global random state object.
 
-    Returns:
-        RandomState or None: The global random state object if initialized, else None.
+    Returns
+    ----------
+    RandomState or None: The global random state object if initialized, else None.
     """
     global _random_state
     return _random_state
@@ -21,8 +22,9 @@ def initialize_random_state(seed: int):
     """
     Initialize the global random state object with the provided seed.
 
-    Args:
-        seed (int): The seed value to initialize the random state object.
+    Parameters
+    ----------
+    seed (int): The seed value to initialize the random state object.
     """
     global _random_state
     _random_state = np.random.RandomState(seed)
diff --git a/pipeline_lib/core/steps/calculate_features.py b/pipeline_lib/core/steps/calculate_features.py
@@ -14,7 +14,7 @@ class UnsupportedFeatureError(Exception):
 
 
 class CalculateFeaturesStep(PipelineStep):
-    """Calculate features."""
+    """Calculate datetime-related features from specified columns."""
 
     used_for_prediction = True
     used_for_training = True
@@ -24,7 +24,14 @@ def __init__(
         datetime_columns: Optional[Union[List[str], str]] = None,
         features: Optional[List[str]] = None,
     ) -> None:
-        """Initialize CalculateFeaturesStep."""
+        """Initialize CalculateFeaturesStep.
+
+        Parameters
+        ----------
+        datetime_columns : Union[List[str], str], optional
+            The name of the column or columns containing datetime values, by default None
+        features : Optional[List[str]], optional
+        """
         super().__init__()
         self.init_logger()
         self.datetime_columns = datetime_columns
@@ -59,7 +66,18 @@ def __init__(
             )
 
     def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
-        """Convert a column to datetime."""
+        """Convert a column to datetime.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame containing the column to convert
+        column : str
+            The name of the column to convert
+        Returns
+        -------
+        pd.DataFrame
+            The DataFrame with the column converted to datetime
+        """
         # Check if the column is already a datetime type
         if not is_datetime64_any_dtype(df[column]):
             try:
@@ -78,7 +96,15 @@ def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataF
         return df
 
     def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
-        """Extract a single feature from a datetime column."""
+        """Extract a single feature from a datetime column.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame containing the datetime column
+        column : str
+            The name of the datetime column
+        feature : str
+        """
         extractor = self.feature_extractors[feature]
         feature_column = f"{column}_{feature}"
 
@@ -97,7 +123,16 @@ def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
             raise ValueError(error_message)
 
     def execute(self, data: DataContainer) -> DataContainer:
-        """Execute the step."""
+        """Execute the step.
+        Parameters
+        ----------
+        data : DataContainer
+            The data container
+        Returns
+        -------
+        DataContainer
+            The updated data container
+        """
         self.logger.info("Calculating features")
 
         if not data.is_train:
@@ -121,7 +156,18 @@ def execute(self, data: DataContainer) -> DataContainer:
     def _create_datetime_features(
         self, df: pd.DataFrame, log: Optional[bool] = False
     ) -> pd.DataFrame:
-        """Create datetime features."""
+        """Create datetime features.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame containing the datetime columns
+        log : Optional[bool], optional
+            Whether to log warnings and errors, by default False
+        Returns
+        -------
+        pd.DataFrame
+            The DataFrame with the datetime features added
+        """
         created_features = []
 
         if self.datetime_columns:

diff --git a/pipeline_lib/core/steps/calculate_metrics.py b/pipeline_lib/core/steps/calculate_metrics.py
@@ -20,6 +20,18 @@ def __init__(self) -> None:
         self.init_logger()
 
     def _calculate_metrics(self, true_values: pd.Series, predictions: pd.Series) -> dict:
+        """Calculate metrics.
+        Parameters
+        ----------
+        true_values : pd.Series
+            True values
+        predictions : pd.Series
+            Predictions
+        Returns
+        -------
+        dict
+            Metrics
+        """
         mae = mean_absolute_error(true_values, predictions)
         rmse = np.sqrt(mean_squared_error(true_values, predictions))
         r2 = r2_score(true_values, predictions)
@@ -39,6 +51,16 @@ def _calculate_metrics(self, true_values: pd.Series, predictions: pd.Series) ->
         }
 
     def execute(self, data: DataContainer) -> DataContainer:
+        """Execute the step.
+        Parameters
+        ----------
+        data : DataContainer
+            The data container
+        Returns
+        -------
+        DataContainer
+            The updated data container
+        """
         self.logger.debug("Starting metric calculation")
 
         metrics = {}

diff --git a/pipeline_lib/core/steps/calculate_reports.py b/pipeline_lib/core/steps/calculate_reports.py
@@ -16,12 +16,26 @@ class CalculateReportsStep(PipelineStep):
     used_for_training = True
 
     def __init__(self, max_samples: int = 1000) -> None:
-        """Initialize CalculateReportsStep."""
+        """Initialize CalculateReportsStep.
+        Parameters
+        ----------
+        max_samples : int, optional
+            Maximum number of samples to use for calculating SHAP values, by default 1000
+        """
         self.init_logger()
         self.max_samples = max_samples
 
     def execute(self, data: DataContainer) -> DataContainer:
-        """Execute the step."""
+        """Execute the step.
+        Parameters
+        ----------
+        data : DataContainer
+            The data container
+        Returns
+        -------
+        DataContainer
+            The data updated data container
+        """
         self.logger.info("Calculating reports")
 
         model = data.model

diff --git a/pipeline_lib/core/steps/clean.py b/pipeline_lib/core/steps/clean.py
@@ -7,6 +7,8 @@
 
 
 class CleanStep(PipelineStep):
+    """Clean tabular data."""
+
     used_for_prediction = True
     used_for_training = True
 
@@ -19,6 +21,22 @@ def __init__(
         drop_ids: Optional[dict] = None,
         filter: Optional[dict] = None,
     ):
+        """Initialize CleanStep.
+        Parameters
+        ----------
+        fill_missing : Optional[dict], optional
+            Dictionary containing column names and fill values, by default None
+        remove_outliers : Optional[dict], optional
+            Dictionary containing column names and outlier removal methods, by default None
+        convert_dtypes : Optional[dict], optional
+            Dictionary containing column names and data types, by default None
+        drop_na_columns : Optional[list], optional
+            List of column names to drop rows with missing values, by default None
+        drop_ids : Optional[dict], optional
+            Dictionary containing column names and IDs to drop, by default None
+        filter : Optional[dict], optional
+            Dictionary containing column names and filter conditions, by default None
+        """
         self.init_logger()
         self.fill_missing = fill_missing
         self.remove_outliers = remove_outliers
@@ -28,6 +46,16 @@ def __init__(
         self.filter = filter
 
     def execute(self, data: DataContainer) -> DataContainer:
+        """Execute the step.
+        Parameters
+        ----------
+        data : DataContainer
+            The data container
+        Returns
+        -------
+        DataContainer
+            The updated data container
+        """
         self.logger.info("Cleaning tabular data...")
 
         if not data.is_train:
@@ -46,8 +74,16 @@ def execute(self, data: DataContainer) -> DataContainer:
         return data
 
     def _clean_df(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Clean the DataFrame."""
-
+        """Clean the DataFrame.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame to clean
+        Returns
+        -------
+        pd.DataFrame
+            The cleaned DataFrame
+        """
         df = self._filter(df)
 
         df = self._remove_outliers(df)
@@ -63,6 +99,16 @@ def _clean_df(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def _filter(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Filter the DataFrame.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame to filter
+        Returns
+        -------
+        pd.DataFrame
+            The filtered DataFrame
+        """
         if self.filter:
             original_rows = len(df)
             for key, value in self.filter.items():
@@ -83,6 +129,16 @@ def _filter(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def _remove_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Remove outliers from the DataFrame.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame to remove outliers from
+        Returns
+        -------
+        pd.DataFrame
+            The DataFrame without outliers
+        """
         if self.remove_outliers:
             for column, method in self.remove_outliers.items():
                 if column in df.columns:
@@ -110,6 +166,16 @@ def _remove_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def _fill_missing(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Fill missing values in the DataFrame.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame to fill missing values in
+        Returns
+        -------
+        pd.DataFrame
+            The DataFrame with missing values filled
+        """
         if self.fill_missing:
             for column, fill_value in self.fill_missing.items():
                 if column in df.columns:
@@ -122,6 +188,16 @@ def _fill_missing(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def _convert_dtypes(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Convert column data types in the DataFrame.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame to convert column data types in
+        Returns
+        -------
+        pd.DataFrame
+            The DataFrame with converted column data types
+        """
         if self.convert_dtypes:
             for column, dtype in self.convert_dtypes.items():
                 if column in df.columns:
@@ -132,6 +208,16 @@ def _convert_dtypes(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def _drop_na_columns(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Drop rows with missing values in the DataFrame.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame to drop rows with missing values in
+        Returns
+        -------
+        pd.DataFrame
+            The DataFrame without rows with missing values
+        """
         if self.drop_na_columns:
             for column in self.drop_na_columns:
                 if column in df.columns:
@@ -146,6 +232,16 @@ def _drop_na_columns(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
     def _drop_ids(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Drop rows with specific IDs in the DataFrame.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The DataFrame to drop rows with specific IDs in
+        Returns
+        -------
+        pd.DataFrame
+            The DataFrame without rows with specific IDs
+        """
         if self.drop_ids:
             for column, ids in self.drop_ids.items():
                 if column in df.columns: