Skip to content

Commit

Permalink
Update docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
ovejabu committed Jun 5, 2024
1 parent b9afee8 commit 955d98a
Show file tree
Hide file tree
Showing 10 changed files with 554 additions and 30 deletions.
2 changes: 1 addition & 1 deletion pipeline_lib/core/data_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def update(self, other: DataContainer) -> None:
Update the data in this container with another DataContainer's data.
Parameters
=========
----------
other : DataContainer
The DataContainer to copy data from.
"""
Expand Down
10 changes: 6 additions & 4 deletions pipeline_lib/core/random_state_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ def get_random_state() -> Optional[RandomState]:
"""
Get the global random state object.
Returns:
RandomState or None: The global random state object if initialized, else None.
Returns
----------
RandomState or None: The global random state object if initialized, else None.
"""
global _random_state
return _random_state
Expand All @@ -21,8 +22,9 @@ def initialize_random_state(seed: int):
"""
Initialize the global random state object with the provided seed.
Args:
seed (int): The seed value to initialize the random state object.
Parameters
----------
seed (int): The seed value to initialize the random state object.
"""
global _random_state
_random_state = np.random.RandomState(seed)
58 changes: 52 additions & 6 deletions pipeline_lib/core/steps/calculate_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class UnsupportedFeatureError(Exception):


class CalculateFeaturesStep(PipelineStep):
"""Calculate features."""
"""Calculate datetime-related features from specified columns."""

used_for_prediction = True
used_for_training = True
Expand All @@ -24,7 +24,14 @@ def __init__(
datetime_columns: Optional[Union[List[str], str]] = None,
features: Optional[List[str]] = None,
) -> None:
"""Initialize CalculateFeaturesStep."""
"""Initialize CalculateFeaturesStep.
Parameters
----------
datetime_columns : Union[List[str], str], optional
The name of the column or columns containing datetime values, by default None
features : Optional[List[str]], optional
"""
super().__init__()
self.init_logger()
self.datetime_columns = datetime_columns
Expand Down Expand Up @@ -59,7 +66,18 @@ def __init__(
)

def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
"""Convert a column to datetime."""
"""Convert a column to datetime.
Parameters
----------
df : pd.DataFrame
The DataFrame containing the column to convert
column : str
The name of the column to convert
Returns
-------
pd.DataFrame
The DataFrame with the column converted to datetime
"""
# Check if the column is already a datetime type
if not is_datetime64_any_dtype(df[column]):
try:
Expand All @@ -78,7 +96,15 @@ def _convert_column_to_datetime(self, df: pd.DataFrame, column: str) -> pd.DataF
return df

def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
"""Extract a single feature from a datetime column."""
"""Extract a single feature from a datetime column.
Parameters
----------
df : pd.DataFrame
The DataFrame containing the datetime column
column : str
The name of the datetime column
feature : str
"""
extractor = self.feature_extractors[feature]
feature_column = f"{column}_{feature}"

Expand All @@ -97,7 +123,16 @@ def _extract_feature(self, df: pd.DataFrame, column: str, feature: str) -> None:
raise ValueError(error_message)

def execute(self, data: DataContainer) -> DataContainer:
"""Execute the step."""
"""Execute the step.
Parameters
----------
data : DataContainer
The data container
Returns
-------
DataContainer
The updated data container
"""
self.logger.info("Calculating features")

if not data.is_train:
Expand All @@ -121,7 +156,18 @@ def execute(self, data: DataContainer) -> DataContainer:
def _create_datetime_features(
self, df: pd.DataFrame, log: Optional[bool] = False
) -> pd.DataFrame:
"""Create datetime features."""
"""Create datetime features.
Parameters
----------
df : pd.DataFrame
The DataFrame containing the datetime columns
log : Optional[bool], optional
Whether to log warnings and errors, by default False
Returns
-------
pd.DataFrame
The DataFrame with the datetime features added
"""
created_features = []

if self.datetime_columns:
Expand Down
22 changes: 22 additions & 0 deletions pipeline_lib/core/steps/calculate_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ def __init__(self) -> None:
self.init_logger()

def _calculate_metrics(self, true_values: pd.Series, predictions: pd.Series) -> dict:
"""Calculate metrics.
Parameters
----------
true_values : pd.Series
True values
predictions : pd.Series
Predictions
Returns
-------
dict
Metrics
"""
mae = mean_absolute_error(true_values, predictions)
rmse = np.sqrt(mean_squared_error(true_values, predictions))
r2 = r2_score(true_values, predictions)
Expand All @@ -39,6 +51,16 @@ def _calculate_metrics(self, true_values: pd.Series, predictions: pd.Series) ->
}

def execute(self, data: DataContainer) -> DataContainer:
"""Execute the step.
Parameters
----------
data : DataContainer
The data container
Returns
-------
DataContainer
The updated data container
"""
self.logger.debug("Starting metric calculation")

metrics = {}
Expand Down
18 changes: 16 additions & 2 deletions pipeline_lib/core/steps/calculate_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,26 @@ class CalculateReportsStep(PipelineStep):
used_for_training = True

def __init__(self, max_samples: int = 1000) -> None:
"""Initialize CalculateReportsStep."""
"""Initialize CalculateReportsStep.
Parameters
----------
max_samples : int, optional
Maximum number of samples to use for calculating SHAP values, by default 1000
"""
self.init_logger()
self.max_samples = max_samples

def execute(self, data: DataContainer) -> DataContainer:
"""Execute the step."""
"""Execute the step.
Parameters
----------
data : DataContainer
The data container
Returns
-------
DataContainer
The data updated data container
"""
self.logger.info("Calculating reports")

model = data.model
Expand Down
100 changes: 98 additions & 2 deletions pipeline_lib/core/steps/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@


class CleanStep(PipelineStep):
"""Clean tabular data."""

used_for_prediction = True
used_for_training = True

Expand All @@ -19,6 +21,22 @@ def __init__(
drop_ids: Optional[dict] = None,
filter: Optional[dict] = None,
):
"""Initialize CleanStep.
Parameters
----------
fill_missing : Optional[dict], optional
Dictionary containing column names and fill values, by default None
remove_outliers : Optional[dict], optional
Dictionary containing column names and outlier removal methods, by default None
convert_dtypes : Optional[dict], optional
Dictionary containing column names and data types, by default None
drop_na_columns : Optional[list], optional
List of column names to drop rows with missing values, by default None
drop_ids : Optional[dict], optional
Dictionary containing column names and IDs to drop, by default None
filter : Optional[dict], optional
Dictionary containing column names and filter conditions, by default None
"""
self.init_logger()
self.fill_missing = fill_missing
self.remove_outliers = remove_outliers
Expand All @@ -28,6 +46,16 @@ def __init__(
self.filter = filter

def execute(self, data: DataContainer) -> DataContainer:
"""Execute the step.
Parameters
----------
data : DataContainer
The data container
Returns
-------
DataContainer
The updated data container
"""
self.logger.info("Cleaning tabular data...")

if not data.is_train:
Expand All @@ -46,8 +74,16 @@ def execute(self, data: DataContainer) -> DataContainer:
return data

def _clean_df(self, df: pd.DataFrame) -> pd.DataFrame:
"""Clean the DataFrame."""

"""Clean the DataFrame.
Parameters
----------
df : pd.DataFrame
The DataFrame to clean
Returns
-------
pd.DataFrame
The cleaned DataFrame
"""
df = self._filter(df)

df = self._remove_outliers(df)
Expand All @@ -63,6 +99,16 @@ def _clean_df(self, df: pd.DataFrame) -> pd.DataFrame:
return df

def _filter(self, df: pd.DataFrame) -> pd.DataFrame:
"""Filter the DataFrame.
Parameters
----------
df : pd.DataFrame
The DataFrame to filter
Returns
-------
pd.DataFrame
The filtered DataFrame
"""
if self.filter:
original_rows = len(df)
for key, value in self.filter.items():
Expand All @@ -83,6 +129,16 @@ def _filter(self, df: pd.DataFrame) -> pd.DataFrame:
return df

def _remove_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
"""Remove outliers from the DataFrame.
Parameters
----------
df : pd.DataFrame
The DataFrame to remove outliers from
Returns
-------
pd.DataFrame
The DataFrame without outliers
"""
if self.remove_outliers:
for column, method in self.remove_outliers.items():
if column in df.columns:
Expand Down Expand Up @@ -110,6 +166,16 @@ def _remove_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
return df

def _fill_missing(self, df: pd.DataFrame) -> pd.DataFrame:
"""Fill missing values in the DataFrame.
Parameters
----------
df : pd.DataFrame
The DataFrame to fill missing values in
Returns
-------
pd.DataFrame
The DataFrame with missing values filled
"""
if self.fill_missing:
for column, fill_value in self.fill_missing.items():
if column in df.columns:
Expand All @@ -122,6 +188,16 @@ def _fill_missing(self, df: pd.DataFrame) -> pd.DataFrame:
return df

def _convert_dtypes(self, df: pd.DataFrame) -> pd.DataFrame:
"""Convert column data types in the DataFrame.
Parameters
----------
df : pd.DataFrame
The DataFrame to convert column data types in
Returns
-------
pd.DataFrame
The DataFrame with converted column data types
"""
if self.convert_dtypes:
for column, dtype in self.convert_dtypes.items():
if column in df.columns:
Expand All @@ -132,6 +208,16 @@ def _convert_dtypes(self, df: pd.DataFrame) -> pd.DataFrame:
return df

def _drop_na_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Drop rows with missing values in the DataFrame.
Parameters
----------
df : pd.DataFrame
The DataFrame to drop rows with missing values in
Returns
-------
pd.DataFrame
The DataFrame without rows with missing values
"""
if self.drop_na_columns:
for column in self.drop_na_columns:
if column in df.columns:
Expand All @@ -146,6 +232,16 @@ def _drop_na_columns(self, df: pd.DataFrame) -> pd.DataFrame:
return df

def _drop_ids(self, df: pd.DataFrame) -> pd.DataFrame:
"""Drop rows with specific IDs in the DataFrame.
Parameters
----------
df : pd.DataFrame
The DataFrame to drop rows with specific IDs in
Returns
-------
pd.DataFrame
The DataFrame without rows with specific IDs
"""
if self.drop_ids:
for column, ids in self.drop_ids.items():
if column in df.columns:
Expand Down
Loading

0 comments on commit 955d98a

Please sign in to comment.