diff --git a/docs/source/logai.algorithms.nn_model.forecast_nn.rst b/docs/source/logai.algorithms.nn_model.forecast_nn.rst new file mode 100644 index 0000000..d734132 --- /dev/null +++ b/docs/source/logai.algorithms.nn_model.forecast_nn.rst @@ -0,0 +1,53 @@ +logai.algorithms.nn\_model.forecast\_nn package +=============================================== + +Submodules +---------- + +logai.algorithms.nn\_model.forecast\_nn.base\_nn module +------------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.forecast_nn.base_nn + :members: + :undoc-members: + :show-inheritance: + +logai.algorithms.nn\_model.forecast\_nn.cnn module +-------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.forecast_nn.cnn + :members: + :undoc-members: + :show-inheritance: + +logai.algorithms.nn\_model.forecast\_nn.lstm module +--------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.forecast_nn.lstm + :members: + :undoc-members: + :show-inheritance: + +logai.algorithms.nn\_model.forecast\_nn.transformer module +---------------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.forecast_nn.transformer + :members: + :undoc-members: + :show-inheritance: + +logai.algorithms.nn\_model.forecast\_nn.utils module +---------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.forecast_nn.utils + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: logai.algorithms.nn_model.forecast_nn + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/logai.algorithms.nn_model.logbert.rst b/docs/source/logai.algorithms.nn_model.logbert.rst new file mode 100644 index 0000000..8296557 --- /dev/null +++ b/docs/source/logai.algorithms.nn_model.logbert.rst @@ -0,0 +1,61 @@ +logai.algorithms.nn\_model.logbert package +========================================== + +Submodules +---------- + +logai.algorithms.nn\_model.logbert.configs module +------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.logbert.configs + :members: + :undoc-members: + :show-inheritance: + +logai.algorithms.nn\_model.logbert.eval\_metric\_utils module +------------------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.logbert.eval_metric_utils + :members: + :undoc-members: + :show-inheritance: + +logai.algorithms.nn\_model.logbert.predict module +------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.logbert.predict + :members: + :undoc-members: + :show-inheritance: + +logai.algorithms.nn\_model.logbert.predict\_utils module +-------------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.logbert.predict_utils + :members: + :undoc-members: + :show-inheritance: + +logai.algorithms.nn\_model.logbert.tokenizer\_utils module +---------------------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.logbert.tokenizer_utils + :members: + :undoc-members: + :show-inheritance: + +logai.algorithms.nn\_model.logbert.train module +----------------------------------------------- + +.. automodule:: logai.algorithms.nn_model.logbert.train + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: logai.algorithms.nn_model.logbert + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/logai.algorithms.nn_model.rst b/docs/source/logai.algorithms.nn_model.rst index 99dd9c5..fe13d58 100644 --- a/docs/source/logai.algorithms.nn_model.rst +++ b/docs/source/logai.algorithms.nn_model.rst @@ -1,6 +1,15 @@ logai.algorithms.nn\_model package ================================== +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + logai.algorithms.nn_model.forecast_nn + logai.algorithms.nn_model.logbert + Submodules ---------- diff --git a/docs/source/logai.applications.openset.anomaly_detection.configs.rst b/docs/source/logai.applications.openset.anomaly_detection.configs.rst new file mode 100644 index 0000000..3c334d7 --- /dev/null +++ b/docs/source/logai.applications.openset.anomaly_detection.configs.rst @@ -0,0 +1,21 @@ +logai.applications.openset.anomaly\_detection.configs package +============================================================= + +Submodules +---------- + +logai.applications.openset.anomaly\_detection.configs.schema module +------------------------------------------------------------------- + +.. automodule:: logai.applications.openset.anomaly_detection.configs.schema + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: logai.applications.openset.anomaly_detection.configs + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/logai.applications.openset.anomaly_detection.rst b/docs/source/logai.applications.openset.anomaly_detection.rst new file mode 100644 index 0000000..e7dfbb9 --- /dev/null +++ b/docs/source/logai.applications.openset.anomaly_detection.rst @@ -0,0 +1,29 @@ +logai.applications.openset.anomaly\_detection package +===================================================== + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + logai.applications.openset.anomaly_detection.configs + +Submodules +---------- + +logai.applications.openset.anomaly\_detection.openset\_anomaly\_detection\_workflow module +------------------------------------------------------------------------------------------ + +.. automodule:: logai.applications.openset.anomaly_detection.openset_anomaly_detection_workflow + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: logai.applications.openset.anomaly_detection + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/logai.applications.openset.rst b/docs/source/logai.applications.openset.rst new file mode 100644 index 0000000..a17dcd0 --- /dev/null +++ b/docs/source/logai.applications.openset.rst @@ -0,0 +1,18 @@ +logai.applications.openset package +================================== + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + logai.applications.openset.anomaly_detection + +Module contents +--------------- + +.. automodule:: logai.applications.openset + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/logai.applications.rst b/docs/source/logai.applications.rst index 7e120e3..e35471a 100644 --- a/docs/source/logai.applications.rst +++ b/docs/source/logai.applications.rst @@ -1,6 +1,14 @@ logai.applications package ========================== +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + logai.applications.openset + Submodules ---------- diff --git a/examples/jupyter_notebook/tutorial_deep_ad.md b/examples/jupyter_notebook/tutorial_deep_ad.md index 584395f..f04ce78 100644 --- a/examples/jupyter_notebook/tutorial_deep_ad.md +++ b/examples/jupyter_notebook/tutorial_deep_ad.md @@ -236,7 +236,6 @@ workflow_config: Then to run the end to end log anomaly detection on the HDFS dataset using LSTM Anomaly Detector (a sequence-based deep-learning model), you can use a python script like below: ```python -import os from logai.applications.openset.anomaly_detection.openset_anomaly_detection_workflow import OpenSetADWorkflowConfig from logai.utils.file_utils import read_file from logai.utils.dataset_utils import split_train_dev_test_for_anomaly_detection @@ -299,6 +298,7 @@ print (predict_results) This kind of Anomaly Detection workflow for various Deep-Learning models and various experimental settings have also been automated in `logai.applications.openset.anomaly_detection.openset_anomaly_detection_workflow.OpenSetADWorkflow` class which can be easily invoked like the below example ```python +import os from logai.applications.openset.anomaly_detection.openset_anomaly_detection_workflow import OpenSetADWorkflow, get_openset_ad_config TEST_DATA_PATH = "tests/logai/test_data/HDFS_AD/HDFS_5k.log" diff --git a/logai/algorithms/anomaly_detection_algo/anomaly_detector_het.py b/logai/algorithms/anomaly_detection_algo/anomaly_detector_het.py index 5ec29af..b6d01c5 100644 --- a/logai/algorithms/anomaly_detection_algo/anomaly_detector_het.py +++ b/logai/algorithms/anomaly_detection_algo/anomaly_detector_het.py @@ -14,7 +14,7 @@ class HetAnomalyDetectionConfig(AnomalyDetectionConfig): """ - Heterogeneous Anomaly Detector Parameters + Heterogeneous Anomaly Detector Parameters. :param train_test_ratio: The ratio between test and training splits. """ @@ -26,7 +26,8 @@ class HetAnomalyDetector(AnomalyDetector): """ Anomaly Detector Wrapper to handle heterogeneous log feature dataframe which include various attributes of log. For each attribute, we build its specific anomaly detector if the data satisfies the requirement. - This current version only supports anomaly detection on the constants.LOGLINE_COUNTS field (i.e. frequency count of the log events) + This current version only supports anomaly detection on the constants.LOGLINE_COUNTS field (i.e. frequency count of + the log events). """ def __init__(self, config: HetAnomalyDetectionConfig): @@ -40,11 +41,11 @@ def __init__(self, config: HetAnomalyDetectionConfig): def preprocess(self, counter_df: pd.DataFrame): """ - Split raw log feature dataframe by unique attribute ID + Splits raw log feature dataframe by unique attribute ID. - :param counter_df: log feature dataframe must contain at least two columns + :param counter_df: A log feature dataframe that must contain at least two columns ['timestamp': datetime, constants.LOGLINE_COUNTS: int]. - The rest of columns combinations are treated as log attribute ID + The rest of columns combinations are treated as log attribute ID. :return: The processed log feature dataframe. """ ts_df = counter_df[[constants.LOG_COUNTS]] @@ -55,13 +56,13 @@ def preprocess(self, counter_df: pd.DataFrame): attr_list = counter_df["attribute"].unique() return attr_list - def fit_predict(self, log_feature: pd.DataFrame): + def fit_predict(self, log_feature: pd.DataFrame) -> pd.DataFrame: """ - Train and predict anomaly scores + Trains a model and predicts anomaly scores. - :param log_features: log feature dataframe must contain at least two columns + :param log_features: A log feature dataframe that must contain at least two columns ['timestamp': datetime, constants.LOGLINE_COUNTS: int]. - The rest of columns combinations are treated as log attribute ID + The rest of columns combinations are treated as log attribute ID. :return: The predicted anomaly scores. """ res = pd.DataFrame() diff --git a/logai/algorithms/anomaly_detection_algo/dbl.py b/logai/algorithms/anomaly_detection_algo/dbl.py index 9da3bf0..927ed57 100644 --- a/logai/algorithms/anomaly_detection_algo/dbl.py +++ b/logai/algorithms/anomaly_detection_algo/dbl.py @@ -24,7 +24,7 @@ class DBLDetectorParams(Config): """ Dynamic Baseline Parameters. For more details on the paramaters see - https://opensource.salesforce.com/Merlion/v1.3.1/merlion.models.anomaly.html#module-merlion.models.anomaly.dbl + https://opensource.salesforce.com/Merlion/v1.3.1/merlion.models.anomaly.html#module-merlion.models.anomaly.dbl. :param threshold: The rule to use for thresholding anomaly scores. :param fixed_period: ``(t0, tf)``; Train the model on all datapoints occurring between t0 and tf (inclusive). @@ -46,10 +46,10 @@ class DBLDetectorParams(Config): @factory.register("detection", "dbl", DBLDetectorParams) class DBLDetector(AnomalyDetectionAlgo): """Dynamic baseline based time series anomaly detection. This is a wrapper class for the Dynamic Baseline - anomaly detection model from Merlion library + anomaly detection model from Merlion library . https://opensource.salesforce.com/Merlion/v1.3.1/merlion.models.anomaly.html#module-merlion.models.anomaly.dbl Current implementation only supports anomaly detection on the constants.LOGLINE_COUNTS class (which maintains - frequency counts of the log events ) + frequency counts of the log events). """ def __init__(self, params: DBLDetectorParams): dbl_config = DynamicBaselineConfig( @@ -65,9 +65,9 @@ def __init__(self, params: DBLDetectorParams): def fit(self, log_features: pd.DataFrame): """ - Train method of the Dynamic Baseline model + Training method of the Dynamic Baseline model. - :param log_features: log feature dataframe must only contain two columns + :param log_features: A log feature dataframe that must only contain two columns ['timestamp': datetime, constants.LOGLINE_COUNTS: int]. """ self._is_valid_ts_df(log_features) @@ -77,9 +77,10 @@ def fit(self, log_features: pd.DataFrame): def predict(self, log_features: pd.DataFrame): """ - Predict anomaly scores for log_feature["timestamp", constants.LOGLINE_COUNTS] + Predicts anomaly scores for log_feature["timestamp", constants.LOGLINE_COUNTS]. - :param log_features: log feature dataframe must contain two columns ['timestamp': datetime, 'counts': int]. + :param log_features: A log feature dataframe that must contain two columns + ['timestamp': datetime, 'counts': int]. :return: A dataframe of the predicted anomaly scores, e.g., index:log_features.index. value: anomaly score to indicate if anomaly or not. """ diff --git a/logai/algorithms/anomaly_detection_algo/distribution_divergence.py b/logai/algorithms/anomaly_detection_algo/distribution_divergence.py index bdc9236..d0394a3 100755 --- a/logai/algorithms/anomaly_detection_algo/distribution_divergence.py +++ b/logai/algorithms/anomaly_detection_algo/distribution_divergence.py @@ -35,8 +35,8 @@ def _js_divergence(p, q): class DistributionDivergenceParams(Config): """Parameters for distribution divergence based anomaly detector. - :param n_bins: number of bins to use to discretize the continuous distribution into a discrete distribution - :param type: list of types of distribution divergences. The allowed types are Kullback–Leibler ("KL"), Jensen–Shannon + :param n_bins: The number of bins to use to discretize the continuous distribution into a discrete distribution + :param type: A list of types of distribution divergences. The allowed types are Kullback–Leibler ("KL"), Jensen–Shannon ("JS"). It also allows a comma separated list of metrics like ("KL,JS" or "JS,KL"). """ n_bins: int = 100 diff --git a/logai/algorithms/anomaly_detection_algo/ets.py b/logai/algorithms/anomaly_detection_algo/ets.py index f9ba08a..a7fc76b 100644 --- a/logai/algorithms/anomaly_detection_algo/ets.py +++ b/logai/algorithms/anomaly_detection_algo/ets.py @@ -54,10 +54,9 @@ class ETSDetectorParams(Config): @factory.register("detection", "ets", ETSDetectorParams) class ETSDetector(AnomalyDetectionAlgo): - """ - ETS Anomaly Detector. This is a wrapper for the ETS based Anomaly Detector from Merlion library - https://opensource.salesforce.com/Merlion/v1.0.2/merlion.models.forecast.html#module-merlion.models.forecast.ets. - This current version only supports anomaly detection of the constants.LOGLINE_COUNTS (i.e. frequency count of log events). + """ETS Anomaly Detector. This is a wrapper for the ETS based Anomaly Detector from Merlion library + https://opensource.salesforce.com/Merlion/v1.0.2/merlion.models.forecast.html#module-merlion.models.forecast.ets + This current version only supports anomaly detection of the constants.LOGLINE_COUNTS (i.e. frequency count of log events) """ def __init__(self, params: ETSDetectorParams): @@ -77,12 +76,11 @@ def __init__(self, params: ETSDetectorParams): def fit(self, log_features: pd.DataFrame): """ - Fit method to train ETS Anomaly Detector. - :param log_features: log feature dataframe must only contain two columns + :param log_features: A log feature dataframe that must only contain two columns ['timestamp': datetime, constants.LOGLINE_COUNTS: int]. - :return: train_scores: anomaly scores dataframe + :return: train_scores: The anomaly scores dataframe ['index':log_features.index, 'timestamps': datetime, 'anom_score': scores, 'trainval': whether it is training set. """ @@ -100,11 +98,11 @@ def fit(self, log_features: pd.DataFrame): def predict(self, log_features: pd.DataFrame): """ - Predict anomaly scores for log_feature["timestamp", constants.LOGLINE_COUNTS]. + Predicts anomaly scores for log_feature["timestamp", constants.LOGLINE_COUNTS]. - :param log_features: log feature dataframe must only contain two columns + :param log_features: A log feature dataframe that must only contain two columns ['timestamp': datetime, constants.LOGLINE_COUNTS: int]. - :return: test_scores: anomaly scores dataframe + :return: test_scores: The anomaly scores dataframe ['index':log_features.index, 'timestamps': datetime, 'anom_score': scores, 'trainval': whether it is training set. """ diff --git a/logai/algorithms/anomaly_detection_algo/forecast_nn.py b/logai/algorithms/anomaly_detection_algo/forecast_nn.py index fcbdb97..72f1d7f 100755 --- a/logai/algorithms/anomaly_detection_algo/forecast_nn.py +++ b/logai/algorithms/anomaly_detection_algo/forecast_nn.py @@ -20,17 +20,13 @@ class ForcastBasedNeuralAD(NNAnomalyDetectionAlgo): """Forcasting based neural anomaly detection models taken from the deep-loglizer paper - (https://arxiv.org/pdf/2107.05908.pdf) + (https://arxiv.org/pdf/2107.05908.pdf). - Inherits: - NNAnomalyDetectionAlgo : interface of neural anomaly detection algorithms + :param config: The parameters of general forecasting based neural anomaly detection models. """ def __init__(self, config: ForecastBasedNNParams): - """Initialization for forecasting based neural anomaly detection models. - - :param config: The config for general forecasting based neural anomaly detection models. - """ + self.model = None self.config = config @@ -41,9 +37,9 @@ def fit( ): """The fit method to train forecasting based neural anomaly detection models. - :param train_data: training dataset of type ForecastNNVectorizedDataset + :param train_data: The training dataset of type ForecastNNVectorizedDataset (consisting of session_idx, features, window_anomalies and window_labels). - :param dev_data: development dataset of type ForecastNNVectorizedDataset + :param dev_data: The development dataset of type ForecastNNVectorizedDataset (consisting of session_idx, features, window_anomalies and window_labels). """ dataloader_train = DataLoader( @@ -65,7 +61,7 @@ def predict(self, test_data: ForecastNNVectorizedDataset): :param test_data: The test dataset of type ForecastNNVectorizedDataset (consisting of session_idx, features, window_anomalies and window_labels). - :returns: A dict containing overall evaluation results. + :return: A dict containing overall evaluation results. """ dataloader_test = DataLoader( test_data.dataset, @@ -80,16 +76,11 @@ def predict(self, test_data: ForecastNNVectorizedDataset): @factory.register("detection", "lstm", LSTMParams) class ForecastBasedLSTM(ForcastBasedNeuralAD): """Forecasting based lstm model for log anomaly detection. - - Inherits: - ForcastBasedNeuralAD: base class for forecast based neural models for anomaly detection. + :param config: A config object containing parameters for LSTM based anomaly detection model. """ def __init__(self, config: LSTMParams): - """Initializing ForecastBasedLSTM object - - :param config: A config object containing parameters for LSTM based anomaly detection model. - """ + super().__init__(config) self.config = config self.model = LSTM(config=self.config) @@ -98,16 +89,10 @@ def __init__(self, config: LSTMParams): @factory.register("detection", "cnn", CNNParams) class ForecastBasedCNN(ForcastBasedNeuralAD): """Forecasting based cnn model for log anomaly detection. - - Inherits: - ForcastBasedNeuralAD: base class for forecast based neural models for anomaly detection. + :param config: A config object containing parameters for CNN based anomaly detection model. """ def __init__(self, config: CNNParams): - """Initializing ForecastBasedCNN object. - - :param config: A config object containing parameters for CNN based anomaly detection model. - """ super().__init__(config) self.config = config self.model = CNN(config=self.config) @@ -116,17 +101,11 @@ def __init__(self, config: CNNParams): @factory.register("detection", "transformer", TransformerParams) class ForecastBasedTransformer(ForcastBasedNeuralAD): """Forecasting based transformer model for log anomaly detection. - - Inherits: - ForcastBasedNeuralAD: base class for forecast based neural models for anomaly detection. + :param config: A config object containing parameters for Transformer based anomaly detection model. """ def __init__(self, config: TransformerParams): - """Initializing ForecastBasedTransformer object. - - :param config: A config object containing parameters for - Transformer based anomaly detection model. - """ + super().__init__(config) self.config = config self.model = Transformer(config=self.config) diff --git a/logai/algorithms/anomaly_detection_algo/isolation_forest.py b/logai/algorithms/anomaly_detection_algo/isolation_forest.py index f7242e9..4ace261 100755 --- a/logai/algorithms/anomaly_detection_algo/isolation_forest.py +++ b/logai/algorithms/anomaly_detection_algo/isolation_forest.py @@ -50,8 +50,8 @@ class IsolationForestDetector(AnomalyDetectionAlgo): """ def __init__(self, params: IsolationForestParams): """Constructor for isolation forest based anomaly detector. - - :param params: The object of type IsolationForestParams containing parameters of Isolation Forest. + + :param params: An object of IsolationForestParams containing parameters of Isolation Forest. """ self.model = IsolationForest( n_estimators=params.n_estimators, @@ -82,7 +82,7 @@ def predict(self, log_features: pd.DataFrame) -> pd.Series: """ Predicts anomalies. - :param log_features: The input for inference + :param log_features: The input for inference. :return: A pandas dataframe of the predicted anomaly scores. """ test_scores = self.model.predict(log_features) diff --git a/logai/algorithms/anomaly_detection_algo/local_outlier_factor.py b/logai/algorithms/anomaly_detection_algo/local_outlier_factor.py index 2814c0b..e85fd62 100755 --- a/logai/algorithms/anomaly_detection_algo/local_outlier_factor.py +++ b/logai/algorithms/anomaly_detection_algo/local_outlier_factor.py @@ -17,7 +17,7 @@ @dataclass class LOFParams(Config): - """Parameters of Locality Outlier Factors based Anomaly Detector + """Parameters of Locality Outlier Factors based Anomaly Detector . For more explanations of the parameters see https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html. :param n_neighbors: Number of neighbors to use by default for kneighbors queries. @@ -66,7 +66,7 @@ def fit(self, log_features: pd.DataFrame): Fits a LOF model. :param log_features: The input for model training. - :return: The scores of the training dataset. + :return: pandas.Dataframe : The scores of the training dataset. """ self.model.fit( np.array(log_features) @@ -80,7 +80,7 @@ def predict(self, log_features: pd.DataFrame) -> pd.Series: """ Predicts anomaly scores. - :param log_features: The input for inference + :param log_features: The input for inference. :return: A pandas dataframe of the predicted anomaly scores. """ test_scores = self.model.predict( diff --git a/logai/algorithms/anomaly_detection_algo/logbert.py b/logai/algorithms/anomaly_detection_algo/logbert.py index bc6897d..f83ecee 100755 --- a/logai/algorithms/anomaly_detection_algo/logbert.py +++ b/logai/algorithms/anomaly_detection_algo/logbert.py @@ -16,20 +16,17 @@ @factory.register("detection", "logbert", LogBERTConfig) class LogBERT(NNAnomalyDetectionAlgo): - """Logbert model for anomaly detection of logs. + """Logbert model for anomaly detection of logs + :param config: A config object for logbert model. """ def __init__(self, config: LogBERTConfig): - """initializing logBERT model - - :param config: The config object for logbert model. - """ self.logbert_train = LogBERTTrain(config=config) self.logbert_predict = LogBERTPredict(config=config) def fit(self, train_data: HFDataset, dev_data: HFDataset): - """Fit method for training logBERT model - + """Fit method for training logBERT model. + :param train_data: The training dataset of type huggingface Dataset object. :param dev_data: The development dataset of type huggingface Dataset object. """ @@ -37,9 +34,8 @@ def fit(self, train_data: HFDataset, dev_data: HFDataset): def predict(self, test_data: HFDataset) -> pd.DataFrame: """Predict method for running inference on logBERT model. - + :param test_data: The test dataset of type huggingface Dataset object. - :returns: A pandas dataframe containing the evaluation results - for each type of metric. + :return: A pandas dataframe object containing the evaluation results for each type of metric. """ return self.logbert_predict.predict(test_data) diff --git a/logai/algorithms/anomaly_detection_algo/one_class_svm.py b/logai/algorithms/anomaly_detection_algo/one_class_svm.py index 7b1b6de..ff06272 100755 --- a/logai/algorithms/anomaly_detection_algo/one_class_svm.py +++ b/logai/algorithms/anomaly_detection_algo/one_class_svm.py @@ -47,8 +47,8 @@ def __init__(self, params: OneClassSVMParams): """ OneClass SVM based Anomaly Detector. This is a wrapper class for the OneClassSVM model from scikit-learn library. For more details see https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html. - - :param params: OneClassSVMParams: The config to control one class SVM models. + + :param params: The parameters to control one class SVM models. """ self.model = OneClassSVM( kernel=params.kernel, diff --git a/logai/algorithms/categorical_encoding_algo/label_encoding.py b/logai/algorithms/categorical_encoding_algo/label_encoding.py index 6755523..f0b1aa7 100755 --- a/logai/algorithms/categorical_encoding_algo/label_encoding.py +++ b/logai/algorithms/categorical_encoding_algo/label_encoding.py @@ -25,7 +25,7 @@ def fit_transform(self, log_attributes: pd.DataFrame): """ Fits and transforms log_attributes into label encoding categories. - :param log_attributes: list of log attributes in text format. + :param log_attributes: A list of log attributes in text format. :return: The label encoding categories. """ diff --git a/logai/algorithms/categorical_encoding_algo/one_hot_encoding.py b/logai/algorithms/categorical_encoding_algo/one_hot_encoding.py index 110c0cc..679cc43 100755 --- a/logai/algorithms/categorical_encoding_algo/one_hot_encoding.py +++ b/logai/algorithms/categorical_encoding_algo/one_hot_encoding.py @@ -34,9 +34,6 @@ class OneHotEncoding(CategoricalEncodingAlgo): https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html. """ def __init__(self, params: OneHotEncodingParams): - """ - Initializes the one-hot encoder. - """ self.model = OneHotEncoder( categories=params.categories, drop=params.drop, @@ -48,9 +45,9 @@ def __init__(self, params: OneHotEncodingParams): def fit_transform(self, log_attributes: pd.DataFrame) -> pd.DataFrame: """ Fits and transforms log attributes into one-hot encoding categories. - + :param log_attributes: A list of log attributes in text form. - :return: The categories in on-hot encoding. + :return: A pandas dataframe of categories in on-hot encoding. """ col_names = log_attributes.columns if len(col_names) == 1: diff --git a/logai/algorithms/categorical_encoding_algo/ordinal_encoding.py b/logai/algorithms/categorical_encoding_algo/ordinal_encoding.py index 830764c..1b6de81 100755 --- a/logai/algorithms/categorical_encoding_algo/ordinal_encoding.py +++ b/logai/algorithms/categorical_encoding_algo/ordinal_encoding.py @@ -49,9 +49,9 @@ def __init__(self, params: OrdinalEncodingParams): def fit_transform(self, log_attributes: pd.DataFrame) -> pd.DataFrame: """ Fits and transforms log attributes into ordinal encoding categories. - + :param log_attributes: A list of log attributes in text format. - :return: The ordinal encoding categories. + :return: A pandas dataframe of ordinal encoding categories. """ self.model.fit(log_attributes) res_column_names = ["{}-categorical".format(c) for c in log_attributes.columns] diff --git a/logai/algorithms/nn_model/forecast_nn/base_nn.py b/logai/algorithms/nn_model/forecast_nn/base_nn.py index ee40d7b..5452341 100755 --- a/logai/algorithms/nn_model/forecast_nn/base_nn.py +++ b/logai/algorithms/nn_model/forecast_nn/base_nn.py @@ -24,34 +24,29 @@ @dataclass class ForecastBasedNNParams(Config): - """Config for neural representation learning for logs using forecasting based self-supervised tasks - - Inherits: - Config : Config interface - - model_name: name of the model - metadata_filepath: path to file containing meta data (pretrained token embeddings in case if semantic log - representations are used in feature type) - output_dir: path to output directory where the model would be dumped - feature_type: (should be "semantics" or "sequential")type of log feature representations used for the log-lines or - log-sequences - label_type: type of label (should be "anomaly" or "next_log") based on whether supervised or unsupervised - (forcast based) model is being used - eval_type: (should be "session" or None) whether to aggregate and report the evaluation metrics at the level of - sessions (based on the span_id in the log data) or at the level of each logline - topk: the prediction at top-k to consider, when deciding whether an evaluation instance is an anomaly or not - embedding_dim: dimension of the embedding space. Both for sequential and semantic type feature representation, - the input log feature representation is passed through an embedding layer which projects it to the embedding_dim - hidden_size: dimension of the hidden representations - freeze: whether to freeze the embedding layer to use the pretrained embeddings or to further train it on the - given task - gpu: device number if gpu is used (otherwise -1 or None will use cpu) - patience: number of eval_steps, the model waits for performance on validation data to improve, before early - stopping the training - num_train_epochs: number of training epochs - batch_size: batch size - learning_rate: learning rate - + """ + Config for neural representation learning for logs using forecasting based self-supervised tasks. + + :param model_name: name of the model. + :param metadata_filepath: path to file containing meta data (pretrained token embeddings in case if + semantic log representations are used in feature type). + :param output_dir: path to output directory where the model would be dumped. + :param feature_type: (should be "semantics" or "sequential")type of log feature representations used + for the log-lines or log-sequences. + :param label_type: type of label (should be "anomaly" or "next_log") based on whether supervised + or unsupervised (forcast based) model is being used. + :param eval_type: (should be "session" or None) whether to aggregate and report the evaluation + metrics at the level of sessions (based on the span_id in the log data) or at the level of each logline. + :param topk: the prediction at top-k to consider, when deciding whether an evaluation instance is an anomaly or not. + :param embedding_dim: dimension of the embedding space. Both for sequential and semantic type feature representation, + the input log feature representation is passed through an embedding layer which projects it to the embedding_dim. + :param hidden_size: dimension of the hidden representations. + :param freeze: whether to freeze the embedding layer to use the pretrained embeddings or to further train it on the given task. + :param gpu: device number if gpu is used (otherwise -1 or None will use cpu). + :param patience: number of eval_steps, the model waits for performance on validation data to improve, before early stopping the training. + :param num_train_epochs: number of training epochs. + :param batch_size: batch size. + :param learning_rate: learning rate. """ model_name: str = None @@ -72,9 +67,12 @@ class ForecastBasedNNParams(Config): class Embedder(nn.Module): - """Learnable embedder for embedding loglines + """Learnable embedder for embedding loglines. - Inherits: torch nn.Module + :param vocab_size: vocabulary size. + :param embedding_dim: embedding dimension. + :param pretrain_matrix: torch.Tensor object containing the pretrained embedding of the vocabulary tokens. + :param freeze: Freeze embeddings to pretrained ones if set to True, otherwise makes the embeddings learnable. """ def __init__( @@ -84,16 +82,6 @@ def __init__( pretrain_matrix: np.array = None, freeze: bool = False, ): - """initializing embedder class - - Args: - vocab_size (int): vocabulary size - embedding_dim (int): embedding dimension - pretrain_matrix (tensor, optional): torch.Tensor object containing the - pretrained embedding of the vocabulary tokens. Defaults to None. - freeze (bool, optional): Freeze embeddings to pretrained ones if set to - True, otherwise makes the embeddings learnable. Defaults to False. - """ super(Embedder, self).__init__() if pretrain_matrix is not None: self.embedding_layer = nn.Embedding.from_pretrained( @@ -109,18 +97,13 @@ def forward(self, x): class ForecastBasedNN(nn.Module): - """Model for learning log representations through a forecasting based self-supervised task + """ + Model for learning log representations through a forecasting based self-supervised task. - Inherits: nn.Module + :param config: ForecastBasedNNParams config class for parameters of forecasting based neural log representation models. """ def __init__(self, config: ForecastBasedNNParams): - """Initialization of base class for forecasting based neural log representation learning - - Args: - config (ForecastBasedNNParams): config class for parameters of forecasting based - neural log representation models - """ super(ForecastBasedNN, self).__init__() self.config = config @@ -153,15 +136,12 @@ def __init__(self, config: ForecastBasedNNParams): ) def predict(self, test_loader: DataLoader, dtype: str = "test"): - """predict method on test data - - Args: - test_loader (Dataloader): dataloader (torch.utils.data.DataLoader) for test (or development) dataset - dtype (str, optional): can be of type "test" or "dev" based on which the predict method is called for. - Defaults to "test". + """ + Predict method on test data. - Returns: - dict : dict object containing the overall evaluation metrics for test (or dev) data + :param test_loader: dataloader (torch.utils.data.DataLoader) for test (or development) dataset. + :param dtype: can be of type "test" or "dev" based on which the predict method is called for. + :return: dict object containing the overall evaluation metrics for test (or dev) data. """ logging.info("Evaluating {} data.".format(dtype)) @@ -400,7 +380,7 @@ def __input2device(self, batch_input: dict): return {k: v.to(self.device) for k, v in batch_input.items()} def save_model(self): - """saving model to file as specified in config""" + """Saving model to file as specified in config""" logging.info("Saving model to {}".format(self.model_save_file)) try: torch.save( @@ -412,24 +392,20 @@ def save_model(self): torch.save(self.state_dict(), self.model_save_file) def load_model(self, model_save_file: str = ""): - """loading model from file + """Loading model from file. - Args: - model_save_file (str, optional): _description_. Defaults to "". + :param model_save_file: path to file where model would be saved. """ logging.info("Loading model from {}".format(self.model_save_file)) self.load_state_dict(torch.load(model_save_file, map_location=self.device)) def fit(self, train_loader: DataLoader, dev_loader: DataLoader = None): - """Fit method for training model - - Args: - train_loader (DataLoader): dataloader (torch.utils.data.DataLoader) for the train dataset - dev_loader (DataLoader, optional): dataloader (torch.utils.data.DataLoader) for the train dataset. - Defaults to None, for which no evaluation is run + """ + Fit method for training model - Returns: - dict : dict containing the best loss on dev dataset + :param train_loader: dataloader (torch.utils.data.DataLoader) for the train dataset. + :param dev_loader: dataloader (torch.utils.data.DataLoader) for the train dataset. Defaults to None, for which no evaluation is run. + :return: dict containing the best loss on dev dataset. """ self.to(self.device) logging.info( diff --git a/logai/algorithms/nn_model/forecast_nn/cnn.py b/logai/algorithms/nn_model/forecast_nn/cnn.py index 318a2c6..87c666c 100755 --- a/logai/algorithms/nn_model/forecast_nn/cnn.py +++ b/logai/algorithms/nn_model/forecast_nn/cnn.py @@ -19,30 +19,23 @@ @dataclass class CNNParams(ForecastBasedNNParams): - """Config for CNN based log representation learning - - Inherits: - ForecastBasedNNParams: Config class for storing parameters of forecast based neural models - - kernel_sizes: list = [2, 3, 4] # kernel size + """Config for CNN based log representation learning. + + :param kernel_sizes: the kernel size (default value: list = [2, 3, 4]). """ kernel_sizes: list = [2, 3, 4] class CNN(ForecastBasedNN): - """CNN based model for learning log representation through a self-supervised forecasting task over log sequences + """ + CNN based model for learning log representation through a self-supervised forecasting task over log sequences. - Args: - ForecastBasedNN: base class for forecasting based neural log representation learning + :param config: parameters for CNN log representation learning model. """ def __init__(self, config: CNNParams): - """initialization for CNN based log representation learning - - Args: - config (CNNParams): parameters for CNN log representation learning model - """ + super().__init__(config) self.config = config self.config.model_name = "cnn" @@ -65,14 +58,10 @@ def __init__(self, config: CNNParams): ) def forward(self, input_dict): - """forward method for cnn model - - Args: - input_dict (dict): dict containing the session_idx, features, window_anomalies - and window_labels as in ForecastNNVectorizedDataset object + """Forward method for cnn model. - Returns: - dict: dict containing loss and prediction tensor + :param input_dict : dict containing the session_idx, features, window_anomalies and window_labels as in ForecastNNVectorizedDataset object. + :return: dict containing loss and prediction tensor. """ if self.label_type == "anomaly": y = input_dict[ForecastNNVectorizedDataset.window_anomalies].long().view(-1) diff --git a/logai/algorithms/nn_model/forecast_nn/lstm.py b/logai/algorithms/nn_model/forecast_nn/lstm.py index 883e490..c882eeb 100755 --- a/logai/algorithms/nn_model/forecast_nn/lstm.py +++ b/logai/algorithms/nn_model/forecast_nn/lstm.py @@ -18,16 +18,12 @@ @dataclass class LSTMParams(ForecastBasedNNParams): - """Config for lstm based log representation learning - - Inherits: - ForecastBasedNNParams: base class for parameters of forecasting based neural log representation moels - - num_directions: int = 2 # whether bidirectional or unidirectional (left to right) model - num_layers: int = 1 # number of hidden layers in the neural network - max_token_len: int = None # maximum token length of the input - use_attention: bool = False # whether to use attention or not + """Config for lstm based log representation learning. + :param num_directions: whether bidirectional or unidirectional (left to right) model. + :param num_layers: number of hidden layers in the neural network. + :param max_token_len: maximum token length of the input. + :param use_attention: whether to use attention or not. """ num_directions: int = 2 @@ -37,15 +33,14 @@ class LSTMParams(ForecastBasedNNParams): class Attention(nn.Module): - """Attention model for lstm based log representation learning""" + """Attention model for lstm based log representation learning. - def __init__(self, input_size, max_seq_len): - """intiializing attention module + :param input_size: input dimension. + :param max_seq_len: maximum sequence length. + """ - Args: - input_size (int): input dimension - max_seq_len (int): maximum sequence length - """ + def __init__(self, input_size, max_seq_len): + super(Attention, self).__init__() self.atten_w = nn.Parameter(torch.randn(max_seq_len, input_size, 1)) self.atten_bias = nn.Parameter(torch.randn(max_seq_len, 1, 1)) @@ -75,18 +70,12 @@ def zeros(self, tensor): class LSTM(ForecastBasedNN): - """LSTM based model for learning log representation through a self-supervised forecasting task over log sequences - - Inherits: - ForecastBasedNN: base class for forecast based neural log representation model + """LSTM based model for learning log representation through a self-supervised forecasting task over log sequences. + :param config: parameters for lstm based model. """ def __init__(self, config: LSTMParams): - """initializing lstm model for log representation learning - - Args: - config (LSTMParams): parameters for lstm based model - """ + super().__init__(config) self.config = config self.config.model_name = "lstm" @@ -118,14 +107,10 @@ def __init__(self, config: LSTMParams): ) def forward(self, input_dict): - """forward method for - - Args: - input_dict (dict): dict containing the session_idx, features, window_anomalies - and window_labels as in ForecastNNVectorizedDataset object + """Forward method for lstm model. - Returns: - dict: dict containing loss and prediction tensor + :param input_dict: dict containing the session_idx, features, window_anomalies and window_labels as in ForecastNNVectorizedDataset object. + :return: dict containing loss and prediction tensor. """ if self.label_type == "anomaly": y = input_dict[ForecastNNVectorizedDataset.window_anomalies].long().view(-1) diff --git a/logai/algorithms/nn_model/forecast_nn/transformer.py b/logai/algorithms/nn_model/forecast_nn/transformer.py index af68e12..13285f0 100755 --- a/logai/algorithms/nn_model/forecast_nn/transformer.py +++ b/logai/algorithms/nn_model/forecast_nn/transformer.py @@ -16,14 +16,10 @@ @dataclass class TransformerParams(ForecastBasedNNParams): - """Config for transformer based log representation learning - - Inherits: - ForecastBasedNNParams : base class for parameters of forecasting based neural log representation moels - - nhead: int = 4 # number of attention heads - num_layers: int = 1 # number of hidden layers in the neural network + """Config for transformer based log representation learning. + :param nhead: number of attention heads. + :param num_layers: number of hidden layers in the neural network. """ nhead: int = 4 @@ -32,10 +28,7 @@ class TransformerParams(ForecastBasedNNParams): class Transformer(ForecastBasedNN): """Transformer based model for learning log representation through a self-supervised forecasting task over - log sequences - - Inherits: - ForecastBasedNN : base class for forecast based neural log representation learning + log sequences. """ def __init__(self, config: TransformerParams): @@ -55,14 +48,10 @@ def __init__(self, config: TransformerParams): self.prediction_layer = nn.Linear(self.config.embedding_dim, num_labels) def forward(self, input_dict): - """forward method of transformer based model - - Args: - input_dict (dict): dict containing the session_idx, features, window_anomalies - and window_labels as in ForecastNNVectorizedDataset object + """Forward method of transformer based model. - Returns: - dict: dict containing loss and prediction tensor + :param input_dict: dict containing the session_idx, features, window_anomalies and window_labels as in ForecastNNVectorizedDataset object. + :return: dict containing loss and prediction tensor. """ if self.label_type == "anomaly": y = input_dict[ForecastNNVectorizedDataset.window_anomalies].long().view(-1) diff --git a/logai/algorithms/nn_model/forecast_nn/utils.py b/logai/algorithms/nn_model/forecast_nn/utils.py index 3db8491..30d4365 100755 --- a/logai/algorithms/nn_model/forecast_nn/utils.py +++ b/logai/algorithms/nn_model/forecast_nn/utils.py @@ -12,22 +12,18 @@ def tensor2flatten_arr(tensor): - """convert tensor to flat numpy array - - Args: - tensor (torch.Tensor): tensor object - - Returns: - numpy array: flat numpy array + """Convert tensor to flat numpy array. + + :param tensor: tensor object + :return: flat numpy array """ return tensor.data.cpu().numpy().reshape(-1) def seed_everything(seed=1234): - """fix the random seeds throughout the python environment + """Fix the random seeds throughout the python environment - Args: - seed (int, optional): Seed value. Defaults to 1234. + :param seed: Seed value. Defaults to 1234. """ random.seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) @@ -36,16 +32,12 @@ def seed_everything(seed=1234): def set_device(gpu: int = None): - """set device (cpu or gpu). Use -1 to specify cpu. - If not manually set device would be automatically set to gpu - if gpu is available otherwise cpu would be used - - Args: - gpu (int, optional): device number of gpu (use -1 for cpu). - Defaults to None. - - Returns: - torch.device : torch device type object + """Set device (cpu or gpu). Use -1 to specify cpu. + If not manually set device would be automatically set to gpu. + if gpu is available otherwise cpu would be used. + + :param gpu: device number of gpu (use -1 for cpu). Defaults to None. + :return: torch device type object. """ if not gpu: if torch.cuda.is_available(): diff --git a/logai/algorithms/nn_model/logbert/configs.py b/logai/algorithms/nn_model/logbert/configs.py index 927ad12..b5f5736 100755 --- a/logai/algorithms/nn_model/logbert/configs.py +++ b/logai/algorithms/nn_model/logbert/configs.py @@ -11,36 +11,31 @@ @dataclass class LogBERTConfig(Config): - """Config for logBERT model - - Inherits: - Config: config interface - - pretrain_from_scratch: bool = True # whether to do pretraining from scratch or intialize with the - HuggingFace pretrained LM. - model_name: str = "bert-base-cased" # name of the model using HuggingFace standardized naming - model_dirname: str = None # name of the directory where the model would be saved. Directory of this name - would be created inside `output_dir`, if it does not exist - mlm_probability: float = 0.15 # probability of the tokens to be masked during MLM trainning - mask_ngram: int = 1 # length of ngrams that are masked during inference - max_token_len: int = 384 # maximum token length of the input - learning_rate: float = 1e-5 # learning rate - weight_decay: float = 0.0001 # parameter to use weight decay of the learning rate - per_device_train_batch_size: int = 50 # training batch size per gpu device - per_device_eval_batch_size: int = 256 # evaluation batch size per gpu device - eval_accumulation_steps: int = 1000 # parameter to accumulate the evaluation results over the steps - num_eval_shards: int = 10 # parameter to shard the evaluation data (to avoid any OOM issue) - evaluation_strategy: str = "steps" # either steps or epoch, based on whether the unit of the eval_steps - parameter is "steps" or "epoch" - num_train_epochs: int = 20 # number of training epochs - logging_steps: int = 10 # number of steps after which the output is logged - save_steps: int = 50 # number of steps after which the model is saved - eval_steps: int = 50 # number of steps after which evaluation is run - resume_from_checkpoint: bool = True # whether to resume from a given model checkpoint. If set to true, - it will find the latest checkpoint saved in the dir and use that to load the model - output_dir: str = None # output directory where the model would be saved. - tokenizer_dirpath: str = None # path to directory containing the tokenizer - + """Config for logBERT model. + + :param pretrain_from_scratch: bool = True : whether to do pretraining from scratch or intialize with the HuggingFace pretrained LM. + :param model_name: str = "bert-base-cased" : name of the model using HuggingFace standardized naming. + :param model_dirname: str = None : name of the directory where the model would be saved. Directory of this + name would be created inside `output_dir`, if it does not exist. + :param mlm_probability: float = 0.15 : probability of the tokens to be masked during MLM trainning. + :param mask_ngram: int = 1 : length of ngrams that are masked during inference. + :param max_token_len: int = 384 : maximum token length of the input. + :param learning_rate: float = 1e-5 : learning rate. + :param weight_decay: float = 0.0001 : parameter to use weight decay of the learning rate. + :param per_device_train_batch_size: int = 50 : training batch size per gpu device. + :param per_device_eval_batch_size: int = 256 : evaluation batch size per gpu device. + :param eval_accumulation_steps: int = 1000 : parameter to accumulate the evaluation results over the steps. + :param num_eval_shards: int = 10 : parameter to shard the evaluation data (to avoid any OOM issue). + :param evaluation_strategy: str = "steps" : either steps or epoch, based on whether the unit of the eval_steps + parameter is "steps" or "epoch". + :param num_train_epochs: int = 20 : number of training epochs. + :param logging_steps: int = 10 : number of steps after which the output is logged. + :param save_steps: int = 50 : number of steps after which the model is saved. + :param eval_steps: int = 50 : number of steps after which evaluation is run. + :param resume_from_checkpoint: bool = True : whether to resume from a given model checkpoint. + If set to true, it will find the latest checkpoint saved in the dir and use that to load the model. + :param output_dir: str = None : output directory where the model would be saved. + :param tokenizer_dirpath: str = None : path to directory containing the tokenizer. """ pretrain_from_scratch: bool = True diff --git a/logai/algorithms/nn_model/logbert/eval_metric_utils.py b/logai/algorithms/nn_model/logbert/eval_metric_utils.py index b7ba6af..e2db87f 100755 --- a/logai/algorithms/nn_model/logbert/eval_metric_utils.py +++ b/logai/algorithms/nn_model/logbert/eval_metric_utils.py @@ -14,15 +14,14 @@ def __plot_roc(x, y, label, y_name, x_name, fig_name): - """plotting roc curve - - Args: - x (np.array): array of x values - y (np.array): array of y values - label (np.array): array of label values - y_name (str): y axis label - x_name (str): x axis label - fig_name (str): figure name + """Plotting roc curve. + + :param x:(np.array): array of x values. + :param y:(np.array): array of y values. + :param label:(np.array): array of label values. + :param y_name:(str): y axis label. + :param x_name:(str): x axis label. + :param fig_name:(str): figure name. """ plt.plot(x, y, label=label) plt.ylabel(y_name) @@ -37,12 +36,11 @@ def __plot_roc(x, y, label, y_name, x_name, fig_name): def __plot_scores_kde(scores_pos, scores_neg, fig_name): - """plotting kernel density estimation of positive and negative scores + """Plotting kernel density estimation of positive and negative scores. - Args: - scores_pos (np.array): array of positive scores - scores_neg (np.array): array of negative scores - fig_name (str): figure name + :param scores_pos: (np.array): array of positive scores. + :param scores_neg:(np.array): array of negative scores. + :param fig_name: (str): figure name. """ plt.rcParams["figure.figsize"] = [7.00, 3.50] plt.rcParams["figure.autolayout"] = True @@ -68,31 +66,26 @@ def __plot_scores_kde(scores_pos, scores_neg, fig_name): def compute_metrics(eval_metrics_per_instance_series, test_labels, test_counts=None): - """computing evaluation metric scores for anomaly detection - - Args: - eval_metrics_per_instance_series (dict): dict object consisting of eval metrics for each instance index - test_labels (dict): gold labels for each instance index - test_counts (dict): counts of each instance index - - Raises: - Exception: IndexError if the indices of eval_metrics_per_instance_series do not match with - indices of test_labels - - Returns: - list of tuples: list of tuples containing labels and scores computed for each index - y: list of anomaly label for each instance - loss_mean: list of mean loss (over all masked non-padded tokens) for each instance - loss_max: list of max loss (over all masked non-padded tokens) for each instance - loss_top6_mean: list of mean loss (averaged over top-k masked non-padded tokens) for each instance, k = 6 - (following LanoBERT paper https://arxiv.org/pdf/2111.09564.pdf) - scores_top6_max_prob: for each instance, we take the max prob. score obtained and average over the top-k masked - (non-padded) token prediction, k = 6 - scores_top6_min_logprob: for each instance, we take the min logprob score obtained and average over the - top-k masked (non-padded) token prediction, k = 6 - scores_top6_max_entropy: for each instance we take the max entropy score obtained and average over the top-k - masked (non-padded) token prediction, k = 6 - + """Computing evaluation metric scores for anomaly detection. + + :param eval_metrics_per_instance_series:(dict): dict object consisting + of eval metrics for each instance index. + :param test_labels:(dict): gold labels for each instance index. + :param test_counts:(dict): counts of each instance index. + :raises: Exception: IndexError if the indices of eval_metrics_per_instance_series + do not match with indices of test_labels. + :return: list of tuples containing labels and scores computed for each index. + - y: list of anomaly label for each instance. + - loss_mean: list of mean loss (over all masked non-padded tokens) for each instance. + - loss_max: list of max loss (over all masked non-padded tokens) for each instance. + - loss_top6_mean: list of mean loss (averaged over top-k masked non-padded tokens) for each + instance, k = 6(following LanoBERT paper https://arxiv.org/pdf/2111.09564.pdf). + - scores_top6_max_prob: for each instance, we take the max prob. score obtained and average + over the top-k masked (non-padded) token prediction, k = 6. + - scores_top6_min_logprob: for each instance, we take the min logprob score obtained and average + over the top-k masked (non-padded) token prediction, k = 6. + - scores_top6_max_entropy: for each instance we take the max entropy score obtained and average + over the top-k masked (non-padded) token prediction, k = 6. """ eval_metrics_per_instance_series["indices"] = eval_metrics_per_instance_series[ "indices" @@ -189,22 +182,21 @@ def __compute_auc_roc( plot_graph=False, plot_histogram=False, ): - """computing AUROC for each of the type of metrics - - Args: - y (list): list of anomaly labels for each instances - loss_mean (list): list of mean loss (over all masked non-padded tokens) for each instance - loss_max (list): list of max loss (over all masked non-padded tokens) for each instance - loss_top6_mean (list): list of mean loss (averaged over top-k masked non-padded tokens) for each instance, - k = 6 (following LanoBERT paper https://arxiv.org/pdf/2111.09564.pdf) - scores_top6_max_prob (list): for each instance, we take the max prob. score obtained and average over the - top-k masked (non-padded) token prediction, k = 6 - scores_top6_min_logprob (list): for each instance, we take the min logprob score obtained and average over the - top-k masked (non-padded) token prediction, k = 6 - scores_top6_max_entropy (list): for each instance we take the max entropy score obtained and average over the - top-k masked (non-padded) token prediction, k = 6 - plot_graph (bool, optional): whether to plot roc graph. Defaults to False. - plot_histogram (bool, optional): whether to plot scores histogram. Defaults to False. + """Computing AUROC for each of the type of metrics + + :param y: (list): list of anomaly labels for each instances. + :param loss_mean: (list): list of mean loss (over all masked non-padded tokens) for each instance. + :param loss_max: (list): list of max loss (over all masked non-padded tokens) for each instance. + :param loss_top6_mean: (list): list of mean loss (averaged over top-k masked non-padded tokens) + for each instance, k = 6 (following LanoBERT paper https://arxiv.org/pdf/2111.09564.pdf). + :param scores_top6_max_prob: (list): for each instance, we take the max prob. score obtained + and average over the top-k masked (non-padded) token prediction, k = 6. + :param scores_top6_min_logprob: (list): for each instance, we take the min logprob score obtained + and average over the top-k masked (non-padded) token prediction, k = 6. + :param scores_top6_max_entropy: (list): for each instance we take the max entropy score obtained + and average over the top-k masked (non-padded) token prediction, k = 6. + :param plot_graph: (bool, optional): whether to plot roc graph. Defaults to False. + :param plot_histogram: (bool, optional): whether to plot scores histogram. Defaults to False. """ __compute_auc_roc_for_metric( @@ -254,14 +246,13 @@ def __compute_auc_roc( def __compute_auc_roc_for_metric( y, metric, metric_name_str, plot_graph=False, plot_histogram=False ): - """computing AUROC for each metric + """Computing AUROC for each metric. - Args: - y (list): list of anomaly labels for each instance - metric (list): list of metric scores for each instance - metric_name_str (str): name of metric - plot_graph (bool, optional): Whether to plot ROC graph. Defaults to False. - plot_histogram (bool, optional): Whether to plot histogram of metric scores. Defaults to False. + :param y: (list): list of anomaly labels for each instance. + :param metric: (list): list of metric scores for each instance. + :param metric_name_str: (str): name of metric. + :param plot_graph: (bool, optional): Whether to plot ROC graph. Defaults to False. + :param plot_histogram: (bool, optional): Whether to plot histogram of metric scores. Defaults to False. """ metric_pos = np.array([metric[i] for i in range(len(metric)) if y[i] == 1]) diff --git a/logai/algorithms/nn_model/logbert/predict.py b/logai/algorithms/nn_model/logbert/predict.py index c0e5d00..c775076 100755 --- a/logai/algorithms/nn_model/logbert/predict.py +++ b/logai/algorithms/nn_model/logbert/predict.py @@ -25,14 +25,12 @@ class LogBERTPredict: - """Class for running inference on logBERT model for unsupervised log anomaly detection""" + """Class for running inference on logBERT model for unsupervised log anomaly detection. - def __init__(self, config: LogBERTConfig): - """Initializing logBERTPredict class + :param config: config object describing the parameters of logbert model. + """ - Args: - config (LogBERTConfig): config object describing the parameters of logbert model - """ + def __init__(self, config: LogBERTConfig): self.config = config @@ -137,13 +135,10 @@ def load_model(self): ) def predict(self, test_dataset: HFDataset): - """Method for running inference on logbert to predict anomalous loglines in test dataset - - Args: - test_dataset (HFDataset): test dataset of type huggingface Dataset object + """Method for running inference on logbert to predict anomalous loglines in test dataset. - Returns: - dict: dict containing instance-wise loss and scores + :param test_dataset: test dataset of type huggingface Dataset object. + :return: dict containing instance-wise loss and scores. """ if not self.model: self.load_model() diff --git a/logai/algorithms/nn_model/logbert/predict_utils.py b/logai/algorithms/nn_model/logbert/predict_utils.py index 8f67b19..1526066 100755 --- a/logai/algorithms/nn_model/logbert/predict_utils.py +++ b/logai/algorithms/nn_model/logbert/predict_utils.py @@ -12,20 +12,15 @@ class Predictor(Trainer): - """Custom Trainer object for running the inference of logBERT model for unsupervised anomaly detection - - Inherits: - Trainer: HuggingFace trainer class + """Custom Trainer object for running the inference of logBERT model for unsupervised anomaly detection. """ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: """ - Returns the test [`~torch.utils.data.DataLoader`]. - Subclass and override this method if you want to inject some custom behavior. - Args: - test_dataset (`torch.utils.data.Dataset`, *optional*): - The test dataset to use. If it is an `datasets.Dataset`, columns not accepted by the `model.forward()` - method are automatically removed. It must implement `__len__`. + Returns the test [`~torch.utils.data.DataLoader`]. Subclass and override this method if you want to inject some custom behavior. + + :param test_dataset: (`torch.utils.data.Dataset`, *optional*): The test dataset to use. If it is an `datasets.Dataset`, + columns not accepted by the `model.forward()` method are automatically removed. It must implement `__len__`. """ if isinstance(test_dataset, torch.utils.data.IterableDataset): @@ -62,11 +57,6 @@ def compute_loss(self, model, inputs, return_outputs=False): Subclass and override for custom behavior. """ - """print ('Input: ', tokenizer.decode(inputs['input_ids'].cpu().data.numpy().tolist()[0])) - i = inputs['labels'].cpu().data.numpy().tolist()[0] - i = list(filter((-100).__ne__, i)) - print ('Output: ', tokenizer.decode(i))""" - if self.label_smoother is not None and "labels" in inputs: labels = inputs.pop("labels") else: @@ -95,11 +85,8 @@ class PredictionLabelSmoother(LabelSmoother): """ Adds label-smoothing on a pre-computed output from a Transformers model. - Args: - epsilon (:obj:`float`, `optional`, defaults to 0.1): - The label smoothing factor. - ignore_index (:obj:`int`, `optional`, defaults to -100): - The index in the labels to ignore when computing the loss. + :param epsilon: (:obj:`float`, `optional`, defaults to 0.1): The label smoothing factor. + :param ignore_index: (:obj:`int`, `optional`, defaults to -100): The index in the labels to ignore when computing the loss. """ epsilon: float = 0.0 diff --git a/logai/algorithms/nn_model/logbert/tokenizer_utils.py b/logai/algorithms/nn_model/logbert/tokenizer_utils.py index 5423f53..8f1b593 100755 --- a/logai/algorithms/nn_model/logbert/tokenizer_utils.py +++ b/logai/algorithms/nn_model/logbert/tokenizer_utils.py @@ -10,13 +10,10 @@ def get_tokenizer(tokenizer_dirpath): - """get huggingface tokenizer object from a given directory path + """Get huggingface tokenizer object from a given directory path. - Args: - tokenizer_dirpath (str): absolute path to directory containing pretrained tokenizer - - Returns: - AutoTokenizer: tokenizer object + :param tokenizer_dirpath: (str): absolute path to directory containing pretrained tokenizer. + :return: AutoTokenizer: tokenizer object. """ return AutoTokenizer.from_pretrained(tokenizer_dirpath, use_fast=True) @@ -24,8 +21,7 @@ def get_tokenizer(tokenizer_dirpath): def get_special_tokens(): """gets special tokens - Returns: - list: list of special tokens + :return: list of special tokens """ return [ "[UNK]", @@ -44,36 +40,27 @@ def get_special_tokens(): def get_special_token_ids(tokenizer): - """get ids of special tokens, given a tokenizer object - - Args: - tokenizer (AutoTokenizer): tokenizer object + """Get ids of special tokens, given a tokenizer object. - Returns: - list: list of token ids of special tokens + :param tokenizer: (AutoTokenizer): tokenizer object. + :return: list of token ids of special tokens. """ return [tokenizer.convert_tokens_to_ids(x) for x in get_special_tokens()] def get_tokenizer_vocab(tokenizer_dirpath): - """get vocabulary from a given tokenizer directory path - - Args: - tokenizer_dirpath (str): absolute path to directory containing pretrained tokenizer + """Get vocabulary from a given tokenizer directory path. - Returns: - list: list of vocabulary words + :param tokenizer_dirpath: (str): absolute path to directory containing pretrained tokenizer. + :return: list of vocabulary words. """ return open(os.path.join(tokenizer_dirpath, "vocab.txt")).readlines() def get_mask_id(tokenizer): - """get id of mask token, given a tokenizer object - - Args: - tokenizer (AutoTokenizer): tokenizer object - - Returns: - int: id of mask token + """Get id of mask token, given a tokenizer object. + + :param tokenizer: (AutoTokenizer): tokenizer object. + :return: id of mask token. """ return tokenizer.convert_tokens_to_ids("[MASK]") diff --git a/logai/algorithms/nn_model/logbert/train.py b/logai/algorithms/nn_model/logbert/train.py index e6c9d9e..597010e 100755 --- a/logai/algorithms/nn_model/logbert/train.py +++ b/logai/algorithms/nn_model/logbert/train.py @@ -70,10 +70,9 @@ def _initialize_trainer(self, model, train_dataset, dev_dataset): ) def get_model_checkpoint(self): - """Get the latest dumped checkpoint from the model directory path mentioned in logBERTConfig + """Get the latest dumped checkpoint from the model directory path mentioned in logBERTConfig. - Returns: - str: path to model checkpoint (or name of model in case of a pretrained model from hugging face) + :return: path to model checkpoint (or name of model in case of a pretrained model from hugging face). """ if os.path.exists(self.model_dirpath) and os.listdir(self.model_dirpath): checkpoint_dir = "checkpoint-" + str( @@ -93,11 +92,10 @@ def get_model_checkpoint(self): return model_checkpoint def fit(self, train_dataset: HFDataset, dev_dataset: HFDataset): - """fit method for training logbert model + """Fit method for training logbert model. - Args: - train_dataset (HFDataset): training dataset of type huggingface Dataset object - dev_dataset (HFDataset): development dataset of type huggingface Dataset object + :param train_dataset: training dataset of type huggingface Dataset object. + :param dev_dataset: development dataset of type huggingface Dataset object. """ model_checkpoint = self.get_model_checkpoint() @@ -114,6 +112,6 @@ def fit(self, train_dataset: HFDataset, dev_dataset: HFDataset): self.trainer.train() def evaluate(self): - """evaluate methof for evaluating logbert model on dev data using perplexity metric""" + """Evaluate methof for evaluating logbert model on dev data using perplexity metric.""" eval_results = self.trainer.evaluate() logging.info("Perplexity: {}".format(math.exp(eval_results["eval_loss"]))) diff --git a/logai/algorithms/nn_model/transformers.py b/logai/algorithms/nn_model/transformers.py index 2b1cd33..eba2da9 100755 --- a/logai/algorithms/nn_model/transformers.py +++ b/logai/algorithms/nn_model/transformers.py @@ -27,14 +27,14 @@ @dataclass class TransformerAlgoConfig(Config): - """Config class for Transformer based model for log classification tasks + """Config class for Transformer based model for log classification tasks. """ tokenizer_config: dict = {"name": "auto", "model": "bert-base-cased"} trainer_config: dict = {} class LogDataset(torch.utils.data.Dataset): - """Wrapper class for Log Dataset, to wrap over torch Dataset class + """Wrapper class for Log Dataset, to wrap over torch Dataset class. """ def __init__(self, encodings, labels): self.encodings = encodings @@ -55,7 +55,7 @@ class TransformerAlgo: For e.g. log anomaly detection is one type of log classfication task where the labels are Normal (Label 0) or Anomalous (Label 1). Currently it supports only binary classification, to change this `num_labels` of AutoModelForSequenceClassification - has to be changed accordingly along with the prediction logic in predict method + has to be changed accordingly along with the prediction logic in predict method. """ def __init__(self, config: TransformerAlgoConfig): self.config = config @@ -69,22 +69,20 @@ def __init__(self, config: TransformerAlgoConfig): return def save(self, output_dir: str): - """save model in given directory + """Save model in given directory. - Args: - output_dir (str): path to output directory where model should be dumped + :param output_dir: The path to output directory where model should be dumped. """ self.trainer.save_model(output_dir) return def train(self, train_logs: pd.Series, train_labels: pd.Series): - """train method for Transformer based pretrained language model with + """Train method for Transformer based pretrained language model with a sequence classification head for supervised log classification task. - Internally this method also splits the available training logs into train and dev data + Internally this method also splits the available training logs into train and dev data. - Args: - train_logs (pd.Series): training log vectors data (after LogVectorizer) - train_labels (pd.Series): training label data + :param train_logs: The training log vectors data (after LogVectorizer). + :param train_labels: The training label data. """ train_logs = train_logs.rename(constants.LOG_EVENTS) if not self.tokenizer: @@ -135,11 +133,10 @@ def train(self, train_logs: pd.Series, train_labels: pd.Series): def train_with_native_torch(self, train_logs: pd.Series, train_labels: pd.Series): """ - Train models in native torch way. Use as needed + Train models in native torch way. - Args: - train_logs (pd.Series): training log features data (after LogVectorizer) - train_labels (pd.Series): label data for training logs + :param train_logs: The training log features data (after LogVectorizer). + :param train_labels: The label data for training logs. """ if not self.tokenizer: if self.config.tokenizer_config["name"] == "auto": @@ -164,16 +161,13 @@ def train_with_native_torch(self, train_logs: pd.Series, train_labels: pd.Series return def predict(self, test_logs: pd.Series, test_labels: pd.Series) -> Tuple[pd.Series, np.ndarray, Dict[str, float]]: - """Predict method for running evaluation on test log data + """Predict method for running evaluation on test log data. - Args: - test_logs (pd.Series): test log features data (output of LogVectorizer) - test_labels (pd.Series): labels of test log data - - Returns: - res (pd.Series): Predicted test labels as pandas Series object - label_ids (`np.ndarray`, *optional*): True test labels (if the dataset contained some). - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics + :param test_logs: The test log features data (output of LogVectorizer). + :param test_labels: The labels of test log data. + :return: - res (pd.Series): Predicted test labels as pandas Series object. + - label_ids (`np.ndarray`, *optional*): True test labels (if the dataset contained some). + - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics. """ test_logs = test_logs.rename(constants.LOG_EVENTS) diff --git a/logai/algorithms/parsing_algo/drain.py b/logai/algorithms/parsing_algo/drain.py index 0fbb611..eefe79d 100755 --- a/logai/algorithms/parsing_algo/drain.py +++ b/logai/algorithms/parsing_algo/drain.py @@ -121,24 +121,6 @@ def __init__(self): @factory.register("parsing", "drain", DrainParams) class Drain(ParsingAlgo): def __init__(self, params: DrainParams, profiler=NullProfiler()): - - """ - Attributes - ---------- - depth : max depth levels of log clusters. Minimum is 2. - For example, for depth==4: - Root is considered depth level 1. - Token count is considered depth level 2. - First log token is considered depth level 3. - Log clusters below first token node are considered depth level 4. - sim_th : similarity threshold - if percentage of similar tokens for a log message is below this - number, a new log cluster will be created. - max_children : max number of children of an internal node - max_clusters : max number of tracked clusters (unlimited by default). - When this number is reached, model starts replacing old clusters - with a new ones according to the LRU policy. - extra_delimiters: delimiters to apply when splitting log message into words (in addition to whitespace). - """ if params.depth < 3: raise ValueError("depth argument must be at least 3") diff --git a/logai/algorithms/parsing_algo/iplom.py b/logai/algorithms/parsing_algo/iplom.py index 1879a97..5082e52 100755 --- a/logai/algorithms/parsing_algo/iplom.py +++ b/logai/algorithms/parsing_algo/iplom.py @@ -94,7 +94,7 @@ def fit(self, loglines: pd.Series): def parse(self, loglines: pd.Series) -> pd.Series: """Parsing method to parse the raw log data. - :param loglines: The raw log data + :param loglines: The raw log data. :returns: The parsed log data. """ self._Step1(loglines) diff --git a/logai/algorithms/vectorization_algo/fasttext.py b/logai/algorithms/vectorization_algo/fasttext.py index 4fac662..a80b773 100755 --- a/logai/algorithms/vectorization_algo/fasttext.py +++ b/logai/algorithms/vectorization_algo/fasttext.py @@ -22,7 +22,7 @@ class FastTextParams(Config): Configuration for FastText vectorizer. For more details on the parameters see https://radimrehurek.com/gensim/models/fasttext.html. - :param vector_size: The size of vector + :param vector_size: The size of vector. :param window: The maximum distance between the current and predicted word within a sentence. :param min_count: Ignores all words with total frequency lower than this. :param sample: The threshold for configuring which higher-frequency words are randomly downsampled. diff --git a/logai/algorithms/vectorization_algo/forecast_nn.py b/logai/algorithms/vectorization_algo/forecast_nn.py index ece5dea..4906c5a 100644 --- a/logai/algorithms/vectorization_algo/forecast_nn.py +++ b/logai/algorithms/vectorization_algo/forecast_nn.py @@ -22,7 +22,12 @@ class ForecastNNVectorizedDataset: - """Class for storing vectorized dataset for forecasting based neural models.""" + """Class for storing vectorized dataset for forecasting based neural models. + :param logline_features: (np.array): list of vectorized log-sequences + :param labels: (list or pd.Series or np.array): list of labels (anomalous or non-anomalous) for each log sequence. + :param nextlogline_ids: (list or pd.Series or np.array): list of ids of next loglines, for each log sequence + :param span_ids: (list or pd.Series or np.array): list of ids of log sequences. + """ session_idx: str = "session_idx" features: str = "features" @@ -30,14 +35,7 @@ class ForecastNNVectorizedDataset: window_labels: str = "window_labels" def __init__(self, logline_features, labels, nextlogline_ids, span_ids): - """ - :param logline_features (np.array): The list of vectorized log-sequences. - :param labels (list or pd.Series or np.array): The list of labels (anomalous or non-anomalous) - for each log sequence. - :param nextlogline_ids (list or pd.Series or np.array): The list of ids of next loglines - for each log sequence. - :param span_ids (list or pd.Series or np.array): The list of ids of log sequences. - """ + self.dataset = [] for data_i, label_i, next_i, index_i in zip( logline_features, labels, nextlogline_ids, span_ids @@ -59,7 +57,7 @@ class ForecastNNVectorizerParams(Config): :param feature_type: The type of log feature representation where the supported types "semantics" and "sequential". :param label_type: The type of label, anomaly or next_log, which corresponds to the supervised and the forecasting based unsupervised setting. - :param sep_token: The separator token used when constructing the log sequences during log grouping/partitioning. + :param sep_token: The separator token used when constructing the log sequences during log grouping/partitioning. (default = "[SEP]") :param max_token_len: The maximum token length of the input. :param min_token_count: The minimum number of occurrences of a token in the training data, for it to be considered in the vocab. @@ -88,13 +86,12 @@ class ForecastNNVectorizerParams(Config): @factory.register("vectorization", "forecast_nn", ForecastNNVectorizerParams) class ForecastNN(VectorizationAlgo): - """Vectorizer Class for forecast based neural models for log representation learning.""" + """Vectorizer Class for forecast based neural models for log representation learning. + + :param config: config object specifying parameters of forecast based neural log repersentation learning model. + """ def __init__(self, config: ForecastNNVectorizerParams): - """ - :param config: The config object specifying parameters of forecast based neural - log repersentation learning model. - """ self.meta_data = {} self.config = config self.config.vectorizer_model_dirpath = os.path.join( @@ -145,7 +142,7 @@ def _process_logsequence(self, data_sequence): def fit(self, logrecord: LogRecordObject): """Fit method to train vectorizer. - :param logrecord: The logrecord object to train the vectorizer on. + :param logrecord: A log record object to train the vectorizer on. """ if self.sequential_vectorizer.vocab is None or ( self.config.feature_type == "semantics" @@ -171,8 +168,8 @@ def fit(self, logrecord: LogRecordObject): def transform(self, logrecord: LogRecordObject): """Transform method to run vectorizer on logrecord object. - :param logrecord: The logrecord object to be vectorized - :return: A ForecastNNVectorizedDataset object containing the vectorized dataset. + :param logrecord: A log record object to be vectorized. + :return: ForecastNNVectorizedDataset object containing the vectorized dataset. """ if self.config.feature_type == "sequential": logline_features = self.sequential_vectorizer.transform( diff --git a/logai/algorithms/vectorization_algo/logbert.py b/logai/algorithms/vectorization_algo/logbert.py index 026bf7c..555d531 100755 --- a/logai/algorithms/vectorization_algo/logbert.py +++ b/logai/algorithms/vectorization_algo/logbert.py @@ -29,18 +29,19 @@ @dataclass class LogBERTVectorizerParams(Config): - """Config class for logBERT Vectorizer. - - :param model_name: The name of the model , using HuggingFace standardized naming. - :param use_fast: Whether to use fast tokenization or not. - :param truncation: Whether to truncate the input to max_token_len. - :param max_token_len: The maximum token length of input. - :param max_vocab_size: The maximum size of the vocabulary. - :param custom_tokens: The list of custom tokens. - :param train_batch_size: The batch size during training the vectorizer. - :param output_dir: The path to directory where the output would be saved. - :param tokenizer_dirpath: The path to the tokenizer where the vectorizer (logbert tokenizer) would be saved. - :param num_proc: The number of processes to be used when tokenizing. + """Config class for logBERT Vectorizer + + :param model_name: name of the model , using HuggingFace standardized naming. + :param use_fast: whether to use fast tokenization or not. + :param truncation: whether to truncate the input to max_token_len. + :param max_token_len: maximum token length of input, if truncation is set to true. + :param max_vocab_size: maximum size of the vocabulary. + :param custom_tokens: list of custom tokens. + :param train_batch_size: batch size during training the vectorizer. + :param output_dir: path to directory where the output would be saved. + :param tokenizer_dirpath: path to the tokenizer where the vectorizer (logbert tokenizer) would be saved. + :param num_proc: number of processes to be used when tokenizing. + """ model_name: str = "" @@ -58,14 +59,13 @@ class LogBERTVectorizerParams(Config): @factory.register("vectorization", "logbert", LogBERTVectorizerParams) class LogBERT(VectorizationAlgo): """Vectorizer class for logbert. + + :param config: A config object for specifying + parameters of log bert vectorizer. """ def __init__(self, config: LogBERTVectorizerParams): - """ - :param config: The config object for specifying - parameters of log bert vectorizer. - """ - + self.config = config self.special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"] @@ -104,7 +104,7 @@ def __init__(self, config: LogBERTVectorizerParams): def fit(self, logrecord: LogRecordObject): """Fit method for training vectorizer for logbert. - :param logrecord: The logrecord object containing the training + :param logrecord: A log record object containing the training dataset over which vectorizer is trained. """ @@ -166,9 +166,9 @@ def _clean_dataset(self, logrecord: LogRecordObject): def transform(self, logrecord: LogRecordObject): """Transform method for running vectorizer over logrecord object. - :param logrecord: The logrecord object containing the dataset + :param logrecord: A log record object containing the dataset to be vectorized. - :return: A HuggingFace dataset object. + :return: HuggingFace dataset object. """ cleaned_logrecord = self._clean_dataset(logrecord) dataset = self._get_hf_dataset(cleaned_logrecord) diff --git a/logai/algorithms/vectorization_algo/semantic.py b/logai/algorithms/vectorization_algo/semantic.py index 109e008..fdf4539 100644 --- a/logai/algorithms/vectorization_algo/semantic.py +++ b/logai/algorithms/vectorization_algo/semantic.py @@ -25,17 +25,15 @@ @dataclass class SemanticVectorizerParams(Config): """ - Configuration of Semantic vectorization of loglines (or sequence of log lines) using models like word2vc, - glove and fastText. - - :param max_token_len: The maximum token length of the input. - :param min_token_count: The minimum count of occurrences of a token in training data for it to be - considered in the vocab. - :param sep_token: The separator token used to separate log lines in input log sequence. - :param embedding_dim: The embedding dimension of the learnt token embeddings. - :param window: The window size parameter for word2vec and fastText models. - :param embedding_type: The type of embedding, currently supports glove, word2vec and fastText. - :param model_save_dir: The path to directory where vectorizer models would be saved. + Configuration of Semantic vectorization of loglines (or sequence of log lines) using models like word2vc, glove and fastText. + + :param max_token_len: maximum token length of the input. + :param min_token_count: minimum count of occurrences of a token in training data for it to be considered in the vocab. + :param sep_token: separator token used to separate log lines in input log sequence. Default is "[SEP]". + :param embedding_dim: embedding dimension of the learnt token embeddings. + :param window: window size parameter for word2vec and fastText models. + :param embedding_type: type of embedding, currently supports glove, word2vec and fastText. Default is "fasttext". + :param model_save_dir: path to directory where vectorizer models would be saved. """ max_token_len: int = 10 @@ -53,12 +51,12 @@ class Semantic(VectorizationAlgo): Semantic vectorizer to convert loglines into token ids based on a embedding model and vocabulary (like word2vec, glove and fastText). It supports either pretrained models and pretrained vocabulary or training word embedding models like Word2Vec or FastText on the given training data. + + :param params: A config object for semantic vectorizer. """ def __init__(self, params: SemanticVectorizerParams): - """ - :param params: The config object for semantic vectorizer. - """ + self.params = params self.model = None self.vocab = None @@ -203,8 +201,7 @@ def transform(self, loglines: pd.Series) -> pd.Series: """Transform method to run semantic vectorizer on loglines. :param loglines: The pandas Series containing the data to be vectorized. - Each data instance should be a logline or sequence of loglines concatenated by separator - token. + Each data instance should be a logline or sequence of loglines concatenated by separator token. :return: The vectorized log data. """ log_vectors = [] diff --git a/logai/algorithms/vectorization_algo/sequential.py b/logai/algorithms/vectorization_algo/sequential.py index 068a597..3fe69bf 100755 --- a/logai/algorithms/vectorization_algo/sequential.py +++ b/logai/algorithms/vectorization_algo/sequential.py @@ -32,12 +32,13 @@ class SequentialVectorizerParams(Config): @factory.register("vectorization", "sequential", SequentialVectorizerParams) class Sequential(VectorizationAlgo): - """Sequential Vectorizer to convert a sequence of loglines to sequence of log ids.""" + """Sequential Vectorizer to convert a sequence of loglines to sequence of log ids. + + :param params: A config object for storing parameters of Sequential Vectorizer. + """ def __init__(self, params: SequentialVectorizerParams): - """ - :param params: The config object for storing parameters of Sequential Vectorizer. - """ + self.params = params self.log_padding = "" self.log_oov = "" @@ -55,7 +56,7 @@ def _clean_data(self, data): def fit(self, loglines: pd.Series): """Fit method for training the sequential vectorizer. - :param loglines (pd.Series): A pandas Series object containing the dataset on + :param loglines: A pandas Series object containing the dataset on which semantic vectorizer is trained (and the vocab is built). Each data instance should be a logline or sequence of loglines concatenated by separator token. """ @@ -82,7 +83,7 @@ def fit(self, loglines: pd.Series): def transform(self, loglines: pd.Series) -> pd.Series: """Transform method for applying sequential vectorizer to loglines. - :param loglines (pd.Series): A pandas Series containing the data to be vectorized. + :param loglines: A pandas Series containing the data to be vectorized. Each data instance should be a logline or sequence of loglines concatenated by separator token. :return: The vectorized loglines. """ diff --git a/logai/algorithms/vectorization_algo/tfidf.py b/logai/algorithms/vectorization_algo/tfidf.py index d5161e8..e76252b 100755 --- a/logai/algorithms/vectorization_algo/tfidf.py +++ b/logai/algorithms/vectorization_algo/tfidf.py @@ -148,7 +148,5 @@ def transform(self, loglines: pd.Series) -> pd.Series: def summary(self): """ Generates model summary. - - :return: TfidfVectorizer.summary. """ return self.model.summary() diff --git a/logai/algorithms/vectorization_algo/word2vec.py b/logai/algorithms/vectorization_algo/word2vec.py index 146b77d..94c1699 100755 --- a/logai/algorithms/vectorization_algo/word2vec.py +++ b/logai/algorithms/vectorization_algo/word2vec.py @@ -38,21 +38,20 @@ class Word2VecParams(Config): class Word2Vec(VectorizationAlgo): """ Word2Vec algorithm for converting raw log data into word2vec vectors. This is a wrapper class for the Word2Vec - model from gensim library https://radimrehurek.com/gensim/models/word2vec.html. + model from gensim library https://radimrehurek.com/gensim/models/word2vec.html + + :param max_token_len: The max token length to vectorize, longer sentences will be chopped. """ def __init__(self, params: Word2VecParams): - """ - :param max_token_len: The max token length to vectorize, longer sentences will be chopped. - """ self.params = params self.model = None def fit(self, loglines: pd.Series): """ - Fits Word2Vec model. + Fits a Word2Vec model. - :param loglines: The parsed loglines. + :param loglines: Parsed loglines. """ max_token_len = self.params.max_token_len @@ -98,6 +97,6 @@ def transform(self, loglines: pd.Series) -> pd.Series: def summary(self): """ - Generate model summary. + Generates model summary. """ return self.model.summary() diff --git a/logai/applications/application_interfaces.py b/logai/applications/application_interfaces.py index 079cc3e..606d12f 100755 --- a/logai/applications/application_interfaces.py +++ b/logai/applications/application_interfaces.py @@ -24,6 +24,21 @@ @dataclass class WorkFlowConfig(Config): + """config class for end to end workflow. + + :param data_loader_config: A config object for data loader. + :param open_set_data_loader_config: A config object for data loader for opensource public log datasets. + :param preprocessor_config: A config object for log preprocessor. + :param log_parser_config: A config object for log parser. + :param log_vectorizer_config: A config object for log vectorizer. + :param partitioner_config: A config object for log partitioner. + :param open_set_partitioner_config: A config object for log partitioner for opensource public log datasets. + :param categorical_encoder_config: A config object for categorical encoder of log data. + :param feature_extractor_config: A config object for log feature extractor. + :param anomaly_detection_config: A config object for log anomaly detector. + :param nn_anomaly_detection_config: A config object for neural anomaly detector. + :param clustering_config: A config object for log clustering algorithm. + """ data_loader_config: object = None open_set_data_loader_config: object = None preprocessor_config: object = None diff --git a/logai/applications/auto_log_summarization.py b/logai/applications/auto_log_summarization.py index caca04b..853f13b 100644 --- a/logai/applications/auto_log_summarization.py +++ b/logai/applications/auto_log_summarization.py @@ -22,14 +22,13 @@ class AutoLogSummarization: """ The unified framework for log parsing analysis. How to use, the design of this analysis app should follow the general workflow of - automated log parsing. The workflow should be able to control solely by `WorkFlowConfig` + automated log parsing. The workflow should be able to control solely by `WorkFlowConfig`. + + :param config: The configuration that controls the behavior this app. """ def __init__(self, config: WorkFlowConfig): - """ - - :param config: WorkFlowConfig: the configuratiion that controls the behavior this app - """ + self.config = config self._parsing_results = pd.DataFrame() self._attributes = None @@ -52,9 +51,10 @@ def log_patterns(self): def get_parameter_list(self, log_pattern): """ - For a given log pattern, return the dynamic parameters - :param log_pattern: str: input log pattern - :return: pd.DataFrame: parameter list with Values, valuecounts and position + For a given log pattern, return the dynamic parameters. + + :param log_pattern: The input log pattern. + :return: The parameter list with Values, valuecounts and position. """ para_list = pd.DataFrame(None, columns=["position", "value_counts", "values"]) if self._parsing_results.empty or not log_pattern: @@ -79,26 +79,18 @@ def get_parameter_list(self, log_pattern): def recognize_parameter_entity(self, para_list): """ Placeholder for log parameter entity recognization - :param para_list: - :return: """ pass def summarize_numeric_paramters(self, paras: list): """ Placeholder for numeric parameter summarization - :param para_list: - :return: """ - - return + pass def find_log_pattern(self, logline: str, return_para_list: bool = True): """ Find the log pattern for a given logline, return all dynamic parameters in this log pattern if needed. - :param logline: - :param return_para_list: - :return: """ log_pattern = None para_list = None @@ -124,7 +116,6 @@ def find_log_pattern(self, logline: str, return_para_list: bool = True): def execute(self): """ Execute auto log parsing analysis. Store the results and index for searching. - :return: """ # load data logrecord = self._load_data() diff --git a/logai/applications/openset/anomaly_detection/openset_anomaly_detection_workflow.py b/logai/applications/openset/anomaly_detection/openset_anomaly_detection_workflow.py index 4efb655..ed839ce 100755 --- a/logai/applications/openset/anomaly_detection/openset_anomaly_detection_workflow.py +++ b/logai/applications/openset/anomaly_detection/openset_anomaly_detection_workflow.py @@ -34,12 +34,7 @@ def validate_config_dict(workflow_config_dict): """Method to validate the config dict with the schema - Args: - workflow_config_dict (dict): dict containing config - for anomaly detection workflow on open log datasets - - Raises: - SchemaError: Schema Error + :param workflow_config_dict: (dict): dict containing config for anomaly detection workflow on open log datasets """ try: config_schema.validate(workflow_config_dict) @@ -54,20 +49,14 @@ def get_openset_ad_config( parse_logline: bool, training_type: str, ): - """Method to dynamically set some of the config parameters based on the given arguments - List of all possible supported anomaly detection types and vectorizer types - configurations can be found in the config yaml file - Avoid this function if you are directly setting all config parameters manually + """Method to dynamically set some of the config parameters based on the given arguments. List of all possible supported anomaly detection types and vectorizer types configurations can be found in the config yaml file. Avoid this function if you are directly setting all config parameters manually - Args: - config_filename (str): Name of the config file (currently supports hdfs and bgl) - anomaly_detection_type (str): string describing the type of anomaly detection - vectorizer_type (str): string describing the type of vectorizer. - parse_logline (bool): Whether to use log parsing or not - training_type (str): Whether to use "supervised" or "unsupervised" training - - Returns: - OpenSetADWorkflowConfig: config object of type OpenSetADWorkflowConfig + :param config_filename: (str): Name of the config file (currently supports hdfs and bgl) + :param anomaly_detection_type: (str): string describing the type of anomaly detection + :param vectorizer_type: (str): string describing the type of vectorizer. + :param parse_logline: (bool): Whether to use log parsing or not + :param training_type: (str): Whether to use "supervised" or "unsupervised" training + :return: OpenSetADWorkflowConfig: config object of type OpenSetADWorkflowConfig """ config_path = os.path.join( os.path.dirname(__file__), "configs", "{}.yaml".format(config_filename) @@ -113,23 +102,17 @@ def get_openset_ad_config( @dataclass class OpenSetADWorkflowConfig(WorkFlowConfig): - """Config for Log Anomaly Detection workflow on Open Log datasets - - Inherits: - WorkFlowConfig: Config object for specifying workflow parameters - - dataset_name: str = None # name of the public open dataset - label_filepath: str = None # path to the separate file (if any) containing the anomaly detection labels - output_dir: str = None # path to output directory where all intermediate and final outputs would be dumped - parse_logline: bool = False # whether to parse or not - training_type: str = None # should be either supervised or unsupervised - deduplicate_test: bool = False # whether to de-duplicate the instances in the test data, while maintaining a count - of the number of each duplicated instance - test_data_frac_pos: float = 0.8 # fraction of the logs having positive class used for teest - test_data_frac_neg: float = 0.8 # fraction of the logs having negative class used for test - train_test_shuffle: bool = False # whether to use chronological ordering of the logs or to shuffle them when - creating the train test splits - + """Config for Log Anomaly Detection workflow on Open Log dataset Inherits: WorkFlowConfig: Config object for specifying workflow parameters + + :param dataset_name: str = None: name of the public open dataset + :param label_filepath: str = None: path to the separate file (if any) containing the anomaly detection labels + :param output_dir: str = None : path to output directory where all intermediate and final outputs would be dumped + :param parse_logline: bool = False : whether to parse or not + :param training_type: str = None: should be either supervised or unsupervised + :param deduplicate_test: bool = False : whether to de-duplicate the instances in the test data, while maintaining a count of the number of each duplicated instance + :param test_data_frac_pos: float = 0.8 : fraction of the logs having positive class used for teest + :param test_data_frac_neg: float = 0.8 : fraction of the logs having negative class used for test + :param train_test_shuffle: bool = False : whether to use chronological ordering of the logs or to shuffle them when creating the train test splits """ dataset_name: str = None # name of the public open dataset @@ -150,13 +133,12 @@ class OpenSetADWorkflowConfig(WorkFlowConfig): class OpenSetADWorkflow: - def __init__(self, config: OpenSetADWorkflowConfig): - """Initializing the log anomaly detection workflow for open log datasets + """log anomaly detection workflow for open log datasets - Args: - config (OpenSetADWorkflowConfig): config object specifying - parameters for log anomaly detection over open datasets - """ + :param config: (OpenSetADWorkflowConfig): config object specifying parameters for log anomaly detection over open datasets + """ + def __init__(self, config: OpenSetADWorkflowConfig): + self.config = config def _get_parse_type_str(self): @@ -257,8 +239,7 @@ def load_anomaly_detector(self): def load_data(self): """loads logrecord object from raw log dataset - Returns: - LogRecordObject : logrecord object created from the raw log dataset + :return: LogRecordObject : logrecord object created from the raw log dataset """ self.load_dataloader() logrecord = self.dataloader.load_data() @@ -272,11 +253,8 @@ def load_data(self): def preprocess_log_data(self, logrecord): """preprocesses logrecord object by doing custom dataset specific data cleaning and formatting - Args: - logrecord (LogRecordObject): log record object to be preprocessed - - Returns: - LogRecordObject: preprocessed lgo record object using custom dataset-specific preprocessing + :param logrecord: (LogRecordObject): log record object to be preprocessed + :return: LogRecordObject: preprocessed lgo record object using custom dataset-specific preprocessing """ self.load_preprocessor() preprocessed_filepath = self._get_output_filename(suffix="preprocessed") @@ -298,11 +276,8 @@ def preprocess_log_data(self, logrecord): def parse_log_data(self, logrecord): """parse logrecord object by applying standard log parsers as specified in the Config - Args: - logrecord (LogRecordObject): logrecord object to be parsed - - Returns: - LogRecordObject: parsed logrecord object + :param logrecord: (LogRecordObject): logrecord object to be parsed + :return: LogRecordObject: parsed logrecord object """ self.load_parser() parsed_filepath = self._get_output_filename(suffix=self._get_parse_type_str()) @@ -324,11 +299,8 @@ def parse_log_data(self, logrecord): def partition_log_data(self, logrecord: LogRecordObject): """partitioning logrecord object by applying session or sliding window based partitions - Args: - logrecord (LogRecordObject): logrecord object to be partitioned - - Returns: - logrecord: partitioned logrecord object + :param logrecord: (LogRecordObject): logrecord object to be partitioned + :return: logrecord: partitioned logrecord object """ self.load_partitioner() output_filepath_suffix = ( @@ -349,16 +321,12 @@ def partition_log_data(self, logrecord: LogRecordObject): return logrecord def generate_train_dev_test_data(self, logrecord: LogRecordObject): - """splitting open log datasets into train dev and test splits according to the parameters - specified in the config object - - Args: - logrecord (LogRecordObject): logrecord object to be split into train, dev and test + """splitting open log datasets into train dev and test splits according to the parameters specified in the config object - Returns: - train_data: logrecord object containing training dataset - dev_data: logrecord object containing dev dataset - test_data: logrecord object containing test dataset + :param logrecord: (LogRecordObject): logrecord object to be split into train, dev and test + :return: - train_data: logrecord object containing training dataset. + - dev_data: logrecord object containing dev dataset. + - test_data: logrecord object containing test dataset """ output_filepath_suffix = ( self._get_parse_type_str() @@ -407,15 +375,10 @@ def generate_train_dev_test_data(self, logrecord: LogRecordObject): return train_data, dev_data, test_data def dedup_data(self, logrecord: LogRecordObject): - """Method to run deduplication of log records, where loglines having same body - and span id is collapsed into a single logline. The original occurrent count values of these - loglines is added as a pandas Series object in the 'attributes' property of the logrecord object. + """Method to run deduplication of log records, where loglines having same body and span id is collapsed into a single logline. The original occurrent count values of theseloglines is added as a pandas Series object in the 'attributes' property of the logrecord object. - Args: - logrecord (LogRecordObject): logrecord object to be deduplicated - - Returns: - LogRecordObject: resulting logrecord object + :param logrecord: (LogRecordObject): logrecord object to be deduplicated + :return: LogRecordObject: resulting logrecord object """ self.load_deduper() old_data_len = len(logrecord.body) @@ -449,10 +412,9 @@ def dedup_data(self, logrecord: LogRecordObject): def run_data_processing_workflow(self): """Running data processing pipeline for log anomaly detection workflow - Returns: - train_data: logrecord object containing training dataset - dev_data: logrecord object containing dev dataset - test_data: logrecord object containing test dataset + :return: - train_data: logrecord object containing training dataset. + - dev_data: logrecord object containing dev dataset. + - test_data: logrecord object containing test dataset """ logrecord = self.load_data() logrecord = self.preprocess_log_data(logrecord=logrecord) @@ -473,13 +435,9 @@ def run_data_processing_workflow(self): def vectorizer_transform(self, logrecord: LogRecordObject, output_filename=None): """Applying vectorization on a logrecord object based on the kind of vectorizer specific in Config - Args: - logrecord (LogRecordObject): logrecord containing data to be vectorized - output_filename (str, optional): path to output file where the vectorized log data would be dumped. - Defaults to None. - - Returns: - vectorized_output : vectorized data + :param logrecord: (LogRecordObject): logrecord containing data to be vectorized + :param output_filename: (str, optional): path to output file where the vectorized log data would be dumped. Defaults to None. + :return: vectorized_output : vectorized data """ if output_filename and os.path.exists(output_filename): vectorized_output = pkl.load(open(output_filename, "rb")) @@ -492,15 +450,12 @@ def vectorizer_transform(self, logrecord: LogRecordObject, output_filename=None) def run_vectorizer(self, train_logrecord, dev_logrecord, test_logrecord): """Wrapper method for applying vectorization on train, dev and test logrecord objects - Args: - train_logrecord (LogRecordObject): logrecord object of the training dataset - dev_logrecord (LogRecordObject): logrecord object of the dev dataset - test_logrecord (LogRecordObject): logrecord object of the test dataset - - Returns: - train_data : vectorized train data - dev_data: vectorized dev data - test_data: vectorized test data + :param train_logrecord: (LogRecordObject): logrecord object of the training dataset + :param dev_logrecord: (LogRecordObject): logrecord object of the dev dataset + :param test_logrecord: (LogRecordObject): logrecord object of the test dataset + :return: - train_data : vectorized train data. + - dev_data: vectorized dev data. + - test_data: vectorized test data. """ self.load_vectorizer() self.vectorizer.fit(train_logrecord) @@ -535,10 +490,9 @@ def run_vectorizer(self, train_logrecord, dev_logrecord, test_logrecord): def run_anomaly_detection(self, train_data, dev_data, test_data): """Method to train and run inference of anomaly detector - Args: - train_data: vectorized version of the train dataset - dev_data: vectorized version of the dev dataset - test_data: vectorized version of the test dataset + :param train_data: vectorized version of the train dataset + :param dev_data: vectorized version of the dev dataset + :param test_data: vectorized version of the test dataset """ self.load_anomaly_detector() self.anomaly_detector.fit(train_data, dev_data) @@ -582,24 +536,3 @@ def execute(self): ) logging.info("Going to Anomaly Detection") self.run_anomaly_detection(train_data, dev_data, test_data) - - -if __name__ == "__main__": - # kwargs = { - # "config_filename": "hdfs", - # "anomaly_detection_type": "logbert_AD", - # "vectorizer_type": "logbert" , - # "parse_logline": False , - # "training_type": "unsupervised" - # } - - kwargs = { - "config_filename": "hdfs", - "anomaly_detection_type": "lstm_sequential_supervised_parsed_AD", - "vectorizer_type": "forecast_nn_sequential", - "parse_logline": True, - "training_type": "supervised", - } - config = get_openset_ad_config(**kwargs) - openset_AD_workflow = OpenSetADWorkflow(config) - openset_AD_workflow.execute() diff --git a/logai/dataloader/data_loader.py b/logai/dataloader/data_loader.py index 0c37f0e..87465a7 100755 --- a/logai/dataloader/data_loader.py +++ b/logai/dataloader/data_loader.py @@ -38,19 +38,19 @@ class FileDataLoader: def __init__(self, config: DataLoaderConfig): """ - Initialize FileDataLoader by consuming the configuration. + Initializes FileDataLoader by consuming the configuration. """ self.config = config def load_data(self) -> LogRecordObject: """ - Load log data with given configuration. + Loads log data with given configuration. Currently support file formats: - csv - tsv - other plain text format such as .log with proper parsing configurations - :return: LogRecordObject: the logs read from log files and converted into LogRecordObject. + :return: The logs read from log files and converted into LogRecordObject. """ kwargs = self.config.reader_args fpath = self.config.filepath @@ -76,7 +76,7 @@ def _read_logs(self, fpath): def _log_to_dataframe(self, fpath, log_format): """ - Function to transform log file to dataframe + Function to transform log file to dataframe. """ headers = [] splitters = re.split(r"(<[^<>]+>)", log_format) @@ -164,6 +164,6 @@ class DefaultDataLoader: # TODO: Placeholder to implement connector based data loader. def __init__(self): """ - Initialize default data loader + Initializes default data loader. """ self._logger = logging.Logger() diff --git a/logai/dataloader/data_loader_utils.py b/logai/dataloader/data_loader_utils.py index 2e51a81..0ffd969 100755 --- a/logai/dataloader/data_loader_utils.py +++ b/logai/dataloader/data_loader_utils.py @@ -14,7 +14,7 @@ def generate_logformat_regex(log_format): """ Function to generate regular expression to split log messages. - return: headers, regex + return: headers, regex. """ headers = [] splitters = re.split(r"(<[^<>]+>)", log_format) @@ -57,7 +57,7 @@ def log_to_dataframe(log_file, regex, headers): def load_data(filename, log_format): """ - Load log from given file and format. + Loads log from given file and format. :param filename: Files to read. :param log_format: Target log format. diff --git a/logai/dataloader/data_model.py b/logai/dataloader/data_model.py index fe1323f..4be68d2 100755 --- a/logai/dataloader/data_model.py +++ b/logai/dataloader/data_model.py @@ -17,7 +17,21 @@ @dataclass class LogRecordObject: """ - Log record object data model. + Log record object data model, compatible with log and event record definition in OpenTelemetry + https://opentelemetry.io/docs/reference/specification/logs/data-model/#log-and-event-record-definition. + + :param timestamp: The timestamp information of the log data. + :param attributes: The attributes of the log data (typically structured data with quantitative or categorical fields). + :param resource: The field denoting data source information generating the log data. + :param trace_id: The request trace id associated with the log data, if any. + :param span_id: The request span id associated with the log data, if any. + :param severity_text: The severity description or log level information. + :param severity_number: The severity number indicating log level. + :param body: The body of the log record, which contains the main information of the log. It can be consisting + of either unstructured, semi-structured or structured information. + :param labels: Any associated label information with the log (for e.g. binary anomaly label indicating + whether each line is anomalous or not). + :param _index: The indices of the log data. """ timestamp: pd.DataFrame = pd.DataFrame() @@ -73,7 +87,7 @@ def to_dataframe(self): def from_dataframe(cls, data: pd.DataFrame, meta_data: dict = None): """ Converts pandas.DataFrame to log record object. - + :param data: The log data in pandas dataframe. :param meta_data: A dictionary that maps data.columns to fields of LogRecordObject. :return: A LogRecordObject object. @@ -96,13 +110,9 @@ def from_dataframe(cls, data: pd.DataFrame, meta_data: dict = None): def save_to_csv(self, filepath: str): """ - Saves the logrecord object to a file. - - Args: - filepath: The absolute path to filename where the logrecord object would be saved. - - Raises: - Exception: Supports only file extensions (.csv, .json, and .pickle or .pkl). + Saves a log record object to file. + + :param filepath: The absolute path to filename where the logrecord object would be saved. """ f = pathlib.Path(filepath) filepath_metadata = filepath.replace(f.suffix, "_metadata.json") @@ -122,14 +132,10 @@ def load_from_csv(cls, filepath): return cls.from_dataframe(data=data, meta_data=meta_data) def select_by_index(self, indices: list, inplace: bool = False): - """Select a subset of a logrecord object based on selected indices. - - Args: - indices: A list of indices to select. - inplace: Performs operation inplace or not. Defaults to False. - - Returns: - LogRecordObject: Resulting logrecord object created from the selected indices. + """Selects a subset of a logrecord object based on selected indices. + + :param indices: A list of indices to select inplace (bool, optional): performs operation inplace or not. + :return: LogRecordObject: The resulting logr ecord object created from the selected indices. """ if not inplace: target = LogRecordObject() @@ -143,14 +149,10 @@ def select_by_index(self, indices: list, inplace: bool = False): return target def filter_by_index(self, indices: list, inplace: bool = False): - """Select a subset of a logrecord object by removing certain indices. - - Args: - indices: A list of indices to remove. - inplace: Performs operation inplace or not. Defaults to False. - - Returns: - LogRecordObject: Resulting logrecord object created after removing the indices. + """Selects a subset of a logrecord object by removing certain indices. + + :param indices: A list of indices to remove inplace (bool, optional): performs operation inplace or not. + :return: The resulting log record object created after removing the indices. """ if not inplace: target = LogRecordObject() @@ -165,9 +167,8 @@ def filter_by_index(self, indices: list, inplace: bool = False): def dropna(self): """Method to drop entries containing NaN or null values in the logrecord object. - - Returns: - LogRecordObject: Modified logrecord object after removing entries with NaN or null values. + + :return: The modified logrecord object after removing entries with NaN or null values. """ null_body = self.body.isnull() null_body = null_body[null_body[constants.LOGLINE_NAME] == True] diff --git a/logai/dataloader/openset_data_loader.py b/logai/dataloader/openset_data_loader.py index cea3b7a..0e42a25 100755 --- a/logai/dataloader/openset_data_loader.py +++ b/logai/dataloader/openset_data_loader.py @@ -16,9 +16,9 @@ def get_config(dataset_name, filepath) -> DataLoaderConfig: """ - Retrieve the configuration of open log datasets to load data. - - :param dataset_name: Supported log dataset name from ("hdfs", "bgl", "HealthApp"). + Retrieves the configuration of open log datasets to load data. + + :param dataset_name: The supported log dataset name from ("hdfs", "bgl", "HealthApp"). :param filepath: The log file path. :return: The configuration to load open log datasets. """ diff --git a/logai/information_extraction/categorical_encoder.py b/logai/information_extraction/categorical_encoder.py index 7869768..00cddb6 100755 --- a/logai/information_extraction/categorical_encoder.py +++ b/logai/information_extraction/categorical_encoder.py @@ -73,7 +73,7 @@ def __init__(self, config: CategoricalEncoderConfig): def fit_transform(self, features: pd.Series) -> Tuple[pd.DataFrame, list]: """ - Transform the str features into categories. + Transforms the str features into categories. :param features: A list of features. :return: A list of encoded features. diff --git a/logai/information_extraction/feature_extractor.py b/logai/information_extraction/feature_extractor.py index a408d78..ca03147 100644 --- a/logai/information_extraction/feature_extractor.py +++ b/logai/information_extraction/feature_extractor.py @@ -17,6 +17,13 @@ @dataclass class FeatureExtractorConfig(Config): """Config class for Feature Extractor. + + :param group_by_category: Which fields of the dataframe object to group by. + :param group_by_time: Grouping log lines by the time frequency, using the notations in + https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases. + :param sliding_window: The length of the sliding window . + :param steps: The step-size of sliding window. + :param max_feature_len: The pad the log vector to this size. """ group_by_category: list = None group_by_time: str = None @@ -37,11 +44,10 @@ def __post_init__(self): def _get_group_counter(attributes: pd.DataFrame, group_by_category: list) -> pd.Series: """ - TODO: merge counting with other feature extraction functions - - :param attributes: pd.Dataframe or pd.Series for counting - :param group_by_category: selected attributes for grouping and counting - :return: + Merge counting with other feature extraction functions. + + :param attributes: pd.Dataframe or pd.Series for counting. + :param group_by_category: selected attributes for grouping and counting. """ attributes["group_index"] = attributes.index group_counter_list = ( @@ -61,9 +67,8 @@ class FeatureExtractor: Generate feature sets: 1. log features: generate feature set from log vectors. 2. log event sequence: concatenating all loglines belongs to the same log event. - 1. log event counter vector: for each log event. - 2. log vector - + 3. log event counter vector: for each log event. + 4. log vector Note: 1. counter vector 2. sematic vector @@ -85,12 +90,13 @@ def convert_to_counter_vector( timestamps: pd.Series = None, ) -> pd.DataFrame: """ - Convert logs to log counter vector, after grouping log data based on the FeatureExtractor config. - - :param log_pattern: An unstructured part of the log data. + Converts logs to log counter vector, after grouping log data based on the FeatureExtractor config. + + :param log_pattern: The unstructured part of the log data. :param attributes: The log attributes. :param timestamps: The timestamps. - :return: A pandas dataframe containing the counts of the log-events after grouping. + + :return: The dataframe object containing the counts of the log-events after grouping. """ # TODO: Implement sliding window for counter vectors input_df = self._get_input_df(log_pattern, attributes, timestamps) @@ -110,17 +116,18 @@ def convert_to_feature_vector( timestamps: pd.Series, ) -> pd.DataFrame: """ - Converting log data into feature vector, by combining the log vectors (can be output + Converts log data into feature vector, by combining the log vectors (can be output of LogVectorizer) with other numerical or categorical attributes of the logs, after grouping based on the FeatureExtractorConfig. - - - Return: - (pd.DataFrame): event_index_list: modified log data (pd.DataFrame) consisting of the converted - feature vector form of the input log data after applying the log grouping. It contains an - "event_index" field which maintains the sequence of log event ids where these ids correspond to the - original input dataframe's indices. - block_list: pd.DataFrame + + :param log_vectors: Numeric features of the logs (for e.g. the vectorized form of the log data obtained as output of LogVectorizer). + :param attributes: Categorical or numerical attributes for grouping, or numerical attributes serve as additional features. + :param timestamps: pd.Series object containing the timestamp data of the loglines. + + :return: ``event_index_list``: modified log data (pd.DataFrame) consisting of the converted feature vector form of the input log data + after applying the log grouping. It contains an "event_index" field which maintains the sequence of + log event ids where these ids correspond to the original input dataframe's indices. + ``block_list``: pd.DataFrame object. """ if log_vectors is None or log_vectors.empty: feature_df = None @@ -144,21 +151,18 @@ def convert_to_sequence( attributes: pd.DataFrame = None, timestamps: pd.Series = None, ): - """Converting log data into sequence using sliding window technique, as defined in FeatureExtractorConfig. - - Args: - log_pattern (pd.Series, optional): pd.Series object that encapsulates the entire arbitrary unstructured part of the log data (for example, - can be the unstructured part of the raw log data or the output of the output of the log parser). Defaults to None. - attributes (pd.DataFrame, optional): structured part (attributes) of the raw log data. Defaults to None. - timestamps (pd.Series, optional): timestamps data corresponding to the log lines. Defaults to None. - - Returns: - (pd.DataFrame): event_index_list: modified log data consisting of the sequence form of the structured and unstructured input - data (i.e. log_pattern and attributes arguments) after running sliding window. For the unstructured part, the returned DataFrame - contains an "event_index" field which maintains the sequence of log event ids where these ids correspond to the - original input dataframe's indices. - (pd.Series): event_sequence: Contains the concatenating form of the unstructured input data (i.e. log_pattern argument), - after concatenating the unstructured data for each sliding window. + """Converts log data into sequence using sliding window technique, as defined in FeatureExtractorConfig. + + :param log_pattern: A pd.Series object that encapsulates the entire arbitrary unstructured part of the log data + (for example, can be the unstructured part of the raw log data or the output of the output of the log parser). + :param attributes: The structured part (attributes) of the raw log data. + :param timestamps: The timestamps data corresponding to the log lines. + :return: ``event_index_list``: pd.DataFrame object of modified log data consisting of the sequence form of the + structured and unstructured input data (i.e. log_pattern and attributes arguments) after running sliding + window. For the unstructured part, the returned DataFrame contains an "event_index" field which maintains + the sequence of log event ids where these ids correspond to the original input dataframe's indices. + ``event_sequence``: pd.Series object containing the concatenating form of the unstructured input data + (i.e. log_pattern argument), after concatenating the unstructured data for each sliding window. """ # TODO: Converting sequence by sliding windows. # Partioning: length of sequence, step diff --git a/logai/information_extraction/log_parser.py b/logai/information_extraction/log_parser.py index 8a25678..6a38139 100755 --- a/logai/information_extraction/log_parser.py +++ b/logai/information_extraction/log_parser.py @@ -41,14 +41,11 @@ def from_dict(cls, config_dict): class LogParser: """ Implementation of log parser for free-form text loglines. + + :param config: The log parser configuration. """ def __init__(self, config: object): - """ - Initialization of log parser. - - :param config: LogParserConfig: log parser configuration. - """ name = config.parsing_algorithm.lower() config_class = factory.get_config_class("parsing", name) algorithm_class = factory.get_algorithm_class("parsing", name) @@ -59,17 +56,15 @@ def __init__(self, config: object): def fit(self, loglines: pd.Series): """ Trains log parser with training loglines. - - :param loglines: pd.Series: the list of loglines for training + :param loglines: A pd.Series object containing the list of loglines for training. """ self.parser.fit(loglines) def parse(self, loglines: pd.Series) -> pd.DataFrame: """ Uses the trained log parser to parse loglines. - - :param loglines: pd.Series: the loglines for parsing. - :return: pd.DataFrame: a dataframe of parsed result ["loglines", "parsed_loglines", "parameter_list"]. + :param loglines: A pd.Series object conatining the loglines for parsing. + :return: A dataframe of parsed result ["loglines", "parsed_loglines", "parameter_list"]. """ if self.parser is None: raise RuntimeError("Parser is None.") @@ -87,8 +82,7 @@ def parse(self, loglines: pd.Series) -> pd.DataFrame: def fit_parse(self, loglines: pd.Series) -> pd.DataFrame: """ Trains and parses the given loglines. - - :param loglines: The list of loglines to train and parse. + :param loglines: A pd.Series object containing the list of loglines to train and parse. :return: A dataframe of parsed result ["loglines", "parsed_loglines", "parameter_list"]. """ try: @@ -100,8 +94,7 @@ def fit_parse(self, loglines: pd.Series) -> pd.DataFrame: def save(self, out_path): """ - Save the parser model. - + Saves the parser model. :param out_path: The directory to save parser models. """ @@ -121,7 +114,6 @@ def save(self, out_path): def load(self, model_path): """ Loads existing parser models. - :param model_path: The directory to load parser models. """ @@ -134,8 +126,8 @@ def get_parameter_list(row): """ Returns parameter list of the loglines. - :param row: A row in dataframe as function input containing ['logline', 'parsed_logline']. - :return: A list of dynamic parameters. + :param row: The row in dataframe as function input containing ['logline', 'parsed_logline']. + :return: The list of dynamic parameters. """ parameter_list = [] if not isinstance(row.logline, str) or not isinstance(row.parsed_logline, str): diff --git a/logai/information_extraction/log_vectorizer.py b/logai/information_extraction/log_vectorizer.py index a37bceb..6823604 100755 --- a/logai/information_extraction/log_vectorizer.py +++ b/logai/information_extraction/log_vectorizer.py @@ -16,6 +16,10 @@ @dataclass class VectorizerConfig(Config): """Config class for Vectorizer. + + :param algo_name: The name of the vectorizer algorithm. + :param algo_param: The parameters of the vectorizer algorithm . + :param custom_param: Additional custom parameters to be passed to the vectorizer algorithm. """ algo_name: str = "word2vec" algo_param: object = None @@ -46,14 +50,14 @@ def __init__(self, config: VectorizerConfig): def fit(self, loglines: pd.Series): """Fit method for LogVectorizer, to train the vectorizer model on the training data. - + :param loglines: A pandas Series object containing the training raw log data. """ self.vectorizer.fit(loglines) def transform(self, loglines: pd.Series) -> pd.Series: """Transform method for LogVectorizer, to transform the raw log text data to vectors. - + :param loglines: A pandas Series object containing the test raw log data. :return: A pandas Series object containing the vectorized log data. """ diff --git a/logai/preprocess/bgl_preprocessor.py b/logai/preprocess/bgl_preprocessor.py index 32b6f1b..e0c2bcc 100755 --- a/logai/preprocess/bgl_preprocessor.py +++ b/logai/preprocess/bgl_preprocessor.py @@ -15,22 +15,16 @@ class BGLPreprocessor(OpenSetPreprocessor): """ Custom preprocessor for Open log dataset BGL. - - Inherits: - OpenSetPreprocessor: log preprocessor class for open log datasets. """ def __init__(self, config: PreprocessorConfig): super().__init__(config) def _get_ids(self, logrecord: LogRecordObject) -> pd.Series: - """get ids of loglines - - Args: - logrecord (LogRecordObject): logrecord object containing the BGL data - - Returns: - pd.Series: containing the ids of the loglines + """Get ids of loglines. + + :param logrecord: logrecord object containing the BGL data. + :return: pd.Series object containing the ids of the loglines. """ time_unit_in_secs = 60 # 21600.0 # 6 hours ids = logrecord.span_id[constants.SPAN_ID].astype(int) @@ -39,12 +33,9 @@ def _get_ids(self, logrecord: LogRecordObject) -> pd.Series: return session_ids def _get_labels(self, logrecord: LogRecordObject): - """get anomaly detection labels of loglines - - Args: - logrecord (LogRecordObject): logrecord object containing the BGL data - - Returns: - pd.Series: containing the labels of the loglines + """Get anomaly detection labels of loglines. + + :param logrecord: logrecord object containing the BGL data. + :return:pd.Series object containing the labels of the loglines. """ return logrecord.labels[constants.LABELS].apply(lambda x: int(x != "-")) diff --git a/logai/preprocess/hdfs_preprocessor.py b/logai/preprocess/hdfs_preprocessor.py index 61cc98c..a9ce832 100755 --- a/logai/preprocess/hdfs_preprocessor.py +++ b/logai/preprocess/hdfs_preprocessor.py @@ -15,9 +15,6 @@ class HDFSPreprocessor(OpenSetPreprocessor): """ Custom Preprocessor for open log dataset HDFS. - - Inherits: - OpenSetPreprocessor: log preprocessor class for open log datasets. """ def __init__(self, config: PreprocessorConfig, label_file: str): @@ -26,13 +23,10 @@ def __init__(self, config: PreprocessorConfig, label_file: str): self.label_file = label_file def _get_labels(self, logrecord: LogRecordObject): - """get anomaly detection labels of loglines - - Args: - logrecord (LogRecordObject): logrecord object containing hdfs data - - Returns: - pd.Series: containing the anomaly detection labels of loglines + """Get anomaly detection labels of loglines. + + :param: logrecord: logrecord object containing hdfs data. + :return: pd.Series object containing the anomaly detection labels of loglines. """ blk_df = pd.read_csv(self.label_file, header=0) anomaly_blk = set(blk_df[blk_df["Label"] == "Anomaly"]["BlockId"]) @@ -45,13 +39,10 @@ def _get_labels(self, logrecord: LogRecordObject): return labels def _get_ids(self, logrecord: LogRecordObject): - """get ids of loglines - - Args: - logrecord (LogRecordObject): logrecord object containing hdfs data - - Returns: - pd.Series: containing the ids of the loglines + """Get ids of loglines. + + :param logrecord: logrecord object containing hdfs data. + :return:pd.Series object containing the ids of the loglines. """ predefined_ids = logrecord.body[" BLOCK "] predefined_ids = predefined_ids.apply(lambda x: self.id_separator.join(set(x))) diff --git a/logai/preprocess/openset_partitioner.py b/logai/preprocess/openset_partitioner.py index 8ddb9e5..a28b2e7 100644 --- a/logai/preprocess/openset_partitioner.py +++ b/logai/preprocess/openset_partitioner.py @@ -20,6 +20,10 @@ @dataclass class OpenSetPartitionerConfig(Config): """Config for Partitioner for open log datasets. + + :param sliding_window: The size of sliding window. + :param session_window: A boolean flag whether to use session based partitioning or not. + :param logsequence_delim: The delimiter string for concatenating log sequences. """ sliding_window: int = 0 @@ -28,12 +32,12 @@ class OpenSetPartitionerConfig(Config): class OpenSetPartitioner: + """Partitioner class for Open log datasets. + + :param config: A config object specifying parameters of log partititoning for open log datasets. + """ def __init__(self, config: OpenSetPartitionerConfig): - """Initialization of Partitioner class for Open log datasets. - - :param config: A config object specifying - parameters of log partititoning for open log datasets. - """ + self.config = config if config.sliding_window > 0: @@ -90,10 +94,9 @@ def _get_next_data_succeeding_sliding_window(self, data, sliding_windows, field) def generate_sliding_window(self, logrecord): """Method to generate sliding window based log sequences from a logrecord object. - - :param logrecord: A logrecord object to be partitioned into sliding windows. - :return: A LogRecordObject object where the body of logrecord object contains the generated - log sequences. + + :param logrecord: A log record object to be partitioned into sliding windows. + :return: LogRecordObject where the body of logrecord object contains the generated log sequences. """ log_data = logrecord.to_dataframe() partitioned_data = self._get_group_sliding_window( @@ -114,12 +117,11 @@ def generate_sliding_window(self, logrecord): return logrecord def generate_session_window(self, logrecord): - """Method to generate session window based log sequences from a logrecord object given some - ids at the logline level. - - :param logrecord: A logrecord object to be partitioned into session windows. - :return: A LogRecordObject object where the body of logrecord object contains the generated - log sequences. + """Method to generate session window based log sequences from a logrecord object given some. + ids at the logline level + + :param logrecord: A log record object to be partitioned into session windows. + :return: LogRecordObject where the body of logrecord object contains the generated log sequences. """ partitioned_data = self.feature_extractor.convert_to_counter_vector( @@ -141,10 +143,9 @@ def generate_session_window(self, logrecord): def partition(self, logrecord): """Wrapper function for applying partitioning on a logrecord object based on the Config parameters. - - :param logrecord: A logrecord object to be partitioned into session or sliding windows. - :return: A LogRecordObject object where the body of logrecord object contains the generated - log sequences. + + :param logrecord: A log record object to be partitioned into session or sliding windows. + :return: LogRecordObject where the body of logrecord object contains the generated log sequences. """ if self.config.sliding_window > 0: logrecord = self.generate_sliding_window(logrecord) diff --git a/logai/preprocess/openset_preprocessor.py b/logai/preprocess/openset_preprocessor.py index dd66818..6acb23f 100755 --- a/logai/preprocess/openset_preprocessor.py +++ b/logai/preprocess/openset_preprocessor.py @@ -13,6 +13,10 @@ class OpenSetPreprocessor(Preprocessor): + """Preprocessor class for Open log datasets. + + :param config: A config object specifying parameters of log preprocessing for open log datasets. + """ def __init__(self, config: PreprocessorConfig): super().__init__(config) self.config = config @@ -31,8 +35,8 @@ def _format_ids(self, data_id: pd.Series): def clean_log(self, logrecord: LogRecordObject) -> LogRecordObject: """Preprocessing cleaning of logrecord object creating from open log datasets. - - :param logrecord: A logrecord object containing the raw log data from open datasets. + + :param logrecord: A log record object containing the raw log data from open datasets. :return: The cleaned logrecord object. """ preprocessed_loglines, custom_patterns = super().clean_log( diff --git a/logai/preprocess/partitioner.py b/logai/preprocess/partitioner.py index 692113d..830a0b8 100755 --- a/logai/preprocess/partitioner.py +++ b/logai/preprocess/partitioner.py @@ -15,17 +15,17 @@ @dataclass class PartitionerConfig(Config): - """Config class for Partitioner. + """Config class for Partitioner. - :param group_by_category: A list of fields to group log data by. - :param group_by_time: A string-type argument to specify grouping by time, supported types + :param group_by_category: The list of fields to group log data by . + :param group_by_time: The string-type argument to specify grouping by time, supported types https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases. :param sliding_window: The sliding window length if partitioning loglines into sliding windows. - :param sep_token: The separator token string to be used as delimiter, when grouping log data. - :param exclude_last_window: Whether to exclude the last window when doing - sliding window based grouping of log data. - :param exclude_smaller_windows: Whether to exclude windows of length smaller - than the given `sliding_window` argument. + :param sep_token: The separator token string to be used as delimiter, when grouping log data . + :param exclude_last_window: A boolean (default false) whether to exclude the last window + when doing sliding window based grouping of log data. + :param exclude_smaller_windows: A boolean (default false) whether to exclude windows of + length smaller than the given `sliding_window` argument. """ group_by_category: list = None group_by_time: str = None @@ -50,20 +50,20 @@ def __init__(self, config: PartitionerConfig): def sliding_window(self, loglines: pd.Series) -> pd.Series: """ - Conduct sliding window log partitioning. - - :param loglines: A series of loglines. - :return: A series of logline sequence after sliding window. + Conducts sliding window log partitioning. + + :param loglines: The series of loglines. + :return: The series of logline sequence after sliding window. """ partitioned_loglines = self._sliding_window(loglines) return pd.Series(partitioned_loglines, name=loglines.name) def group_counter(self, logrecord_df: pd.DataFrame) -> pd.DataFrame: """ - Group log record by given categories and return counter vectors. - + Groups log records by given categories and return counter vectors. + :param logrecord_df: The log record dataframe. - :return: The log counter vector dataframe after grouping + :return: The log counter vector dataframe after grouping. """ if not self._valid_columns(logrecord_df.columns.values): raise ValueError("Make sure logrecord has the columns to group by.") @@ -104,11 +104,11 @@ def group_sliding_window( self, logrecord_df: pd.DataFrame, logline_col_name=constants.LOGLINE_NAME ) -> pd.DataFrame: """ - Group log record by sliding window based on the sliding window length, and returns the - resulting pandas DataFrame object. - - :param logrecord_df: A pandas DataFrame object on which grouping is to be applied. - :return: A pd.DataFrame object after sliding window based grouping. + Groups log records by sliding window based on the sliding window length, and returns + the resulting pandas dataFrame object. + + :param logrecord_df: A pandas dataFrame on which grouping is to be applied. + :return: A pandas dataFrame after sliding window based grouping. """ if not self._valid_columns(logrecord_df.columns): raise ValueError("Make sure logrecord has the columns to group by.") diff --git a/logai/preprocess/preprocessor.py b/logai/preprocess/preprocessor.py index c313983..a8cda3f 100755 --- a/logai/preprocess/preprocessor.py +++ b/logai/preprocess/preprocessor.py @@ -36,8 +36,8 @@ def __init__(self, config: PreprocessorConfig): def clean_log(self, loglines: pd.Series) -> pd.Series: """Cleans the input log data. - :param loglines: The raw loglines data to be cleaned. - :return: The cleaned loglines data. + :param loglines: The raw loglines data to be cleaned . + :return:pd.Series: The cleaned loglines data . """ cleaned_log = loglines terms = pd.DataFrame() diff --git a/logai/preprocess/thunderbird_preprocessor.py b/logai/preprocess/thunderbird_preprocessor.py index 22021a0..a074974 100755 --- a/logai/preprocess/thunderbird_preprocessor.py +++ b/logai/preprocess/thunderbird_preprocessor.py @@ -20,23 +20,17 @@ def __init__(self, config: PreprocessorConfig): super().__init__(config) def _get_ids(self, logrecord: LogRecordObject) -> pd.Series: - """get ids of loglines + """Get ids of loglines. - Args: - logrecord (LogRecordObject): logrecord object - - Returns: - pd.Series: pandas series containing the ids of te loglines + :param logrecord: A log record object. + :return: A pandas series containing the ids of te loglines. """ return logrecord.span_id[constants.SPAN_ID] def _get_labels(self, logrecord: LogRecordObject): - """get anomaly detection labels of loglines - - Args: - logrecord (LogRecordObject): logrecord object containing hdfs data - - Returns: - pd.Series: containing the anomaly detection labels of loglines + """Get anomaly detection labels of loglines. + + :param logrecord: A log record object containing hdfs data. + :return: The anomaly detection labels of loglines. """ return logrecord.labels[constants.LABELS].apply(lambda x: int(x != "-")) diff --git a/logai/utils/dataset_utils.py b/logai/utils/dataset_utils.py index 77fb006..ad607fa 100644 --- a/logai/utils/dataset_utils.py +++ b/logai/utils/dataset_utils.py @@ -19,23 +19,22 @@ def split_train_dev_test_for_anomaly_detection( test_data_frac_pos_class=None, shuffle=False, ): - """Util method to split a logrecord object into train dev and test splits, where the splitting - and the fractions are based on the SPAN_ID field of the logrecord. + """Util method to split a logrecord object into train dev and test splits, where the splitting fractions + are based on the SPAN_ID field of the logrecord. + + :param logrecord: (LogRecordObject): input logrecord object to be split into train, dev and test splits + :param training_type: (str): 'supervised' or 'unsupervised' + :param test_data_frac_neg_class: (float): fraction of the negative class to be . Defaults to None. + :param test_data_frac_pos_class: (float, optional): when supervised mode is selected, fraction of the + positive class data to be used for test data. (fraction for dev data is fixed).For unsupervised mode this value is fixed to 1.0 + :param shuffle: (bool, optional): whether to shuffle the log data when splitting into train and test. + If False, then it uses the chronological ordering, where the first (chronologically first) split will + constitute train data, second one development data and third one as test data. Defaults to False. + + :return: - logrecord_train: logrecord object containing train data. + - logrecord_dev: logrecord object containing dev data. + - logrecord_test: logrecord object containing test data. - Args: - logrecord (LogRecordObject): input logrecord object to be split into train, dev and test splits - training_type (str): 'supervised' or 'unsupervised' - test_data_frac_neg_class (float): fraction of the negative class to be . Defaults to None. - test_data_frac_pos_class (float, optional): when supervised mode is selected, fraction of the positive class - data to be used for test data. (fraction for dev data is fixed).For unsupervised mode this value is fixed to 1.0 - shuffle (bool, optional): whether to shuffle the log data when splitting into train and test. If False, - then it uses the chronological ordering, where the first (chronologically first) split will constitute train data, - second one development data and third one as test data. Defaults to False. - - Returns: - logrecord_train: logrecord object containing train data - logrecord_dev: logrecord object containing dev data - logrecord_test: logrecord object containing test data """ if training_type not in ["supervised", "unsupervised"]: diff --git a/logai/utils/file_utils.py b/logai/utils/file_utils.py index 665b077..638e073 100644 --- a/logai/utils/file_utils.py +++ b/logai/utils/file_utils.py @@ -14,13 +14,10 @@ def file_exists(path: str): - """Util function to check if file exists. - - Args: - path (str): path to file. - - Returns: - bool: if file exists or not. + """util function to check if file exists + + :param path: (str): path to file + :return: bool: if file exists or not """ return os.path.exists(path) @@ -28,11 +25,8 @@ def file_exists(path: str): def read_file(filepath: str): """Reads yaml, json, csv or pickle files. - Args: - filepath (str): path to file. - - Returns: - object : data object containing file contents. + :param filepath: (str): path to file + :return: data object containing file contents """ file_type = pathlib.Path(filepath).suffix if file_type == ".yaml": diff --git a/logai/utils/functions.py b/logai/utils/functions.py index 4f977dc..f25c8c0 100644 --- a/logai/utils/functions.py +++ b/logai/utils/functions.py @@ -17,15 +17,12 @@ def pad(x, max_len: np.array, padding_value: int = 0): - """Method to trim or pad any 1-d numpy array to a given max length with the given padding value. - - Args: - x (np.array): given 1-d numpy array to be padded/trimmed. - max_len (int): maximum length of padded/trimmed output. - padding_value (int, optional): padding value. Defaults to 0. - - Returns: - np.array: padded/trimmed numpy array. + """Method to trim or pad any 1-d numpy array to a given max length with the given padding value + + :param x: (np.array): given 1-d numpy array to be padded/trimmed + :param max_len: (int): maximum length of padded/trimmed output + :param padding_value: (int, optional): padding value. Defaults to 0. + :return: np.array: padded/trimmed numpy array """ flattened_vector = x fill_size = max_len - len(flattened_vector) @@ -83,7 +80,7 @@ def pd_to_timeseries(log_features: pd.Series): Covert pandas.DataFrame to merlion.TimeSeries for log counter vectors. :param log_features: log feature dataframe must only contain two columns - ['timestamp': datetime, constants.LOGLINE_COUNTS: int]. + ['timestamp': datetime, constants.LOGLINE_COUNTS: int]. :return: merlion.TimeSeries type. """ ts_df = log_features[constants.LOG_COUNTS]