diff --git a/scripts/kruize_metrics.py b/scripts/kruize_metrics.py index 00cba2647..5f7016ca1 100644 --- a/scripts/kruize_metrics.py +++ b/scripts/kruize_metrics.py @@ -25,7 +25,7 @@ import os import argparse -csv_headers = ["timestamp","listRecommendations_count_success","listExperiments_count_success","createExperiment_count_success","updateResults_count_success","updateRecommendations_count_success","generatePlots_count_success","loadRecommendationsByExperimentName_count_success","loadRecommendationsByExperimentNameAndDate_count_success","loadResultsByExperimentName_count_success","loadExperimentByName_count_success","addRecommendationToDB_count_success","addResultToDB_count_success","addBulkResultsToDBAndFetchFailedResults_count_success","addExperimentToDB_count_success","addPerformanceProfileToDB_count_success","loadPerformanceProfileByName_count_success","loadAllPerformanceProfiles_count_success","listRecommendations_count_failure","listExperiments_count_failure","createExperiment_count_failure","updateResults_count_failure","updateRecommendations_count_failure","generatePlots_count_failure","loadRecommendationsByExperimentName_count_failure","loadRecommendationsByExperimentNameAndDate_count_failure","loadResultsByExperimentName_count_failure","loadExperimentByName_count_failure","addRecommendationToDB_count_failure","addResultToDB_count_failure","addBulkResultsToDBAndFetchFailedResults_count_failure","addExperimentToDB_count_failure","addPerformanceProfileToDB_count_failure","loadPerformanceProfileByName_count_failure","loadAllPerformanceProfiles_count_failure","listRecommendations_sum_success","listExperiments_sum_success","createExperiment_sum_success","updateResults_sum_success","updateRecommendations_sum_success","generatePlots_sum_success","loadRecommendationsByExperimentName_sum_success","loadRecommendationsByExperimentNameAndDate_sum_success","loadResultsByExperimentName_sum_success","loadExperimentByName_sum_success","addRecommendationToDB_sum_success","addResultToDB_sum_success","addBulkResultsToDBAndFetchFailedResults_sum_success","addExperimentToDB_sum_success","addPerformanceProfileToDB_sum_success","loadPerformanceProfileByName_sum_success","loadAllPerformanceProfiles_sum_success","listRecommendations_sum_failure","listExperiments_sum_failure","createExperiment_sum_failure","updateResults_sum_failure","updateRecommendations_sum_failure","generatePlots_sum_failure","loadRecommendationsByExperimentName_sum_failure","loadRecommendationsByExperimentNameAndDate_sum_failure","loadResultsByExperimentName_sum_failure","loadExperimentByName_sum_failure","addRecommendationToDB_sum_failure","addResultToDB_sum_failure","addBulkResultsToDBAndFetchFailedResults_sum_failure","addExperimentToDB_sum_failure","addPerformanceProfileToDB_sum_failure","loadPerformanceProfileByName_sum_failure","loadAllPerformanceProfiles_sum_failure","loadAllRecommendations_sum_failure","loadAllExperiments_sum_failure","loadAllResults_sum_failure","loadAllRecommendations_sum_success","loadAllExperiments_sum_success","loadAllResults_sum_success","listRecommendations_max_success","listExperiments_max_success","createExperiment_max_success","updateResults_max_success","updateRecommendations_max_success","generatePlots_max_success","loadRecommendationsByExperimentName_max_success","loadRecommendationsByExperimentNameAndDate_max_success","loadResultsByExperimentName_max_success","loadExperimentByName_max_success","addRecommendationToDB_max_success","addResultToDB_max_success","addBulkResultsToDBAndFetchFailedResults_max_success","addExperimentToDB_max_success","addPerformanceProfileToDB_max_success","loadPerformanceProfileByName_max_success","loadAllPerformanceProfiles_max_success","kruizedb_cpu_max","kruizedb_memory","kruize_cpu_max","kruize_memory","kruize_results","db_size","updateResultsPerCall_success","updateRecommendationsPerCall_success","updateRecommendations_notifications_total"] +csv_headers = ["timestamp","listRecommendations_count_success","listExperiments_count_success","createExperiment_count_success","updateResults_count_success","updateRecommendations_count_success","createBulkJob_count_success", "bulkJobs_count_running" , "jobStatus_count_success", "bulk_getExperimentMap_count_success", "runBulkJob_count_success","importMetadata_count_success", "generatePlots_count_success","loadRecommendationsByExperimentName_count_success","loadRecommendationsByExperimentNameAndDate_count_success","loadResultsByExperimentName_count_success","loadExperimentByName_count_success","addRecommendationToDB_count_success","addResultToDB_count_success","addBulkResultsToDBAndFetchFailedResults_count_success","addExperimentToDB_count_success","addPerformanceProfileToDB_count_success","loadPerformanceProfileByName_count_success","loadAllPerformanceProfiles_count_success","listRecommendations_count_failure","listExperiments_count_failure","createExperiment_count_failure","updateResults_count_failure","updateRecommendations_count_failure","createBulkJob_count_failure","jobStatus_count_failure","runBulkJob_count_failure","importMetadata_count_failure","generatePlots_count_failure","loadRecommendationsByExperimentName_count_failure","loadRecommendationsByExperimentNameAndDate_count_failure","loadResultsByExperimentName_count_failure","loadExperimentByName_count_failure","addRecommendationToDB_count_failure","addResultToDB_count_failure","addBulkResultsToDBAndFetchFailedResults_count_failure","addExperimentToDB_count_failure","addPerformanceProfileToDB_count_failure","loadPerformanceProfileByName_count_failure","loadAllPerformanceProfiles_count_failure","listRecommendations_sum_success","listExperiments_sum_success","createExperiment_sum_success","updateResults_sum_success","updateRecommendations_sum_success","createBulkJob_sum_success", "jobStatus_sum_success", "bulk_getExperimentMap_sum_success", "runBulkJob_sum_success","importMetadata_sum_success", "generatePlots_sum_success","loadRecommendationsByExperimentName_sum_success","loadRecommendationsByExperimentNameAndDate_sum_success","loadResultsByExperimentName_sum_success","loadExperimentByName_sum_success","addRecommendationToDB_sum_success","addResultToDB_sum_success","addBulkResultsToDBAndFetchFailedResults_sum_success","addExperimentToDB_sum_success","addPerformanceProfileToDB_sum_success","loadPerformanceProfileByName_sum_success","loadAllPerformanceProfiles_sum_success","listRecommendations_sum_failure","listExperiments_sum_failure","createExperiment_sum_failure","updateResults_sum_failure","updateRecommendations_sum_failure","createBulkJob_sum_failure", "jobStatus_sum_failure", "bulk_getExperimentMap_sum_failure", "runBulkJob_sum_failure","importMetadata_sum_failure","generatePlots_sum_failure","loadRecommendationsByExperimentName_sum_failure","loadRecommendationsByExperimentNameAndDate_sum_failure","loadResultsByExperimentName_sum_failure","loadExperimentByName_sum_failure","addRecommendationToDB_sum_failure","addResultToDB_sum_failure","addBulkResultsToDBAndFetchFailedResults_sum_failure","addExperimentToDB_sum_failure","addPerformanceProfileToDB_sum_failure","loadPerformanceProfileByName_sum_failure","loadAllPerformanceProfiles_sum_failure","loadAllRecommendations_sum_failure","loadAllExperiments_sum_failure","loadAllResults_sum_failure","loadAllRecommendations_sum_success","loadAllExperiments_sum_success","loadAllResults_sum_success","listRecommendations_max_success","listExperiments_max_success","createExperiment_max_success","updateResults_max_success","updateRecommendations_max_success","createBulkJob_max_success", "jobStatus_max_success", "bulk_getExperimentMap_max_success", "runBulkJob_max_success","importMetadata_max_success","generatePlots_max_success","loadRecommendationsByExperimentName_max_success","loadRecommendationsByExperimentNameAndDate_max_success","loadResultsByExperimentName_max_success","loadExperimentByName_max_success","addRecommendationToDB_max_success","addResultToDB_max_success","addBulkResultsToDBAndFetchFailedResults_max_success","addExperimentToDB_max_success","addPerformanceProfileToDB_max_success","loadPerformanceProfileByName_max_success","loadAllPerformanceProfiles_max_success","kruizedb_cpu_max","kruizedb_memory","kruize_cpu_max","kruize_memory","kruize_results","db_size","updateResultsPerCall_success","updateRecommendationsPerCall_success","BulkJobPerCall_success", "updateRecommendations_notifications_total"] queries_map_total = { @@ -34,6 +34,12 @@ "createExperiment_count_success": "sum((kruizeAPI_count{api=\"createExperiment\",application=\"Kruize\",status=\"success\"}))", "updateResults_count_success": "sum((kruizeAPI_count{api=\"updateResults\",application=\"Kruize\",status=\"success\"}))", "updateRecommendations_count_success": "sum((kruizeAPI_count{api=\"updateRecommendations\",application=\"Kruize\",status=\"success\"}))", + "createBulkJob_count_success": "sum((kruizeAPI_count{api=\"bulk\",application=\"Kruize\",method=\"createBulkJob\",status=\"success\"}))", + "bulkJobs_count_running": "sum((kruizeAPI_active_jobs_count{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"running\"}))", + "jobStatus_count_success": "sum((kruizeAPI_count{api=\"bulk\",application=\"Kruize\",method=\"jobStatus\",status=\"success\"}))", + "bulk_getExperimentMap_count_success": "sum((kruizeAPI_count{api=\"bulk\",application=\"Kruize\",method=\"getExperimentMap\",status=\"success\"}))", + "runBulkJob_count_success": "sum((kruizeAPI_count{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"success\"}))", + "importMetadata_count_success": "sum((kruizeAPI_count{api=\"datasources\",application=\"Kruize\",method=\"importMetadata\",status=\"success\"}))", "generatePlots_count_success": "sum((KruizeMethod_count{method=\"generatePlots\",application=\"Kruize\",status=\"success\"}))", "loadRecommendationsByExperimentName_count_success": "sum((kruizeDB_count{method=\"loadRecommendationsByExperimentName\",application=\"Kruize\",status=\"success\"}))", "loadRecommendationsByExperimentNameAndDate_count_success": "sum((kruizeDB_count{method=\"loadRecommendationsByExperimentNameAndDate\",application=\"Kruize\",status=\"success\"}))", @@ -51,6 +57,10 @@ "createExperiment_count_failure": "sum((kruizeAPI_count{api=\"createExperiment\",application=\"Kruize\",status=\"failure\"}))", "updateResults_count_failure": "sum((kruizeAPI_count{api=\"updateResults\",application=\"Kruize\",status=\"failure\"}))", "updateRecommendations_count_failure": "sum((kruizeAPI_count{api=\"updateRecommendations\",application=\"Kruize\",status=\"failure\"}))", + "createBulkJob_count_failure": "sum((kruizeAPI_count{api=\"bulk\",application=\"Kruize\",method=\"createBulkJob\",status=\"failure\"}))", + "jobStatus_count_failure": "sum((kruizeAPI_count{api=\"bulk\",application=\"Kruize\",method=\"jobStatus\",status=\"failure\"}))", + "runBulkJob_count_failure": "sum((kruizeAPI_count{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"failure\"}))", + "importMetadata_count_failure": "sum((kruizeAPI_count{api=\"datasources\",application=\"Kruize\",method=\"importMetadata\",status=\"failure\"}))", "generatePlots_count_failure": "sum((KruizeMethod_count{method=\"generatePlots\",application=\"Kruize\",status=\"failure\"}))", "loadRecommendationsByExperimentName_count_failure": "sum((kruizeDB_count{method=\"loadRecommendationsByExperimentName\",application=\"Kruize\",status=\"failure\"}))", "loadRecommendationsByExperimentNameAndDate_count_failure": "sum((kruizeDB_count{method=\"loadRecommendationsByExperimentNameAndDate\",application=\"Kruize\",status=\"failure\"}))", @@ -68,6 +78,11 @@ "createExperiment_sum_success": "sum((kruizeAPI_sum{api=\"createExperiment\",application=\"Kruize\",status=\"success\"}))", "updateResults_sum_success": "sum((kruizeAPI_sum{api=\"updateResults\",application=\"Kruize\",status=\"success\"}))", "updateRecommendations_sum_success": "sum((kruizeAPI_sum{api=\"updateRecommendations\",application=\"Kruize\",status=\"success\"}))", + "createBulkJob_sum_success": "sum((kruizeAPI_sum{api=\"bulk\",application=\"Kruize\",method=\"createBulkJob\",status=\"success\"}))", + "jobStatus_sum_success": "sum((kruizeAPI_sum{api=\"bulk\",application=\"Kruize\",method=\"jobStatus\",status=\"success\"}))", + "bulk_getExperimentMap_sum_success": "sum((kruizeAPI_sum{api=\"bulk\",application=\"Kruize\",method=\"getExperimentMap\",status=\"success\"}))", + "runBulkJob_sum_success": "sum((kruizeAPI_sum{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"success\"}))", + "importMetadata_sum_success": "sum((kruizeAPI_sum{api=\"datasources\",application=\"Kruize\",method=\"importMetadata\",status=\"success\"}))", "generatePlots_sum_success": "sum((KruizeMethod_sum{method=\"generatePlots\",application=\"Kruize\",status=\"success\"}))", "loadRecommendationsByExperimentName_sum_success": "sum((kruizeDB_sum{method=\"loadRecommendationsByExperimentName\",application=\"Kruize\",status=\"success\"}))", "loadRecommendationsByExperimentNameAndDate_sum_success": "sum((kruizeDB_sum{method=\"loadRecommendationsByExperimentNameAndDate\",application=\"Kruize\",status=\"success\"}))", @@ -85,6 +100,11 @@ "createExperiment_sum_failure": "sum((kruizeAPI_sum{api=\"createExperiment\",application=\"Kruize\",status=\"failure\"}))", "updateResults_sum_failure": "sum((kruizeAPI_sum{api=\"updateResults\",application=\"Kruize\",status=\"failure\"}))", "updateRecommendations_sum_failure": "sum((kruizeAPI_sum{api=\"updateRecommendations\",application=\"Kruize\",status=\"failure\"}))", + "createBulkJob_sum_failure": "sum((kruizeAPI_sum{api=\"bulk\",application=\"Kruize\",method=\"createBulkJob\",status=\"failure\"}))", + "jobStatus_sum_failure": "sum((kruizeAPI_sum{api=\"bulk\",application=\"Kruize\",method=\"jobStatus\",status=\"failure\"}))", + "bulk_getExperimentMap_sum_failure": "sum((kruizeAPI_sum{api=\"bulk\",application=\"Kruize\",method=\"getExperimentMap\",status=\"failure\"}))", + "runBulkJob_sum_failure": "sum((kruizeAPI_sum{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"failure\"}))", + "importMetadata_sum_failure": "sum((kruizeAPI_sum{api=\"datasources\",application=\"Kruize\",method=\"importMetadata\",status=\"failure\"}))", "generatePlots_sum_failure": "sum((KruizeMethod_sum{method=\"generatePlots\",application=\"Kruize\",status=\"failure\"}))", "loadRecommendationsByExperimentName_sum_failure": "sum((kruizeDB_sum{method=\"loadRecommendationsByExperimentName\",application=\"Kruize\",status=\"failure\"}))", "loadRecommendationsByExperimentNameAndDate_sum_failure": "sum((kruizeDB_sum{method=\"loadRecommendationsByExperimentNameAndDate\",application=\"Kruize\",status=\"failure\"}))", @@ -108,6 +128,11 @@ "createExperiment_max_success": "max(max_over_time(kruizeAPI_max{{api=\"createExperiment\",application=\"Kruize\",status=\"success\"}}[6h]))", "updateResults_max_success": "max(max_over_time(kruizeAPI_max{{api=\"updateResults\",application=\"Kruize\",status=\"success\"}}[6h]))", "updateRecommendations_max_success": "max(max_over_time(kruizeAPI_max{{api=\"updateRecommendations\",application=\"Kruize\",status=\"success\"}}[6h]))", + "createBulkJob_max_success": "max(max_over_time(kruizeAPI_max{{api=\"bulk\",application=\"Kruize\",method=\"createBulkJob\",status=\"success\"}}[6h]))", + "jobStatus_max_success": "max(max_over_time(kruizeAPI_max{{api=\"bulk\",application=\"Kruize\",method=\"jobStatus\",status=\"success\"}}[6h]))", + "bulk_getExperimentMap_max_success": "max(max_over_time(kruizeAPI_max{{api=\"bulk\",application=\"Kruize\",method=\"getExperimentMap\",status=\"success\"}}[6h]))", + "runBulkJob_max_success": "max(max_over_time(kruizeAPI_max{{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"success\"}}[6h]))", + "importMetadata_max_success": "max(max_over_time(kruizeAPI_max{{api=\"datasources\",application=\"Kruize\",method=\"importMetadata\",status=\"success\"}}[6h]))", "generatePlots_max_success": "max(max_over_time(KruizeMethod_max{{method=\"generatePlots\",application=\"Kruize\",status=\"success\"}}[6h]))", "loadRecommendationsByExperimentName_max_success": "max(max_over_time(kruizeDB_max{{method=\"loadRecommendationsByExperimentName\",application=\"Kruize\",status=\"success\"}}[6h]))", "loadRecommendationsByExperimentNameAndDate_max_success": "max(max_over_time(kruizeDB_max{{method=\"loadRecommendationsByExperimentNameAndDate\",application=\"Kruize\",status=\"success\"}}[6h]))", @@ -204,7 +229,15 @@ def run_queries(map_type,server,prometheus_url=None): results_map["updateRecommendationsPerCall_success"] = sum_success / count_success except ValueError: print("Error: Unable to convert values to floats.") - + if "runBulkJob_sum_success" in results_map and "runBulkJob_count_success" in results_map: + if results_map["runBulkJob_sum_success"] and results_map["runBulkJob_count_success"]: + try: + sum_success = round(float(results_map["runBulkJob_sum_success"]),10) + count_success = round(float(results_map["runBulkJob_count_success"]),10) + if count_success != 0: + results_map["BulkJobPerCall_success"] = sum_success / count_success + except ValueError: + print("Error: Unable to convert values to floats.") except Exception as e: print(f"AN ERROR OCCURED: {e}") sys.exit(1) @@ -320,6 +353,12 @@ def main(argv): "createExperiment_count_success": f"sum(increase(kruizeAPI_count{{api=\"createExperiment\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "updateResults_count_success": f"sum(increase(kruizeAPI_count{{api=\"updateResults\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "updateRecommendations_count_success": f"sum(increase(kruizeAPI_count{{api=\"updateRecommendations\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", + "createBulkJob_count_success": f"sum(increase(kruizeAPI_count{{api=\"bulk\",application=\"Kruize\",method=\"createBulkJob\",status=\"success\"}}[{time_duration}]))", + "bulkJobs_count_running": f"sum(increase(kruizeAPI_active_jobs_count{{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"running\"}}[{time_duration}]))", + "jobStatus_count_success": f"sum(increase(kruizeAPI_count{{api=\"bulk\",application=\"Kruize\",method=\"jobStatus\",status=\"success\"}}[{time_duration}]))", + "bulk_getExperimentMap_count_success": f"sum(increase(kruizeAPI_count{{api=\"bulk\",application=\"Kruize\",method=\"getExperimentMap\",status=\"success\"}}[{time_duration}]))", + "runBulkJob_count_success": f"sum(increase(kruizeAPI_count{{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"success\"}}[{time_duration}]))", + "importMetadata_count_success": f"sum(increase(kruizeAPI_count{{api=\"datasources\",application=\"Kruize\",method=\"importMetadata\",status=\"success\"}}[{time_duration}]))", "generatePlots_count_success": f"sum(increase(KruizeMethod_count{{method=\"generatePlots\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "loadRecommendationsByExperimentName_count_success": f"sum(increase(kruizeDB_count{{method=\"loadRecommendationsByExperimentName\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "loadRecommendationsByExperimentNameAndDate_count_success": f"sum(increase(kruizeDB_count{{method=\"loadRecommendationsByExperimentNameAndDate\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", @@ -337,6 +376,10 @@ def main(argv): "createExperiment_count_failure": f"sum(increase(kruizeAPI_count{{api=\"createExperiment\",application=\"Kruize\",status=\"failure\"}}[{time_duration}]))", "updateResults_count_failure": f"sum(increase(kruizeAPI_count{{api=\"updateResults\",application=\"Kruize\",status=\"failure\"}}[{time_duration}]))", "updateRecommendations_count_failure": f"sum(increase(kruizeAPI_count{{api=\"updateRecommendations\",application=\"Kruize\",status=\"failure\"}}[{time_duration}]))", + "createBulkJob_count_failure": f"sum(increase(kruizeAPI_count{{api=\"bulk\",application=\"Kruize\",method=\"createBulkJob\",status=\"failure\"}}[{time_duration}]))", + "jobStatus_count_failure": f"sum(increase(kruizeAPI_count{{api=\"bulk\",application=\"Kruize\",method=\"jobStatus\",status=\"failure\"}}[{time_duration}]))", + "runBulkJob_count_failure": f"sum(increase(kruizeAPI_count{{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"failure\"}}[{time_duration}]))", + "importMetadata_count_failure": f"sum(increase(kruizeAPI_count{{api=\"datasources\",application=\"Kruize\",method=\"importMetadata\",status=\"failure\"}}[{time_duration}]))", "generatePlots_count_failure": f"sum(increase(KruizeMethod_count{{method=\"generatePlots\",application=\"Kruize\",status=\"failure\"}}[{time_duration}]))", "loadRecommendationsByExperimentName_count_failure": f"sum(increase(kruizeDB_count{{method=\"loadRecommendationsByExperimentName\",application=\"Kruize\",status=\"failure\"}}[{time_duration}]))", "loadRecommendationsByExperimentNameAndDate_count_failure": f"sum(increase(kruizeDB_count{{method=\"loadRecommendationsByExperimentNameAndDate\",application=\"Kruize\",status=\"failure\"}}[{time_duration}]))", @@ -354,6 +397,11 @@ def main(argv): "createExperiment_sum_success": f"sum(increase(kruizeAPI_sum{{api=\"createExperiment\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "updateResults_sum_success": f"sum(increase(kruizeAPI_sum{{api=\"updateResults\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "updateRecommendations_sum_success": f"sum(increase(kruizeAPI_sum{{api=\"updateRecommendations\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", + "createBulkJob_sum_success": f"sum(increase(kruizeAPI_sum{{api=\"bulk\",application=\"Kruize\",method=\"createBulkJob\",status=\"success\"}}[{time_duration}]))", + "jobStatus_sum_success": f"sum(increase(kruizeAPI_sum{{api=\"bulk\",application=\"Kruize\",method=\"jobStatus\",status=\"success\"}}[{time_duration}]))", + "bulk_getExperimentMap_sum_success": f"sum(increase(kruizeAPI_sum{{api=\"bulk\",application=\"Kruize\",method=\"getExperimentMap\",status=\"success\"}}[{time_duration}]))", + "runBulkJob_sum_success": f"sum(increase(kruizeAPI_sum{{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"success\"}}[{time_duration}]))", + "importMetadata_sum_success": f"sum(increase(kruizeAPI_sum{{api=\"datasources\",application=\"Kruize\",method=\"importMetadata\",status=\"success\"}}[{time_duration}]))", "generatePlots_sum_success": f"sum(increase(KruizeMethod_sum{{method=\"generatePlots\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "loadRecommendationsByExperimentName_sum_success": f"sum(increase(kruizeDB_sum{{method=\"loadRecommendationsByExperimentName\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "loadRecommendationsByExperimentNameAndDate_sum_success": f"sum(increase(kruizeDB_sum{{method=\"loadRecommendationsByExperimentNameAndDate\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", @@ -371,6 +419,11 @@ def main(argv): "createExperiment_max_success": f"max(max_over_time(kruizeAPI_max{{api=\"createExperiment\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "updateResults_max_success": f"max(max_over_time(kruizeAPI_max{{api=\"updateResults\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "updateRecommendations_max_success": f"max(max_over_time(kruizeAPI_max{{api=\"updateRecommendations\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", + "createBulkJob_max_success": f"max(max_over_time(kruizeAPI_max{{api=\"bulk\",application=\"Kruize\",method=\"createBulkJob\",status=\"success\"}}[{time_duration}]))", + "jobStatus_max_success": f"max(max_over_time(kruizeAPI_max{{api=\"bulk\",application=\"Kruize\",method=\"jobStatus\",status=\"success\"}}[{time_duration}]))", + "bulk_getExperimentMap_max_success": f"max(max_over_time(kruizeAPI_max{{api=\"bulk\",application=\"Kruize\",method=\"getExperimentMap\",status=\"success\"}}[{time_duration}]))", + "runBulkJob_max_success": f"max(max_over_time(kruizeAPI_max{{api=\"bulk\",application=\"Kruize\",method=\"runBulkJob\",status=\"success\"}}[{time_duration}]))", + "importMetadata_max_success": f"max(max_over_time(kruizeAPI_max{{api=\"datasources\",application=\"Kruize\",method=\"importMetadata\",status=\"success\"}}[{time_duration}]))", "generatePlots_max_success": f"max(max_over_time(KruizeMethod_max{{method=\"generatePlots\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "loadRecommendationsByExperimentName_max_success": f"max(max_over_time(kruizeDB_max{{method=\"loadRecommendationsByExperimentName\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", "loadRecommendationsByExperimentNameAndDate_max_success": f"max(max_over_time(kruizeDB_max{{method=\"loadRecommendationsByExperimentNameAndDate\",application=\"Kruize\",status=\"success\"}}[{time_duration}]))", diff --git a/src/main/java/com/autotune/analyzer/services/BulkService.java b/src/main/java/com/autotune/analyzer/services/BulkService.java index 4f507f51a..c695bc70a 100644 --- a/src/main/java/com/autotune/analyzer/services/BulkService.java +++ b/src/main/java/com/autotune/analyzer/services/BulkService.java @@ -18,9 +18,11 @@ import com.autotune.analyzer.serviceObjects.BulkInput; import com.autotune.analyzer.serviceObjects.BulkJobStatus; import com.autotune.analyzer.workerimpl.BulkJobManager; +import com.autotune.utils.MetricsConfig; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ser.impl.SimpleBeanPropertyFilter; import com.fasterxml.jackson.databind.ser.impl.SimpleFilterProvider; +import io.micrometer.core.instrument.Timer; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -65,6 +67,8 @@ public void init(ServletConfig config) throws ServletException { */ @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { + String statusValue = "failure"; + Timer.Sample timerJobStatus = Timer.start(MetricsConfig.meterRegistry()); try { String jobID = req.getParameter(JOB_ID); String verboseParam = req.getParameter(VERBOSE); @@ -107,12 +111,18 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws Se objectMapper.setFilterProvider(filters); String jsonResponse = objectMapper.writeValueAsString(jobDetails); resp.getWriter().write(jsonResponse); + statusValue = "success"; } catch (Exception e) { e.printStackTrace(); } } } catch (Exception e) { e.printStackTrace(); + } finally { + if (null != timerJobStatus) { + MetricsConfig.timerJobStatus = MetricsConfig.timerBJobStatus.tag("status", statusValue).register(MetricsConfig.meterRegistry()); + timerJobStatus.stop(MetricsConfig.timerJobStatus); + } } } @@ -124,28 +134,38 @@ protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws Se */ @Override protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { - // Set response type - response.setContentType(JSON_CONTENT_TYPE); - response.setCharacterEncoding(CHARACTER_ENCODING); + String statusValue = "failure"; + Timer.Sample timerCreateBulkJob = Timer.start(MetricsConfig.meterRegistry()); + try { + // Set response type + response.setContentType(JSON_CONTENT_TYPE); + response.setCharacterEncoding(CHARACTER_ENCODING); - // Create ObjectMapper instance - ObjectMapper objectMapper = new ObjectMapper(); + // Create ObjectMapper instance + ObjectMapper objectMapper = new ObjectMapper(); - // Read the request payload and map to RequestPayload class - BulkInput payload = objectMapper.readValue(request.getInputStream(), BulkInput.class); + // Read the request payload and map to RequestPayload class + BulkInput payload = objectMapper.readValue(request.getInputStream(), BulkInput.class); - // Generate a unique jobID - String jobID = UUID.randomUUID().toString(); - BulkJobStatus jobStatus = new BulkJobStatus(jobID, IN_PROGRESS, Instant.now()); - jobStatusMap.put(jobID, jobStatus); - // Submit the job to be processed asynchronously - executorService.submit(new BulkJobManager(jobID, jobStatus, payload)); + // Generate a unique jobID + String jobID = UUID.randomUUID().toString(); + BulkJobStatus jobStatus = new BulkJobStatus(jobID, IN_PROGRESS, Instant.now()); + jobStatusMap.put(jobID, jobStatus); + // Submit the job to be processed asynchronously + executorService.submit(new BulkJobManager(jobID, jobStatus, payload)); - // Just sending a simple success response back - // Return the jobID to the user - JSONObject jsonObject = new JSONObject(); - jsonObject.put(JOB_ID, jobID); - response.getWriter().write(jsonObject.toString()); + // Just sending a simple success response back + // Return the jobID to the user + JSONObject jsonObject = new JSONObject(); + jsonObject.put(JOB_ID, jobID); + response.getWriter().write(jsonObject.toString()); + statusValue = "success"; + } finally { + if (null != timerCreateBulkJob) { + MetricsConfig.timerCreateBulkJob = MetricsConfig.timerBCreateBulkJob.tag("status", statusValue).register(MetricsConfig.meterRegistry()); + timerCreateBulkJob.stop(MetricsConfig.timerCreateBulkJob); + } + } } diff --git a/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java b/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java index 48cc0026b..d032e2b50 100644 --- a/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java +++ b/src/main/java/com/autotune/analyzer/workerimpl/BulkJobManager.java @@ -15,7 +15,6 @@ *******************************************************************************/ package com.autotune.analyzer.workerimpl; - import com.autotune.analyzer.kruizeObject.RecommendationSettings; import com.autotune.analyzer.serviceObjects.*; import com.autotune.analyzer.utils.AnalyzerConstants; @@ -27,9 +26,11 @@ import com.autotune.operator.KruizeDeploymentInfo; import com.autotune.utils.GenericRestApiClient; import com.autotune.utils.KruizeConstants; +import com.autotune.utils.MetricsConfig; import com.autotune.utils.Utils; import com.fasterxml.jackson.core.JsonProcessingException; import com.google.gson.Gson; +import io.micrometer.core.instrument.Timer; import org.apache.http.conn.ConnectTimeoutException; import org.json.JSONObject; import org.slf4j.Logger; @@ -48,6 +49,7 @@ import java.util.*; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -55,7 +57,6 @@ import static com.autotune.utils.KruizeConstants.KRUIZE_BULK_API.*; import static com.autotune.utils.KruizeConstants.KRUIZE_BULK_API.NotificationConstants.*; - /** * The `run` method processes bulk input to create experiments and generates resource optimization recommendations. * It handles the creation of experiment names based on various data source components, makes HTTP POST requests @@ -118,6 +119,9 @@ private static Map parseLabelString(String labelString) { @Override public void run() { + String statusValue = "failure"; + MetricsConfig.activeJobs.incrementAndGet(); + Timer.Sample timerRunJob = Timer.start(MetricsConfig.meterRegistry()); DataSourceMetadataInfo metadataInfo = null; DataSourceManager dataSourceManager = new DataSourceManager(); DataSourceInfo datasource = null; @@ -153,82 +157,110 @@ public void run() { } else { ExecutorService createExecutor = Executors.newFixedThreadPool(bulk_thread_pool_size); ExecutorService generateExecutor = Executors.newFixedThreadPool(bulk_thread_pool_size); - for (CreateExperimentAPIObject apiObject : createExperimentAPIObjectMap.values()) { - DataSourceInfo finalDatasource = datasource; - createExecutor.submit(() -> { - String experiment_name = apiObject.getExperimentName(); - BulkJobStatus.Experiment experiment = jobData.addExperiment(experiment_name); - try { - // send request to createExperiment API for experiment creation - GenericRestApiClient apiClient = new GenericRestApiClient(finalDatasource); - apiClient.setBaseURL(KruizeDeploymentInfo.experiments_url); - GenericRestApiClient.HttpResponseWrapper responseCode; - boolean expriment_exists = false; + try { + for (CreateExperimentAPIObject apiObject : createExperimentAPIObjectMap.values()) { + DataSourceInfo finalDatasource = datasource; + createExecutor.submit(() -> { + String experiment_name = apiObject.getExperimentName(); + BulkJobStatus.Experiment experiment = jobData.addExperiment(experiment_name); try { - responseCode = apiClient.callKruizeAPI("[" + new Gson().toJson(apiObject) + "]"); - LOGGER.debug("API Response code: {}", responseCode); - if (responseCode.getStatusCode() == HttpURLConnection.HTTP_CREATED) { - expriment_exists = true; - } else if (responseCode.getStatusCode() == HttpURLConnection.HTTP_CONFLICT) { - expriment_exists = true; - } else { - experiment.setNotification(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, responseCode.getResponseBody().toString(), responseCode.getStatusCode())); - } - } catch (Exception e) { - e.printStackTrace(); - experiment.setNotification(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, e.getMessage(), HttpURLConnection.HTTP_BAD_REQUEST)); - } finally { - if (!expriment_exists) { - LOGGER.info("Processing experiment {}", jobData.getProcessed_experiments()); - jobData.setProcessed_experiments(jobData.getProcessed_experiments() + 1); - } - synchronized (new Object()) { - if (jobData.getTotal_experiments() == jobData.getProcessed_experiments()) { - setFinalJobStatus(COMPLETED, null, null, finalDatasource); + // send request to createExperiment API for experiment creation + GenericRestApiClient apiClient = new GenericRestApiClient(finalDatasource); + apiClient.setBaseURL(KruizeDeploymentInfo.experiments_url); + GenericRestApiClient.HttpResponseWrapper responseCode; + boolean expriment_exists = false; + try { + responseCode = apiClient.callKruizeAPI("[" + new Gson().toJson(apiObject) + "]"); + LOGGER.debug("API Response code: {}", responseCode); + if (responseCode.getStatusCode() == HttpURLConnection.HTTP_CREATED) { + expriment_exists = true; + } else if (responseCode.getStatusCode() == HttpURLConnection.HTTP_CONFLICT) { + expriment_exists = true; + } else { + experiment.setNotification(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, responseCode.getResponseBody().toString(), responseCode.getStatusCode())); + } + } catch (Exception e) { + e.printStackTrace(); + experiment.setNotification(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, e.getMessage(), HttpURLConnection.HTTP_BAD_REQUEST)); + } finally { + if (!expriment_exists) { + LOGGER.info("Processing experiment {}", jobData.getProcessed_experiments()); + jobData.setProcessed_experiments(jobData.getProcessed_experiments() + 1); + } + synchronized (new Object()) { + if (jobData.getTotal_experiments() == jobData.getProcessed_experiments()) { + setFinalJobStatus(COMPLETED, null, null, finalDatasource); + } } } - } - if (expriment_exists) { - generateExecutor.submit(() -> { - // send request to generateRecommendations API - GenericRestApiClient recommendationApiClient = new GenericRestApiClient(finalDatasource); - String encodedExperimentName; - encodedExperimentName = URLEncoder.encode(experiment_name, StandardCharsets.UTF_8); - recommendationApiClient.setBaseURL(String.format(KruizeDeploymentInfo.recommendations_url, encodedExperimentName)); - GenericRestApiClient.HttpResponseWrapper recommendationResponseCode = null; - try { - recommendationResponseCode = recommendationApiClient.callKruizeAPI(null); - LOGGER.debug("API Response code: {}", recommendationResponseCode); - if (recommendationResponseCode.getStatusCode() == HttpURLConnection.HTTP_CREATED) { - experiment.getRecommendations().setStatus(NotificationConstants.Status.PROCESSED); - } else { + if (expriment_exists) { + generateExecutor.submit(() -> { + // send request to generateRecommendations API + GenericRestApiClient recommendationApiClient = new GenericRestApiClient(finalDatasource); + String encodedExperimentName; + encodedExperimentName = URLEncoder.encode(experiment_name, StandardCharsets.UTF_8); + recommendationApiClient.setBaseURL(String.format(KruizeDeploymentInfo.recommendations_url, encodedExperimentName)); + GenericRestApiClient.HttpResponseWrapper recommendationResponseCode = null; + try { + recommendationResponseCode = recommendationApiClient.callKruizeAPI(null); + LOGGER.debug("API Response code: {}", recommendationResponseCode); + if (recommendationResponseCode.getStatusCode() == HttpURLConnection.HTTP_CREATED) { + experiment.getRecommendations().setStatus(NotificationConstants.Status.PROCESSED); + } else { + experiment.getRecommendations().setStatus(NotificationConstants.Status.FAILED); + experiment.setNotification(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, recommendationResponseCode.getResponseBody().toString(), recommendationResponseCode.getStatusCode())); + } + } catch (Exception e) { + e.printStackTrace(); experiment.getRecommendations().setStatus(NotificationConstants.Status.FAILED); - experiment.setNotification(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, recommendationResponseCode.getResponseBody().toString(), recommendationResponseCode.getStatusCode())); - } - } catch (Exception e) { - e.printStackTrace(); - experiment.getRecommendations().setStatus(NotificationConstants.Status.FAILED); - experiment.getRecommendations().setNotifications(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, e.getMessage(), HttpURLConnection.HTTP_INTERNAL_ERROR)); - } finally { - jobData.setProcessed_experiments(jobData.getProcessed_experiments() + 1); - synchronized (new Object()) { - if (jobData.getTotal_experiments() == jobData.getProcessed_experiments()) { - setFinalJobStatus(COMPLETED, null, null, finalDatasource); + experiment.getRecommendations().setNotifications(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, e.getMessage(), HttpURLConnection.HTTP_INTERNAL_ERROR)); + } finally { + jobData.setProcessed_experiments(jobData.getProcessed_experiments() + 1); + synchronized (new Object()) { + if (jobData.getTotal_experiments() == jobData.getProcessed_experiments()) { + setFinalJobStatus(COMPLETED, null, null, finalDatasource); + } } } - } - }); - } - } catch (Exception e) { - e.printStackTrace(); - experiment.setNotification(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, e.getMessage(), HttpURLConnection.HTTP_INTERNAL_ERROR)); - jobData.setProcessed_experiments(jobData.getProcessed_experiments() + 1); - if (jobData.getTotal_experiments() == jobData.getProcessed_experiments()) { - setFinalJobStatus(COMPLETED, null, null, finalDatasource); + }); + } + } catch (Exception e) { + e.printStackTrace(); + experiment.setNotification(new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, e.getMessage(), HttpURLConnection.HTTP_INTERNAL_ERROR)); + jobData.setProcessed_experiments(jobData.getProcessed_experiments() + 1); + if (jobData.getTotal_experiments() == jobData.getProcessed_experiments()) { + setFinalJobStatus(COMPLETED, null, null, finalDatasource); + } } + }); + } + } finally { + // Shutdown createExecutor and wait for it to finish + createExecutor.shutdown(); + while (!createExecutor.isTerminated()) { + try { + createExecutor.awaitTermination(1, TimeUnit.MINUTES); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; } - }); + } + + // Shutdown generateExecutor and wait for it to finish + generateExecutor.shutdown(); + while (!generateExecutor.isTerminated()) { + try { + generateExecutor.awaitTermination(1, TimeUnit.MINUTES); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + + if (jobData.getTotal_experiments() == jobData.getProcessed_experiments()) { + statusValue = "success"; + } } } } @@ -249,6 +281,12 @@ public void run() { LOGGER.error(e.getMessage()); e.printStackTrace(); setFinalJobStatus(FAILED, String.valueOf(HttpURLConnection.HTTP_INTERNAL_ERROR), new BulkJobStatus.Notification(BulkJobStatus.NotificationType.ERROR, e.getMessage(), HttpURLConnection.HTTP_INTERNAL_ERROR), datasource); + } finally { + if (null != timerRunJob) { + MetricsConfig.timerRunJob = MetricsConfig.timerBRunJob.tag("status", statusValue).register(MetricsConfig.meterRegistry()); + timerRunJob.stop(MetricsConfig.timerRunJob); + } + MetricsConfig.activeJobs.decrementAndGet(); } } @@ -286,34 +324,43 @@ public void setFinalJobStatus(String status, String notificationKey, BulkJobStat } Map getExperimentMap(String labelString, BulkJobStatus jobData, DataSourceMetadataInfo metadataInfo, DataSourceInfo datasource) throws Exception { - Map createExperimentAPIObjectMap = new HashMap<>(); - Collection dataSourceCollection = metadataInfo.getDataSourceHashMap().values(); - for (DataSource ds : dataSourceCollection) { - HashMap clusterHashMap = ds.getDataSourceClusterHashMap(); - for (DataSourceCluster dsc : clusterHashMap.values()) { - HashMap namespaceHashMap = dsc.getDataSourceNamespaceHashMap(); - for (DataSourceNamespace namespace : namespaceHashMap.values()) { - HashMap dataSourceWorkloadHashMap = namespace.getDataSourceWorkloadHashMap(); - if (dataSourceWorkloadHashMap != null) { - for (DataSourceWorkload dsw : dataSourceWorkloadHashMap.values()) { - HashMap dataSourceContainerHashMap = dsw.getDataSourceContainerHashMap(); - if (dataSourceContainerHashMap != null) { - for (DataSourceContainer dc : dataSourceContainerHashMap.values()) { - // Experiment name - dynamically constructed - String experiment_name = frameExperimentName(labelString, dsc, namespace, dsw, dc); - // create JSON to be passed in the createExperimentAPI - List createExperimentAPIObjectList = new ArrayList<>(); - CreateExperimentAPIObject apiObject = prepareCreateExperimentJSONInput(dc, dsc, dsw, namespace, - experiment_name, createExperimentAPIObjectList); - createExperimentAPIObjectMap.put(experiment_name, apiObject); + String statusValue = "failure"; + Timer.Sample timerGetExpMap = Timer.start(MetricsConfig.meterRegistry()); + try { + Map createExperimentAPIObjectMap = new HashMap<>(); + Collection dataSourceCollection = metadataInfo.getDataSourceHashMap().values(); + for (DataSource ds : dataSourceCollection) { + HashMap clusterHashMap = ds.getDataSourceClusterHashMap(); + for (DataSourceCluster dsc : clusterHashMap.values()) { + HashMap namespaceHashMap = dsc.getDataSourceNamespaceHashMap(); + for (DataSourceNamespace namespace : namespaceHashMap.values()) { + HashMap dataSourceWorkloadHashMap = namespace.getDataSourceWorkloadHashMap(); + if (dataSourceWorkloadHashMap != null) { + for (DataSourceWorkload dsw : dataSourceWorkloadHashMap.values()) { + HashMap dataSourceContainerHashMap = dsw.getDataSourceContainerHashMap(); + if (dataSourceContainerHashMap != null) { + for (DataSourceContainer dc : dataSourceContainerHashMap.values()) { + // Experiment name - dynamically constructed + String experiment_name = frameExperimentName(labelString, dsc, namespace, dsw, dc); + // create JSON to be passed in the createExperimentAPI + List createExperimentAPIObjectList = new ArrayList<>(); + CreateExperimentAPIObject apiObject = prepareCreateExperimentJSONInput(dc, dsc, dsw, namespace, + experiment_name, createExperimentAPIObjectList); + createExperimentAPIObjectMap.put(experiment_name, apiObject); + } } } } } } } + return createExperimentAPIObjectMap; + } finally { + if (null != timerGetExpMap) { + MetricsConfig.timerGetExpMap = MetricsConfig.timerBGetExpMap.tag("status", statusValue).register(MetricsConfig.meterRegistry()); + timerGetExpMap.stop(MetricsConfig.timerGetExpMap); + } } - return createExperimentAPIObjectMap; } private String getLabels(BulkInput.FilterWrapper filter) { diff --git a/src/main/java/com/autotune/common/datasource/DataSourceManager.java b/src/main/java/com/autotune/common/datasource/DataSourceManager.java index a8401970c..a71d80083 100644 --- a/src/main/java/com/autotune/common/datasource/DataSourceManager.java +++ b/src/main/java/com/autotune/common/datasource/DataSourceManager.java @@ -24,6 +24,8 @@ import com.autotune.database.dao.ExperimentDAOImpl; import com.autotune.database.service.ExperimentDBService; import com.autotune.utils.KruizeConstants; +import com.autotune.utils.MetricsConfig; +import io.micrometer.core.instrument.Timer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -65,15 +67,25 @@ public DataSourceManager() { * @return */ public DataSourceMetadataInfo importMetadataFromDataSource(DataSourceInfo dataSourceInfo, String uniqueKey, long startTime, long endTime, int steps) throws DataSourceDoesNotExist, IOException, NoSuchAlgorithmException, KeyStoreException, KeyManagementException { - if (null == dataSourceInfo) { - throw new DataSourceDoesNotExist(KruizeConstants.DataSourceConstants.DataSourceErrorMsgs.MISSING_DATASOURCE_INFO); - } - DataSourceMetadataInfo dataSourceMetadataInfo = dataSourceMetadataOperator.createDataSourceMetadata(dataSourceInfo, uniqueKey, startTime, endTime, steps); - if (null == dataSourceMetadataInfo) { - LOGGER.error(KruizeConstants.DataSourceConstants.DataSourceMetadataErrorMsgs.DATASOURCE_METADATA_INFO_NOT_AVAILABLE, "for datasource {}" + dataSourceInfo.getName()); - return null; + String statusValue = "failure"; + io.micrometer.core.instrument.Timer.Sample timerImportMetadata = Timer.start(MetricsConfig.meterRegistry()); + try { + if (null == dataSourceInfo) { + throw new DataSourceDoesNotExist(KruizeConstants.DataSourceConstants.DataSourceErrorMsgs.MISSING_DATASOURCE_INFO); + } + DataSourceMetadataInfo dataSourceMetadataInfo = dataSourceMetadataOperator.createDataSourceMetadata(dataSourceInfo, uniqueKey, startTime, endTime, steps); + if (null == dataSourceMetadataInfo) { + LOGGER.error(KruizeConstants.DataSourceConstants.DataSourceMetadataErrorMsgs.DATASOURCE_METADATA_INFO_NOT_AVAILABLE, "for datasource {}" + dataSourceInfo.getName()); + return null; + } + statusValue = "success"; + return dataSourceMetadataInfo; + } finally { + if (null != timerImportMetadata) { + MetricsConfig.timerImportMetadata = MetricsConfig.timerBImportMetadata.tag("status", statusValue).register(MetricsConfig.meterRegistry()); + timerImportMetadata.stop(MetricsConfig.timerImportMetadata); + } } - return dataSourceMetadataInfo; } /** @@ -84,6 +96,8 @@ public DataSourceMetadataInfo importMetadataFromDataSource(DataSourceInfo dataSo * @throws DataSourceDoesNotExist Thrown when the provided data source information is null. */ public DataSourceMetadataInfo getMetadataFromDataSource(DataSourceInfo dataSource) { + String statusValue = "failure"; + io.micrometer.core.instrument.Timer.Sample timerGetMetadata = Timer.start(MetricsConfig.meterRegistry()); try { if (null == dataSource) { throw new DataSourceDoesNotExist(KruizeConstants.DataSourceConstants.DataSourceErrorMsgs.MISSING_DATASOURCE_INFO); @@ -94,11 +108,17 @@ public DataSourceMetadataInfo getMetadataFromDataSource(DataSourceInfo dataSourc LOGGER.error(KruizeConstants.DataSourceConstants.DataSourceMetadataErrorMsgs.DATASOURCE_METADATA_INFO_NOT_AVAILABLE, "for datasource {}" + dataSourceName); return null; } + statusValue = "success"; return dataSourceMetadataInfo; } catch (DataSourceDoesNotExist e) { LOGGER.error(e.getMessage()); } catch (Exception e) { LOGGER.error("Loading saved datasource metadata failed: {} ", e.getMessage()); + } finally { + if (null != timerGetMetadata) { + MetricsConfig.timerGetMetadata = MetricsConfig.timerBGetMetadata.tag("status", statusValue).register(MetricsConfig.meterRegistry()); + timerGetMetadata.stop(MetricsConfig.timerGetMetadata); + } } return null; } diff --git a/src/main/java/com/autotune/utils/MetricsConfig.java b/src/main/java/com/autotune/utils/MetricsConfig.java index 002d1411a..b7afa12f7 100644 --- a/src/main/java/com/autotune/utils/MetricsConfig.java +++ b/src/main/java/com/autotune/utils/MetricsConfig.java @@ -1,6 +1,8 @@ package com.autotune.utils; import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.Gauge; +import io.micrometer.core.instrument.Metrics; import io.micrometer.core.instrument.Timer; import io.micrometer.core.instrument.binder.jvm.ClassLoaderMetrics; import io.micrometer.core.instrument.binder.jvm.JvmGcMetrics; @@ -10,6 +12,8 @@ import io.micrometer.prometheus.PrometheusConfig; import io.micrometer.prometheus.PrometheusMeterRegistry; +import java.util.concurrent.atomic.AtomicInteger; + public class MetricsConfig { public static Timer timerListRec, timerListExp, timerCreateExp, timerUpdateResults, timerUpdateRecomendations; @@ -17,20 +21,26 @@ public class MetricsConfig { public static Timer timerLoadAllRec, timerLoadAllExp, timerLoadAllResults; public static Timer timerAddRecDB, timerAddResultsDB, timerAddExpDB, timerAddBulkResultsDB; public static Timer timerAddPerfProfileDB, timerLoadPerfProfileName, timerLoadAllPerfProfiles; - public static Counter timerKruizeNotifications; + public static Timer timerImportMetadata, timerGetMetadata; + public static Timer timerJobStatus, timerCreateBulkJob, timerGetExpMap, timerCreateBulkExp, timerGenerateBulkRec, timerRunJob; + public static Counter timerKruizeNotifications , timerBulkJobs; public static Timer.Builder timerBListRec, timerBListExp, timerBCreateExp, timerBUpdateResults, timerBUpdateRecommendations; public static Timer.Builder timerBLoadRecExpName, timerBLoadResultsExpName, timerBLoadExpName, timerBLoadRecExpNameDate, timerBBoxPlots; public static Timer.Builder timerBLoadAllRec, timerBLoadAllExp, timerBLoadAllResults; public static Timer.Builder timerBAddRecDB, timerBAddResultsDB, timerBAddExpDB, timerBAddBulkResultsDB; public static Timer.Builder timerBAddPerfProfileDB, timerBLoadPerfProfileName, timerBLoadAllPerfProfiles; - public static Counter.Builder timerBKruizeNotifications; + public static Counter.Builder timerBKruizeNotifications, timerBBulkJobs; public static PrometheusMeterRegistry meterRegistry; public static Timer timerListDS, timerImportDSMetadata, timerListDSMetadata; public static Timer.Builder timerBListDS, timerBImportDSMetadata, timerBListDSMetadata; + public static Timer.Builder timerBImportMetadata, timerBGetMetadata; + public static Timer.Builder timerBJobStatus, timerBCreateBulkJob, timerBGetExpMap, timerBCreateBulkExp, timerBGenerateBulkRec, timerBRunJob; private static MetricsConfig INSTANCE; public String API_METRIC_DESC = "Time taken for Kruize APIs"; public String DB_METRIC_DESC = "Time taken for KruizeDB methods"; public String METHOD_METRIC_DESC = "Time taken for Kruize methods"; + public static final AtomicInteger activeJobs = new AtomicInteger(0); + public static Gauge.Builder timerBBulkRunJobs; private MetricsConfig() { meterRegistry = new PrometheusMeterRegistry(PrometheusConfig.DEFAULT); @@ -62,12 +72,21 @@ private MetricsConfig() { timerBImportDSMetadata = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "dsmetadata").tag("method", "POST"); timerBListDSMetadata = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "dsmetadata").tag("method", "GET"); timerBKruizeNotifications = Counter.builder("KruizeNotifications").description("Kruize notifications").tag("api", "updateRecommendations"); + + timerBImportMetadata = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "datasources").tag("method", "importMetadata"); + timerBGetMetadata = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "datasources").tag("method", "getMetadata"); + timerBJobStatus = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "bulk").tag("method", "jobStatus"); + timerBCreateBulkJob = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "bulk").tag("method", "createBulkJob"); + timerBGetExpMap = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "bulk").tag("method", "getExperimentMap"); + timerBRunJob = Timer.builder("kruizeAPI").description(API_METRIC_DESC).tag("api", "bulk").tag("method", "runBulkJob"); + timerBBulkRunJobs = Gauge.builder("kruizeAPI_active_jobs_count", activeJobs, AtomicInteger::get).description("No.of bulk jobs running").tags("api", "bulk", "method", "runBulkJob" , "status", "running"); + timerBBulkRunJobs.register(meterRegistry); + new ClassLoaderMetrics().bindTo(meterRegistry); new ProcessorMetrics().bindTo(meterRegistry); new JvmGcMetrics().bindTo(meterRegistry); new JvmMemoryMetrics().bindTo(meterRegistry); meterRegistry.config().namingConvention(NamingConvention.dot); - } public static PrometheusMeterRegistry meterRegistry() {