Skip to content

Commit

Permalink
fix conflicts
Browse files Browse the repository at this point in the history
Signed-off-by: Saad Khan <[email protected]>
  • Loading branch information
khansaad committed Nov 14, 2024
2 parents a2c4e34 + 3f68ee7 commit 103d548
Show file tree
Hide file tree
Showing 19 changed files with 1,001 additions and 878 deletions.
62 changes: 1 addition & 61 deletions .github/workflows/test-on-pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,68 +13,8 @@ on:
workflow_dispatch:

jobs:
# This workflow contains two jobs called "build autotune" and "build crc"
build_autotune:
# The type of runner that the job will run on
runs-on: ubuntu-20.04

steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v3
- name: Setup Minikube
uses: manusa/[email protected]
with:
minikube version: 'v1.16.0'
kubernetes version: 'v1.19.2'
- name: Build autotune
run: |
echo Build autotune
pr_number=${{ github.event.pull_request.number }}
echo "pr_number=${pr_number}" >> "$GITHUB_ENV"
./build.sh -i autotune_operator:pr_${pr_number}
docker images | grep autotune
- name: Check cluster info on minikube
run: |
kubectl cluster-info
kubectl get pods -n kube-system
- name: Install Prometheus on minikube
run: |
echo Install Prometheus on minikube
cd scripts
./prometheus_on_minikube.sh -as
- name: Deploy kruize in experiment mode
run: |
echo Deploy kruize in experiment mode
cp ./manifests/autotune/autotune-operator-deployment.yaml_template ./manifests/autotune/autotune-operator-deployment.yaml_template.old
sed -e "s/imagePullPolicy: Always/imagePullPolicy: IfNotPresent/g" ./manifests/autotune/autotune-operator-deployment.yaml_template.old > ./manifests/autotune/autotune-operator-deployment.yaml_template
echo "***************************************************************"
cat ./manifests/autotune/autotune-operator-deployment.yaml_template
echo "***************************************************************"
echo "PR_NUMBER = ${{ env.pr_number }}"
./deploy.sh -c minikube -i autotune_operator:pr_${{ env.pr_number }}
sleep 20
- name: Capture ffdc logs
if: always()
run: |
echo "Capturing ffdc logs"
./scripts/ffdc.sh -d ${GITHUB_WORKSPACE}
- name: Archive results
if: always()
run: |
cd ${GITHUB_WORKSPACE}
tar cvf autotune_results.tar kruize_*log.txt
- name: Upload results
if: always()
uses: actions/upload-artifact@v3
with:
name: autotune-results
path: ./autotune_results.tar
retention-days: 2


# This workflow builds the kruize image and runs an end-to-end test to validate the remote monitoring workflow
build_crc:
# The type of runner that the job will run on
runs-on: ubuntu-20.04
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile.autotune
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
##########################################################
# Build Docker Image
##########################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:9.4 as mvnbuild-jdk21
FROM registry.access.redhat.com/ubi9/ubi-minimal:9.5 as mvnbuild-jdk21
ARG USER=autotune
ARG AUTOTUNE_HOME=/home/$USER

Expand Down Expand Up @@ -48,7 +48,7 @@ RUN jlink --strip-debug --compress 2 --no-header-files --no-man-pages --module-p
# Runtime Docker Image
##########################################################
# Use ubi-minimal as the base image
FROM registry.access.redhat.com/ubi9/ubi-minimal:9.4
FROM registry.access.redhat.com/ubi9/ubi-minimal:9.5

ARG AUTOTUNE_VERSION
ARG USER=autotune
Expand Down
182 changes: 92 additions & 90 deletions design/BulkAPI.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,79 +109,84 @@ GET /bulk?job_id=123e4567-e89b-12d3-a456-426614174000&verbose=true
**Body (JSON):**
When verbose=true, additional detailed information about the job is provided.

example 1:

```json
{
"status": "IN_PROGRESS",
"total_experiments": 23,
"processed_experiments": 22,
"data": {
"experiments": {
"new": [
"prometheus-1|default|monitoring|node-exporter(daemonset)|node-exporter",
"prometheus-1|default|cadvisor|cadvisor(daemonset)|cadvisor",
"prometheus-1|default|monitoring|alertmanager-main(statefulset)|config-reloader",
"prometheus-1|default|monitoring|alertmanager-main(statefulset)|alertmanager",
"prometheus-1|default|monitoring|prometheus-operator(deployment)|kube-rbac-proxy",
"prometheus-1|default|kube-system|coredns(deployment)|coredns",
"prometheus-1|default|monitoring|prometheus-k8s(statefulset)|config-reloader",
"prometheus-1|default|monitoring|blackbox-exporter(deployment)|kube-rbac-proxy",
"prometheus-1|default|monitoring|prometheus-operator(deployment)|prometheus-operator",
"prometheus-1|default|monitoring|node-exporter(daemonset)|kube-rbac-proxy",
"prometheus-1|default|monitoring|kube-state-metrics(deployment)|kube-rbac-proxy-self",
"prometheus-1|default|monitoring|kube-state-metrics(deployment)|kube-state-metrics",
"prometheus-1|default|monitoring|kruize(deployment)|kruize",
"prometheus-1|default|monitoring|blackbox-exporter(deployment)|module-configmap-reloader",
"prometheus-1|default|monitoring|prometheus-k8s(statefulset)|prometheus",
"prometheus-1|default|monitoring|kube-state-metrics(deployment)|kube-rbac-proxy-main",
"prometheus-1|default|kube-system|kube-proxy(daemonset)|kube-proxy",
"prometheus-1|default|monitoring|prometheus-adapter(deployment)|prometheus-adapter",
"prometheus-1|default|monitoring|grafana(deployment)|grafana",
"prometheus-1|default|kube-system|kindnet(daemonset)|kindnet-cni",
"prometheus-1|default|monitoring|kruize-db-deployment(deployment)|kruize-db",
"prometheus-1|default|monitoring|blackbox-exporter(deployment)|blackbox-exporter"
],
"updated": [],
"failed": null
"job_id": "5798a2df-6c67-467b-a3c2-befe634a0e3a",
"job_start_time": "2024-10-09T18:09:31.549Z",
"job_end_time": null,
"experiments": [
{
"name": "prometheus-1|default|kube-system|coredns(deployment)|coredns",
"recommendations": {
"status": "unprocessed"
}
},
"recommendations": {
"data": {
"processed": [
"prometheus-1|default|monitoring|alertmanager-main(statefulset)|config-reloader",
"prometheus-1|default|monitoring|node-exporter(daemonset)|node-exporter",
"prometheus-1|default|local-path-storage|local-path-provisioner(deployment)|local-path-provisioner",
"prometheus-1|default|monitoring|alertmanager-main(statefulset)|alertmanager",
"prometheus-1|default|monitoring|prometheus-operator(deployment)|kube-rbac-proxy",
"prometheus-1|default|kube-system|coredns(deployment)|coredns",
"prometheus-1|default|monitoring|blackbox-exporter(deployment)|kube-rbac-proxy",
"prometheus-1|default|monitoring|prometheus-k8s(statefulset)|config-reloader",
"prometheus-1|default|monitoring|prometheus-operator(deployment)|prometheus-operator",
"prometheus-1|default|monitoring|node-exporter(daemonset)|kube-rbac-proxy",
"prometheus-1|default|monitoring|kube-state-metrics(deployment)|kube-rbac-proxy-self",
"prometheus-1|default|monitoring|kube-state-metrics(deployment)|kube-state-metrics",
"prometheus-1|default|monitoring|kruize(deployment)|kruize",
"prometheus-1|default|monitoring|blackbox-exporter(deployment)|module-configmap-reloader",
"prometheus-1|default|monitoring|prometheus-k8s(statefulset)|prometheus",
"prometheus-1|default|monitoring|kube-state-metrics(deployment)|kube-rbac-proxy-main",
"prometheus-1|default|kube-system|kube-proxy(daemonset)|kube-proxy",
"prometheus-1|default|monitoring|prometheus-adapter(deployment)|prometheus-adapter",
"prometheus-1|default|monitoring|grafana(deployment)|grafana",
"prometheus-1|default|kube-system|kindnet(daemonset)|kindnet-cni",
"prometheus-1|default|monitoring|kruize-db-deployment(deployment)|kruize-db",
"prometheus-1|default|monitoring|blackbox-exporter(deployment)|blackbox-exporter"
],
"processing": [
"prometheus-1|default|cadvisor|cadvisor(daemonset)|cadvisor"
],
"unprocessed": [
],
"failed": []
{
"name": "prometheus-1|default|kube-system|kindnet(deployment)|kindnet-cni",
"recommendations": {
"status": "processed"
}
},
{
"name": "prometheus-1|default|monitoring|kruize(deployment)|kruize",
"recommendations": {
"status": "processing"
}
},
{
"name": "prometheus-1|default|monitoring|kruize(deployment)|kruize",
"recommendations": {
"status": "failed",
"notifications": {
"400": {
"type": "error",
"message": "Not able to fetch metrics",
"code": 400
}
}
}
},
{
"name": "prometheus-1|default|monitoring|kruize(deployment)|kruize",
"recommendations": {
"status": "failed",
"notifications": {
"400": {
"type": "error",
"message": "Not able to fetch metrics",
"code": 400
}
}
}
}
]
}
```

example 2:

```json
{
"status": "FAILED",
"total_experiments": 0,
"processed_experiments": 0,
"notifications": {
"503": {
"type": "ERROR",
"message": "HttpHostConnectException: Unable to connect to the data source. Please try again later. (receive series from Addr: 10.96.192.138:10901 LabelSets: {prometheus=\"monitoring/k8stage\", prometheus_replica=\"prometheus-k8stage-0\"},{prometheus=\"monitoring/k8stage\", prometheus_replica=\"prometheus-k8stage-1\"},{replica=\"thanos-ruler-0\", ruler_cluster=\"\"} MinTime: 1730222825216 MaxTime: 1731412800000: rpc error: code = Unknown desc = receive series from 01JBV2JN5SVN84D3HD5MVSGN3A: load chunks: get range reader: Please reduce your request rate)",
"code": 503
}
},
"job_id": "5798a2df-6c67-467b-a3c2-befe634a0e3a",
"job_start_time": "2024-10-09T18:09:31.549Z",
"job_end_time": null
"job_id": "270fa4d9-2701-4ca0-b056-74229cc28498",
"job_start_time": "2024-11-12T15:05:46.362Z",
"job_end_time": "2024-11-12T15:06:05.301Z"
}

```

### Response Parameters
Expand All @@ -205,40 +210,37 @@ resource optimization in Kubernetes environments. Below is a breakdown of the JS
- **Type**: `Integer`
- **Description**: Number of experiments that have been processed so far.

- **data**:
- **Type**: `Object`
- **Description**: Contains detailed information about the experiments and recommendations being processed.
- **experiments**:
- **Type**: `Array `
- **Description**: Array of experiment objects, each containing details about individual experiments.

- Each object in the `experiments` array has the following structure:

- **experiments**:
- **new**:
- **Type**: `Array of Strings`
- **Description**: List of new experiments that have been identified but not yet processed.
| Field | Type | Description |
|-------------------------|--------------|--------------------------------------------------------------------------|
| `name` | `string` | Name of the experiment, typically indicating a service name and deployment context. |
| `notification` | `object` | Notifications specific to this experiment (if any). |
| `recommendation` | `object` | Recommendation status and notifications specific to this experiment. |

- **updated**:
- **Type**: `Array of Strings`
- **Description**: List of experiments that were previously processed but have now been updated.
#### Recommendation Object

- **failed**:
- **Type**: `null or Array`
- **Description**: List of experiments that failed during processing. If no failures, the value is `null`.
The `recommendation` field within each experiment provides information about recommendation processing status and
errors (if any).

- **recommendations**:
- **data**:
- **processed**:
- **Type**: `Array of Strings`
- **Description**: List of experiments for which recommendations have already been processed.
| Field | Type | Description |
|-------------------------|--------------|--------------------------------------------------------------------------|
| `status` | `string` | Status of the recommendation (e.g., `"unprocessed"`, `"processed"`, `"processing"`, `"failed"`). |
| `notification` | `object` | Notifications related to recommendation processing. |

- **processing**:
- **Type**: `Array of Strings`
- **Description**: List of experiments that are currently being processed for recommendations.
#### Notification Object

- **unprocessed**:
- **Type**: `Array of Strings`
- **Description**: List of experiments that have not yet been processed for recommendations.
Both the `notification` and `recommendation.notification` fields may contain error messages or warnings as follows:

- **failed**:
- **Type**: `Array of Strings`
- **Description**: List of experiments for which the recommendation process failed.
| Field | Type | Description |
|-------------------------|--------------|----------------------------------------------------------------------------|
| `type` | `string` | Type of notification (e.g., `"info"`,`"error"`, `"warning"`). |
| `message` | `string` | Description of the notification message. |
| `code` | `integer` | HTTP-like code indicating the type of error (e.g., `400` for bad request). |

- **job_id**:
- **Type**: `String`
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
<properties>
<fabric8-version>4.13.2</fabric8-version>
<org-json-version>20240303</org-json-version>
<jetty-version>9.4.55.v20240627</jetty-version>
<jetty-version>10.0.24</jetty-version>
<slf4j-version>2.17.1</slf4j-version>
<java-version>17</java-version>
<prometheus-simpleclient>0.14.1</prometheus-simpleclient>
Expand Down
25 changes: 20 additions & 5 deletions src/main/java/com/autotune/Autotune.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,20 @@
import com.autotune.analyzer.utils.AnalyzerConstants;
import com.autotune.common.datasource.DataSourceCollection;
import com.autotune.common.datasource.DataSourceInfo;
import com.autotune.common.exceptions.datasource.DataSourceAlreadyExist;
import com.autotune.common.exceptions.datasource.DataSourceNotServiceable;
import com.autotune.common.exceptions.datasource.UnsupportedDataSourceProvider;
import com.autotune.database.helper.DBConstants;
import com.autotune.database.init.KruizeHibernateUtil;
import com.autotune.experimentManager.core.ExperimentManager;
import com.autotune.operator.InitializeDeployment;
import com.autotune.operator.KruizeDeploymentInfo;
import com.autotune.service.HealthService;
import com.autotune.service.InitiateListener;
import com.autotune.utils.*;
import com.autotune.utils.CloudWatchAppender;
import com.autotune.utils.KruizeConstants;
import com.autotune.utils.MetricsConfig;
import com.autotune.utils.ServerContext;
import com.autotune.utils.filter.KruizeCORSFilter;
import io.prometheus.client.exporter.MetricsServlet;
import io.prometheus.client.hotspot.DefaultExports;
Expand All @@ -50,12 +56,17 @@

import javax.servlet.DispatcherType;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Scanner;

import static com.autotune.utils.KruizeConstants.DataSourceConstants.DataSourceErrorMsgs.DATASOURCE_CONNECTION_FAILED;
import static com.autotune.utils.ServerContext.*;

public class Autotune {
Expand Down Expand Up @@ -112,7 +123,11 @@ public static void main(String[] args) {
// load available datasources from db
loadDataSourcesFromDB();
// setting up DataSources
setUpDataSources();
try {
setUpDataSources();
} catch (Exception e) {
LOGGER.error(DATASOURCE_CONNECTION_FAILED, e.getMessage());
}
// checking available DataSources
checkAvailableDataSources();
// load available metric profiles from db
Expand All @@ -124,7 +139,7 @@ public static void main(String[] args) {
//Regenerate a Hibernate session following the creation of new tables
KruizeHibernateUtil.buildSessionFactory();
} catch (Exception | K8sTypeNotSupportedException | MonitoringAgentNotSupportedException |
MonitoringAgentNotFoundException e) {
MonitoringAgentNotFoundException e) {
e.printStackTrace();
System.exit(1);
}
Expand Down Expand Up @@ -170,7 +185,7 @@ public static void main(String[] args) {
/**
* Set up the data sources available at installation time from config file
*/
private static void setUpDataSources() {
private static void setUpDataSources() throws UnsupportedDataSourceProvider, DataSourceNotServiceable, DataSourceAlreadyExist, IOException, NoSuchAlgorithmException, KeyStoreException, KeyManagementException {
DataSourceCollection dataSourceCollection = DataSourceCollection.getInstance();
dataSourceCollection.addDataSourcesFromConfigFile(KruizeConstants.CONFIG_FILE);
}
Expand All @@ -190,7 +205,7 @@ private static void checkAvailableDataSources() {
DataSourceCollection dataSourceCollection = DataSourceCollection.getInstance();
LOGGER.info(KruizeConstants.DataSourceConstants.DataSourceInfoMsgs.CHECKING_AVAILABLE_DATASOURCE);
HashMap<String, DataSourceInfo> dataSources = dataSourceCollection.getDataSourcesCollection();
for (String name: dataSources.keySet()) {
for (String name : dataSources.keySet()) {
DataSourceInfo dataSource = dataSources.get(name);
String dataSourceName = dataSource.getName();
String url = dataSource.getUrl().toString();
Expand Down
Loading

0 comments on commit 103d548

Please sign in to comment.