From 2d98f7a12ed23ad66bb00d558468f8c45d6eb680 Mon Sep 17 00:00:00 2001 From: Justin Brooks Date: Thu, 19 Oct 2023 19:36:00 -0400 Subject: [PATCH] Add otel! (#36) Co-authored-by: George Scott --- charts/operator-wandb/Chart.lock | 9 +- charts/operator-wandb/Chart.yaml | 6 +- .../charts/app/templates/deployment.yaml | 8 + .../charts/app/templates/service.yaml | 7 +- .../operator-wandb/charts/console/values.yaml | 8 +- charts/operator-wandb/charts/otel/.helmignore | 23 +++ charts/operator-wandb/charts/otel/Chart.yaml | 15 ++ charts/operator-wandb/charts/otel/README.md | 7 + .../charts/otel/templates/_config.tpl | 93 +++++++++ .../charts/otel/templates/_helpers.tpl | 102 ++++++++++ .../charts/otel/templates/_receivers.tpl | 180 ++++++++++++++++++ .../charts/otel/templates/clusterrole.yaml | 54 ++++++ .../otel/templates/clusterrolebinding.yaml | 23 +++ .../charts/otel/templates/configmap.yaml | 18 ++ .../charts/otel/templates/deamonset.yaml | 145 ++++++++++++++ .../charts/otel/templates/service.yaml | 26 +++ .../charts/otel/templates/serviceaccount.yaml | 15 ++ charts/operator-wandb/charts/otel/values.yaml | 59 ++++++ .../charts/prometheus/values.yaml | 4 + .../operator-wandb/charts/weave/values.yaml | 6 +- charts/operator-wandb/values.yaml | 27 +++ 21 files changed, 821 insertions(+), 14 deletions(-) create mode 100644 charts/operator-wandb/charts/otel/.helmignore create mode 100644 charts/operator-wandb/charts/otel/Chart.yaml create mode 100644 charts/operator-wandb/charts/otel/README.md create mode 100644 charts/operator-wandb/charts/otel/templates/_config.tpl create mode 100644 charts/operator-wandb/charts/otel/templates/_helpers.tpl create mode 100644 charts/operator-wandb/charts/otel/templates/_receivers.tpl create mode 100644 charts/operator-wandb/charts/otel/templates/clusterrole.yaml create mode 100644 charts/operator-wandb/charts/otel/templates/clusterrolebinding.yaml create mode 100644 charts/operator-wandb/charts/otel/templates/configmap.yaml create mode 100644 charts/operator-wandb/charts/otel/templates/deamonset.yaml create mode 100644 charts/operator-wandb/charts/otel/templates/service.yaml create mode 100644 charts/operator-wandb/charts/otel/templates/serviceaccount.yaml create mode 100644 charts/operator-wandb/charts/otel/values.yaml diff --git a/charts/operator-wandb/Chart.lock b/charts/operator-wandb/Chart.lock index 9e67be2e..7e3157aa 100644 --- a/charts/operator-wandb/Chart.lock +++ b/charts/operator-wandb/Chart.lock @@ -19,6 +19,9 @@ dependencies: version: 0.1.0 - name: redis repository: https://charts.bitnami.com/bitnami - version: 18.1.0 -digest: sha256:0e062062405e017968fb5ad0e5064936cb55e2b441ddb1c2048f34eaf6de11a8 -generated: "2023-09-27T12:33:43.680199603-04:00" + version: 18.1.5 +- name: otel + repository: file://charts/otel + version: 0.1.0 +digest: sha256:d6f7dbed1f8fcbbd34d18b0911891fb27eeef0021092b69ea35e7ca5dcede038 +generated: "2023-10-16T19:07:10.090393-04:00" diff --git a/charts/operator-wandb/Chart.yaml b/charts/operator-wandb/Chart.yaml index 2c33849a..23d8267c 100644 --- a/charts/operator-wandb/Chart.yaml +++ b/charts/operator-wandb/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: operator-wandb description: A Helm chart for deploying W&B to Kubernetes type: application -version: 0.10.24 +version: 0.10.25 appVersion: 1.0.0 icon: https://wandb.ai/logo.svg @@ -40,3 +40,7 @@ dependencies: version: "18.*.*" condition: redis.install repository: https://charts.bitnami.com/bitnami + - name: otel + version: "*.*.*" + repository: file://charts/otel + condition: otel.install diff --git a/charts/operator-wandb/charts/app/templates/deployment.yaml b/charts/operator-wandb/charts/app/templates/deployment.yaml index 786f8fee..b8d95841 100644 --- a/charts/operator-wandb/charts/app/templates/deployment.yaml +++ b/charts/operator-wandb/charts/app/templates/deployment.yaml @@ -76,6 +76,9 @@ spec: - name: prometheus containerPort: 8181 protocol: TCP + - name: gorilla-statsd + containerPort: 8125 + protocol: TCP env: - name: HOST value: "{{ .Values.global.host }}" @@ -153,6 +156,11 @@ spec: - name: GORILLA_SESSION_LENGTH value: "{{ .Values.global.auth.sessionLengthHours }}h" + + - name: GORILLA_STATSD_PORT + value: "8125" + - name: GORILLA_STATSD_HOST + value: "0.0.0.0" - name: BUCKET value: "{{ include "app.bucket" . }}" diff --git a/charts/operator-wandb/charts/app/templates/service.yaml b/charts/operator-wandb/charts/app/templates/service.yaml index 8fbdc6e0..20e3fd9c 100644 --- a/charts/operator-wandb/charts/app/templates/service.yaml +++ b/charts/operator-wandb/charts/app/templates/service.yaml @@ -1,4 +1,3 @@ -{{- if .Values.enabled }} apiVersion: v1 kind: Service metadata: @@ -23,6 +22,8 @@ spec: - port: 8181 protocol: TCP name: prometheus + - port: 8125 + protocol: UDP + name: gorilla-statsd selector: - {{- include "app.labels" . | nindent 4 }} -{{- end }} \ No newline at end of file + {{- include "app.labels" . | nindent 4 }} \ No newline at end of file diff --git a/charts/operator-wandb/charts/console/values.yaml b/charts/operator-wandb/charts/console/values.yaml index 3a7fc07f..b1c7a542 100644 --- a/charts/operator-wandb/charts/console/values.yaml +++ b/charts/operator-wandb/charts/console/values.yaml @@ -38,8 +38,8 @@ resources: # specify resources, uncomment the following lines, adjust them as necessary, # and remove the curly braces after 'resources:'. requests: - cpu: 500m - memory: 1Gi + cpu: 200m + memory: 200Mi limits: - cpu: 4000m - memory: 8Gi + cpu: 1 + memory: 500Mi diff --git a/charts/operator-wandb/charts/otel/.helmignore b/charts/operator-wandb/charts/otel/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/charts/operator-wandb/charts/otel/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/operator-wandb/charts/otel/Chart.yaml b/charts/operator-wandb/charts/otel/Chart.yaml new file mode 100644 index 00000000..95db1884 --- /dev/null +++ b/charts/operator-wandb/charts/otel/Chart.yaml @@ -0,0 +1,15 @@ +apiVersion: v2 +name: otel +type: application +description: A Helm chart for Kubernetes + +version: 0.1.0 +appVersion: "0.33.0" + +home: https://wandb.ai +icon: https://wandb.ai/logo.svg + +maintainers: + - name: wandb + email: support@wandb.com + url: https://wandb.com diff --git a/charts/operator-wandb/charts/otel/README.md b/charts/operator-wandb/charts/otel/README.md new file mode 100644 index 00000000..ad713f68 --- /dev/null +++ b/charts/operator-wandb/charts/otel/README.md @@ -0,0 +1,7 @@ +We had to create a seperate chart, because the offical one does not support + +1. We need to send an otlphttp to the console server. The name of this service + is dynamic. TEL helm chart does not support dynamic pipeline values +2. We could do the above as a config map, and pass it into the agent... however, + otel helm does not support using custom config maps names because they need + to be based on the release name. diff --git a/charts/operator-wandb/charts/otel/templates/_config.tpl b/charts/operator-wandb/charts/otel/templates/_config.tpl new file mode 100644 index 00000000..2dde04ab --- /dev/null +++ b/charts/operator-wandb/charts/otel/templates/_config.tpl @@ -0,0 +1,93 @@ +{{- define "otel.config" -}} +{{- $data := deepCopy .Values.config }} +{{- $config := .Values.config }} +{{- $config = mustMergeOverwrite (include "otel.hostMetricsReceiver" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.logsCollectionReceiver" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.kubeletMetricsReceiver" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.kubernetesEventReceiver" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.kubernetesClusterReceiver" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.sqlQueryReceiver" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.statsdAppReceiver" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.extensions" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.processors" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.service" . | fromYaml) $config }} +{{- $config = mustMergeOverwrite (include "otel.exporter" . | fromYaml) $config }} +{{- tpl (toYaml $config) . }} +{{- end }} + +{{- define "otel.exporter" -}} +exporters: + debug: {} + debug/detailed: + verbosity: detailed + prometheus: + endpoint: 0.0.0.0:9109 +{{- end }} + +{{- define "otel.extensions" -}} +extensions: + health_check: {} + memory_ballast: + size_in_percentage: 40 +{{- end }} + +{{- define "otel.processors" -}} +processors: + batch: {} + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + k8sattributes: + filter: + node_from_env_var: K8S_NODE_NAME + passthrough: false + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + extract: + metadata: + - "k8s.namespace.name" + - "k8s.deployment.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.node.name" + - "k8s.pod.name" + - "k8s.pod.uid" + - "k8s.pod.start_time" + annotations: + - tag_name: $$1 + key_regex: (.*) + from: pod + labels: + - tag_name: $$1 + key_regex: (.*) + from: pod +{{- end }} + +{{- define "otel.service" -}} +service: + extensions: + - health_check + - memory_ballast + pipelines: + metrics: + exporters: [debug, prometheus] + processors: [memory_limiter, batch, k8sattributes] + receivers: [hostmetrics, k8s_cluster, kubeletstats, sqlquery] + logs: + exporters: [debug] + processors: [memory_limiter, batch] + receivers: [filelog] + telemetry: + metrics: + address: ${env:POD_IP}:8888 +{{- end }} \ No newline at end of file diff --git a/charts/operator-wandb/charts/otel/templates/_helpers.tpl b/charts/operator-wandb/charts/otel/templates/_helpers.tpl new file mode 100644 index 00000000..daad4c48 --- /dev/null +++ b/charts/operator-wandb/charts/otel/templates/_helpers.tpl @@ -0,0 +1,102 @@ +{{/* vim: set filetype=mustache: */}} + +{{/* +Expand the name of the chart. +*/}} +{{- define "otel.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "otel.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "otel.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "otel.labels" -}} +helm.sh/chart: {{ include "otel.chart" . }} +{{ include "otel.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +wandb.com/app-name: {{ include "otel.chart" . }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "otel.selectorLabels" -}} +app.kubernetes.io/name: {{ include "otel.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "otel.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "otel.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Returns the extraEnv keys and values to inject into containers. + +Global values will override any chart-specific values. +*/}} +{{- define "otel.extraEnv" -}} +{{- $allExtraEnv := merge (default (dict) .local.extraEnv) .global.extraEnv -}} +{{- range $key, $value := $allExtraEnv }} +- name: {{ $key }} + value: {{ $value | quote }} +{{- end -}} +{{- end -}} + +{{/* +Returns a list of _common_ labels to be shared across all +app deployments and other shared objects. +*/}} +{{- define "otel.commonLabels" -}} +{{- $commonLabels := default (dict) .Values.common.labels -}} +{{- if $commonLabels }} +{{- range $key, $value := $commonLabels }} +{{ $key }}: {{ $value | quote }} +{{- end }} +{{- end -}} +{{- end -}} + +{{/* +Returns a list of _pod_ labels to be shared across all +app deployments. +*/}} +{{- define "otel.podLabels" -}} +{{- range $key, $value := .Values.pod.labels }} +{{ $key }}: {{ $value | quote }} +{{- end }} +{{- end -}} + diff --git a/charts/operator-wandb/charts/otel/templates/_receivers.tpl b/charts/operator-wandb/charts/otel/templates/_receivers.tpl new file mode 100644 index 00000000..9536162c --- /dev/null +++ b/charts/operator-wandb/charts/otel/templates/_receivers.tpl @@ -0,0 +1,180 @@ +{{- define "otel.hostMetricsReceiver" -}} +receivers: + hostmetrics: + root_path: /hostfs + collection_interval: 10s + scrapers: + cpu: + load: + memory: + disk: + filesystem: + exclude_mount_points: + mount_points: + - /dev/* + - /proc/* + - /sys/* + - /run/k3s/containerd/* + - /var/lib/docker/* + - /var/lib/kubelet/* + - /snap/* + match_type: regexp + exclude_fs_types: + fs_types: + - autofs + - binfmt_misc + - bpf + - cgroup2 + - configfs + - debugfs + - devpts + - devtmpfs + - fusectl + - hugetlbfs + - iso9660 + - mqueue + - nsfs + - overlay + - proc + - procfs + - pstore + - rpc_pipefs + - securityfs + - selinuxfs + - squashfs + - sysfs + - tracefs + match_type: strict + network: +{{- end }} + +{{- define "otel.logsCollectionReceiver" -}} +receivers: + filelog: + include: [ /var/log/pods/*/*/*.log ] + exclude: [] + start_at: end + include_file_path: true + include_file_name: false + operators: + # Find out which format is used by kubernetes + - type: router + id: get-format + routes: + - output: parser-docker + expr: 'body matches "^\\{"' + - output: parser-crio + expr: 'body matches "^[^ Z]+ "' + - output: parser-containerd + expr: 'body matches "^[^ Z]+Z"' + # Parse CRI-O format + - type: regex_parser + id: parser-crio + regex: '^(?P