Skip to content

Commit

Permalink
datadog-agent and datadog-agent-nvml: various fixes to work with latest
Browse files Browse the repository at this point in the history
- add libpcap
- symlink so build avoids downloading libpcap and uses system lib
- regen dep bump patch
- build with python 3.12 as integration dependencies require it
	ERROR: Package 'datadog-slurm' requires a different Python: 3.11.11 not in '>=3.12'
- compile datadog-agent-nvml with python 3.12
- patch to disable gpu monitor as causing test failures, upstream issue tracked DataDog/datadog-agent#32419

Signed-off-by: James Rawlings <[email protected]>
  • Loading branch information
rawlingsj committed Dec 20, 2024
1 parent 1436136 commit 4757b1c
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 17 deletions.
12 changes: 9 additions & 3 deletions datadog-agent-nvml.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package:
name: datadog-agent-nvml
version: 1.0.9
epoch: 3
epoch: 4
description: "Checks NVIDIA Management Library (NVML) exposed metrics through the Datadog Agent and can correlate them with the exposed Kubernetes devices"
copyright:
- license: Apache-2.0
Expand All @@ -18,14 +18,15 @@ environment:
- busybox
- datadog-agent
- datadog-agent-core-integrations
- python-${{vars.python_version}}-dev # strictly requires python3.11
- py${{vars.python_version}}-grpcio
- python-${{vars.python_version}}-dev
- rsync

vars:
dd_conf: /etc/datadog-agent/conf.d
dd_home: / # agent being run by root expects /.
dd_shared: /opt/datadog-agent/embedded
python_version: "3.11"
python_version: "3.12"

pipeline:
# This integration wheel comes from the integrations-extras repository
Expand All @@ -35,6 +36,11 @@ pipeline:
tag: nvml-${{package.version}}
expected-commit: d38c5cdb4ab4d07f4432afb25e0ccd70341efb51

- runs: |
# use system python grpcio to avoid compiling it
# this makes the build use a system grpcio package which is greater than or equal to the one in the requirements.in
sed -i 's/grpcio==\(.*\)/grpcio>=\1/' ./nvml/requirements.in
- runs: |
# Create and activate a virtual environment.
python -m venv .venv
Expand Down
31 changes: 21 additions & 10 deletions datadog-agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ package:
- datadog-agent-core-integrations
- findutils
- grep
- libpcap
- libseccomp
- shadow

vars:
py-version: "3.11"
py-version: "3.12"
destd: /opt/datadog-agent

var-transforms:
Expand Down Expand Up @@ -62,27 +63,27 @@ environment:
- krb5-dev
- libbpf-dev
- libedit-dev
- libpcap-dev
- libzip
- linux-headers
- ninja
- openssf-compiler-options
- procps-dev
- py${{vars.py-version}}-pip
- py${{vars.py-version}}-semver
- python-${{vars.py-version}}-dev # strictly requires python3.11
- python-${{vars.py-version}}-dev
- systemd-dev
- util-linux-misc # unshare
- wget # Required for downloading clang-12 and kernel headers from debian
environment:
# CGo allows Go programs to call C code
CGO_ENABLED: "1"
# -Os optimizes the code for size and add the directory to rtlinkers includes
CGO_CFLAGS: "-Os -I${{targets.destdir}}/usr/include/"
CGO_CFLAGS: "-Os -I/usr/include/"
# Pass options to the linker.
CGO_LDFLAGS: "-L${{targets.destdir}}/usr/lib/"
CGO_LDFLAGS: "-L/usr/lib/"
# disables generation of debugging information
# omits the symbol table and debug information, further reducing the size of the binary.
GOFLAGS: "-ldflags=-w -ldflags=-s"
GOFLAGS: "-ldflags=-w"
# The version of linux-headers to fetch kernel headers for
LINUX_HEADERS_VERSION: "5.10.0-0.deb10.29"
# The version of linux to fetch kernel headers for
Expand All @@ -95,6 +96,13 @@ pipeline:
tag: ${{package.version}}
expected-commit: 646618687e4f9351b5fe19cce678c9cd4b011e74

# disable GPU support for the agent as it causes test failures at runtime
# error: agent: undefined symbol: nvmlVgpuTypeGetCapabilities
# upstream issue to track https://github.com/DataDog/datadog-agent/issues/32419
- uses: patch
with:
patches: /home/build/disable-gpu.patch

# Install `invoke` (build) dependencies. We ultimately package with venv so
# these won't leak into the package.
- runs: |
Expand Down Expand Up @@ -135,6 +143,11 @@ pipeline:
wget "https://github.com/llvm/llvm-project/releases/download/llvmorg-12.0.1/clang+llvm-12.0.1-x86_64-linux-gnu-ubuntu-16.04.tar.xz" -O /tmp/clang.tar.xz -o /dev/null
echo "6b3cc55d3ef413be79785c4dc02828ab3bd6b887872b143e3091692fc6acefe7 /tmp/clang.tar.xz" | sha256sum --check
# need to link libpcap.a to /home/build/dev/lib/libpcap.a else the build will attempt to download libpcap
- runs: |
mkdir -p /home/build/dev/lib
ln -s /usr/lib/libpcap.a /home/build/dev/lib/libpcap.a
- runs: |
wget -O common.deb http://deb.debian.org/debian-security/pool/updates/main/l/linux-5.10/linux-headers-${LINUX_HEADERS_VERSION}-common_${LINUX_KERNEL_VERSION}_all.deb
dpkg -x common.deb /tmp/common
Expand Down Expand Up @@ -165,7 +178,6 @@ pipeline:
- runs: |
invoke -e rtloader.make \
--python-runtimes=3 \
--install-prefix="${{targets.destdir}}/usr" \
--cmake-options="\
-DCMAKE_INSTALL_LIBDIR=lib \
Expand All @@ -181,7 +193,6 @@ pipeline:
--bundle system-probe \
--bundle security-agent \
--exclude-rtloader \
--python-runtimes 3 \
--no-development \
--bundle-ebpf \
--embedded-path /usr/lib
Expand Down Expand Up @@ -287,7 +298,7 @@ subpackages:
with:
repository: https://github.com/DataDog/integrations-core
branch: ${{vars.datadog-major-minor-x}} # 7.59.x
expected-commit: cff91adb18fe879fd875d62d285a67deb69040fe # needs to be updated with each new release
expected-commit: 3189af0e0ae840c9a4bab3131662c7fd6b0de7fb # needs to be updated with each new release
- uses: patch
with:
patches: /home/build/int-core-datadog_checks_dev-pyproject-toml.patch /home/build/int-core-mysql-hatch-toml.patch /home/build/int-core-singlestore-hatch-toml.patch /home/build/int-core-agent_requirements-in.patch /home/build/int-core-snowflake-pyproject-toml.patch
Expand All @@ -299,7 +310,7 @@ subpackages:
python${{vars.py-version}} -m venv .venv
# Install locked dependencies
.venv/bin/pip install --require-hashes --only-binary=:all: --no-deps -r .deps/resolved/linux-${{build.arch}}_py3.txt
.venv/bin/pip install --require-hashes --only-binary=:all: --no-deps -r .deps/resolved/linux-${{build.arch}}_${{vars.py-version}}.txt
excludes="datadog_checks_base datadog_checks_dev datadog_checks_tests_helper docker_daemon esxi teleport"
checks=$(invoke -r /home/build agent.collect-integrations /home/integrations/ 3 linux --excluded "$excludes")
Expand Down
98 changes: 98 additions & 0 deletions datadog-agent/disable-gpu.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
diff --git a/cmd/system-probe/modules/gpu.go b/cmd/system-probe/modules/gpu.go
index 27f3f0c..f1ceba6 100644
--- a/cmd/system-probe/modules/gpu.go
+++ b/cmd/system-probe/modules/gpu.go
@@ -8,20 +8,13 @@
package modules

import (
- "fmt"
- "net/http"
- "time"
-
- "github.com/NVIDIA/go-nvml/pkg/nvml"
- "go.uber.org/atomic"
-
+ "errors"
"github.com/DataDog/datadog-agent/cmd/system-probe/api/module"
"github.com/DataDog/datadog-agent/cmd/system-probe/config"
sysconfigtypes "github.com/DataDog/datadog-agent/cmd/system-probe/config/types"
"github.com/DataDog/datadog-agent/cmd/system-probe/utils"
- "github.com/DataDog/datadog-agent/pkg/gpu"
gpuconfig "github.com/DataDog/datadog-agent/pkg/gpu/config"
- "github.com/DataDog/datadog-agent/pkg/util/log"
+ "net/http"
)

var _ module.Module = &GPUMonitoringModule{}
@@ -32,53 +25,21 @@ var GPUMonitoring = module.Factory{
Name: config.GPUMonitoringModule,
ConfigNamespaces: gpuMonitoringConfigNamespaces,
Fn: func(_ *sysconfigtypes.Config, deps module.FactoryDependencies) (module.Module, error) {
-
- c := gpuconfig.NewConfig()
- probeDeps := gpu.ProbeDependencies{
- Telemetry: deps.Telemetry,
- //if the config parameter doesn't exist or is empty string, the default value is used as defined in go-nvml library
- //(https://github.com/NVIDIA/go-nvml/blob/main/pkg/nvml/lib.go#L30)
- NvmlLib: nvml.New(nvml.WithLibraryPath(c.NVMLLibraryPath)),
- }
-
- ret := probeDeps.NvmlLib.Init()
- if ret != nvml.SUCCESS && ret != nvml.ERROR_ALREADY_INITIALIZED {
- return nil, fmt.Errorf("unable to initialize NVML library: %v", ret)
- }
-
- t, err := gpu.NewProbe(c, probeDeps)
- if err != nil {
- return nil, fmt.Errorf("unable to start GPU monitoring: %w", err)
- }
-
- return &GPUMonitoringModule{
- Probe: t,
- lastCheck: atomic.NewInt64(0),
- }, nil
+ return nil, errors.New("GPU monitoring disabled at build time")
},
NeedsEBPF: func() bool {
- return true
+ return false
},
}

// GPUMonitoringModule is a module for GPU monitoring
type GPUMonitoringModule struct {
- *gpu.Probe
- lastCheck *atomic.Int64
}

// Register registers the GPU monitoring module
func (t *GPUMonitoringModule) Register(httpMux *module.Router) error {
httpMux.HandleFunc("/check", func(w http.ResponseWriter, _ *http.Request) {
- t.lastCheck.Store(time.Now().Unix())
- stats, err := t.Probe.GetAndFlush()
- if err != nil {
- log.Errorf("Error getting GPU stats: %v", err)
- w.WriteHeader(500)
- return
- }
-
- utils.WriteAsJSON(w, stats)
+ utils.WriteAsJSON(w, map[string]interface{}{})
})

return nil
@@ -86,12 +47,9 @@ func (t *GPUMonitoringModule) Register(httpMux *module.Router) error {

// GetStats returns the last check time
func (t *GPUMonitoringModule) GetStats() map[string]interface{} {
- return map[string]interface{}{
- "last_check": t.lastCheck.Load(),
- }
+ return map[string]interface{}{}
}

// Close closes the GPU monitoring module
func (t *GPUMonitoringModule) Close() {
- t.Probe.Close()
}
8 changes: 4 additions & 4 deletions datadog-agent/int-core-agent_requirements-in.patch
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
diff --git a/agent_requirements.in b/agent_requirements.in
index b4c724713e..0713f9b365 100644
index 859d088..11f529c 100644
--- a/agent_requirements.in
+++ b/agent_requirements.in
@@ -66,7 +66,7 @@ semver==3.0.2
@@ -65,7 +65,7 @@ securesystemslib[crypto,pynacl]==0.28.0
semver==3.0.2
service-identity[idna]==24.1.0
simplejson==3.19.3
six==1.16.0
-snowflake-connector-python==3.12.1
+snowflake-connector-python==3.12.3; python_version > '3.0'
supervisor==4.2.5
tuf==4.0.0
uptime==3.0.1
uptime==3.0.1

0 comments on commit 4757b1c

Please sign in to comment.