Skip to content

Commit

Permalink
Merge pull request #516 from elezar/use-nvidia-runtime-for-mps
Browse files Browse the repository at this point in the history
Use nvidia runtime for MPS control
  • Loading branch information
elezar authored Feb 13, 2024
2 parents d546a3e + 101b1bb commit 2832c64
Show file tree
Hide file tree
Showing 8 changed files with 27 additions and 111 deletions.
6 changes: 0 additions & 6 deletions cmd/mps-control-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,6 @@ func main() {
}

config.flags = []cli.Flag{
&cli.StringFlag{
Name: "nvidia-driver-root",
Value: "/",
Usage: "the root path for the NVIDIA driver installation (typical values are '/' or '/run/nvidia/driver')",
EnvVars: []string{"NVIDIA_DRIVER_ROOT"},
},
&cli.StringFlag{
Name: "config-file",
Usage: "the path to a config file as an alternative to command line options or environment variables",
Expand Down
15 changes: 6 additions & 9 deletions cmd/mps-control-daemon/mps/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ func (d *Daemon) Envvars() envvars {
}

// Start starts the MPS deamon as a background process.
// The pipe and log dirs are also created relative to the driver-root.
func (d *Daemon) Start() error {
if err := d.setComputeMode(computeModeExclusiveProcess); err != nil {
return fmt.Errorf("error setting compute mode %v: %w", computeModeExclusiveProcess, err)
Expand All @@ -92,16 +91,16 @@ func (d *Daemon) Start() error {
klog.InfoS("Staring MPS daemon", "resource", d.rm.Resource())

pipeDir := d.pipeDir()
if err := os.MkdirAll(filepath.Join("/driver-root", pipeDir), 0755); err != nil {
if err := os.MkdirAll(pipeDir, 0755); err != nil {
return fmt.Errorf("error creating directory %v: %w", pipeDir, err)
}

logDir := d.logDir()
if err := os.MkdirAll(filepath.Join("/driver-root", logDir), 0755); err != nil {
if err := os.MkdirAll(logDir, 0755); err != nil {
return fmt.Errorf("error creating directory %v: %w", logDir, err)
}

mpsDaemon := exec.Command("chroot", "/driver-root", mpsControlBin, "-d")
mpsDaemon := exec.Command(mpsControlBin, "-d")
mpsDaemon.Env = append(mpsDaemon.Env, d.Envvars().toSlice()...)
if err := mpsDaemon.Run(); err != nil {
return err
Expand All @@ -120,7 +119,7 @@ func (d *Daemon) Start() error {
}
}

statusFile, err := os.Create(filepath.Join("/driver-root", d.startedFile()))
statusFile, err := os.Create(d.startedFile())
if err != nil {
return err
}
Expand All @@ -141,7 +140,7 @@ func (d *Daemon) Stop() error {
return fmt.Errorf("error setting compute mode %v: %w", computeModeDefault, err)
}

err = os.Remove(filepath.Join("/driver-root", d.startedFile()))
err = os.Remove(d.startedFile())
if err != nil && err != os.ErrNotExist {
return fmt.Errorf("failed to remove started file: %w", err)
}
Expand Down Expand Up @@ -177,7 +176,7 @@ func (d *Daemon) EchoPipeToControl(command string) (string, error) {
defer writer.Close()
defer reader.Close()

mpsDaemon := exec.Command("chroot", "/driver-root", mpsControlBin)
mpsDaemon := exec.Command(mpsControlBin)
mpsDaemon.Env = append(mpsDaemon.Env, d.Envvars().toSlice()...)

mpsDaemon.Stdin = reader
Expand All @@ -201,8 +200,6 @@ func (d *Daemon) EchoPipeToControl(command string) (string, error) {
func (d *Daemon) setComputeMode(mode computeMode) error {
for _, uuid := range d.Devices().GetUUIDs() {
cmd := exec.Command(
// TODO: This needs to be set up to handle non-rootfs paths such as GKE.
"chroot", "/driver-root",
"nvidia-smi",
"-i", uuid,
"-c", string(mode))
Expand Down
72 changes: 0 additions & 72 deletions cmd/mps-control-daemon/mps/find.go

This file was deleted.

6 changes: 1 addition & 5 deletions cmd/mps-control-daemon/mps/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,7 @@ func New(opts ...Option) (Manager, error) {

// TODO: This should be controllable via an option
if m.nvmllib == nil {
driverLibraryPath, err := root("/driver-root").getDriverLibraryPath()
if err != nil {
return nil, fmt.Errorf("failed to locate driver libraries: %w", err)
}
m.nvmllib = nvml.New(nvml.WithLibraryPath(driverLibraryPath))
m.nvmllib = nvml.New()
}

return m, nil
Expand Down
2 changes: 1 addition & 1 deletion deployments/container/Dockerfile.ubi8
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ RUN dnf remove -y cuda-*

ENV NVIDIA_DISABLE_REQUIRE="true"
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=utility
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

ARG VERSION="N/A"
ARG GIT_COMMIT="unknown"
Expand Down
2 changes: 1 addition & 1 deletion deployments/container/Dockerfile.ubuntu
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ RUN apt-get --purge -y autoremove cuda-*

ENV NVIDIA_DISABLE_REQUIRE="true"
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=utility
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

ARG VERSION="N/A"
ARG GIT_COMMIT="unknown"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ spec:
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
{{- end }}
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
securityContext:
{{- include "nvidia-device-plugin.securityContext" . | nindent 10 }}
volumeMounts:
Expand All @@ -186,12 +190,13 @@ spec:
# This is required for CDI detection to work correctly.
- name: driver-root
mountPath: /driver-root
readOnly: true
{{- end }}
# The MPS /dev/shm is needed to allow for MPS daemon health-checking.
- name: mps-shm
mountPath: /driver-root/dev/shm
mountPath: /dev/shm
- name: mps-root
mountPath: /driver-root/mps
mountPath: /mps
- name: cdi-root
mountPath: /var/run/cdi
{{- if eq $hasConfigMap "true" }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ spec:
exec:
command:
- cat
- /driver-root/mps/nvidia.com/gpu/.started
- /mps/nvidia.com/gpu/.started
initialDelaySeconds: 1
periodSeconds: 1
env:
Expand All @@ -158,8 +158,6 @@ spec:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: NVIDIA_VISIBLE_DEVICES
value: void
{{- if typeIs "string" .Values.migStrategy }}
- name: MIG_STRATEGY
value: "{{ .Values.migStrategy }}"
Expand All @@ -168,18 +166,21 @@ spec:
- name: CONFIG_FILE
value: /config/config.yaml
{{- end }}
{{- if ne $migStrategiesAreAllNone "true" }}
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
{{- end }}
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
securityContext:
privileged: true
volumeMounts:
{{- if typeIs "string" .Values.nvidiaDriverRoot }}
# We always mount the driver root at /driver-root in the container.
- name: driver-root
mountPath: /driver-root
{{- end }}
- name: mps-shm
mountPath: /driver-root/dev/shm
mountPath: /dev/shm
- name: mps-root
mountPath: /driver-root/mps
mountPath: /mps
{{- if eq $hasConfigMap "true" }}
- name: available-configs
mountPath: /available-configs
Expand All @@ -199,11 +200,6 @@ spec:
- name: mps-shm
hostPath:
path: /var/lib/kubelet/device-plugins/mps/shm
{{- if typeIs "string" .Values.nvidiaDriverRoot }}
- name: driver-root
hostPath:
path: {{ .Values.nvidiaDriverRoot }}
{{- end }}
{{- if eq $hasConfigMap "true" }}
- name: available-configs
configMap:
Expand Down

0 comments on commit 2832c64

Please sign in to comment.