Skip to content

Commit

Permalink
Allow mps root to be specified
Browse files Browse the repository at this point in the history
This change allows the MPS root on the host to be specified
and uses /run/nvidia/mps by default.

Signed-off-by: Evan Lezar <[email protected]>
  • Loading branch information
elezar committed Feb 13, 2024
1 parent 2832c64 commit 251bd9b
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 43 deletions.
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type Flags struct {
type CommandLineFlags struct {
MigStrategy *string `json:"migStrategy" yaml:"migStrategy"`
FailOnInitError *bool `json:"failOnInitError" yaml:"failOnInitError"`
MpsRoot *string `json:"mpsRoot,omitempty" yaml:"mpsRoot,omitempty"`
NvidiaDriverRoot *string `json:"nvidiaDriverRoot,omitempty" yaml:"nvidiaDriverRoot,omitempty"`
GDSEnabled *bool `json:"gdsEnabled" yaml:"gdsEnabled"`
MOFEDEnabled *bool `json:"mofedEnabled" yaml:"mofedEnabled"`
Expand Down Expand Up @@ -116,6 +117,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.MigStrategy, c, n)
case "fail-on-init-error":
updateFromCLIFlag(&f.FailOnInitError, c, n)
case "mps-root":
updateFromCLIFlag(&f.MpsRoot, c, n)
case "nvidia-driver-root":
updateFromCLIFlag(&f.NvidiaDriverRoot, c, n)
case "gds-enabled":
Expand Down
16 changes: 8 additions & 8 deletions cmd/mps-control-daemon/mps/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ type Daemon struct {
}

// NewDaemon creates an MPS daemon instance.
func NewDaemon(rm rm.ResourceManager) *Daemon {
func NewDaemon(rm rm.ResourceManager, root string) *Daemon {
return &Daemon{
rm: rm,
root: "/mps",
root: root,
}
}

Expand All @@ -77,8 +77,8 @@ func (e envvars) toSlice() []string {
// TODO: Set CUDA_VISIBLE_DEVICES to include only the devices for this resource type.
func (d *Daemon) Envvars() envvars {
return map[string]string{
"CUDA_MPS_PIPE_DIRECTORY": d.pipeDir(),
"CUDA_MPS_LOG_DIRECTORY": d.logDir(),
"CUDA_MPS_PIPE_DIRECTORY": d.PipeDir(),
"CUDA_MPS_LOG_DIRECTORY": d.LogDir(),
}
}

Expand All @@ -90,12 +90,12 @@ func (d *Daemon) Start() error {

klog.InfoS("Staring MPS daemon", "resource", d.rm.Resource())

pipeDir := d.pipeDir()
pipeDir := d.PipeDir()
if err := os.MkdirAll(pipeDir, 0755); err != nil {
return fmt.Errorf("error creating directory %v: %w", pipeDir, err)
}

logDir := d.logDir()
logDir := d.LogDir()
if err := os.MkdirAll(logDir, 0755); err != nil {
return fmt.Errorf("error creating directory %v: %w", logDir, err)
}
Expand Down Expand Up @@ -151,11 +151,11 @@ func (d *Daemon) resourceRoot() string {
return filepath.Join(d.root, string(d.rm.Resource()))
}

func (d *Daemon) pipeDir() string {
func (d *Daemon) PipeDir() string {
return filepath.Join(d.resourceRoot(), "pipe")
}

func (d *Daemon) logDir() string {
func (d *Daemon) LogDir() string {
return filepath.Join(d.resourceRoot(), "log")
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/mps-control-daemon/mps/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (m *manager) Daemons() ([]*Daemon, error) {
klog.InfoS("Resource is not shared", "resource", "resource", resourceManager.Resource())
continue
}
daemon := NewDaemon(resourceManager)
daemon := NewDaemon(resourceManager, "/mps")
daemons = append(daemons, daemon)
}

Expand Down
9 changes: 9 additions & 0 deletions cmd/nvidia-device-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ func main() {
Usage: "the path where the NVIDIA driver root is mounted in the container; used for generating CDI specifications",
EnvVars: []string{"CONTAINER_DRIVER_ROOT"},
},
&cli.StringFlag{
Name: "mps-root",
Usage: "the path on the host where MPS-specific mounts and files are created by the MPS control daemon manager",
EnvVars: []string{"MPS_ROOT"},
},
}

err := c.Run(os.Args)
Expand All @@ -143,6 +148,10 @@ func validateFlags(config *spec.Config) error {
return fmt.Errorf("using --mig-strategy=mixed is not supported with MPS")
}

if config.Flags.MpsRoot == nil || *config.Flags.MpsRoot == "" {
return fmt.Errorf("using MPS requires --mps-root to be specified")
}

deviceListStrategies, err := spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy)
if err != nil {
return err
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ spec:
name: nvidia-device-plugin-ctr
command: ["nvidia-device-plugin"]
env:
- name: MPS_ROOT
value: "{{ .Values.mps.root }}"
{{- if typeIs "string" .Values.migStrategy }}
- name: MIG_STRATEGY
value: "{{ .Values.migStrategy }}"
Expand Down Expand Up @@ -215,12 +217,11 @@ spec:
path: /var/lib/kubelet/device-plugins
- name: mps-root
hostPath:
# TODO: This should be /var/run/nvidia/mps
path: /var/lib/kubelet/device-plugins/mps
path: {{ .Values.mps.root }}
type: DirectoryOrCreate
- name: mps-shm
hostPath:
path: /var/lib/kubelet/device-plugins/mps/shm
path: {{ .Values.mps.root }}/shm
{{- if typeIs "string" .Values.nvidiaDriverRoot }}
- name: driver-root
hostPath:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,11 @@ spec:
volumes:
- name: mps-root
hostPath:
# TODO: This should be /var/run/nvidia/mps
path: /var/lib/kubelet/device-plugins/mps
path: {{ .Values.mps.root }}
type: DirectoryOrCreate
- name: mps-shm
hostPath:
path: /var/lib/kubelet/device-plugins/mps/shm
path: {{ .Values.mps.root }}/shm
{{- if eq $hasConfigMap "true" }}
- name: available-configs
configMap:
Expand Down
7 changes: 7 additions & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,10 @@ nfd:
- "0302"
deviceLabelFields:
- vendor

mps:
# root specifies the location where files and folders for managing MPS will
# be created. This includes a daemon-specific /dev/shm and pipe and log
# directories.
# Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
root: "/run/nvidia/mps"
72 changes: 44 additions & 28 deletions internal/plugin/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ func (plugin *NvidiaDevicePlugin) waitForMPSDaemon() error {
}
// TODO: Check the started file here.
// TODO: Have some retry strategy here.
if err := mps.NewDaemon(plugin.rm).AssertHealthy(); err != nil {
// TODO: /mps here is represents the path relative to the /driver-root.
mpsDaemon := mps.NewDaemon(plugin.rm, "/mps")
if err := mpsDaemon.AssertHealthy(); err != nil {
return fmt.Errorf("error checking MPS daemon health: %w", err)
}
return nil
Expand Down Expand Up @@ -331,6 +333,9 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu
if err != nil {
return nil, fmt.Errorf("failed to get allocate response for CDI: %v", err)
}
if err := plugin.updateContainerAllocatResponseForMPS(&response, deviceIDs); err != nil {
return nil, fmt.Errorf("failed to update allocate response for MPS: %v", err)
}

if plugin.deviceListStrategies.Includes(spec.DeviceListStrategyEnvvar) {
response.Envs = plugin.apiEnvs(plugin.deviceListEnvvar, deviceIDs)
Expand All @@ -348,36 +353,47 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu
if *plugin.config.Flags.MOFEDEnabled {
response.Envs["NVIDIA_MOFED"] = "enabled"
}
return &response, nil
}

func (plugin NvidiaDevicePlugin) updateContainerAllocatResponseForMPS(response *pluginapi.ContainerAllocateResponse, deviceIDs []string) error {
if plugin.config.Sharing.SharingStrategy() != spec.SharingStrategyMPS {
return nil
}

// TODO: We should check that the deviceIDs are shared using MPS.

// TODO: We should generate a CDI specification for MPS
if plugin.config.Sharing.SharingStrategy() == spec.SharingStrategyMPS {
if response.Envs == nil {
response.Envs = make(map[string]string)
}
pipeDir := filepath.Join("/mps", string(plugin.rm.Resource()), "pipe")
response.Envs["CUDA_MPS_PIPE_DIRECTORY"] = pipeDir
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: pipeDir,
HostPath: filepath.Join("/var/lib/kubelet/device-plugins", pipeDir),
},
)
logDir := filepath.Join("/mps", string(plugin.rm.Resource()), "log")
response.Envs["CUDA_MPS_LOG_DIRECTORY"] = logDir
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: logDir,
HostPath: filepath.Join("/var/lib/kubelet/device-plugins", logDir),
},
)
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: "/dev/shm",
HostPath: "/var/lib/kubelet/device-plugins/mps/shm",
},
)

// TODO: We use the Daemon here just to construct the pipe and log dirs for the specified resource.
containerMpsDaemon := mps.NewDaemon(plugin.rm, "/mps")

containerPipeDir := containerMpsDaemon.PipeDir()
containerLogDir := containerMpsDaemon.LogDir()

if response.Envs == nil {
response.Envs = make(map[string]string)
}
response.Envs["CUDA_MPS_PIPE_DIRECTORY"] = containerPipeDir
response.Envs["CUDA_MPS_LOG_DIRECTORY"] = containerLogDir

return &response, nil
// TODO: We use the Daemon here just to construct the pipe and log dirs for the specified resource on the host.
hostMpsDaemon := mps.NewDaemon(plugin.rm, *plugin.config.Flags.MpsRoot)
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: containerPipeDir,
HostPath: hostMpsDaemon.PipeDir(),
},
&pluginapi.Mount{
ContainerPath: containerLogDir,
HostPath: hostMpsDaemon.LogDir(),
},
&pluginapi.Mount{
ContainerPath: "/dev/shm",
HostPath: filepath.Join(*plugin.config.Flags.MpsRoot, "shm"),
},
)
return nil
}

// getAllocateResponseForCDI returns the allocate response for the specified device IDs.
Expand Down

0 comments on commit 251bd9b

Please sign in to comment.