Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wmeta: Add GPU entity #32019

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions comp/core/tagger/collectors/workloadmeta_extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ func (c *WorkloadMetaCollector) processEvents(evBundle workloadmeta.EventBundle)
// tagInfos = append(tagInfos, c.handleProcess(ev)...) No tags for now
case workloadmeta.KindKubernetesDeployment:
tagInfos = append(tagInfos, c.handleKubeDeployment(ev)...)
case workloadmeta.KindGPU:
// tagInfos = append(tagInfos, c.handleGPU(ev)...) No tags for now
default:
log.Errorf("cannot handle event for entity %q with kind %q", entityID.ID, entityID.Kind)
}
Expand Down
8 changes: 8 additions & 0 deletions comp/core/workloadmeta/def/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ type Component interface {
// to all entities with kind KindProcess.
ListProcesses() []*Process

// GetGPU returns metadata about a GPU device. It fetches the entity
// with kind KindGPU and the given ID.
GetGPU(id string) (*GPU, error)

// ListGPUs returns metadata about all known GPU devices, equivalent
// to all entities with kind KindGPU.
ListGPUs() []*GPU

// ListProcessesWithFilter returns all the processes for which the passed
// filter evaluates to true.
ListProcessesWithFilter(filterFunc EntityFilterFunc[*Process]) []*Process
Expand Down
62 changes: 62 additions & 0 deletions comp/core/workloadmeta/def/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ const (
KindECSTask Kind = "ecs_task"
KindContainerImageMetadata Kind = "container_image_metadata"
KindProcess Kind = "process"
KindGPU Kind = "gpu"
)

// Source is the source name of an entity.
Expand Down Expand Up @@ -1349,3 +1350,64 @@ func (e EventBundle) Acknowledge() {
// InitHelper this should be provided as a helper to allow passing the component into
// the inithook for additional start-time configutation.
type InitHelper func(context.Context, Component, config.Component) error

// GPU represents a GPU resource.
type GPU struct {
EntityID
EntityMeta
// Vendor is the name of the manufacturer of the device (e.g., NVIDIA)
Vendor string

// Device is the comercial name of the device (e.g., Tesla V100) as returned
// by the device driver (NVML for NVIDIA GPUs). Note that some models might
// have some additional information like the memory size (e.g., Tesla
// A100-SXM2-80GB), the exact format of this field is vendor and device
// specific.
Device string
ActivePIDs []int
}

var _ Entity = &GPU{}

// GetID implements Entity#GetID.
func (g GPU) GetID() EntityID {
return g.EntityID
}

// Merge implements Entity#Merge.
func (g *GPU) Merge(e Entity) error {
gg, ok := e.(*GPU)
if !ok {
return fmt.Errorf("cannot merge GPU with different kind %T", e)
}

// If the source has active PIDs, remove the ones from the destination so merge() takes latest active PIDs from the soure
if gg.ActivePIDs != nil {
g.ActivePIDs = nil
}

return merge(g, gg)
}

// DeepCopy implements Entity#DeepCopy.
func (g GPU) DeepCopy() Entity {
cp := deepcopy.Copy(g).(GPU)
return &cp
}

// String implements Entity#String.
func (g GPU) String(verbose bool) string {
var sb strings.Builder

_, _ = fmt.Fprintln(&sb, "----------- Entity ID -----------")
_, _ = fmt.Fprintln(&sb, g.EntityID.String(verbose))

_, _ = fmt.Fprintln(&sb, "----------- Entity Meta -----------")
_, _ = fmt.Fprintln(&sb, g.EntityMeta.String(verbose))

_, _ = fmt.Fprintln(&sb, "Vendor:", g.Vendor)
_, _ = fmt.Fprintln(&sb, "Device:", g.Device)
_, _ = fmt.Fprintln(&sb, "Active PIDs:", g.ActivePIDs)

return sb.String()
}
33 changes: 33 additions & 0 deletions comp/core/workloadmeta/def/types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,36 @@ func TestMergeECSContainer(t *testing.T) {
assert.Nil(t, container2.ECSContainer)
assert.EqualValues(t, container1.ECSContainer.DisplayName, "ecs-container-1")
}

func TestMergeGPU(t *testing.T) {
gpu1 := GPU{
EntityID: EntityID{
Kind: KindGPU,
ID: "gpu-1-id",
},
EntityMeta: EntityMeta{
Name: "gpu-1",
},
Vendor: "nvidia",
Device: "",
ActivePIDs: []int{123, 456},
}
gpu2 := GPU{
EntityID: EntityID{
Kind: KindGPU,
ID: "gpu-1-id",
},
EntityMeta: EntityMeta{
Name: "gpu-1",
},
Vendor: "nvidia",
Device: "tesla",
ActivePIDs: []int{654},
}

err := gpu1.Merge(&gpu2)
assert.NoError(t, err)
assert.Equal(t, gpu1.Device, "tesla")
assert.ElementsMatch(t, gpu1.ActivePIDs, []int{654})
assert.Equal(t, gpu1.Vendor, "nvidia")
}
22 changes: 22 additions & 0 deletions comp/core/workloadmeta/impl/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,28 @@ func (w *workloadmeta) ListKubernetesMetadata(filterFunc wmdef.EntityFilterFunc[
return metadata
}

// GetGPU implements Store#GetGPU.
func (w *workloadmeta) GetGPU(id string) (*wmdef.GPU, error) {
entity, err := w.getEntityByKind(wmdef.KindGPU, id)
if err != nil {
return nil, err
}

return entity.(*wmdef.GPU), nil
}

// ListGPUs implements Store#ListGPUs.
func (w *workloadmeta) ListGPUs() []*wmdef.GPU {
entities := w.listEntitiesByKind(wmdef.KindGPU)

gpuList := make([]*wmdef.GPU, 0, len(entities))
for i := range entities {
gpuList = append(gpuList, entities[i].(*wmdef.GPU))
}

return gpuList
}

// Notify implements Store#Notify
func (w *workloadmeta) Notify(events []wmdef.CollectorEvent) {
if len(events) > 0 {
Expand Down
Loading