diff --git a/comp/core/tagger/collectors/workloadmeta_extract.go b/comp/core/tagger/collectors/workloadmeta_extract.go index 062931c0e3311..eee5f075d1a2b 100644 --- a/comp/core/tagger/collectors/workloadmeta_extract.go +++ b/comp/core/tagger/collectors/workloadmeta_extract.go @@ -149,6 +149,8 @@ func (c *WorkloadMetaCollector) processEvents(evBundle workloadmeta.EventBundle) // tagInfos = append(tagInfos, c.handleProcess(ev)...) No tags for now case workloadmeta.KindKubernetesDeployment: tagInfos = append(tagInfos, c.handleKubeDeployment(ev)...) + case workloadmeta.KindGPU: + // tagInfos = append(tagInfos, c.handleGPU(ev)...) No tags for now default: log.Errorf("cannot handle event for entity %q with kind %q", entityID.ID, entityID.Kind) } diff --git a/comp/core/workloadmeta/def/component.go b/comp/core/workloadmeta/def/component.go index 328555d2e525c..fcc04e13c34a0 100644 --- a/comp/core/workloadmeta/def/component.go +++ b/comp/core/workloadmeta/def/component.go @@ -96,6 +96,14 @@ type Component interface { // to all entities with kind KindProcess. ListProcesses() []*Process + // GetGPU returns metadata about a GPU device. It fetches the entity + // with kind KindGPU and the given ID. + GetGPU(id string) (*GPU, error) + + // ListGPUs returns metadata about all known GPU devices, equivalent + // to all entities with kind KindGPU. + ListGPUs() []*GPU + // ListProcessesWithFilter returns all the processes for which the passed // filter evaluates to true. ListProcessesWithFilter(filterFunc EntityFilterFunc[*Process]) []*Process diff --git a/comp/core/workloadmeta/def/types.go b/comp/core/workloadmeta/def/types.go index 47315eca9db06..ad7dbbeda21e3 100644 --- a/comp/core/workloadmeta/def/types.go +++ b/comp/core/workloadmeta/def/types.go @@ -47,6 +47,7 @@ const ( KindECSTask Kind = "ecs_task" KindContainerImageMetadata Kind = "container_image_metadata" KindProcess Kind = "process" + KindGPU Kind = "gpu" ) // Source is the source name of an entity. @@ -1349,3 +1350,64 @@ func (e EventBundle) Acknowledge() { // InitHelper this should be provided as a helper to allow passing the component into // the inithook for additional start-time configutation. type InitHelper func(context.Context, Component, config.Component) error + +// GPU represents a GPU resource. +type GPU struct { + EntityID + EntityMeta + // Vendor is the name of the manufacturer of the device (e.g., NVIDIA) + Vendor string + + // Device is the comercial name of the device (e.g., Tesla V100) as returned + // by the device driver (NVML for NVIDIA GPUs). Note that some models might + // have some additional information like the memory size (e.g., Tesla + // A100-SXM2-80GB), the exact format of this field is vendor and device + // specific. + Device string + ActivePIDs []int +} + +var _ Entity = &GPU{} + +// GetID implements Entity#GetID. +func (g GPU) GetID() EntityID { + return g.EntityID +} + +// Merge implements Entity#Merge. +func (g *GPU) Merge(e Entity) error { + gg, ok := e.(*GPU) + if !ok { + return fmt.Errorf("cannot merge GPU with different kind %T", e) + } + + // If the source has active PIDs, remove the ones from the destination so merge() takes latest active PIDs from the soure + if gg.ActivePIDs != nil { + g.ActivePIDs = nil + } + + return merge(g, gg) +} + +// DeepCopy implements Entity#DeepCopy. +func (g GPU) DeepCopy() Entity { + cp := deepcopy.Copy(g).(GPU) + return &cp +} + +// String implements Entity#String. +func (g GPU) String(verbose bool) string { + var sb strings.Builder + + _, _ = fmt.Fprintln(&sb, "----------- Entity ID -----------") + _, _ = fmt.Fprintln(&sb, g.EntityID.String(verbose)) + + _, _ = fmt.Fprintln(&sb, "----------- Entity Meta -----------") + _, _ = fmt.Fprintln(&sb, g.EntityMeta.String(verbose)) + + _, _ = fmt.Fprintln(&sb, "Vendor:", g.Vendor) + _, _ = fmt.Fprintln(&sb, "Device:", g.Device) + _, _ = fmt.Fprintln(&sb, "Active PIDs:", g.ActivePIDs) + + return sb.String() +} diff --git a/comp/core/workloadmeta/def/types_test.go b/comp/core/workloadmeta/def/types_test.go index d64d49d9e7b4b..27d4d68b44951 100644 --- a/comp/core/workloadmeta/def/types_test.go +++ b/comp/core/workloadmeta/def/types_test.go @@ -149,3 +149,36 @@ func TestMergeECSContainer(t *testing.T) { assert.Nil(t, container2.ECSContainer) assert.EqualValues(t, container1.ECSContainer.DisplayName, "ecs-container-1") } + +func TestMergeGPU(t *testing.T) { + gpu1 := GPU{ + EntityID: EntityID{ + Kind: KindGPU, + ID: "gpu-1-id", + }, + EntityMeta: EntityMeta{ + Name: "gpu-1", + }, + Vendor: "nvidia", + Device: "", + ActivePIDs: []int{123, 456}, + } + gpu2 := GPU{ + EntityID: EntityID{ + Kind: KindGPU, + ID: "gpu-1-id", + }, + EntityMeta: EntityMeta{ + Name: "gpu-1", + }, + Vendor: "nvidia", + Device: "tesla", + ActivePIDs: []int{654}, + } + + err := gpu1.Merge(&gpu2) + assert.NoError(t, err) + assert.Equal(t, gpu1.Device, "tesla") + assert.ElementsMatch(t, gpu1.ActivePIDs, []int{654}) + assert.Equal(t, gpu1.Vendor, "nvidia") +} diff --git a/comp/core/workloadmeta/impl/store.go b/comp/core/workloadmeta/impl/store.go index 5b49df81f228f..b6288f36f453a 100644 --- a/comp/core/workloadmeta/impl/store.go +++ b/comp/core/workloadmeta/impl/store.go @@ -401,6 +401,28 @@ func (w *workloadmeta) ListKubernetesMetadata(filterFunc wmdef.EntityFilterFunc[ return metadata } +// GetGPU implements Store#GetGPU. +func (w *workloadmeta) GetGPU(id string) (*wmdef.GPU, error) { + entity, err := w.getEntityByKind(wmdef.KindGPU, id) + if err != nil { + return nil, err + } + + return entity.(*wmdef.GPU), nil +} + +// ListGPUs implements Store#ListGPUs. +func (w *workloadmeta) ListGPUs() []*wmdef.GPU { + entities := w.listEntitiesByKind(wmdef.KindGPU) + + gpuList := make([]*wmdef.GPU, 0, len(entities)) + for i := range entities { + gpuList = append(gpuList, entities[i].(*wmdef.GPU)) + } + + return gpuList +} + // Notify implements Store#Notify func (w *workloadmeta) Notify(events []wmdef.CollectorEvent) { if len(events) > 0 {