Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: calculate the overhead correctly #158

Merged
merged 1 commit into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 1 addition & 38 deletions pkg/providers/instancetype/instancetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
"github.com/patrickmn/go-cache"
"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
Expand Down Expand Up @@ -179,11 +178,6 @@ func (p *DefaultProvider) List(ctx context.Context, kc *v1alpha1.KubeletConfigur
return nil, fmt.Errorf("failed to get cluster CNI: %w", err)
}

nodeResourceOverhead, err := p.nodeOverhead(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get node resource overhead: %w", err)
}

result := lo.Map(p.instanceTypesInfo, func(i *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType, _ int) *cloudprovider.InstanceType {
zoneData := lo.Map(allZones.UnsortedList(), func(zoneID string, _ int) ZoneData {
if !p.instanceTypesOfferings[lo.FromPtr(i.InstanceTypeId)].Has(zoneID) || !vSwitchsZones.Has(zoneID) {
Expand All @@ -203,7 +197,7 @@ func (p *DefaultProvider) List(ctx context.Context, kc *v1alpha1.KubeletConfigur
// so that Karpenter is able to cache the set of InstanceTypes based on values that alter the set of instance types
// !!! Important !!!
offers := p.createOfferings(ctx, *i.InstanceTypeId, zoneData)
return NewInstanceType(ctx, nodeResourceOverhead, i, kc, p.region, nodeClass.Spec.SystemDisk, offers, clusterCNI)
return NewInstanceType(ctx, i, kc, p.region, nodeClass.Spec.SystemDisk, offers, clusterCNI)
})

// Filter out nil values
Expand All @@ -213,37 +207,6 @@ func (p *DefaultProvider) List(ctx context.Context, kc *v1alpha1.KubeletConfigur
return result, nil
}

func (p *DefaultProvider) nodeOverhead(ctx context.Context) (corev1.ResourceList, error) {
var nodes corev1.NodeList
if err := p.kubeClient.List(ctx, &nodes); err != nil {
return corev1.ResourceList{}, err
}

// We do not sure how to calculate the overhead of the node, let's just use the maximum possible
// To avoid some loop node creation
maxCPUOverHead := int64(0)
maxMemoryOverHead := int64(0)
for _, node := range nodes.Items {
capacity := node.Status.Capacity
allocatable := node.Status.Allocatable

cpuOverHead := capacity.Cpu().MilliValue() - allocatable.Cpu().MilliValue()
memoryOverHead := capacity.Memory().Value() - allocatable.Memory().Value()

if cpuOverHead > maxCPUOverHead {
maxCPUOverHead = cpuOverHead
}
if memoryOverHead > maxMemoryOverHead {
maxMemoryOverHead = memoryOverHead
}
}

return corev1.ResourceList{
corev1.ResourceCPU: *resource.NewMilliQuantity(maxCPUOverHead, resource.DecimalSI),
corev1.ResourceMemory: *resource.NewQuantity(maxMemoryOverHead, resource.DecimalSI),
}, nil
}

func (p *DefaultProvider) UpdateInstanceTypes(ctx context.Context) error {
// DO NOT REMOVE THIS LOCK ----------------------------------------------------------------------------
// We lock here so that multiple callers to getInstanceTypesOfferings do not result in cache misses and multiple
Expand Down
62 changes: 57 additions & 5 deletions pkg/providers/instancetype/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,52 @@ type ZoneData struct {
Available bool
}

func NewInstanceType(ctx context.Context, overhead corev1.ResourceList,
func calculateResourceOverhead(pods, cpuM, memoryMi int64) corev1.ResourceList {
jwcesign marked this conversation as resolved.
Show resolved Hide resolved
// referring to: https://help.aliyun.com/zh/ack/ack-managed-and-ack-dedicated/user-guide/resource-reservation-policy#0f5ffe176df7q
// CPU overhead calculation
cpuOverHead := calculateCPUOverhead(cpuM)

// TODO: In a real environment, the formula does not produce accurate results,
// consistently yielding values that are 200MiB larger than expected.
// Memory overhead: min(11*pods + 255, memoryMi*0.25)
memoryOverHead := int64(math.Min(float64(11*pods+255), float64(memoryMi)*0.25)) + 200

return corev1.ResourceList{
corev1.ResourceCPU: *resource.NewMilliQuantity(cpuOverHead, resource.DecimalSI),
corev1.ResourceMemory: *resources.Quantity(fmt.Sprintf("%dMi", memoryOverHead)),
}
}

// thresholds defines CPU overhead thresholds and their corresponding percentages
var thresholds = [...]struct {
cores int64
overhead float64
}{
{1000, 0.06},
{3000, 0.01},
{3000, 0.005},
{4000, 0.005},
}

func calculateCPUOverhead(cpuM int64) int64 {
var cpuOverHead int64

// Calculate overhead for each threshold
for _, t := range thresholds {
if cpuM >= t.cores {
cpuOverHead += int64(1000 * t.overhead)
}
}

// Additional overhead for CPU > 4 cores (0.25%)
if cpuM > 4000 {
cpuOverHead += int64(float64(cpuM-4000) * 0.0025)
}

return cpuOverHead
}

func NewInstanceType(ctx context.Context,
info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType,
kc *v1alpha1.KubeletConfiguration, region string, systemDisk *v1alpha1.SystemDisk,
offerings cloudprovider.Offerings, clusterCNI string) *cloudprovider.InstanceType {
Expand All @@ -71,12 +116,15 @@ func NewInstanceType(ctx context.Context, overhead corev1.ResourceList,
Offerings: offerings,
Capacity: computeCapacity(ctx, info, kc.MaxPods, kc.PodsPerCore, systemDisk, clusterCNI),
Overhead: &cloudprovider.InstanceTypeOverhead{
// Follow overhead will be merged, so we can set only one overhead totally
KubeReserved: overhead,
KubeReserved: corev1.ResourceList{},
SystemReserved: corev1.ResourceList{},
EvictionThreshold: corev1.ResourceList{},
},
}

// Follow KubeReserved/SystemReserved/EvictionThreshold will be merged, so we can set only one overhead totally
it.Overhead.KubeReserved = calculateResourceOverhead(it.Capacity.Pods().Value(),
it.Capacity.Cpu().MilliValue(), extractMemory(info).Value()/MiBByteRatio)
if it.Requirements.Compatible(scheduling.NewRequirements(scheduling.NewRequirement(corev1.LabelOSStable, corev1.NodeSelectorOpIn, string(corev1.Windows)))) == nil {
it.Capacity[v1alpha1.ResourcePrivateIPv4Address] = *privateIPv4Address(info)
}
Expand Down Expand Up @@ -206,9 +254,13 @@ func cpu(info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceT
return resources.Quantity(fmt.Sprint(*info.CpuCoreCount))
}

func memory(ctx context.Context, info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType) *resource.Quantity {
func extractMemory(info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType) *resource.Quantity {
sizeInGib := tea.Float32Value(info.MemorySize)
mem := resources.Quantity(fmt.Sprintf("%fGi", sizeInGib))
return resources.Quantity(fmt.Sprintf("%fGi", sizeInGib))
}

func memory(ctx context.Context, info *ecsclient.DescribeInstanceTypesResponseBodyInstanceTypesInstanceType) *resource.Quantity {
mem := extractMemory(info)
if mem.IsZero() {
return mem
}
Expand Down
Loading