diff --git a/api/v1alpha1/flavor_group_capacity_types.go b/api/v1alpha1/flavor_group_capacity_types.go index 80596256e..7e9ee36c0 100644 --- a/api/v1alpha1/flavor_group_capacity_types.go +++ b/api/v1alpha1/flavor_group_capacity_types.go @@ -24,7 +24,11 @@ type FlavorGroupCapacitySpec struct { AvailabilityZone string `json:"availabilityZone"` } -// FlavorCapacityStatus holds per-flavor capacity numbers for one (flavor group × AZ) pair. +// FlavorCapacityStatus holds per-flavor scheduler probe results for one (flavor group × AZ) pair. +// These values come directly from scheduler probes and are independent of the cross-group +// capacity split (see FreeCapacity and ExclusivelyFreeCapacity on the parent status). +// "Placeable" means: if all remaining capacity in this AZ were used solely by this flavor, +// this is how many would fit. It does not account for competing flavor groups. type FlavorCapacityStatus struct { // FlavorName is the OpenStack flavor name (e.g. "hana-v2-small"). FlavorName string `json:"flavorName"` @@ -37,11 +41,11 @@ type FlavorCapacityStatus struct { // +kubebuilder:validation:Optional PlaceableVMs int64 `json:"placeableVms,omitempty"` - // TotalCapacityHosts is the number of eligible hosts in an empty-datacenter scenario. + // TotalCapacityHosts is the number of eligible hosts assuming an empty datacenter. // +kubebuilder:validation:Optional TotalCapacityHosts int64 `json:"totalCapacityHosts,omitempty"` - // TotalCapacityVMSlots is the maximum number of VM slots in an empty-datacenter scenario. + // TotalCapacityVMSlots is the maximum number of VM slots assuming an empty datacenter. // +kubebuilder:validation:Optional TotalCapacityVMSlots int64 `json:"totalCapacityVmSlots,omitempty"` } @@ -57,14 +61,45 @@ type FlavorGroupCapacityStatus struct { // +kubebuilder:validation:Optional CommittedCapacity int64 `json:"committedCapacity,omitempty"` - // TotalCapacity is the total capacity of all eligible hosts in an empty-datacenter scenario. + // CommittedCapacityBytes is CommittedCapacity converted to raw bytes. + // +kubebuilder:validation:Optional + CommittedCapacityBytes int64 `json:"committedCapacityBytes,omitempty"` + + // SmallestFlavorName is the name of the smallest flavor in this group, used as the + // slot unit for ExclusivelyFreeSlots and related capacity fields. + // +kubebuilder:validation:Optional + SmallestFlavorName string `json:"smallestFlavorName,omitempty"` + + // TotalCapacity is the installed capacity across all eligible hosts in an empty-datacenter + // scenario, expressed as raw resource amounts (bytes for memory, count for cores). // +kubebuilder:validation:Optional TotalCapacity map[string]resource.Quantity `json:"totalCapacity,omitempty"` - // TotalInstances is the total number of VM instances running on hypervisors in this AZ, - // derived from Hypervisor CRD Status.Instances (not filtered by flavor group). + // FreeCapacity is the sum of remaining resources across all candidate hosts for this + // group given current allocations. Because groups can share hosts, the sum across groups + // may exceed actual installed capacity — this field reflects per-group availability + // before any cross-group fairness split. + // +kubebuilder:validation:Optional + FreeCapacity map[string]resource.Quantity `json:"freeCapacity,omitempty"` + + // ExclusivelyFreeCapacity is the share of remaining resources fairly attributed to this + // group by the round-robin capacity split. The sum across all groups for an AZ never + // exceeds actual installed capacity. + // +kubebuilder:validation:Optional + ExclusivelyFreeCapacity map[string]resource.Quantity `json:"exclusivelyFreeCapacity,omitempty"` + + // ExclusivelyFreeSlots is the number of smallest-flavor VM slots available from ExclusivelyFreeCapacity. // +kubebuilder:validation:Optional - TotalInstances int64 `json:"totalInstances,omitempty"` + ExclusivelyFreeSlots int64 `json:"exclusivelyFreeSlots,omitempty"` + + // RunningInstances is the number of VMs running in this (flavor group × AZ) whose + // flavor belongs to this group. + // +kubebuilder:validation:Optional + RunningInstances int64 `json:"runningInstances,omitempty"` + + // RunningResources is the total resource consumption of running VMs, keyed by resource type. + // +kubebuilder:validation:Optional + RunningResources map[string]resource.Quantity `json:"runningResources,omitempty"` // LastReconcileAt is the timestamp of the last successful reconcile. // +kubebuilder:validation:Optional @@ -80,7 +115,7 @@ type FlavorGroupCapacityStatus struct { // +kubebuilder:resource:scope=Cluster // +kubebuilder:printcolumn:name="FlavorGroup",type="string",JSONPath=".spec.flavorGroup" // +kubebuilder:printcolumn:name="AZ",type="string",JSONPath=".spec.availabilityZone" -// +kubebuilder:printcolumn:name="TotalInstances",type="integer",JSONPath=".status.totalInstances" +// +kubebuilder:printcolumn:name="Running",type="integer",JSONPath=".status.runningInstances" // +kubebuilder:printcolumn:name="LastReconcile",type="date",JSONPath=".status.lastReconcileAt" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" @@ -89,16 +124,10 @@ type FlavorGroupCapacityStatus struct { // The capacity API reads these CRDs instead of probing the scheduler on each request. type FlavorGroupCapacity struct { metav1.TypeMeta `json:",inline"` - - // metadata is a standard object metadata // +optional metav1.ObjectMeta `json:"metadata,omitempty,omitzero"` - - // spec defines the desired state of FlavorGroupCapacity // +required Spec FlavorGroupCapacitySpec `json:"spec"` - - // status defines the observed state of FlavorGroupCapacity // +optional Status FlavorGroupCapacityStatus `json:"status,omitempty,omitzero"` } diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 89fe75d93..9f654186a 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -856,6 +856,27 @@ func (in *FlavorGroupCapacityStatus) DeepCopyInto(out *FlavorGroupCapacityStatus (*out)[key] = val.DeepCopy() } } + if in.FreeCapacity != nil { + in, out := &in.FreeCapacity, &out.FreeCapacity + *out = make(map[string]resource.Quantity, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } + if in.ExclusivelyFreeCapacity != nil { + in, out := &in.ExclusivelyFreeCapacity, &out.ExclusivelyFreeCapacity + *out = make(map[string]resource.Quantity, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } + if in.RunningResources != nil { + in, out := &in.RunningResources, &out.RunningResources + *out = make(map[string]resource.Quantity, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } in.LastReconcileAt.DeepCopyInto(&out.LastReconcileAt) if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 07f3e9ce2..af86e3d84 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -771,7 +771,7 @@ func main() { setupLog.Error(err, "failed to register capacity monitor metrics, continuing without metrics") } - capacityController := capacity.NewController(multiclusterClient, capacityConfig) + capacityController := capacity.NewController(multiclusterClient, capacityConfig, commitmentsVMSource) if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { return capacityController.Start(ctx) })); err != nil { diff --git a/helm/bundles/cortex-nova/templates/alerts.yaml b/helm/bundles/cortex-nova/templates/alerts.yaml index 6f3fabef2..674eaf890 100644 --- a/helm/bundles/cortex-nova/templates/alerts.yaml +++ b/helm/bundles/cortex-nova/templates/alerts.yaml @@ -570,8 +570,8 @@ spec: - alert: CortexNovaCommittedResourceCapacityDroppedToZero expr: | - (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} == 0) - and on(resource, az) (cortex_committed_resource_reported_capacity_gib{service="cortex-nova-metrics"} offset 30m > 0) + (cortex_committed_resource_reported_capacity{service="cortex-nova-metrics"} == 0) + and on(resource, az) (cortex_committed_resource_reported_capacity{service="cortex-nova-metrics"} offset 30m > 0) for: 5m labels: context: committed-resource-api diff --git a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml index 73a009ba4..952e15722 100644 --- a/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml +++ b/helm/library/cortex/files/crds/cortex.cloud_flavorgroupcapacities.yaml @@ -21,8 +21,8 @@ spec: - jsonPath: .spec.availabilityZone name: AZ type: string - - jsonPath: .status.totalInstances - name: TotalInstances + - jsonPath: .status.runningInstances + name: Running type: integer - jsonPath: .status.lastReconcileAt name: LastReconcile @@ -56,7 +56,7 @@ spec: metadata: type: object spec: - description: spec defines the desired state of FlavorGroupCapacity + description: FlavorGroupCapacitySpec defines the desired state of FlavorGroupCapacity. properties: availabilityZone: description: AvailabilityZone is the OpenStack AZ this capacity data @@ -70,7 +70,7 @@ spec: - flavorGroup type: object status: - description: status defines the observed state of FlavorGroupCapacity + description: FlavorGroupCapacityStatus defines the observed state of FlavorGroupCapacity. properties: committedCapacity: description: |- @@ -78,6 +78,11 @@ spec: expressed in multiples of the smallest flavor's memory. format: int64 type: integer + committedCapacityBytes: + description: CommittedCapacityBytes is CommittedCapacity converted + to raw bytes. + format: int64 + type: integer conditions: description: The current status conditions of the FlavorGroupCapacity. items: @@ -135,12 +140,33 @@ spec: - type type: object type: array + exclusivelyFreeCapacity: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + ExclusivelyFreeCapacity is the share of remaining resources fairly attributed to this + group by the round-robin capacity split. The sum across all groups for an AZ never + exceeds actual installed capacity. + type: object + exclusivelyFreeSlots: + description: ExclusivelyFreeSlots is the number of smallest-flavor + VM slots available from ExclusivelyFreeCapacity. + format: int64 + type: integer flavors: description: Flavors holds per-flavor capacity data for all flavors in the group. items: - description: FlavorCapacityStatus holds per-flavor capacity numbers - for one (flavor group × AZ) pair. + description: |- + FlavorCapacityStatus holds per-flavor scheduler probe results for one (flavor group × AZ) pair. + These values come directly from scheduler probes and are independent of the cross-group + capacity split (see FreeCapacity and ExclusivelyFreeCapacity on the parent status). + "Placeable" means: if all remaining capacity in this AZ were used solely by this flavor, + this is how many would fit. It does not account for competing flavor groups. properties: flavorName: description: FlavorName is the OpenStack flavor name (e.g. "hana-v2-small"). @@ -157,39 +183,68 @@ spec: type: integer totalCapacityHosts: description: TotalCapacityHosts is the number of eligible hosts - in an empty-datacenter scenario. + assuming an empty datacenter. format: int64 type: integer totalCapacityVmSlots: description: TotalCapacityVMSlots is the maximum number of VM - slots in an empty-datacenter scenario. + slots assuming an empty datacenter. format: int64 type: integer required: - flavorName type: object type: array + freeCapacity: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + FreeCapacity is the sum of remaining resources across all candidate hosts for this + group given current allocations. Because groups can share hosts, the sum across groups + may exceed actual installed capacity — this field reflects per-group availability + before any cross-group fairness split. + type: object lastReconcileAt: description: LastReconcileAt is the timestamp of the last successful reconcile. format: date-time type: string - totalCapacity: + runningInstances: + description: |- + RunningInstances is the number of VMs running in this (flavor group × AZ) whose + flavor belongs to this group. + format: int64 + type: integer + runningResources: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true - description: TotalCapacity is the total capacity of all eligible hosts - in an empty-datacenter scenario. + description: RunningResources is the total resource consumption of + running VMs, keyed by resource type. type: object - totalInstances: + smallestFlavorName: description: |- - TotalInstances is the total number of VM instances running on hypervisors in this AZ, - derived from Hypervisor CRD Status.Instances (not filtered by flavor group). - format: int64 - type: integer + SmallestFlavorName is the name of the smallest flavor in this group, used as the + slot unit for ExclusivelyFreeSlots and related capacity fields. + type: string + totalCapacity: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + TotalCapacity is the installed capacity across all eligible hosts in an empty-datacenter + scenario, expressed as raw resource amounts (bytes for memory, count for cores). + type: object type: object required: - spec diff --git a/internal/scheduling/reservations/capacity/controller.go b/internal/scheduling/reservations/capacity/controller.go index d49cada59..9ae89fe02 100644 --- a/internal/scheduling/reservations/capacity/controller.go +++ b/internal/scheduling/reservations/capacity/controller.go @@ -30,16 +30,19 @@ import ( var log = ctrl.Log.WithName("capacity-controller").WithValues("module", "capacity") // Controller reconciles FlavorGroupCapacity CRDs on a fixed interval. -// For each (flavor group × AZ) pair it probes all flavors in the group and updates the CRD status. +// For each AZ it probes all flavor groups, runs the round-robin capacity split, then writes +// one FlavorGroupCapacity CRD per (flavor group × AZ) pair. type Controller struct { client client.Client + vmSource reservations.VMSource schedulerClient *reservations.SchedulerClient config Config } -func NewController(c client.Client, config Config) *Controller { +func NewController(c client.Client, config Config, vmSource reservations.VMSource) *Controller { return &Controller{ client: c, + vmSource: vmSource, schedulerClient: reservations.NewSchedulerClient(config.SchedulerURL), config: config, } @@ -64,7 +67,18 @@ func (c *Controller) Start(ctx context.Context) error { } } -// reconcileAll iterates all flavor groups × AZs and upserts FlavorGroupCapacity CRDs. +type vmUsageKey struct{ group, az string } + +// vmUsage aggregates resource totals for running VMs in one (group × AZ). +// resources keys are ResourceMemory (bytes) and ResourceCores (count). +// fresh is false when the VMSource call failed — running fields must not be overwritten. +type vmUsage struct { + instances int64 + resources map[string]int64 + fresh bool +} + +// reconcileAll iterates all AZs, runs the round-robin split per AZ, then writes CRDs. func (c *Controller) reconcileAll(ctx context.Context) error { logger := LoggerFromContext(ctx) startTime := time.Now() @@ -87,25 +101,22 @@ func (c *Controller) reconcileAll(ctx context.Context) error { azs := availabilityZones(hvList.Items) - // Compute reservation memory blocks once per cycle — shared across all (group × AZ) pairs. blockedByReservations, err := c.blockedMemoryByHost(ctx) if err != nil { logger.Error(err, "failed to compute blocked memory by host, placeable slot counts may be overstated") blockedByReservations = map[string]int64{} } + usageByKey := c.computeVMUsage(ctx, logger, flavorGroups, hvList.Items) + var succeeded, failed int - for groupName, groupData := range flavorGroups { - for _, az := range azs { - if err := c.reconcileOne(ctx, groupName, groupData, az, hvByName, hvList.Items, blockedByReservations); err != nil { - logger.Error(err, "failed to reconcile flavor group capacity", - "flavorGroup", groupName, "az", az) - failed++ - // Continue with other pairs rather than aborting the whole cycle. - continue - } - succeeded++ + for _, az := range azs { + if err := c.reconcileAZ(ctx, az, flavorGroups, hvByName, blockedByReservations, usageByKey); err != nil { + logger.Error(err, "failed to reconcile AZ", "az", az) + failed++ + continue } + succeeded += len(flavorGroups) } logger.Info("capacity reconcile cycle completed", @@ -118,23 +129,292 @@ func (c *Controller) reconcileAll(ctx context.Context) error { return nil } -// reconcileOne updates the FlavorGroupCapacity CRD for one (group × AZ) pair. -func (c *Controller) reconcileOne( +// computeVMUsage fetches running VMs and aggregates usage per (flavorGroup, az). +// On error returns an empty map with fresh=false — callers must not overwrite running fields. +func (c *Controller) computeVMUsage( + ctx context.Context, + logger interface{ Error(error, string, ...any) }, + flavorGroups map[string]compute.FlavorGroupFeature, + hvs []hv1.Hypervisor, +) map[vmUsageKey]vmUsage { + + result := make(map[vmUsageKey]vmUsage) + if c.vmSource == nil { + return result + } + + hvList := &hv1.HypervisorList{Items: hvs} + vms, err := c.vmSource.ListVMsOnHypervisors(ctx, hvList, true) + if err != nil { + logger.Error(err, "failed to list VMs for usage computation, running fields will retain last known values") + return result + } + + flavorToGroup := make(map[string]string) + flavorMemBytes := make(map[string]int64) + flavorVCPUs := make(map[string]int64) + for groupName, gd := range flavorGroups { + for _, f := range gd.Flavors { + flavorToGroup[f.Name] = groupName + flavorMemBytes[f.Name] = int64(f.MemoryMB) * 1024 * 1024 //nolint:gosec + flavorVCPUs[f.Name] = int64(f.VCPUs) //nolint:gosec + } + } + + for _, vm := range vms { + groupName, ok := flavorToGroup[vm.FlavorName] + if !ok { + continue + } + key := vmUsageKey{group: groupName, az: vm.AvailabilityZone} + u := result[key] + if u.resources == nil { + u.resources = make(map[string]int64) + } + u.instances++ + u.resources[ResourceMemory] += flavorMemBytes[vm.FlavorName] + u.resources[ResourceCores] += flavorVCPUs[vm.FlavorName] + u.fresh = true + result[key] = u + } + return result +} + +// hvRemainingResources returns remaining schedulable resources after subtracting +// current allocations and (for memory) active reservation blocks. +// Returns nil if the hypervisor has no capacity data. +func hvRemainingResources(hv hv1.Hypervisor, blockedMemBytes int64) map[string]int64 { + effCap := hv.Status.EffectiveCapacity + if effCap == nil { + effCap = hv.Status.Capacity + } + if effCap == nil { + return nil + } + + result := make(map[string]int64, 2) + + if qty, ok := effCap[hv1.ResourceMemory]; ok { + mem := qty.Value() + if alloc, ok := hv.Status.Allocation[hv1.ResourceMemory]; ok { + mem -= alloc.Value() + } + mem -= blockedMemBytes + if mem < 0 { + mem = 0 + } + result[ResourceMemory] = mem + } + + if qty, ok := effCap[hv1.ResourceCPU]; ok { + cpu := qty.Value() + if alloc, ok := hv.Status.Allocation[hv1.ResourceCPU]; ok { + cpu -= alloc.Value() + } + if cpu < 0 { + cpu = 0 + } + result[ResourceCores] = cpu + } + + return result +} + +// reconcileAZ runs the round-robin capacity split for all flavor groups in one AZ, +// then writes one FlavorGroupCapacity CRD per group that had all probes succeed. +// Groups with failed probes are skipped — their CRDs retain the last good state. +func (c *Controller) reconcileAZ( ctx context.Context, - groupName string, - groupData compute.FlavorGroupFeature, az string, + flavorGroups map[string]compute.FlavorGroupFeature, hvByName map[string]hv1.Hypervisor, - allHVs []hv1.Hypervisor, blockedByReservations map[string]int64, + usageByKey map[vmUsageKey]vmUsage, ) error { - smallestFlavorBytes := int64(groupData.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec - if smallestFlavorBytes <= 0 { - return fmt.Errorf("smallest flavor %q has invalid memory %d MB", - groupData.SmallestFlavor.Name, groupData.SmallestFlavor.MemoryMB) + logger := LoggerFromContext(ctx) + + type probeResult struct { + groupName string + groupData compute.FlavorGroupFeature + flavors []v1alpha1.FlavorCapacityStatus + // allFresh is false if any scheduler probe failed; the group's CRD is left unchanged. + allFresh bool + smallestCandidates []string + committedCapacity int64 + } + + results := make([]probeResult, 0, len(flavorGroups)) + + groupNames := make([]string, 0, len(flavorGroups)) + for name := range flavorGroups { + groupNames = append(groupNames, name) + } + sort.Strings(groupNames) + + for _, groupName := range groupNames { + groupData := flavorGroups[groupName] + + smallestFlavorBytes := int64(groupData.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec + if smallestFlavorBytes <= 0 { + logger.Error(fmt.Errorf("smallest flavor %q has invalid memory %d MB", + groupData.SmallestFlavor.Name, groupData.SmallestFlavor.MemoryMB), + "skipping flavor group", "flavorGroup", groupName) + continue + } + + // Probe all flavors. Sort for stable CRD output. + flavors := make([]compute.FlavorInGroup, len(groupData.Flavors)) + copy(flavors, groupData.Flavors) + sort.Slice(flavors, func(i, j int) bool { return flavors[i].Name < flavors[j].Name }) + + allFresh := true + newFlavors := make([]v1alpha1.FlavorCapacityStatus, 0, len(flavors)) + + // Load existing per-flavor data to preserve stale values on probe failure. + crdName := crdNameFor(groupName, az) + var existing v1alpha1.FlavorGroupCapacity + if err := c.client.Get(ctx, types.NamespacedName{Name: crdName}, &existing); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to get FlavorGroupCapacity %s: %w", crdName, err) + } + existingByName := make(map[string]v1alpha1.FlavorCapacityStatus, len(existing.Status.Flavors)) + for _, f := range existing.Status.Flavors { + existingByName[f.FlavorName] = f + } + + var smallestCandidates []string + for _, flavor := range flavors { + cur := existingByName[flavor.Name] + cur.FlavorName = flavor.Name + + totalVMSlots, totalHosts, _, totalErr := c.probeScheduler(ctx, flavor, az, c.config.TotalPipeline, hvByName, true, nil) + placeableVMs, placeableHosts, candidates, placeableErr := c.probeScheduler(ctx, flavor, az, c.config.PlaceablePipeline, hvByName, false, blockedByReservations) + + if totalErr != nil { + allFresh = false + } else { + cur.TotalCapacityVMSlots = totalVMSlots + cur.TotalCapacityHosts = totalHosts + } + if placeableErr != nil { + allFresh = false + } else { + cur.PlaceableVMs = placeableVMs + cur.PlaceableHosts = placeableHosts + if flavor.Name == groupData.SmallestFlavor.Name { + smallestCandidates = candidates + } + } + newFlavors = append(newFlavors, cur) + } + + committedCapacity, committedErr := c.sumCommittedCapacity(ctx, groupName, az, smallestFlavorBytes) + if committedErr != nil { + logger.Error(committedErr, "failed to sum committed capacity", + "flavorGroup", groupName, "az", az) + committedCapacity = 0 + } + + results = append(results, probeResult{ + groupName: groupName, + groupData: groupData, + flavors: newFlavors, + allFresh: allFresh, + smallestCandidates: smallestCandidates, + committedCapacity: committedCapacity, + }) } + // Build HostState and GroupInput for the round-robin split. + // Only include groups where all probes succeeded. + hosts := make(map[string]HostState) + groupInputs := make([]GroupInput, 0, len(results)) + for _, r := range results { + if !r.allFresh || r.smallestCandidates == nil { + continue + } + flavorMemBytes := int64(r.groupData.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec + flavorVCPUs := int64(r.groupData.SmallestFlavor.VCPUs) //nolint:gosec + + candidateHosts := make([]string, 0, len(r.smallestCandidates)) + for _, h := range r.smallestCandidates { + candidateHosts = append(candidateHosts, h) + if _, ok := hosts[h]; !ok { + hv, hvOk := hvByName[h] + if !hvOk { + continue + } + remaining := hvRemainingResources(hv, blockedByReservations[h]) + if remaining != nil { + hosts[h] = HostState{Remaining: remaining} + memSlots := remaining[ResourceMemory] / flavorMemBytes + cpuSlots := remaining[ResourceCores] / flavorVCPUs + usableSlots := memSlots + if cpuSlots < usableSlots { + usableSlots = cpuSlots + } + strandedMem := remaining[ResourceMemory] - usableSlots*flavorMemBytes + strandedCPU := remaining[ResourceCores] - usableSlots*flavorVCPUs + logger.V(1).Info("candidate host for capacity split", + "az", az, "flavorGroup", r.groupName, "host", h, + "usableSlots", usableSlots, + "strandedMemoryGiB", strandedMem/(1024*1024*1024), + "strandedCores", strandedCPU) + } + } + } + sort.Strings(candidateHosts) // stable order + groupInputs = append(groupInputs, GroupInput{ + Name: r.groupName, + FlavorResources: map[string]int64{ + ResourceMemory: flavorMemBytes, + ResourceCores: flavorVCPUs, + }, + CandidateHosts: candidateHosts, + }) + } + + freeResources, exclusiveResources, unassigned := SplitCapacity(groupInputs, hosts) + if unassigned[ResourceMemory] > 0 || unassigned[ResourceCores] > 0 { + logger.Info("fragmented capacity not assigned to any group", + "az", az, + "unassignedMemoryGiB", unassigned[ResourceMemory]/(1024*1024*1024), + "unassignedCores", unassigned[ResourceCores], + "candidateHosts", len(hosts), + "groups", len(groupInputs)) + } + + // Write one CRD per group. Skip groups with failed probes — their CRDs retain last good state. + for _, r := range results { + if !r.allFresh { + continue + } + if err := c.writeCRD(ctx, r.groupName, r.groupData, az, + r.flavors, r.committedCapacity, + usageByKey[vmUsageKey{r.groupName, az}], + freeResources[r.groupName], + exclusiveResources[r.groupName], + ); err != nil { + logger.Error(err, "failed to write FlavorGroupCapacity CRD", + "flavorGroup", r.groupName, "az", az) + } + } + return nil +} + +// writeCRD upserts one FlavorGroupCapacity CRD with fresh computed values. +func (c *Controller) writeCRD( + ctx context.Context, + groupName string, + groupData compute.FlavorGroupFeature, + az string, + newFlavors []v1alpha1.FlavorCapacityStatus, + committedCapacity int64, + usage vmUsage, + freeRes map[string]int64, + exclusiveRes map[string]int64, +) error { + crdName := crdNameFor(groupName, az) var existing v1alpha1.FlavorGroupCapacity @@ -154,54 +434,9 @@ func (c *Controller) reconcileOne( return fmt.Errorf("failed to get FlavorGroupCapacity %s: %w", crdName, err) } - // Build a lookup of existing per-flavor data so we can preserve stale values on probe failure. - existingByName := make(map[string]v1alpha1.FlavorCapacityStatus, len(existing.Status.Flavors)) - for _, f := range existing.Status.Flavors { - existingByName[f.FlavorName] = f - } - - // Probe all flavors in the group. Sort for stable CRD output. - flavors := make([]compute.FlavorInGroup, len(groupData.Flavors)) - copy(flavors, groupData.Flavors) - sort.Slice(flavors, func(i, j int) bool { return flavors[i].Name < flavors[j].Name }) - - allFresh := true - newFlavors := make([]v1alpha1.FlavorCapacityStatus, 0, len(flavors)) - for _, flavor := range flavors { - cur := existingByName[flavor.Name] - cur.FlavorName = flavor.Name - - totalVMSlots, totalHosts, totalErr := c.probeScheduler(ctx, flavor, az, c.config.TotalPipeline, hvByName, true, nil) - placeableVMs, placeableHosts, placeableErr := c.probeScheduler(ctx, flavor, az, c.config.PlaceablePipeline, hvByName, false, blockedByReservations) - - if totalErr != nil { - allFresh = false - } else { - cur.TotalCapacityVMSlots = totalVMSlots - cur.TotalCapacityHosts = totalHosts - } - if placeableErr != nil { - allFresh = false - } else { - cur.PlaceableVMs = placeableVMs - cur.PlaceableHosts = placeableHosts - } - newFlavors = append(newFlavors, cur) - } - - // Count total instances and committed capacity (always available regardless of probe results). - totalInstances := countInstancesInAZ(allHVs, az) - committedCapacity, committedErr := c.sumCommittedCapacity(ctx, groupName, az, smallestFlavorBytes) - if committedErr != nil { - LoggerFromContext(ctx).Error(committedErr, "failed to sum committed capacity", - "flavorGroup", groupName, "az", az) - committedCapacity = 0 - } - - // Compute TotalCapacity: for each flavor multiply slot count by its RAM/CPU, - // then take the max across all flavors independently for each resource. - // This reveals the most capacity because the flavor best matching the host's - // resource ratio saturates more resources and produces a higher product. + // TotalCapacity: for each flavor multiply slot count by its resources; take the max + // across all flavors independently. The flavor best matching the host's resource + // ratio saturates more resources and produces a higher product. flavorSpecByName := make(map[string]compute.FlavorInGroup, len(groupData.Flavors)) for _, f := range groupData.Flavors { flavorSpecByName[f.Name] = f @@ -222,39 +457,34 @@ func (c *Controller) reconcileOne( } } - // Only update TotalCapacity when all probes succeeded (allFresh=true). - // This preserves stale values across transient probe failures and ensures - // the CR controller can distinguish "not yet probed" (key absent) from - // "probed but zero capacity" (key present, value=0). - var totalCapacity map[string]resource.Quantity - if allFresh { - totalCapacity = map[string]resource.Quantity{ - string(v1alpha1.CommittedResourceTypeMemory): *resource.NewQuantity(maxMemBytes, resource.BinarySI), - string(v1alpha1.CommittedResourceTypeCores): *resource.NewQuantity(maxCPUCores, resource.DecimalSI), - } - } else { - totalCapacity = existing.Status.TotalCapacity - } - patch := client.MergeFrom(existing.DeepCopy()) existing.Status.Flavors = newFlavors - existing.Status.TotalInstances = totalInstances existing.Status.CommittedCapacity = committedCapacity - existing.Status.TotalCapacity = totalCapacity + existing.Status.CommittedCapacityBytes = committedCapacity * int64(groupData.SmallestFlavor.MemoryMB) * 1024 * 1024 //nolint:gosec + existing.Status.SmallestFlavorName = groupData.SmallestFlavor.Name + existing.Status.TotalCapacity = map[string]resource.Quantity{ + string(v1alpha1.CommittedResourceTypeMemory): *resource.NewQuantity(maxMemBytes, resource.BinarySI), + string(v1alpha1.CommittedResourceTypeCores): *resource.NewQuantity(maxCPUCores, resource.DecimalSI), + } + // Only overwrite running fields when the VM data is fresh — a VMSource outage must not + // zero out the last-known values while the CRD is still marked ready. + if usage.fresh { + existing.Status.RunningInstances = usage.instances + existing.Status.RunningResources = resMapToQuantity(usage.resources) + } + existing.Status.FreeCapacity = resMapToQuantity(freeRes) + existing.Status.ExclusivelyFreeCapacity = resMapToQuantity(exclusiveRes) + if flavorMemBytes := int64(groupData.SmallestFlavor.MemoryMB) * 1024 * 1024; flavorMemBytes > 0 { //nolint:gosec + existing.Status.ExclusivelyFreeSlots = exclusiveRes[ResourceMemory] / flavorMemBytes + } existing.Status.LastReconcileAt = metav1.Now() freshCondition := metav1.Condition{ Type: v1alpha1.FlavorGroupCapacityConditionReady, ObservedGeneration: existing.Generation, - } - if allFresh { - freshCondition.Status = metav1.ConditionTrue - freshCondition.Reason = "ReconcileSucceeded" - freshCondition.Message = "capacity data is up-to-date" - } else { - freshCondition.Status = metav1.ConditionFalse - freshCondition.Reason = "ReconcileFailed" - freshCondition.Message = "one or more flavor probes failed" + Status: metav1.ConditionTrue, + Reason: "ReconcileSucceeded", + Message: "capacity data is up-to-date", } meta.SetStatusCondition(&existing.Status.Conditions, freshCondition) @@ -264,11 +494,8 @@ func (c *Controller) reconcileOne( return nil } -// probeScheduler calls the scheduler with the given pipeline and returns VM slots + host count. -// Capacity is computed as sum of floor(hostMemory / flavorMemory) across returned hosts. -// When ignoreAllocations is true (total/empty-datacenter probe), raw effective capacity is used. -// When false (placeable probe), hv.Status.Allocation and blockedByReservations are subtracted so -// that slots reflect remaining capacity after running VMs and active reservation blocks. +// probeScheduler calls the scheduler and returns slot count, host count, and candidate host names. +// ignoreAllocations=true (total probe) uses raw effective capacity; false (placeable probe) subtracts allocations. func (c *Controller) probeScheduler( ctx context.Context, flavor compute.FlavorInGroup, @@ -276,11 +503,11 @@ func (c *Controller) probeScheduler( hvByName map[string]hv1.Hypervisor, ignoreAllocations bool, blockedByReservations map[string]int64, -) (capacity, hosts int64, err error) { +) (capacity, hosts int64, candidateHosts []string, err error) { flavorBytes := int64(flavor.MemoryMB) * 1024 * 1024 //nolint:gosec if flavorBytes <= 0 { - return 0, 0, fmt.Errorf("flavor %q has invalid memory %d MB", flavor.Name, flavor.MemoryMB) + return 0, 0, nil, fmt.Errorf("flavor %q has invalid memory %d MB", flavor.Name, flavor.MemoryMB) } // Build EligibleHosts from all known hypervisors so that novaLimitHostsToRequest @@ -301,12 +528,16 @@ func (c *Controller) probeScheduler( AvailabilityZone: az, Pipeline: pipeline, EligibleHosts: eligibleHosts, - }, scheduling.Options{SkipHistory: true, SkipInflight: true, SkipCommittedResourceTracking: true}) + }, scheduling.Options{ + ReadOnly: true, + SkipHistory: true, + SkipInflight: true, + SkipCommittedResourceTracking: true, + }) if err != nil { - return 0, 0, fmt.Errorf("scheduler call failed (pipeline=%s): %w", pipeline, err) + return 0, 0, nil, fmt.Errorf("scheduler call failed (pipeline=%s): %w", pipeline, err) } - hosts = int64(len(resp.Hosts)) for _, hostName := range resp.Hosts { hv, ok := hvByName[hostName] if !ok { @@ -333,16 +564,17 @@ func (c *Controller) probeScheduler( capBytes = 0 } } - if capBytes > 0 { - capacity += capBytes / flavorBytes + if slots := capBytes / flavorBytes; slots > 0 { + capacity += slots + candidateHosts = append(candidateHosts, hostName) } } - return capacity, hosts, nil + hosts = int64(len(candidateHosts)) + return capacity, hosts, candidateHosts, nil } -// blockedMemoryByHost lists all Reservations and returns the total bytes blocked per host name. -// Only placed reservations (TargetHost or Status.Host non-empty) are counted. -// When a reservation is being migrated (TargetHost != Status.Host), both hosts are blocked. +// blockedMemoryByHost returns total reservation-blocked bytes per host. +// Both TargetHost and Status.Host are blocked; migration blocks both simultaneously. func (c *Controller) blockedMemoryByHost(ctx context.Context) (map[string]int64, error) { var list v1alpha1.ReservationList if err := c.client.List(ctx, &list); err != nil { @@ -377,9 +609,8 @@ func (c *Controller) blockedMemoryByHost(ctx context.Context) (map[string]int64, return blocked, nil } -// sumCommittedCapacity sums AcceptedSpec.Amount (or Spec.Amount as fallback) across all -// CommittedResource CRDs for the given (flavorGroup, az) pair with an active state -// (guaranteed or confirmed) and resource type memory. Returns the total in slots. +// sumCommittedCapacity sums active CommittedResource amounts (memory type, guaranteed/confirmed) +// for the given (flavorGroup, az) pair. Returns the total in smallest-flavor slots. func (c *Controller) sumCommittedCapacity(ctx context.Context, groupName, az string, smallestFlavorBytes int64) (int64, error) { var list v1alpha1.CommittedResourceList if err := c.client.List(ctx, &list); err != nil { @@ -411,6 +642,25 @@ func (c *Controller) sumCommittedCapacity(ctx context.Context, groupName, az str return total, nil } +// resMapToQuantity converts a raw resource map to a map[string]resource.Quantity. +// Keys are passed through unchanged — ResourceMemory == string(CommittedResourceTypeMemory) +// and ResourceCores == string(CommittedResourceTypeCores) by design. +func resMapToQuantity(res map[string]int64) map[string]resource.Quantity { + if len(res) == 0 { + return nil + } + out := make(map[string]resource.Quantity, len(res)) + for k, v := range res { + switch k { + case ResourceMemory: + out[k] = *resource.NewQuantity(v, resource.BinarySI) + case ResourceCores: + out[k] = *resource.NewQuantity(v, resource.DecimalSI) + } + } + return out +} + // availabilityZones returns a sorted, deduplicated list of AZs from Hypervisor CRD labels. func availabilityZones(hvs []hv1.Hypervisor) []string { azSet := make(map[string]struct{}) @@ -427,18 +677,6 @@ func availabilityZones(hvs []hv1.Hypervisor) []string { return azs } -// countInstancesInAZ counts total VM instances across all hypervisors in the given AZ. -func countInstancesInAZ(hvs []hv1.Hypervisor, az string) int64 { - var total int64 - for _, hv := range hvs { - if hv.Labels["topology.kubernetes.io/zone"] != az { - continue - } - total += int64(len(hv.Status.Instances)) - } - return total -} - // crdNameFor produces a collision-safe DNS label for a (flavorGroup, az) pair. // A 6-hex-char FNV-1a hash of the raw inputs is appended so that pairs differing only // by characters that sanitise identically (e.g. "." vs "-") still get unique names. diff --git a/internal/scheduling/reservations/capacity/controller_test.go b/internal/scheduling/reservations/capacity/controller_test.go index 8e25ff644..fe026416b 100644 --- a/internal/scheduling/reservations/capacity/controller_test.go +++ b/internal/scheduling/reservations/capacity/controller_test.go @@ -78,7 +78,7 @@ func newFlavorGroupKnowledge(t *testing.T, groupName string, smallestMemoryMB ui } } -// newHypervisor creates a Hypervisor CRD with a topology AZ label and effective capacity. +// newHypervisor creates a Hypervisor CRD with a topology AZ label, memory and CPU effective capacity. func newHypervisor(name, az string, memoryBytes int64, instanceIDs ...string) *hv1.Hypervisor { hv := &hv1.Hypervisor{ ObjectMeta: metav1.ObjectMeta{ @@ -87,9 +87,11 @@ func newHypervisor(name, az string, memoryBytes int64, instanceIDs ...string) *h }, } if memoryBytes > 0 { - qty := resource.NewQuantity(memoryBytes, resource.BinarySI) + memQty := resource.NewQuantity(memoryBytes, resource.BinarySI) + cpuQty := resource.NewQuantity(128, resource.DecimalSI) // generous CPU so memory is the binding constraint hv.Status.EffectiveCapacity = map[hv1.ResourceName]resource.Quantity{ - hv1.ResourceMemory: *qty, + hv1.ResourceMemory: *memQty, + hv1.ResourceCPU: *cpuQty, } } for _, id := range instanceIDs { @@ -109,6 +111,12 @@ func newMockSchedulerServer(t *testing.T, hosts []string) *httptest.Server { })) } +// newController is a test helper that creates a Controller with a nil VMSource. +func newController(t *testing.T, c client.Client, cfg Config) *Controller { + t.Helper() + return NewController(c, cfg, nil) +} + // --- unit tests for pure helper functions --- var ( @@ -173,25 +181,15 @@ func TestAvailabilityZones(t *testing.T) { } func TestCountInstancesInAZ(t *testing.T) { - hvs := []hv1.Hypervisor{ - *newHypervisor("h1", "az-a", 0, "vm1", "vm2"), - *newHypervisor("h2", "az-a", 0, "vm3"), - *newHypervisor("h3", "az-b", 0, "vm4"), - } - if got := countInstancesInAZ(hvs, "az-a"); got != 3 { - t.Errorf("countInstancesInAZ(az-a) = %d, want 3", got) - } - if got := countInstancesInAZ(hvs, "az-b"); got != 1 { - t.Errorf("countInstancesInAZ(az-b) = %d, want 1", got) - } - if got := countInstancesInAZ(hvs, "az-c"); got != 0 { - t.Errorf("countInstancesInAZ(az-c) = %d, want 0", got) - } + // countInstancesInAZ was removed since TotalInstances is no longer stored in the CRD. + // The AZ instance count is now derived from RunningInstances per flavor group. + // This test is replaced by the reconcileAZ integration tests which verify RunningInstances. + t.Skip("countInstancesInAZ removed — see TestReconcileAZ_CreatesCRD") } -// --- integration-style tests for reconcileOne --- +// --- integration-style tests for reconcileAZ --- -func TestReconcileOne_CreatesCRD(t *testing.T) { +func TestReconcileAZ_CreatesCRD(t *testing.T) { const ( groupName = "hana-v2" az = "qa-de-1a" @@ -209,11 +207,11 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). Build() - // Both probes return host-1 so capacity = floor(4GiB/4GiB) = 1 + // Both probes return host-1 → total capacity = floor(4GiB/4GiB) = 1, placeable = 1. schedulerServer := newMockSchedulerServer(t, []string{"host-1"}) defer schedulerServer.Close() - ctrl := NewController(fakeClient, Config{ + ctrl := newController(t, fakeClient, Config{ SchedulerURL: schedulerServer.URL, TotalPipeline: "kvm-report-capacity", PlaceablePipeline: "kvm-general-purpose", @@ -226,8 +224,10 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { } hvByName := map[string]hv1.Hypervisor{"host-1": *hv} - if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}, map[string]int64{}); err != nil { - t.Fatalf("reconcileOne failed: %v", err) + if err := ctrl.reconcileAZ(context.Background(), az, + map[string]compute.FlavorGroupFeature{groupName: groupData}, + hvByName, map[string]int64{}, map[vmUsageKey]vmUsage{}); err != nil { + t.Fatalf("reconcileAZ failed: %v", err) } var crd v1alpha1.FlavorGroupCapacity @@ -253,12 +253,18 @@ func TestReconcileOne_CreatesCRD(t *testing.T) { if f.PlaceableHosts != 1 { t.Errorf("PlaceableHosts = %d, want 1", f.PlaceableHosts) } - if crd.Status.TotalInstances != 1 { - t.Errorf("TotalInstances = %d, want 1", crd.Status.TotalInstances) + // Round-robin assigns the 1 available slot → ExclusivelyFreeCapacity[memory] = 1 flavor slot worth. + excl := crd.Status.ExclusivelyFreeCapacity[string(v1alpha1.CommittedResourceTypeMemory)] + if excl.IsZero() { + t.Errorf("ExclusivelyFreeCapacity[memory] is zero, want non-zero (1 slot assigned)") + } + // TotalInstances removed; per-group running VMs sourced from VMSource (nil in this test → 0). + if crd.Status.RunningInstances != 0 { + t.Errorf("RunningInstances = %d, want 0 (no VMSource configured)", crd.Status.RunningInstances) } } -func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { +func TestReconcileAZ_SkipsCRDWriteOnSchedulerError(t *testing.T) { const ( groupName = "hana-v2" az = "qa-de-1a" @@ -274,13 +280,13 @@ func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). Build() - // Scheduler returns 500 to simulate error + // Scheduler returns 500 to simulate error. failServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusInternalServerError) })) defer failServer.Close() - ctrl := NewController(fakeClient, Config{ + ctrl := newController(t, fakeClient, Config{ SchedulerURL: failServer.URL, TotalPipeline: "kvm-report-capacity", PlaceablePipeline: "kvm-general-purpose", @@ -292,28 +298,23 @@ func TestReconcileOne_SetsReadyConditionFalseOnSchedulerError(t *testing.T) { Flavors: []compute.FlavorInGroup{smallFlavor}, } - // reconcileOne returns no error itself (it continues on probe failure), but sets Ready=False - if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, map[string]hv1.Hypervisor{}, []hv1.Hypervisor{}, map[string]int64{}); err != nil { - t.Fatalf("reconcileOne failed: %v", err) + if err := ctrl.reconcileAZ(context.Background(), az, + map[string]compute.FlavorGroupFeature{groupName: groupData}, + map[string]hv1.Hypervisor{}, map[string]int64{}, map[vmUsageKey]vmUsage{}); err != nil { + t.Fatalf("reconcileAZ failed: %v", err) } - var crd v1alpha1.FlavorGroupCapacity - if err := fakeClient.Get(context.Background(), types.NamespacedName{Name: crdNameFor(groupName, az)}, &crd); err != nil { - t.Fatalf("failed to get CRD: %v", err) - } - - var freshStatus metav1.ConditionStatus - for _, c := range crd.Status.Conditions { - if c.Type == v1alpha1.FlavorGroupCapacityConditionReady { - freshStatus = c.Status - } + // Stale probes → CRD must NOT be written; last good state is preserved. + var list v1alpha1.FlavorGroupCapacityList + if err := fakeClient.List(context.Background(), &list); err != nil { + t.Fatalf("failed to list CRDs: %v", err) } - if freshStatus != metav1.ConditionFalse { - t.Errorf("Ready condition = %q, want %q", freshStatus, metav1.ConditionFalse) + if len(list.Items) != 0 { + t.Errorf("expected 0 CRDs (stale cycle skips write), got %d", len(list.Items)) } } -func TestReconcileOne_IdempotentUpdate(t *testing.T) { +func TestReconcileAZ_IdempotentUpdate(t *testing.T) { const ( groupName = "hana-v2" az = "qa-de-1a" @@ -326,7 +327,7 @@ func TestReconcileOne_IdempotentUpdate(t *testing.T) { knowledge := newFlavorGroupKnowledge(t, groupName, memMB) crdName := crdNameFor(groupName, az) - // Pre-create the CRD to test the update path (not create path) + // Pre-create the CRD to test the update path (not create path). existing := &v1alpha1.FlavorGroupCapacity{ ObjectMeta: metav1.ObjectMeta{Name: crdName}, Spec: v1alpha1.FlavorGroupCapacitySpec{ @@ -344,7 +345,7 @@ func TestReconcileOne_IdempotentUpdate(t *testing.T) { schedulerServer := newMockSchedulerServer(t, []string{"host-1"}) defer schedulerServer.Close() - ctrl := NewController(fakeClient, Config{ + ctrl := newController(t, fakeClient, Config{ SchedulerURL: schedulerServer.URL, TotalPipeline: "kvm-report-capacity", PlaceablePipeline: "kvm-general-purpose", @@ -356,14 +357,15 @@ func TestReconcileOne_IdempotentUpdate(t *testing.T) { Flavors: []compute.FlavorInGroup{smallFlavor}, } hvByName := map[string]hv1.Hypervisor{"host-1": *hv} + groups := map[string]compute.FlavorGroupFeature{groupName: groupData} // First call - if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}, map[string]int64{}); err != nil { - t.Fatalf("first reconcileOne failed: %v", err) + if err := ctrl.reconcileAZ(context.Background(), az, groups, hvByName, map[string]int64{}, map[vmUsageKey]vmUsage{}); err != nil { + t.Fatalf("first reconcileAZ failed: %v", err) } - // Second call — should not error on the already-existing CRD - if err := ctrl.reconcileOne(context.Background(), groupName, groupData, az, hvByName, []hv1.Hypervisor{*hv}, map[string]int64{}); err != nil { - t.Fatalf("second reconcileOne failed: %v", err) + // Second call — should not error on the already-existing CRD. + if err := ctrl.reconcileAZ(context.Background(), az, groups, hvByName, map[string]int64{}, map[vmUsageKey]vmUsage{}); err != nil { + t.Fatalf("second reconcileAZ failed: %v", err) } var crd v1alpha1.FlavorGroupCapacity @@ -382,14 +384,14 @@ func TestReconcileAll_SkipsGroupsWithNoAZs(t *testing.T) { scheme := newTestScheme(t) knowledge := newFlavorGroupKnowledge(t, "hana-v2", 2048) - // No hypervisors → no AZs → reconcileAll returns without error + // No hypervisors → no AZs → reconcileAll returns without error. fakeClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(knowledge). WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}, &v1alpha1.Knowledge{}). Build() - ctrl := NewController(fakeClient, Config{ + ctrl := newController(t, fakeClient, Config{ SchedulerURL: "http://localhost:9999", // unreachable; not called TotalPipeline: "kvm-report-capacity", PlaceablePipeline: "kvm-general-purpose", @@ -418,34 +420,44 @@ func TestProbeScheduler_CapacityCalculation(t *testing.T) { fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() - // Scheduler returns both hosts + // Scheduler returns both hosts. srv := newMockSchedulerServer(t, []string{"host-1", "host-2"}) defer srv.Close() - c := NewController(fakeClient, Config{SchedulerURL: srv.URL}) + c := newController(t, fakeClient, Config{SchedulerURL: srv.URL}) hvByName := map[string]hv1.Hypervisor{ "host-1": *hv1Obj, "host-2": *hv2Obj, } flavor := compute.FlavorInGroup{Name: "test-flavor", MemoryMB: memMB} - capacity, hosts, err := c.probeScheduler(context.Background(), flavor, "az-a", "test-pipeline", hvByName, true, nil) + capacity, hosts, candidates, err := c.probeScheduler(context.Background(), flavor, "az-a", "test-pipeline", hvByName, true, nil) if err != nil { t.Fatalf("probeScheduler failed: %v", err) } if hosts != 2 { t.Errorf("hosts = %d, want 2", hosts) } - // host-1 = 1 slot (4GiB/4GiB), host-2 = 2 slots (8GiB/4GiB) + // host-1 = 1 slot (4GiB/4GiB), host-2 = 2 slots (8GiB/4GiB). if capacity != 3 { t.Errorf("capacity = %d, want 3", capacity) } + // Both hosts should appear in the candidate list. + candidateSet := make(map[string]struct{}, len(candidates)) + for _, h := range candidates { + candidateSet[h] = struct{}{} + } + if _, ok := candidateSet["host-1"]; !ok { + t.Errorf("host-1 missing from candidates %v", candidates) + } + if _, ok := candidateSet["host-2"]; !ok { + t.Errorf("host-2 missing from candidates %v", candidates) + } } // TestProbeScheduler_SubtractsAllocationsWhenNotIgnored verifies that placeable-probe slot // counting uses remaining capacity (effectiveCapacity − allocation) while the total-probe uses -// raw capacity. This is the regression test for the bug where both probes used raw capacity, -// making running VMs invisible in the usage = total − placeable calculation. +// raw capacity. func TestProbeScheduler_SubtractsAllocationsWhenNotIgnored(t *testing.T) { const memMB = 4096 const memBytes = int64(memMB) * 1024 * 1024 @@ -462,12 +474,12 @@ func TestProbeScheduler_SubtractsAllocationsWhenNotIgnored(t *testing.T) { srv := newMockSchedulerServer(t, []string{"host-1"}) defer srv.Close() - c := NewController(fakeClient, Config{SchedulerURL: srv.URL}) + c := newController(t, fakeClient, Config{SchedulerURL: srv.URL}) hvByName := map[string]hv1.Hypervisor{"host-1": *hv} flavor := compute.FlavorInGroup{Name: "test-flavor", MemoryMB: memMB} // Total probe (ignoreAllocations=true): raw capacity → 2 slots. - totalCap, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "total-pipeline", hvByName, true, nil) + totalCap, _, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "total-pipeline", hvByName, true, nil) if err != nil { t.Fatalf("probeScheduler (total) failed: %v", err) } @@ -476,7 +488,7 @@ func TestProbeScheduler_SubtractsAllocationsWhenNotIgnored(t *testing.T) { } // Placeable probe (ignoreAllocations=false): capacity − allocation → 1 slot. - placeableCap, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "placeable-pipeline", hvByName, false, nil) + placeableCap, _, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "placeable-pipeline", hvByName, false, nil) if err != nil { t.Fatalf("probeScheduler (placeable) failed: %v", err) } @@ -491,7 +503,7 @@ func TestReconcileAll_MultipleGroupsAndAZs(t *testing.T) { const memMB = 2048 const memBytes = int64(memMB) * 1024 * 1024 - // Two AZs, two hypervisors + // Two AZs, two hypervisors. hv1Obj := newHypervisor("h1", "az-a", memBytes) hv2Obj := newHypervisor("h2", "az-b", memBytes) knowledge := newFlavorGroupKnowledge(t, "2152", memMB) @@ -505,7 +517,7 @@ func TestReconcileAll_MultipleGroupsAndAZs(t *testing.T) { srv := newMockSchedulerServer(t, []string{}) defer srv.Close() - c := NewController(fakeClient, Config{ + c := newController(t, fakeClient, Config{ SchedulerURL: srv.URL, TotalPipeline: "kvm-report-capacity", PlaceablePipeline: "kvm-general-purpose", @@ -515,7 +527,7 @@ func TestReconcileAll_MultipleGroupsAndAZs(t *testing.T) { t.Fatalf("reconcileAll failed: %v", err) } - // Expect one CRD per AZ for the single group + // Expect one CRD per AZ for the single group. var list v1alpha1.FlavorGroupCapacityList if err := fakeClient.List(context.Background(), &list); err != nil { t.Fatalf("failed to list CRDs: %v", err) @@ -532,7 +544,7 @@ func TestReconcileAll_MultipleGroupsAndAZs(t *testing.T) { func TestReconcileAll_FlavorGroupsKnowledgeNotReady(t *testing.T) { scheme := newTestScheme(t) - // Knowledge CRD exists but is not Ready + // Knowledge CRD exists but is not Ready. knowledge := &v1alpha1.Knowledge{ ObjectMeta: metav1.ObjectMeta{Name: "flavor-groups"}, Spec: v1alpha1.KnowledgeSpec{ @@ -556,29 +568,44 @@ func TestReconcileAll_FlavorGroupsKnowledgeNotReady(t *testing.T) { WithStatusSubresource(&v1alpha1.Knowledge{}). Build() - c := NewController(fakeClient, Config{ + c := newController(t, fakeClient, Config{ SchedulerURL: "http://localhost:9999", TotalPipeline: "kvm-report-capacity", PlaceablePipeline: "kvm-general-purpose", }) - // Should return an error when knowledge is not ready + // Should return an error when knowledge is not ready. if err := c.reconcileAll(context.Background()); err == nil { t.Error("reconcileAll should fail when flavor groups knowledge is not ready") } } -func TestReconcileOne_ZeroMemoryFlavorReturnsError(t *testing.T) { +func TestReconcileAZ_ZeroMemoryFlavorSkipped(t *testing.T) { scheme := newTestScheme(t) - fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() - c := NewController(fakeClient, Config{}) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}). + Build() + c := newController(t, fakeClient, Config{}) groupData := compute.FlavorGroupFeature{ SmallestFlavor: compute.FlavorInGroup{Name: "bad-flavor", MemoryMB: 0}, } - err := c.reconcileOne(context.Background(), "hana-v2", groupData, "az-a", nil, nil, nil) - if err == nil { - t.Error("expected error for zero-memory flavor") + // reconcileAZ logs and skips groups with zero memory; it does not return an error. + err := c.reconcileAZ(context.Background(), "az-a", + map[string]compute.FlavorGroupFeature{"hana-v2": groupData}, + nil, nil, nil) + if err != nil { + t.Errorf("reconcileAZ should not return error for zero-memory flavor, got: %v", err) + } + + // No CRD should have been created. + var list v1alpha1.FlavorGroupCapacityList + if err := fakeClient.List(context.Background(), &list); err != nil { + t.Fatalf("failed to list CRDs: %v", err) + } + if len(list.Items) != 0 { + t.Errorf("expected 0 CRDs, got %d", len(list.Items)) } } @@ -619,17 +646,17 @@ func TestSumCommittedCapacity(t *testing.T) { scheme := newTestScheme(t) objects := []client.Object{ - // Should count: confirmed, memory, right group+AZ, AcceptedAmount set + // Should count: confirmed, memory, right group+AZ, AcceptedAmount set. newCR("cr1", groupName, az, v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeMemory, "8Gi", "8Gi"), - // Should count: guaranteed, memory, right group+AZ, no AcceptedAmount → falls back to Spec.Amount + // Should count: guaranteed, memory, right group+AZ, no AcceptedAmount → falls back to Spec.Amount. newCR("cr2", groupName, az, v1alpha1.CommitmentStatusGuaranteed, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), - // Should NOT count: wrong state + // Should NOT count: wrong state. newCR("cr3", groupName, az, v1alpha1.CommitmentStatusPlanned, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), - // Should NOT count: wrong resource type + // Should NOT count: wrong resource type. newCR("cr4", groupName, az, v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeCores, "4Gi", ""), - // Should NOT count: wrong AZ + // Should NOT count: wrong AZ. newCR("cr5", groupName, "other-az", v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), - // Should NOT count: wrong flavor group + // Should NOT count: wrong flavor group. newCR("cr6", "other-group", az, v1alpha1.CommitmentStatusConfirmed, v1alpha1.CommittedResourceTypeMemory, "4Gi", ""), } @@ -638,8 +665,8 @@ func TestSumCommittedCapacity(t *testing.T) { WithObjects(objects...). Build() - c := NewController(fakeClient, Config{}) - // smallestFlavorBytes = 4GiB → cr1 = 8GiB/4GiB = 2 slots, cr2 = 4GiB/4GiB = 1 slot → total = 3 + c := newController(t, fakeClient, Config{}) + // smallestFlavorBytes = 4GiB → cr1 = 8GiB/4GiB = 2 slots, cr2 = 4GiB/4GiB = 1 slot → total = 3. got, err := c.sumCommittedCapacity(context.Background(), groupName, az, memBytes) if err != nil { t.Fatalf("sumCommittedCapacity failed: %v", err) @@ -667,12 +694,12 @@ func TestProbeScheduler_SubtractsReservationBlocksWhenNotIgnored(t *testing.T) { srv := newMockSchedulerServer(t, []string{"host-1"}) defer srv.Close() - c := NewController(fakeClient, Config{SchedulerURL: srv.URL}) + c := newController(t, fakeClient, Config{SchedulerURL: srv.URL}) hvByName := map[string]hv1.Hypervisor{"host-1": *hv} flavor := compute.FlavorInGroup{Name: "test-flavor", MemoryMB: memMB} // Total probe: raw 3 slots, no subtraction. - totalCap, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "total-pipeline", hvByName, true, nil) + totalCap, _, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "total-pipeline", hvByName, true, nil) if err != nil { t.Fatalf("probeScheduler (total) failed: %v", err) } @@ -684,7 +711,7 @@ func TestProbeScheduler_SubtractsReservationBlocksWhenNotIgnored(t *testing.T) { blockedByReservations := map[string]int64{ "host-1": memBytes, // 1 reservation blocking 1 slot's worth of memory } - placeableCap, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "placeable-pipeline", hvByName, false, blockedByReservations) + placeableCap, _, _, err := c.probeScheduler(context.Background(), flavor, "az-a", "placeable-pipeline", hvByName, false, blockedByReservations) if err != nil { t.Fatalf("probeScheduler (placeable) failed: %v", err) } diff --git a/internal/scheduling/reservations/capacity/metrics.go b/internal/scheduling/reservations/capacity/metrics.go index f282cc9d0..27293e0eb 100644 --- a/internal/scheduling/reservations/capacity/metrics.go +++ b/internal/scheduling/reservations/capacity/metrics.go @@ -20,12 +20,17 @@ var ( // Monitor provides Prometheus metrics for FlavorGroupCapacity CRDs. // It implements prometheus.Collector and reads CRD status on each Collect call. type Monitor struct { - client client.Client - vmSlotsEmpty *prometheus.GaugeVec - vmSlotsPlaceable *prometheus.GaugeVec - hostsEmpty *prometheus.GaugeVec - hostsPlaceable *prometheus.GaugeVec - committedCapacity *prometheus.GaugeVec + client client.Client + vmSlotsEmpty *prometheus.GaugeVec + vmSlotsPlaceable *prometheus.GaugeVec + hostsEmpty *prometheus.GaugeVec + hostsPlaceable *prometheus.GaugeVec + committedCapacityGiB *prometheus.GaugeVec + committedReservations *prometheus.GaugeVec + runningInstances *prometheus.GaugeVec + freeCapacityGiB *prometheus.GaugeVec + exclusivelyFreeCapacityGiB *prometheus.GaugeVec + exclusivelyFreeSlots *prometheus.GaugeVec } // NewMonitor creates a new Monitor that reads FlavorGroupCapacity CRDs. @@ -48,10 +53,30 @@ func NewMonitor(c client.Client) Monitor { Name: "cortex_committed_resource_capacity_hosts_placeable", Help: "Number of hosts still able to accept a new VM of this flavor.", }, capacityFlavorLabels), - committedCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + committedCapacityGiB: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cortex_committed_resource_committed_gib", - Help: "Sum of AcceptedAmount in GiB across Ready CommittedResource CRDs for this flavor group and AZ.", + Help: "Total committed memory in GiB for this flavor group and AZ.", }, capacityLabels), + committedReservations: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_committed_reservations", + Help: "Number of committed reservation slots (smallest-flavor units) for this flavor group and AZ.", + }, capacityLabels), + runningInstances: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_running_instances", + Help: "Number of running VMs whose flavor belongs to this flavor group and AZ.", + }, capacityFlavorLabels), + freeCapacityGiB: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_free_capacity_gib", + Help: "Sum of remaining memory in GiB across all candidate hosts for this flavor group before the cross-group split. May overlap across groups sharing hosts.", + }, capacityFlavorLabels), + exclusivelyFreeCapacityGiB: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_exclusively_free_capacity_gib", + Help: "Memory in GiB fairly attributed to this flavor group by the round-robin split. Sum across groups never exceeds installed capacity.", + }, capacityFlavorLabels), + exclusivelyFreeSlots: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_committed_resource_exclusively_free_slots", + Help: "Number of smallest-flavor VM slots available after the cross-group capacity split.", + }, capacityFlavorLabels), } } @@ -61,7 +86,12 @@ func (m *Monitor) Describe(ch chan<- *prometheus.Desc) { m.vmSlotsPlaceable.Describe(ch) m.hostsEmpty.Describe(ch) m.hostsPlaceable.Describe(ch) - m.committedCapacity.Describe(ch) + m.committedCapacityGiB.Describe(ch) + m.committedReservations.Describe(ch) + m.runningInstances.Describe(ch) + m.freeCapacityGiB.Describe(ch) + m.exclusivelyFreeCapacityGiB.Describe(ch) + m.exclusivelyFreeSlots.Describe(ch) } // Collect implements prometheus.Collector — lists all FlavorGroupCapacity CRDs and exports gauges. @@ -79,14 +109,34 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { m.vmSlotsPlaceable.Reset() m.hostsEmpty.Reset() m.hostsPlaceable.Reset() - m.committedCapacity.Reset() + m.committedCapacityGiB.Reset() + m.committedReservations.Reset() + m.runningInstances.Reset() + m.freeCapacityGiB.Reset() + m.exclusivelyFreeCapacityGiB.Reset() + m.exclusivelyFreeSlots.Reset() for _, crd := range list.Items { groupAZLabels := prometheus.Labels{ "flavor_group": crd.Spec.FlavorGroup, "az": crd.Spec.AvailabilityZone, } - m.committedCapacity.With(groupAZLabels).Set(float64(crd.Status.CommittedCapacity)) + groupAZFlavorLabels := prometheus.Labels{ + "flavor_group": crd.Spec.FlavorGroup, + "az": crd.Spec.AvailabilityZone, + "flavor_name": crd.Status.SmallestFlavorName, + } + m.committedCapacityGiB.With(groupAZLabels).Set(float64(crd.Status.CommittedCapacityBytes) / (1024 * 1024 * 1024)) + m.committedReservations.With(groupAZLabels).Set(float64(crd.Status.CommittedCapacity)) + m.runningInstances.With(groupAZFlavorLabels).Set(float64(crd.Status.RunningInstances)) + + if qty, ok := crd.Status.FreeCapacity[string(v1alpha1.CommittedResourceTypeMemory)]; ok { + m.freeCapacityGiB.With(groupAZFlavorLabels).Set(float64(qty.Value()) / (1024 * 1024 * 1024)) + } + if qty, ok := crd.Status.ExclusivelyFreeCapacity[string(v1alpha1.CommittedResourceTypeMemory)]; ok { + m.exclusivelyFreeCapacityGiB.With(groupAZFlavorLabels).Set(float64(qty.Value()) / (1024 * 1024 * 1024)) + } + m.exclusivelyFreeSlots.With(groupAZFlavorLabels).Set(float64(crd.Status.ExclusivelyFreeSlots)) for _, f := range crd.Status.Flavors { flavorLabels := prometheus.Labels{ @@ -105,5 +155,10 @@ func (m *Monitor) Collect(ch chan<- prometheus.Metric) { m.vmSlotsPlaceable.Collect(ch) m.hostsEmpty.Collect(ch) m.hostsPlaceable.Collect(ch) - m.committedCapacity.Collect(ch) + m.committedCapacityGiB.Collect(ch) + m.committedReservations.Collect(ch) + m.runningInstances.Collect(ch) + m.freeCapacityGiB.Collect(ch) + m.exclusivelyFreeCapacityGiB.Collect(ch) + m.exclusivelyFreeSlots.Collect(ch) } diff --git a/internal/scheduling/reservations/capacity/split.go b/internal/scheduling/reservations/capacity/split.go new file mode 100644 index 000000000..15a9102c6 --- /dev/null +++ b/internal/scheduling/reservations/capacity/split.go @@ -0,0 +1,233 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import "sort" + +// Resource keys used for capacity splitting. Match CommittedResourceType constants. +const ( + ResourceMemory = "memory" + ResourceCores = "cores" +) + +type GroupInput struct { + Name string + FlavorResources map[string]int64 + CandidateHosts []string +} + +type HostState struct { + Remaining map[string]int64 +} + +type groupState struct { + input GroupInput + remaining []string + assignedCount int64 +} + +func fits(flavorRes, hostRemaining map[string]int64) bool { + for r, needed := range flavorRes { + if hostRemaining[r] < needed { + return false + } + } + return true +} + +// sortGroups re-orders states in-place by the round-robin priority: +// 1. ASC number of remaining candidate hosts (fewest candidates first) +// 2. DESC flavor memory (larger flavor first) +// 3. DESC flavor cores +// 4. ASC group name (stable tiebreaker) +func sortGroups(states []groupState) { + sort.SliceStable(states, func(i, j int) bool { + a, b := states[i], states[j] + if len(a.remaining) != len(b.remaining) { + return len(a.remaining) < len(b.remaining) + } + if a.input.FlavorResources[ResourceMemory] != b.input.FlavorResources[ResourceMemory] { + return a.input.FlavorResources[ResourceMemory] > b.input.FlavorResources[ResourceMemory] + } + if a.input.FlavorResources[ResourceCores] != b.input.FlavorResources[ResourceCores] { + return a.input.FlavorResources[ResourceCores] > b.input.FlavorResources[ResourceCores] + } + return a.input.Name < b.input.Name + }) +} + +// bestHost returns the index within remaining of the host to consume next. +// Sort order: least (remaining_mem % flavor_mem), then least remaining memory, then least CPU, then host name. +func bestHost(remaining []string, hostRes map[string]map[string]int64, flavorRes map[string]int64) int { + flavorMem := flavorRes[ResourceMemory] + best := 0 + for i, h := range remaining[1:] { + idx := i + 1 + bh := remaining[best] + hMem := hostRes[h][ResourceMemory] + bMem := hostRes[bh][ResourceMemory] + + var hWaste, bWaste int64 + if flavorMem > 0 { + hWaste = hMem % flavorMem + bWaste = bMem % flavorMem + } + if hWaste != bWaste { + if hWaste < bWaste { + best = idx + } + continue + } + if hMem != bMem { + if hMem < bMem { + best = idx + } + continue + } + hCPU := hostRes[h][ResourceCores] + bCPU := hostRes[bh][ResourceCores] + if hCPU < bCPU || (hCPU == bCPU && h < bh) { + best = idx + } + } + return best +} + +// SplitCapacity runs the round-robin capacity assignment algorithm. +// +// For each AZ it assigns resources (in raw units — bytes for memory, count for cores) +// to flavor groups in a fair, deterministic way such that no host is over-committed. +// Groups sharing hypervisors are served round-robin so no group monopolises shared hosts. +// +// Returns: +// - freeResources[groupName][resource]: sum of remaining resources across all candidate +// hosts for each group before the split. May overlap across groups sharing hosts. +// - exclusiveResources[groupName][resource]: fairly attributed share after the split; +// sum across groups never exceeds actual installed capacity. +// - unassigned[resource]: resources on candidate hosts not claimed by any group due to +// fragmentation (for operator log visibility). +// +// The caller divides exclusiveResources[group][ResourceMemory] by the group's flavor memory +// to obtain the slot count meaningful to that group. +func SplitCapacity(groups []GroupInput, hosts map[string]HostState) (freeResources, exclusiveResources map[string]map[string]int64, unassigned map[string]int64) { + states := make([]groupState, len(groups)) + for i, g := range groups { + remaining := make([]string, 0, len(g.CandidateHosts)) + for _, h := range g.CandidateHosts { + if hs, ok := hosts[h]; ok && fits(g.FlavorResources, hs.Remaining) { + remaining = append(remaining, h) + } + } + sort.Strings(remaining) // stable initial order + states[i] = groupState{input: g, remaining: remaining} + } + + // freeResources: usable capacity per group — floor(remaining/flavorSize)*flavorSize per host. + // Uses the minimum slot count across all resources so memory and CPU stay consistent. + freeResources = make(map[string]map[string]int64, len(groups)) + for _, g := range groups { + res := make(map[string]int64) + for _, h := range g.CandidateHosts { + hs, ok := hosts[h] + if !ok { + continue + } + // Compute the binding slot count: min across all flavor resources. + slots := int64(-1) + for r, need := range g.FlavorResources { + if need <= 0 { + continue + } + s := hs.Remaining[r] / need + if slots < 0 || s < slots { + slots = s + } + } + if slots <= 0 { + continue + } + for r, need := range g.FlavorResources { + res[r] += slots * need + } + } + freeResources[g.Name] = res + } + + // Copy host resources so the caller's map is not mutated. + hostRes := make(map[string]map[string]int64, len(hosts)) + for name, hs := range hosts { + res := make(map[string]int64, len(hs.Remaining)) + for r, v := range hs.Remaining { + res[r] = v + } + hostRes[name] = res + } + + for { + // Each round: serve groups in priority order, one flavor-sized allocation each. + sortGroups(states) + + progress := false + // Grant one allocation to each group that still has eligible candidates. + for i := range states { + g := &states[i] + if len(g.remaining) == 0 { + continue + } + + chosen := g.remaining[bestHost(g.remaining, hostRes, g.input.FlavorResources)] + for r, amount := range g.input.FlavorResources { + hostRes[chosen][r] -= amount + } + g.assignedCount++ + progress = true + + // Drop hosts that can no longer fit their group's flavor after this allocation. + for j := range states { + flavorRes := states[j].input.FlavorResources + filtered := make([]string, 0, len(states[j].remaining)) + for _, h := range states[j].remaining { + if fits(flavorRes, hostRes[h]) { + filtered = append(filtered, h) + } + } + states[j].remaining = filtered + } + } + + if !progress { + break + } + } + + // Unassigned: remaining resources on candidate hosts after the split. + // Non-candidate hosts are excluded — their leftover is not fragmentation. + candidateSet := make(map[string]struct{}) + for _, g := range groups { + for _, h := range g.CandidateHosts { + candidateSet[h] = struct{}{} + } + } + unassigned = make(map[string]int64) + for h, res := range hostRes { + if _, isCandidate := candidateSet[h]; !isCandidate { + continue + } + for r, remaining := range res { + if remaining > 0 { + unassigned[r] += remaining + } + } + } + + exclusiveResources = make(map[string]map[string]int64, len(states)) + for _, g := range states { + resources := make(map[string]int64, len(g.input.FlavorResources)) + for r, amount := range g.input.FlavorResources { + resources[r] = g.assignedCount * amount + } + exclusiveResources[g.input.Name] = resources + } + return freeResources, exclusiveResources, unassigned +} diff --git a/internal/scheduling/reservations/capacity/split_test.go b/internal/scheduling/reservations/capacity/split_test.go new file mode 100644 index 000000000..00936e61e --- /dev/null +++ b/internal/scheduling/reservations/capacity/split_test.go @@ -0,0 +1,245 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package capacity + +import ( + "testing" +) + +const GiB = 1024 * 1024 * 1024 + +func flavor(memBytes, cores int64) map[string]int64 { + return map[string]int64{ResourceMemory: memBytes, ResourceCores: cores} +} + +func host(memBytes, cores int64) HostState { + return HostState{Remaining: map[string]int64{ResourceMemory: memBytes, ResourceCores: cores}} +} + +// TestSplitCapacity covers the round-robin assignment algorithm. +func TestSplitCapacity(t *testing.T) { + tests := []struct { + name string + groups []GroupInput + hosts map[string]HostState + wantAssignedMem map[string]int64 + wantUnassignedMem int64 + // wantFreeMem: optional check on FreeCapacity per group (only set in cases that exercise it). + wantFreeMem map[string]int64 + }{ + { + name: "single group, two hosts", + groups: []GroupInput{ + {Name: "hana", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1", "h2"}}, + }, + hosts: map[string]HostState{ + "h1": host(8*GiB, 4), // 2 slots + "h2": host(4*GiB, 2), // 1 slot + }, + wantAssignedMem: map[string]int64{"hana": 3 * 4 * GiB}, + wantUnassignedMem: 0, + }, + { + name: "disjoint groups, each with own host", + groups: []GroupInput{ + {Name: "gp", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1"}}, + {Name: "hana", FlavorResources: flavor(8*GiB, 4), CandidateHosts: []string{"h2"}}, + }, + hosts: map[string]HostState{ + "h1": host(8*GiB, 4), + "h2": host(8*GiB, 4), + }, + wantAssignedMem: map[string]int64{"gp": 8 * GiB, "hana": 8 * GiB}, + wantUnassignedMem: 0, + }, + { + // Both groups share the same host; round-robin gives each one slot. + name: "overlapping groups, fair round-robin split", + groups: []GroupInput{ + {Name: "alpha", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"shared"}}, + {Name: "beta", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"shared"}}, + }, + hosts: map[string]HostState{ + "shared": host(8*GiB, 4), + }, + wantAssignedMem: map[string]int64{"alpha": 4 * GiB, "beta": 4 * GiB}, + wantUnassignedMem: 0, + }, + { + // "constrained" has only one candidate host; fewer candidates → served first. + // It wins the shared slot; "free" falls back to its exclusive host. + name: "overlapping groups, constrained group served first", + groups: []GroupInput{ + {Name: "constrained", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"shared"}}, + {Name: "free", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"shared", "exclusive"}}, + }, + hosts: map[string]HostState{ + "shared": host(4*GiB, 2), + "exclusive": host(8*GiB, 4), + }, + wantAssignedMem: map[string]int64{"constrained": 4 * GiB, "free": 8 * GiB}, + wantUnassignedMem: 0, + }, + { + // Host has memory but no CPU — flavor requires both, so host is ineligible. + name: "CPU exhausted host dropped", + groups: []GroupInput{ + {Name: "gp", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"cpu-full"}}, + }, + hosts: map[string]HostState{ + "cpu-full": host(16*GiB, 0), + }, + wantAssignedMem: map[string]int64{"gp": 0}, + wantUnassignedMem: 16 * GiB, + }, + { + // Host has less memory than the flavor requires → nothing assigned, all memory is unassigned. + name: "host too small for flavor", + groups: []GroupInput{ + {Name: "hana", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1"}}, + }, + hosts: map[string]HostState{ + "h1": host(3*GiB, 4), + }, + wantAssignedMem: map[string]int64{"hana": 0}, + wantUnassignedMem: 3 * GiB, + }, + { + // Same candidate count → tiebreak on larger flavor first: hana8 goes before gp4. + name: "larger flavor served first on candidate-count tie", + groups: []GroupInput{ + {Name: "hana8", FlavorResources: flavor(8*GiB, 4), CandidateHosts: []string{"h1"}}, + {Name: "gp4", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1"}}, + }, + hosts: map[string]HostState{ + "h1": host(12*GiB, 8), + }, + wantAssignedMem: map[string]int64{"hana8": 8 * GiB, "gp4": 4 * GiB}, + wantUnassignedMem: 0, + }, + { + // h1: 5 GiB → 5 % 4 = 1 GiB waste. h2: 8 GiB → 8 % 4 = 0 GiB waste. + // h2 chosen first (lower modulo remainder); 1 GiB strands on h1 after its one slot. + name: "host selection prefers lower modulo remainder", + groups: []GroupInput{ + {Name: "gp", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1", "h2"}}, + }, + hosts: map[string]HostState{ + "h1": host(5*GiB, 8), + "h2": host(8*GiB, 4), + }, + wantAssignedMem: map[string]int64{"gp": 3 * 4 * GiB}, + wantUnassignedMem: 1 * GiB, + }, + { + // freeResources counts floor(remaining/flavorSize)*flavorSize per host. + // h1 has 6 GiB: floor(6/4)*4 = 4 GiB usable. h2 has 3 GiB: below flavor → 0. + name: "free capacity excludes sub-flavor remainder", + groups: []GroupInput{ + {Name: "gp", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1", "h2"}}, + }, + hosts: map[string]HostState{ + "h1": host(6*GiB, 8), // 1 slot usable, 2 GiB wasted + "h2": host(3*GiB, 4), // below flavor threshold → 0 usable + }, + wantAssignedMem: map[string]int64{"gp": 4 * GiB}, + wantUnassignedMem: 2*GiB + 3*GiB, // 2 GiB remainder on h1 + all of h2 + wantFreeMem: map[string]int64{"gp": 4 * GiB}, + }, + { + name: "no groups", + groups: nil, + hosts: map[string]HostState{ + "h1": host(8*GiB, 16), + }, + wantAssignedMem: map[string]int64{}, + wantUnassignedMem: 0, + }, + { + name: "no hosts", + groups: []GroupInput{ + {Name: "hana", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1"}}, + }, + hosts: map[string]HostState{}, + wantAssignedMem: map[string]int64{"hana": 0}, + wantUnassignedMem: 0, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + free, assigned, unassigned := SplitCapacity(tc.groups, tc.hosts) + + for groupName, wantMem := range tc.wantAssignedMem { + if got := assigned[groupName][ResourceMemory]; got != wantMem { + t.Errorf("assigned[%s][memory] = %d, want %d", groupName, got, wantMem) + } + } + if got := unassigned[ResourceMemory]; got != tc.wantUnassignedMem { + t.Errorf("unassigned[memory] = %d, want %d", got, tc.wantUnassignedMem) + } + for groupName, wantMem := range tc.wantFreeMem { + if got := free[groupName][ResourceMemory]; got != wantMem { + t.Errorf("free[%s][memory] = %d, want %d", groupName, got, wantMem) + } + } + }) + } +} + +// TestSplitCapacity_SumNeverExceedsTotal is a property test: the total assigned memory +// across all groups must never exceed the total available memory across all hosts. +func TestSplitCapacity_SumNeverExceedsTotal(t *testing.T) { + groups := []GroupInput{ + {Name: "g1", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1", "h2", "h3"}}, + {Name: "g2", FlavorResources: flavor(8*GiB, 4), CandidateHosts: []string{"h2", "h3"}}, + {Name: "g3", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1", "h3"}}, + } + hosts := map[string]HostState{ + "h1": host(12*GiB, 6), + "h2": host(16*GiB, 8), + "h3": host(24*GiB, 12), + } + + _, assigned, _ := SplitCapacity(groups, hosts) + + var totalInstalled, totalAssigned int64 + for _, hs := range hosts { + totalInstalled += hs.Remaining[ResourceMemory] + } + for _, res := range assigned { + totalAssigned += res[ResourceMemory] + } + if totalAssigned > totalInstalled { + t.Errorf("totalAssigned (%d) > totalInstalled (%d): capacity overreported", totalAssigned, totalInstalled) + } +} + +// TestSplitCapacity_Deterministic verifies identical input always produces identical output. +func TestSplitCapacity_Deterministic(t *testing.T) { + groups := []GroupInput{ + {Name: "c", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h1", "h2"}}, + {Name: "a", FlavorResources: flavor(8*GiB, 4), CandidateHosts: []string{"h1", "h2"}}, + {Name: "b", FlavorResources: flavor(4*GiB, 2), CandidateHosts: []string{"h2"}}, + } + hosts := map[string]HostState{ + "h1": host(16*GiB, 8), + "h2": host(8*GiB, 4), + } + + _, first, firstUnassigned := SplitCapacity(groups, hosts) + for i := range 10 { + _, got, gotUnassigned := SplitCapacity(groups, hosts) + for _, g := range groups { + if got[g.Name][ResourceMemory] != first[g.Name][ResourceMemory] { + t.Errorf("run %d: assigned[%s][memory] = %d, want %d (non-deterministic)", + i, g.Name, got[g.Name][ResourceMemory], first[g.Name][ResourceMemory]) + } + } + if gotUnassigned[ResourceMemory] != firstUnassigned[ResourceMemory] { + t.Errorf("run %d: unassigned[memory] = %d, want %d (non-deterministic)", + i, gotUnassigned[ResourceMemory], firstUnassigned[ResourceMemory]) + } + } +} diff --git a/internal/scheduling/reservations/commitments/api/report_capacity.go b/internal/scheduling/reservations/commitments/api/report_capacity.go index b8cc67f73..5277d51d0 100644 --- a/internal/scheduling/reservations/commitments/api/report_capacity.go +++ b/internal/scheduling/reservations/commitments/api/report_capacity.go @@ -9,13 +9,27 @@ import ( "strconv" "time" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" "github.com/google/uuid" "github.com/sapcc/go-api-declarations/liquid" ) -// handles POST /commitments/v1/report-capacity requests from Limes: +// unitSizeForResource returns the GiB value of one declared unit for the given resource name. +// For RAM resources it reads RAMUnitGiB from config (defaults to 1). Cores and instances are always 1. +func (api *HTTPAPI) unitSizeForResource(resName liquid.ResourceName) string { + group, resType, err := commitments.GetFlavorGroupAndTypeFromResource(string(resName)) + if err != nil || resType != v1alpha1.CommittedResourceTypeMemory { + return "1" + } + unitGiB := api.config.ResourceConfigForGroup(group).RAM.RAMUnitGiB + if unitGiB == 0 { + return "1" + } + return strconv.FormatUint(unitGiB, 10) +} + // See: https://github.com/sapcc/go-api-declarations/blob/main/liquid/commitment.go // See: https://pkg.go.dev/github.com/sapcc/go-api-declarations/liquid // Reports available capacity across all flavor group resources. Note, unit is specified in the Info API response with multiple of the smallest memory resource unit within a flavor group. @@ -73,8 +87,9 @@ func (api *HTTPAPI) HandleReportCapacity(w http.ResponseWriter, r *http.Request) // Update capacity gauge for each resource/AZ combination. for resName, resReport := range report.Resources { + unitSize := api.unitSizeForResource(resName) for az, azReport := range resReport.PerAZ { - api.capacityMonitor.reportedCapacity.WithLabelValues(string(resName), string(az)).Set(float64(azReport.Capacity)) + api.capacityMonitor.reportedCapacity.WithLabelValues(string(resName), string(az), unitSize).Set(float64(azReport.Capacity)) } } diff --git a/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go b/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go index cb7b21336..05f1498f8 100644 --- a/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go +++ b/internal/scheduling/reservations/commitments/api/report_capacity_monitor.go @@ -25,17 +25,15 @@ func NewReportCapacityAPIMonitor() ReportCapacityAPIMonitor { Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10}, }, []string{"status_code"}), reportedCapacity: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_committed_resource_reported_capacity_gib", - Help: "Last reported capacity in GiB per resource and availability zone as returned by the capacity API", - }, []string{"resource", "az"}), + Name: "cortex_committed_resource_reported_capacity", + Help: "Last reported capacity per resource and AZ as returned by the capacity API. unit_size indicates the GiB value of one declared unit (e.g. 480 for a HANA slot, 1 for variable-ratio RAM or cores).", + }, []string{"resource", "az", "unit_size"}), } for _, statusCode := range []string{"200", "500", "503"} { m.requestCounter.WithLabelValues(statusCode) m.requestDuration.WithLabelValues(statusCode) } - // resource/az are dynamic; sentinel ensures the metric family exists for alert validation. - m.reportedCapacity.WithLabelValues("", "") return m } diff --git a/internal/scheduling/reservations/commitments/api/report_capacity_test.go b/internal/scheduling/reservations/commitments/api/report_capacity_test.go index 19117eb7b..ddef4031c 100644 --- a/internal/scheduling/reservations/commitments/api/report_capacity_test.go +++ b/internal/scheduling/reservations/commitments/api/report_capacity_test.go @@ -14,93 +14,52 @@ import ( "testing" "github.com/sapcc/go-api-declarations/liquid" + "k8s.io/apimachinery/pkg/api/resource" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/cobaltcore-dev/cortex/api/v1alpha1" commitments "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments" ) +// defaultCapacityConfig enables all three resource types for all groups. +var defaultCapacityConfig = commitments.APIConfig{ + FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ + "*": { + RAM: commitments.RAMResourceTypeConfig{HasCapacity: true}, + Cores: commitments.ResourceTypeConfig{HasCapacity: true}, + Instances: commitments.ResourceTypeConfig{HasCapacity: true}, + }, + }, +} + func TestHandleReportCapacity(t *testing.T) { - // Setup fake client scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { t.Fatal(err) } - - // testCapacityConfig enables capacity reporting for all groups via "*" catch-all. - testCapacityConfig := commitments.APIConfig{ - EnableReportCapacity: true, - FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ - "*": { - RAM: commitments.RAMResourceTypeConfig{HasCapacity: true}, - Cores: commitments.ResourceTypeConfig{HasCapacity: true}, - Instances: commitments.ResourceTypeConfig{HasCapacity: true}, - }, - }, - } - - // Create empty flavor groups knowledge so capacity calculation doesn't fail - emptyKnowledge := createEmptyFlavorGroupKnowledge() - - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(emptyKnowledge). - Build() - - api := NewAPIWithConfig(fakeClient, testCapacityConfig, nil) + api := NewAPIWithConfig( + fake.NewClientBuilder().WithScheme(scheme).WithObjects(createEmptyFlavorGroupKnowledge()).Build(), + commitments.APIConfig{EnableReportCapacity: true, FlavorGroupResourceConfig: defaultCapacityConfig.FlavorGroupResourceConfig}, + nil, + ) tests := []struct { name string method string body interface{} expectedStatus int - checkResponse func(*testing.T, *liquid.ServiceCapacityReport) }{ - { - name: "POST request succeeds", - method: http.MethodPost, - body: liquid.ServiceCapacityRequest{}, - expectedStatus: http.StatusOK, - checkResponse: func(t *testing.T, resp *liquid.ServiceCapacityReport) { - // Resources may be nil or empty for empty capacity - if len(resp.Resources) != 0 { - t.Errorf("Expected empty or nil Resources, got %d resources", len(resp.Resources)) - } - }, - }, - { - name: "POST with empty body succeeds", - method: http.MethodPost, - body: nil, - expectedStatus: http.StatusOK, - checkResponse: func(t *testing.T, resp *liquid.ServiceCapacityReport) { - // Resources may be nil or empty for empty capacity - if len(resp.Resources) != 0 { - t.Errorf("Expected empty or nil Resources, got %d resources", len(resp.Resources)) - } - }, - }, - { - name: "GET request fails", - method: http.MethodGet, - body: nil, - expectedStatus: http.StatusMethodNotAllowed, - checkResponse: nil, - }, - { - name: "PUT request fails", - method: http.MethodPut, - body: nil, - expectedStatus: http.StatusMethodNotAllowed, - checkResponse: nil, - }, + {name: "POST succeeds", method: http.MethodPost, body: liquid.ServiceCapacityRequest{}, expectedStatus: http.StatusOK}, + {name: "POST with empty body succeeds", method: http.MethodPost, body: nil, expectedStatus: http.StatusOK}, + {name: "GET fails", method: http.MethodGet, body: nil, expectedStatus: http.StatusMethodNotAllowed}, + {name: "PUT fails", method: http.MethodPut, body: nil, expectedStatus: http.StatusMethodNotAllowed}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - // Create request var req *http.Request if tt.body != nil { bodyBytes, err := json.Marshal(tt.body) @@ -111,314 +70,305 @@ func TestHandleReportCapacity(t *testing.T) { } else { req = httptest.NewRequest(tt.method, "/commitments/v1/report-capacity", http.NoBody) } - req = req.WithContext(context.Background()) - - // Create response recorder rr := httptest.NewRecorder() - - // Call handler - api.HandleReportCapacity(rr, req) - - // Check status code + api.HandleReportCapacity(rr, req.WithContext(context.Background())) if rr.Code != tt.expectedStatus { - t.Errorf("Expected status %d, got %d", tt.expectedStatus, rr.Code) - } - - // Check response if applicable - if tt.checkResponse != nil && rr.Code == http.StatusOK { - var resp liquid.ServiceCapacityReport - if err := json.NewDecoder(rr.Body).Decode(&resp); err != nil { - t.Fatalf("Failed to decode response: %v", err) - } - tt.checkResponse(t, &resp) + t.Errorf("status = %d, want %d", rr.Code, tt.expectedStatus) } }) } } +// TestCapacityCalculator covers calculator behavior across different scenarios. func TestCapacityCalculator(t *testing.T) { - // Setup fake client with Knowledge CRD scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { t.Fatal(err) } - testCapacityConfig := commitments.APIConfig{ - FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ - "*": { - RAM: commitments.RAMResourceTypeConfig{HasCapacity: true}, - Cores: commitments.ResourceTypeConfig{HasCapacity: true}, - Instances: commitments.ResourceTypeConfig{HasCapacity: true}, + const flavorMemBytes = 32752 * 1024 * 1024 // test flavor: 32752 MiB + + newCalculator := func(objects ...client.Object) *commitments.CapacityCalculator { + return commitments.NewCapacityCalculator( + fake.NewClientBuilder().WithScheme(scheme).WithObjects(objects...). + WithStatusSubresource(&v1alpha1.FlavorGroupCapacity{}).Build(), + defaultCapacityConfig, + ) + } + + tests := []struct { + name string + checkFn func(t *testing.T) + }{ + { + name: "no knowledge → error", + checkFn: func(t *testing.T) { + _, err := newCalculator().CalculateCapacity(context.Background(), + liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}}) + if err == nil || !strings.Contains(err.Error(), "not found") { + t.Errorf("expected not-found error, got %v", err) + } + }, + }, + { + name: "empty knowledge → 0 resources", + checkFn: func(t *testing.T) { + report, err := newCalculator(createEmptyFlavorGroupKnowledge()).CalculateCapacity( + context.Background(), liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}}) + if err != nil { + t.Fatal(err) + } + if len(report.Resources) != 0 { + t.Errorf("expected 0 resources, got %d", len(report.Resources)) + } + }, + }, + { + name: "knowledge only → perAZ entries match requested AZs", + checkFn: func(t *testing.T) { + azs := []liquid.AvailabilityZone{"qa-de-1a", "qa-de-1b", "qa-de-1d"} + report, err := newCalculator(createTestFlavorGroupKnowledge(t)).CalculateCapacity( + context.Background(), liquid.ServiceCapacityRequest{AllAZs: azs}) + if err != nil { + t.Fatal(err) + } + if len(report.Resources) != 3 { + t.Fatalf("expected 3 resources, got %d", len(report.Resources)) + } + for _, res := range report.Resources { + verifyPerAZMatchesRequest(t, res, azs) + } + }, + }, + { + name: "empty AllAZs → empty perAZ maps", + checkFn: func(t *testing.T) { + report, err := newCalculator(createTestFlavorGroupKnowledge(t)).CalculateCapacity( + context.Background(), liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{}}) + if err != nil { + t.Fatal(err) + } + for name, res := range report.Resources { + if len(res.PerAZ) != 0 { + t.Errorf("%s: expected empty PerAZ, got %d entries", name, len(res.PerAZ)) + } + } + }, + }, + { + name: "different AZ sets each get their own entries", + checkFn: func(t *testing.T) { + calc := newCalculator(createTestFlavorGroupKnowledge(t)) + req1 := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"eu-de-1a", "eu-de-1b"}} + req2 := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"us-west-1a", "us-west-1b", "us-west-1c"}} + for _, req := range []liquid.ServiceCapacityRequest{req1, req2} { + report, err := calc.CalculateCapacity(context.Background(), req) + if err != nil { + t.Fatal(err) + } + for _, res := range report.Resources { + verifyPerAZMatchesRequest(t, res, req.AllAZs) + } + } }, }, } - t.Run("CalculateCapacity returns error when no flavor groups knowledge exists", func(t *testing.T) { - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - Build() - - calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) - req := liquid.ServiceCapacityRequest{ - AllAZs: []liquid.AvailabilityZone{"az-one", "az-two"}, - } - _, err := calculator.CalculateCapacity(context.Background(), req) - if err == nil { - t.Fatal("Expected error when flavor groups knowledge doesn't exist, got nil") - } - if !strings.Contains(err.Error(), "not found") { - t.Errorf("Expected 'not found' error, got: %v", err) - } - }) - - t.Run("CalculateCapacity returns empty report when flavor groups knowledge exists but is empty", func(t *testing.T) { - // Create empty flavor groups knowledge - emptyKnowledge := createEmptyFlavorGroupKnowledge() - - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(emptyKnowledge). - Build() - - calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) - req := liquid.ServiceCapacityRequest{ - AllAZs: []liquid.AvailabilityZone{"az-one", "az-two"}, - } - report, err := calculator.CalculateCapacity(context.Background(), req) - if err != nil { - t.Fatalf("Expected no error, got: %v", err) - } - - if report.Resources == nil { - t.Error("Expected Resources map to be initialized") - } - - if len(report.Resources) != 0 { - t.Errorf("Expected 0 resources, got %d", len(report.Resources)) - } - }) - - t.Run("CalculateCapacity returns perAZ entries for all AZs from request", func(t *testing.T) { - flavorGroupKnowledge := createTestFlavorGroupKnowledge(t) - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(flavorGroupKnowledge). - Build() - - calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) - req := liquid.ServiceCapacityRequest{ - AllAZs: []liquid.AvailabilityZone{"qa-de-1a", "qa-de-1b", "qa-de-1d"}, - } - report, err := calculator.CalculateCapacity(context.Background(), req) - if err != nil { - t.Fatalf("Expected no error, got: %v", err) - } - - if len(report.Resources) != 3 { - t.Fatalf("Expected 3 resources (_ram, _cores, _instances), got %d", len(report.Resources)) - } - - // Verify all resources have exactly the requested AZs - verifyPerAZMatchesRequest(t, report.Resources["hw_version_test-group_ram"], req.AllAZs) - verifyPerAZMatchesRequest(t, report.Resources["hw_version_test-group_cores"], req.AllAZs) - verifyPerAZMatchesRequest(t, report.Resources["hw_version_test-group_instances"], req.AllAZs) - }) - - t.Run("CalculateCapacity with empty AllAZs returns empty perAZ maps", func(t *testing.T) { - flavorGroupKnowledge := createTestFlavorGroupKnowledge(t) - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(flavorGroupKnowledge). - Build() + // CRD-value cases: all use fixed-ratio knowledge + one CRD for az-one. + type crdValueCase struct { + name string + runningInstances int64 + exclusiveFreeBytes int64 + ready bool + checkAZ liquid.AvailabilityZone + wantCapacity uint64 + wantUsage *uint64 // nil = expect absent + cfg *commitments.APIConfig + wantResourceCount int // 0 = don't check + } + u := func(v uint64) *uint64 { return &v } - calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) - req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{}} - report, err := calculator.CalculateCapacity(context.Background(), req) - if err != nil { - t.Fatalf("Expected no error, got: %v", err) - } + crdCases := []crdValueCase{ + { + // running=200, exclusively_free=800 slots → capacity=1000, usage=200 + name: "ready CRD: capacity = running + exclusively free, usage = running", + runningInstances: 200, exclusiveFreeBytes: 800 * flavorMemBytes, ready: true, + checkAZ: "az-one", wantCapacity: 1000, wantUsage: u(200), + }, + { + // stale CRD: last-known capacity still reported, usage omitted + name: "stale CRD: capacity reported, usage absent", + runningInstances: 200, exclusiveFreeBytes: 800 * flavorMemBytes, ready: false, + checkAZ: "az-one", wantCapacity: 1000, wantUsage: nil, + }, + { + // CRD only covers az-one; az-two has no CRD → capacity=0 + name: "missing CRD for AZ: capacity=0", + runningInstances: 500, exclusiveFreeBytes: 400, ready: true, + checkAZ: "az-two", wantCapacity: 0, wantUsage: nil, + }, + } - if len(report.Resources) != 3 { - t.Fatalf("Expected 3 resources, got %d", len(report.Resources)) - } + for _, tc := range crdCases { + tests = append(tests, struct { + name string + checkFn func(t *testing.T) + }{ + name: tc.name, + checkFn: func(t *testing.T) { + cfg := defaultCapacityConfig + if tc.cfg != nil { + cfg = *tc.cfg + } + crd := createTestFlavorGroupCapacity(tc.runningInstances, tc.exclusiveFreeBytes, tc.ready) + calc := commitments.NewCapacityCalculator( + fake.NewClientBuilder().WithScheme(scheme). + WithObjects(createTestFlavorGroupKnowledge(t), crd). + WithStatusSubresource(crd).Build(), + cfg, + ) + allAZs := []liquid.AvailabilityZone{"az-one"} + if tc.checkAZ != "az-one" { + allAZs = append(allAZs, tc.checkAZ) + } + report, err := calc.CalculateCapacity(context.Background(), + liquid.ServiceCapacityRequest{AllAZs: allAZs}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if tc.wantResourceCount > 0 && len(report.Resources) != tc.wantResourceCount { + t.Fatalf("expected %d resources, got %d", tc.wantResourceCount, len(report.Resources)) + } + ramRes := report.Resources["hw_version_test-group_ram"] + if ramRes == nil { + t.Fatal("missing hw_version_test-group_ram") + } + az := ramRes.PerAZ[tc.checkAZ] + if az == nil { + t.Fatalf("missing entry for AZ %s", tc.checkAZ) + } + if az.Capacity != tc.wantCapacity { + t.Errorf("capacity = %d, want %d", az.Capacity, tc.wantCapacity) + } + if tc.wantUsage == nil { + if az.Usage.IsSome() { + t.Error("expected usage absent, got value") + } + } else { + if usage := az.Usage.UnwrapOr(99999); usage != *tc.wantUsage { + t.Errorf("usage = %d, want %d", usage, *tc.wantUsage) + } + } + }, + }) + } - for resName, res := range report.Resources { - if len(res.PerAZ) != 0 { - t.Errorf("%s: expected empty PerAZ, got %d entries", resName, len(res.PerAZ)) + // HasCapacity=false case — different config, checks resource count. + tests = append(tests, struct { + name string + checkFn func(t *testing.T) + }{ + name: "HasCapacity=false omits resource from report", + checkFn: func(t *testing.T) { + cfg := commitments.APIConfig{ + FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ + "*": { + RAM: commitments.RAMResourceTypeConfig{HasCapacity: true}, + Cores: commitments.ResourceTypeConfig{HasCapacity: true}, + Instances: commitments.ResourceTypeConfig{HasCapacity: false}, + }, + }, } - } - }) - - t.Run("CalculateCapacity responds to different AZ sets correctly", func(t *testing.T) { - flavorGroupKnowledge := createTestFlavorGroupKnowledge(t) - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(flavorGroupKnowledge). - Build() - - calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) - - req1 := liquid.ServiceCapacityRequest{ - AllAZs: []liquid.AvailabilityZone{"eu-de-1a", "eu-de-1b"}, - } - report1, err := calculator.CalculateCapacity(context.Background(), req1) - if err != nil { - t.Fatalf("Expected no error, got: %v", err) - } - - req2 := liquid.ServiceCapacityRequest{ - AllAZs: []liquid.AvailabilityZone{"us-west-1a", "us-west-1b", "us-west-1c", "us-west-1d"}, - } - report2, err := calculator.CalculateCapacity(context.Background(), req2) - if err != nil { - t.Fatalf("Expected no error, got: %v", err) - } - - // Verify reports have exactly the requested AZs - for _, res := range report1.Resources { - verifyPerAZMatchesRequest(t, res, req1.AllAZs) - } - for _, res := range report2.Resources { - verifyPerAZMatchesRequest(t, res, req2.AllAZs) - } - }) - - t.Run("CalculateCapacity reads capacity and usage from Ready CRD", func(t *testing.T) { - knowledge := createTestFlavorGroupKnowledge(t) - crd := createTestFlavorGroupCapacity(1000, 800, true) - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(knowledge, crd). - WithStatusSubresource(crd). - Build() - - calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) - req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}} - report, err := calculator.CalculateCapacity(context.Background(), req) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - ramRes := report.Resources["hw_version_test-group_ram"] - if ramRes == nil { - t.Fatal("expected hw_version_test-group_ram resource") - } - azReport := ramRes.PerAZ["az-one"] - if azReport == nil { - t.Fatal("expected az-one entry") - } - if azReport.Capacity != 1000 { - t.Errorf("expected capacity=1000, got %d", azReport.Capacity) - } - if !azReport.Usage.IsSome() { - t.Fatal("expected usage to be set for Ready CRD") - } - // usage = (total - placeable) slots = (1000 - 800) = 200 slots - if usage := azReport.Usage.UnwrapOr(0); usage != 200 { - t.Errorf("expected usage=200 (200 slots), got %d", usage) - } - }) - - t.Run("CalculateCapacity returns zero capacity for missing CRD", func(t *testing.T) { - knowledge := createTestFlavorGroupKnowledge(t) - // CRD exists only for az-one; az-two has no CRD - crd := createTestFlavorGroupCapacity(500, 400, true) - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(knowledge, crd). - WithStatusSubresource(crd). - Build() - - calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) - req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one", "az-two"}} - report, err := calculator.CalculateCapacity(context.Background(), req) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - ramRes := report.Resources["hw_version_test-group_ram"] - if ramRes == nil { - t.Fatal("expected hw_version_test-group_ram resource") - } - azTwo := ramRes.PerAZ["az-two"] - if azTwo == nil { - t.Fatal("expected az-two entry even without CRD") - } - if azTwo.Capacity != 0 { - t.Errorf("expected capacity=0 for missing CRD, got %d", azTwo.Capacity) - } + crd := createTestFlavorGroupCapacity(100, 80, true) + calc := commitments.NewCapacityCalculator( + fake.NewClientBuilder().WithScheme(scheme). + WithObjects(createTestFlavorGroupKnowledge(t), crd). + WithStatusSubresource(crd).Build(), + cfg, + ) + report, err := calc.CalculateCapacity(context.Background(), + liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}}) + if err != nil { + t.Fatal(err) + } + if len(report.Resources) != 2 { + t.Errorf("expected 2 resources (ram, cores), got %d", len(report.Resources)) + } + if _, ok := report.Resources["hw_version_test-group_instances"]; ok { + t.Error("hw_version_test-group_instances should be absent") + } + }, }) - t.Run("CalculateCapacity omits usage for stale CRD (Ready=False)", func(t *testing.T) { - knowledge := createTestFlavorGroupKnowledge(t) - crd := createTestFlavorGroupCapacity(1000, 800, false) - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(knowledge, crd). - WithStatusSubresource(crd). - Build() + for _, tt := range tests { + t.Run(tt.name, tt.checkFn) + } +} - calculator := commitments.NewCapacityCalculator(fakeClient, testCapacityConfig) - req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}} - report, err := calculator.CalculateCapacity(context.Background(), req) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } +// TestCapacityCalculator_VariableRatio tests RAM capacity/usage for variable-ratio groups. +// Covers both exact-MiB flavors and the 16 MiB vRAM offset case (hw_video:ram_max_mb=16), +// where the actual MemoryMB is 16 less than the nominal value. Both must be handled correctly. +func TestCapacityCalculator_VariableRatio(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatal(err) + } - ramRes := report.Resources["hw_version_test-group_ram"] - if ramRes == nil { - t.Fatal("expected hw_version_test-group_ram resource") - } - azReport := ramRes.PerAZ["az-one"] - if azReport == nil { - t.Fatal("expected az-one entry") - } - // Stale CRD: last-known capacity is still reported (1000 slots) - if azReport.Capacity != 1000 { - t.Errorf("expected last-known capacity=1000 for stale CRD, got %d", azReport.Capacity) - } - // Stale CRD: usage must be absent (None) - if azReport.Usage.IsSome() { - t.Error("expected usage to be absent (None) for stale CRD") - } - }) + const ramUnitGiB = 2 // 1 declared unit = 2 GiB - t.Run("CalculateCapacity omits resources with HasCapacity=false", func(t *testing.T) { - knowledge := createTestFlavorGroupKnowledge(t) - crd := createTestFlavorGroupCapacity(100, 80, true) - fakeClient := fake.NewClientBuilder(). - WithScheme(scheme). - WithObjects(knowledge, crd). - WithStatusSubresource(crd). - Build() + tests := []struct { + name string + flavorMemMiB int + wantRAMCap uint64 + wantRAMUsage uint64 + }{ + { + // Exact: 3 running + 5 free = 8 × 2 GiB = 8 declared units. + name: "exact 2 GiB flavor (no vRAM offset)", + flavorMemMiB: 2048, + wantRAMCap: 8, + wantRAMUsage: 3, + }, + { + // 3 VMs × 2032 MiB = 6096 MiB. 6096 / 2048 = 2 (not 3) — undercount by 1 unit. + // Capacity: (3+5)×2032 MiB / 2048 MiB = 7 (not 8). Known limitation. + name: "2032 MiB flavor (16 MiB vRAM offset, hw_video:ram_max_mb=16)", + flavorMemMiB: 2032, + wantRAMCap: 7, + wantRAMUsage: 2, + }, + } - // Only RAM and Cores have capacity; Instances does not. - cfgNoInstances := commitments.APIConfig{ - FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ - "*": { - RAM: commitments.RAMResourceTypeConfig{HasCapacity: true}, - Cores: commitments.ResourceTypeConfig{HasCapacity: true}, - Instances: commitments.ResourceTypeConfig{HasCapacity: false}, + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + flavorMemBytes := int64(tc.flavorMemMiB) * 1024 * 1024 + knowledge := createVariableRatioFlavorGroupKnowledge(t, tc.flavorMemMiB) + // 3 running VMs, 5 exclusively free slots + crd := createFlavorGroupCapacityWithResources(3, 5*flavorMemBytes, 3*flavorMemBytes, 3*8) + cfg := commitments.APIConfig{ + FlavorGroupResourceConfig: map[string]commitments.FlavorGroupResourcesConfig{ + "*": {RAM: commitments.RAMResourceTypeConfig{HasCapacity: true, RAMUnitGiB: ramUnitGiB}}, }, - }, - } - calculator := commitments.NewCapacityCalculator(fakeClient, cfgNoInstances) - req := liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}} - report, err := calculator.CalculateCapacity(context.Background(), req) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if len(report.Resources) != 2 { - t.Fatalf("expected 2 resources (ram, cores), got %d: %v", len(report.Resources), report.Resources) - } - if _, ok := report.Resources["hw_version_test-group_instances"]; ok { - t.Error("expected hw_version_test-group_instances to be absent (HasCapacity=false)") - } - }) + } + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme).WithObjects(knowledge, crd).WithStatusSubresource(crd).Build() + + report, err := commitments.NewCapacityCalculator(fakeClient, cfg).CalculateCapacity( + context.Background(), liquid.ServiceCapacityRequest{AllAZs: []liquid.AvailabilityZone{"az-one"}}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + az := report.Resources["hw_version_test-group_ram"].PerAZ["az-one"] + if az.Capacity != tc.wantRAMCap { + t.Errorf("RAM capacity = %d, want %d", az.Capacity, tc.wantRAMCap) + } + if usage := az.Usage.UnwrapOr(99); usage != tc.wantRAMUsage { + t.Errorf("RAM usage = %d, want %d", usage, tc.wantRAMUsage) + } + }) + } } -// This follows the same semantics as nova liquid: the response must contain -// entries for all AZs in AllAZs, no more and no less. func verifyPerAZMatchesRequest(t *testing.T, res *liquid.ResourceCapacityReport, requestedAZs []liquid.AvailabilityZone) { t.Helper() if res == nil { @@ -430,142 +380,136 @@ func verifyPerAZMatchesRequest(t *testing.T, res *liquid.ResourceCapacityReport, } for _, az := range requestedAZs { if _, ok := res.PerAZ[az]; !ok { - t.Errorf("missing entry for requested AZ %s", az) + t.Errorf("missing entry for AZ %s", az) } } for az := range res.PerAZ { if !slices.Contains(requestedAZs, az) { - t.Errorf("unexpected AZ %s in response (not in request)", az) + t.Errorf("unexpected AZ %s in response", az) } } } -// createEmptyFlavorGroupKnowledge creates an empty flavor groups Knowledge CRD func createEmptyFlavorGroupKnowledge() *v1alpha1.Knowledge { - // Box empty array properly - emptyFeatures := []map[string]interface{}{} - raw, err := v1alpha1.BoxFeatureList(emptyFeatures) + raw, err := v1alpha1.BoxFeatureList([]map[string]interface{}{}) if err != nil { - panic(err) // Should never happen for empty slice + panic(err) } - return &v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{ - Name: "flavor-groups", - // No namespace - Knowledge is cluster-scoped - }, + ObjectMeta: v1.ObjectMeta{Name: "flavor-groups"}, Spec: v1alpha1.KnowledgeSpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, - Extractor: v1alpha1.KnowledgeExtractorSpec{ - Name: "flavor_groups", - }, + Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"}, }, Status: v1alpha1.KnowledgeStatus{ - Conditions: []v1.Condition{ - { - Type: v1alpha1.KnowledgeConditionReady, - Status: "True", - }, - }, - Raw: raw, + Conditions: []v1.Condition{{Type: v1alpha1.KnowledgeConditionReady, Status: "True"}}, + Raw: raw, }, } } -// createTestFlavorGroupCapacity creates a FlavorGroupCapacity CRD for testing. -// totalSlots and placeableSlots are for the named smallest flavor entry. -// ready controls whether the Ready condition is True or False. -func createTestFlavorGroupCapacity(totalSlots, placeableSlots int64, ready bool) *v1alpha1.FlavorGroupCapacity { - const group = "test-group" - const az = "az-one" - const smallestFlavorName = "test_c8_m32" +// createTestFlavorGroupCapacity creates a FlavorGroupCapacity CRD for a fixed-ratio group. +func createTestFlavorGroupCapacity(runningInstances, exclusiveFreeMemBytes int64, ready bool) *v1alpha1.FlavorGroupCapacity { conditionStatus := v1.ConditionTrue if !ready { conditionStatus = v1.ConditionFalse } + status := v1alpha1.FlavorGroupCapacityStatus{ + Flavors: []v1alpha1.FlavorCapacityStatus{{FlavorName: "test_c8_m32"}}, + RunningInstances: runningInstances, + Conditions: []v1.Condition{{Type: v1alpha1.FlavorGroupCapacityConditionReady, Status: conditionStatus}}, + } + if exclusiveFreeMemBytes > 0 { + const flavorMemBytes = 32752 * 1024 * 1024 // test flavor memory size + status.ExclusivelyFreeCapacity = map[string]resource.Quantity{ + string(v1alpha1.CommittedResourceTypeMemory): *resource.NewQuantity(exclusiveFreeMemBytes, resource.BinarySI), + } + status.ExclusivelyFreeSlots = exclusiveFreeMemBytes / flavorMemBytes + } return &v1alpha1.FlavorGroupCapacity{ - ObjectMeta: v1.ObjectMeta{ - Name: group + "-" + az, - }, - Spec: v1alpha1.FlavorGroupCapacitySpec{ - FlavorGroup: group, - AvailabilityZone: az, - }, - Status: v1alpha1.FlavorGroupCapacityStatus{ - Flavors: []v1alpha1.FlavorCapacityStatus{ - { - FlavorName: smallestFlavorName, - TotalCapacityVMSlots: totalSlots, - PlaceableVMs: placeableSlots, - }, - }, - Conditions: []v1.Condition{ - { - Type: v1alpha1.FlavorGroupCapacityConditionReady, - Status: conditionStatus, - }, - }, - }, + ObjectMeta: v1.ObjectMeta{Name: "test-group-az-one"}, + Spec: v1alpha1.FlavorGroupCapacitySpec{FlavorGroup: "test-group", AvailabilityZone: "az-one"}, + Status: status, } } -// that accepts commitments (has fixed RAM/core ratio) +// createTestFlavorGroupKnowledge creates a fixed-ratio (HANA-style) flavor group Knowledge CRD. func createTestFlavorGroupKnowledge(t *testing.T) *v1alpha1.Knowledge { t.Helper() - features := []map[string]interface{}{ { "name": "test-group", "flavors": []map[string]interface{}{ - { - "name": "test_c8_m32", - "vcpus": 8, - "memoryMB": 32752, - "diskGB": 50, - }, - }, - "largestFlavor": map[string]interface{}{ - "name": "test_c8_m32", - "vcpus": 8, - "memoryMB": 32752, - "diskGB": 50, - }, - "smallestFlavor": map[string]interface{}{ - "name": "test_c8_m32", - "vcpus": 8, - "memoryMB": 32752, - "diskGB": 50, + {"name": "test_c8_m32", "vcpus": 8, "memoryMB": 32752, "diskGB": 50}, }, - // Fixed RAM/core ratio (4096 MiB per vCPU) - required for group to accept commitments - "ramCoreRatio": 4096, + "largestFlavor": map[string]interface{}{"name": "test_c8_m32", "vcpus": 8, "memoryMB": 32752, "diskGB": 50}, + "smallestFlavor": map[string]interface{}{"name": "test_c8_m32", "vcpus": 8, "memoryMB": 32752, "diskGB": 50}, + "ramCoreRatio": 4096, // fixed RAM/core ratio → slots-based reporting }, } - - // Use BoxFeatureList to properly format the features raw, err := v1alpha1.BoxFeatureList(features) if err != nil { t.Fatal(err) } - return &v1alpha1.Knowledge{ - ObjectMeta: v1.ObjectMeta{ - Name: "flavor-groups", - // No namespace - Knowledge is cluster-scoped + ObjectMeta: v1.ObjectMeta{Name: "flavor-groups"}, + Spec: v1alpha1.KnowledgeSpec{ + SchedulingDomain: v1alpha1.SchedulingDomainNova, + Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"}, + }, + Status: v1alpha1.KnowledgeStatus{ + Conditions: []v1.Condition{{Type: v1alpha1.KnowledgeConditionReady, Status: "True"}}, + Raw: raw, + }, + } +} + +// createVariableRatioFlavorGroupKnowledge creates a Knowledge CRD for a variable-ratio group +// (no ramCoreRatio, so RAM is reported in GiB units, not slots). +func createVariableRatioFlavorGroupKnowledge(t *testing.T, flavorMemMiB int) *v1alpha1.Knowledge { + t.Helper() + features := []map[string]interface{}{ + { + "name": "test-group", + "flavors": []map[string]interface{}{{"name": "test-flavor", "vcpus": 8, "memoryMB": flavorMemMiB, "diskGB": 50}}, + "largestFlavor": map[string]interface{}{"name": "test-flavor", "vcpus": 8, "memoryMB": flavorMemMiB, "diskGB": 50}, + "smallestFlavor": map[string]interface{}{"name": "test-flavor", "vcpus": 8, "memoryMB": flavorMemMiB, "diskGB": 50}, + // no ramCoreRatio → variable-ratio group }, + } + raw, err := v1alpha1.BoxFeatureList(features) + if err != nil { + t.Fatal(err) + } + return &v1alpha1.Knowledge{ + ObjectMeta: v1.ObjectMeta{Name: "flavor-groups"}, Spec: v1alpha1.KnowledgeSpec{ SchedulingDomain: v1alpha1.SchedulingDomainNova, - Extractor: v1alpha1.KnowledgeExtractorSpec{ - Name: "flavor_groups", - }, + Extractor: v1alpha1.KnowledgeExtractorSpec{Name: "flavor_groups"}, }, Status: v1alpha1.KnowledgeStatus{ - Conditions: []v1.Condition{ - { - Type: v1alpha1.KnowledgeConditionReady, - Status: "True", - }, + Conditions: []v1.Condition{{Type: v1alpha1.KnowledgeConditionReady, Status: "True", Reason: "ExtractorSucceeded"}}, + Raw: raw, + }, + } +} + +// createFlavorGroupCapacityWithResources creates a ready FlavorGroupCapacity CRD with +// RunningInstances, ExclusivelyFreeCapacity, and RunningResources all populated. +func createFlavorGroupCapacityWithResources(runningInstances, exclusiveFreeMemBytes, runningMemBytes, runningCores int64) *v1alpha1.FlavorGroupCapacity { + return &v1alpha1.FlavorGroupCapacity{ + ObjectMeta: v1.ObjectMeta{Name: "test-group-az-one"}, + Spec: v1alpha1.FlavorGroupCapacitySpec{FlavorGroup: "test-group", AvailabilityZone: "az-one"}, + Status: v1alpha1.FlavorGroupCapacityStatus{ + RunningInstances: runningInstances, + RunningResources: map[string]resource.Quantity{ + string(v1alpha1.CommittedResourceTypeMemory): *resource.NewQuantity(runningMemBytes, resource.BinarySI), + string(v1alpha1.CommittedResourceTypeCores): *resource.NewQuantity(runningCores, resource.DecimalSI), + }, + ExclusivelyFreeCapacity: map[string]resource.Quantity{ + string(v1alpha1.CommittedResourceTypeMemory): *resource.NewQuantity(exclusiveFreeMemBytes, resource.BinarySI), }, - Raw: raw, + Conditions: []v1.Condition{{Type: v1alpha1.FlavorGroupCapacityConditionReady, Status: v1.ConditionTrue}}, }, } } diff --git a/internal/scheduling/reservations/commitments/capacity.go b/internal/scheduling/reservations/commitments/capacity.go index 2f366404c..ee3fc1a99 100644 --- a/internal/scheduling/reservations/commitments/capacity.go +++ b/internal/scheduling/reservations/commitments/capacity.go @@ -28,25 +28,21 @@ func NewCapacityCalculator(client client.Client, conf APIConfig) *CapacityCalcul // CalculateCapacity computes per-AZ capacity for all flavor groups. // For each flavor group, three resources are reported: _ram, _cores, _instances. -// Capacity and usage are read from FlavorGroupCapacity CRDs pre-computed by the capacity controller. -// Usage is approximated from slot counts (total − placeable of the smallest flavor); this may -// slightly under-report usage when larger flavors are running, showing more free capacity than -// reality — acceptable for capacity planning purposes. +// All values are read from FlavorGroupCapacity CRDs pre-computed by the capacity controller: +// - Capacity: RunningInstances + ExclusivelyFreeCapacity converted to slots. +// - Usage: RunningInstances / RunningResources. func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.ServiceCapacityRequest) (liquid.ServiceCapacityReport, error) { - // Get all flavor groups from Knowledge CRDs (needed for smallest-flavor lookup). knowledge := &reservations.FlavorGroupKnowledgeClient{Client: c.client} flavorGroups, err := knowledge.GetAllFlavorGroups(ctx, nil) if err != nil { return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to get flavor groups: %w", err) } - // Get version from Knowledge CRD (same as info API version). var infoVersion int64 = -1 if knowledgeCRD, err := knowledge.Get(ctx); err == nil && knowledgeCRD != nil && !knowledgeCRD.Status.LastContentChange.IsZero() { infoVersion = knowledgeCRD.Status.LastContentChange.Unix() } - // List all FlavorGroupCapacity CRDs and index by (flavorGroup, az). var capacityList v1alpha1.FlavorGroupCapacityList if err := c.client.List(ctx, &capacityList); err != nil { return liquid.ServiceCapacityReport{}, fmt.Errorf("failed to list FlavorGroupCapacity CRDs: %w", err) @@ -58,7 +54,6 @@ func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.S crdByKey[groupAZKey{crd.Spec.FlavorGroup, crd.Spec.AvailabilityZone}] = crd } - // Build capacity report for all flavor groups. report := liquid.ServiceCapacityReport{ InfoVersion: infoVersion, Resources: make(map[liquid.ResourceName]*liquid.ResourceCapacityReport), @@ -72,11 +67,7 @@ func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.S continue } - smallestFlavorName := groupData.SmallestFlavor.Name - // Add 16 MiB before dividing: flavors may reserve 16 MiB for video RAM (hw_video:ram_max_mb=16), - // so a nominal "2 GiB" flavor may report 2032 MiB. Adding 16 restores the intended GiB boundary. - memoryGiBPerSlot := (groupData.SmallestFlavor.MemoryMB + 16) / 1024 - vcpusPerSlot := groupData.SmallestFlavor.VCPUs + ramUnitBytes := int64(resCfg.RAM.RAMUnitMiB()) * 1024 * 1024 //nolint:gosec ramAZCapacity := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, len(req.AllAZs)) coresAZCapacity := make(map[liquid.AvailabilityZone]*liquid.AZResourceCapacityReport, len(req.AllAZs)) @@ -85,7 +76,6 @@ func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.S for _, az := range req.AllAZs { crd, ok := crdByKey[groupAZKey{groupName, string(az)}] if !ok { - // No CRD for this (group, AZ) pair — report zero. zero := &liquid.AZResourceCapacityReport{Capacity: 0} ramAZCapacity[az] = zero coresAZCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} @@ -93,55 +83,67 @@ func (c *CapacityCalculator) CalculateCapacity(ctx context.Context, req liquid.S continue } - // If the CRD data is stale, report last-known capacity but omit usage. if !apimeta.IsStatusConditionTrue(crd.Status.Conditions, v1alpha1.FlavorGroupCapacityConditionReady) { logger.Info("FlavorGroupCapacity CRD is stale, reporting capacity without usage", "flavorGroup", groupName, "az", az) } - // Find the smallest-flavor entry in the CRD status. - var smallest *v1alpha1.FlavorCapacityStatus - for i := range crd.Status.Flavors { - if crd.Status.Flavors[i].FlavorName == smallestFlavorName { - smallest = &crd.Status.Flavors[i] - break - } - } - if smallest == nil { - zero := &liquid.AZResourceCapacityReport{Capacity: 0} - ramAZCapacity[az] = zero - coresAZCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} - instancesAZCapacity[az] = &liquid.AZResourceCapacityReport{Capacity: 0} - continue - } + // ExclusivelyFreeSlots is pre-computed by the controller using min(memSlots, cpuSlots). + exclusiveFreeSlots := uint64(crd.Status.ExclusivelyFreeSlots) //nolint:gosec + + // Capacity = running + exclusively free, all derived from CRD bytes. + runningInstances := uint64(crd.Status.RunningInstances) //nolint:gosec + instancesCapacity := runningInstances + exclusiveFreeSlots - totalSlots := uint64(smallest.TotalCapacityVMSlots) //nolint:gosec // slot count from CRD, realistically bounded + // RAM capacity: running bytes + exclusively free bytes → declared units. + // Fixed-ratio groups report in slots (1 unit = 1 instance). var ramCapacity uint64 if groupData.HasFixedRamCoreRatio() { - ramCapacity = totalSlots - } else { - ramCapacity = totalSlots * memoryGiBPerSlot + ramCapacity = instancesCapacity + } else if ramUnitBytes > 0 { + runningMemBytes := int64(0) + if qty, ok := crd.Status.RunningResources[string(v1alpha1.CommittedResourceTypeMemory)]; ok { + runningMemBytes = qty.Value() + } + freeMemBytes := int64(0) + if qty, ok := crd.Status.ExclusivelyFreeCapacity[string(v1alpha1.CommittedResourceTypeMemory)]; ok { + freeMemBytes = qty.Value() + } + ramCapacity = uint64(runningMemBytes+freeMemBytes) / uint64(ramUnitBytes) + } + + // Cores capacity: running cores + exclusively free cores. + var coresCapacity uint64 + runningCoresCount := int64(0) + if qty, ok := crd.Status.RunningResources[string(v1alpha1.CommittedResourceTypeCores)]; ok { + runningCoresCount = qty.Value() + } + freeCoresCount := int64(0) + if qty, ok := crd.Status.ExclusivelyFreeCapacity[string(v1alpha1.CommittedResourceTypeCores)]; ok { + freeCoresCount = qty.Value() } + coresCapacity = uint64(runningCoresCount + freeCoresCount) + ramEntry := &liquid.AZResourceCapacityReport{Capacity: ramCapacity} - coresEntry := &liquid.AZResourceCapacityReport{Capacity: totalSlots * vcpusPerSlot} - instancesEntry := &liquid.AZResourceCapacityReport{Capacity: totalSlots} + coresEntry := &liquid.AZResourceCapacityReport{Capacity: coresCapacity} + instancesEntry := &liquid.AZResourceCapacityReport{Capacity: instancesCapacity} - // Usage is approximated from slot counts. This may slightly under-report usage when - // larger flavors are running (safe direction: shows more free capacity than reality). + // Usage from actual running VMs — only when CRD data is fresh. if apimeta.IsStatusConditionTrue(crd.Status.Conditions, v1alpha1.FlavorGroupCapacityConditionReady) { - placeableSlots := uint64(smallest.PlaceableVMs) //nolint:gosec // slot count from CRD, realistically bounded - var usedSlots uint64 - if totalSlots > placeableSlots { - usedSlots = totalSlots - placeableSlots - } + instancesEntry.Usage = Some[uint64](runningInstances) + coresEntry.Usage = Some[uint64](uint64(runningCoresCount)) + if groupData.HasFixedRamCoreRatio() { - ramEntry.Usage = Some[uint64](usedSlots) - } else { - ramEntry.Usage = Some[uint64](usedSlots * memoryGiBPerSlot) + ramEntry.Usage = Some[uint64](runningInstances) + } else if ramUnitBytes > 0 { + runningMemBytes := int64(0) + if qty, ok := crd.Status.RunningResources[string(v1alpha1.CommittedResourceTypeMemory)]; ok { + runningMemBytes = qty.Value() + } + ramEntry.Usage = Some[uint64](uint64(runningMemBytes) / uint64(ramUnitBytes)) } - coresEntry.Usage = Some[uint64](usedSlots * vcpusPerSlot) - instancesEntry.Usage = Some[uint64](usedSlots) } + ramAZCapacity[az] = ramEntry coresAZCapacity[az] = coresEntry instancesAZCapacity[az] = instancesEntry