Skip to content

Commit

Permalink
roachtest: ensure workload node has same zone as cluster if unspecified
Browse files Browse the repository at this point in the history
Previously we randomized the default zones for gce when unspecified
to avoid zone exhaustion errors. However, this ran into an issue with
workload nodes which are created seperately from the main cluster, as
a different default zone would return for each.

This change exposes the zone randomization logic up to roachtest so it
can assign the same default zone for both CRDB cluster and workload node.

Fixes: cockroachdb#129997
Release note: none
Epic: none
  • Loading branch information
DarrylWong committed Sep 5, 2024
1 parent 31210e3 commit ef25fb7
Show file tree
Hide file tree
Showing 9 changed files with 131 additions and 52 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ artifacts
/certs
# make stress, acceptance produce stress.test, acceptance.test
*.test*
pkg/cmd/roachtest/_runner-logs
# fuzz tests
work-Fuzz*
*-fuzz.zip
Expand Down
Binary file added pkg/cmd/roachtest/.DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ go_test(
"//pkg/roachprod/logger",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/azure",
"//pkg/roachprod/vm/gce",
"//pkg/testutils",
"//pkg/testutils/datapathutils",
"//pkg/testutils/echotest",
Expand Down
13 changes: 9 additions & 4 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -904,10 +904,6 @@ func (f *clusterFactory) newCluster(
if err != nil {
return nil, nil, err
}
if clusterCloud != spec.Local {
providerOptsContainer.SetProviderOpts(clusterCloud, providerOpts)
workloadProviderOptsContainer.SetProviderOpts(clusterCloud, workloadProviderOpts)
}

createFlagsOverride(&createVMOpts)
// Make sure expiration is changed if --lifetime override flag
Expand All @@ -930,6 +926,15 @@ func (f *clusterFactory) newCluster(
//
// https://github.com/cockroachdb/cockroach/issues/67906#issuecomment-887477675
genName := f.genName(cfg)

// Set the zones used for the cluster. We call this in the loop as the default GCE zone
// is randomized to avoid zone exhaustion errors.
providerOpts, workloadProviderOpts = cfg.spec.SetRoachprodOptsZones(providerOpts, workloadProviderOpts, params, string(selectedArch))
if clusterCloud != spec.Local {
providerOptsContainer.SetProviderOpts(clusterCloud, providerOpts)
workloadProviderOptsContainer.SetProviderOpts(clusterCloud, workloadProviderOpts)
}

// Logs for creating a new cluster go to a dedicated log file.
var retryStr string
if i > 1 {
Expand Down
100 changes: 61 additions & 39 deletions pkg/cmd/roachtest/spec/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,7 @@ func awsMachineSupportsSSD(machineType string) bool {
return false
}

func getAWSOpts(
machineType string, zones []string, volumeSize, ebsThroughput int, localSSD bool,
) vm.ProviderOpts {
func getAWSOpts(machineType string, volumeSize, ebsThroughput int, localSSD bool) vm.ProviderOpts {
opts := aws.DefaultProviderOpts()
if volumeSize != 0 {
opts.DefaultEBSVolume.Disk.VolumeSize = volumeSize
Expand All @@ -204,15 +202,11 @@ func getAWSOpts(
} else {
opts.MachineType = machineType
}
if len(zones) != 0 {
opts.CreateZones = zones
}
return opts
}

func getGCEOpts(
machineType string,
zones []string,
volumeSize, localSSDCount int,
localSSD bool,
RAID0 bool,
Expand All @@ -233,9 +227,6 @@ func getGCEOpts(
if volumeSize != 0 {
opts.PDVolumeSize = volumeSize
}
if len(zones) != 0 {
opts.Zones = zones
}
opts.SSDCount = localSSDCount
if localSSD && localSSDCount > 0 {
// NB: As the default behavior for _roachprod_ (at least in AWS/GCP) is
Expand All @@ -253,12 +244,9 @@ func getGCEOpts(
return opts
}

func getAzureOpts(machineType string, zones []string, volumeSize int) vm.ProviderOpts {
func getAzureOpts(machineType string, volumeSize int) vm.ProviderOpts {
opts := azure.DefaultProviderOpts()
opts.MachineType = machineType
if len(zones) != 0 {
opts.Locations = zones
}
if volumeSize != 0 {
opts.NetworkDiskSize = int32(volumeSize)
}
Expand Down Expand Up @@ -418,25 +406,6 @@ func (s *ClusterSpec) RoachprodOpts(
}
}

zonesStr := params.Defaults.Zones
switch cloud {
case AWS:
if s.AWS.Zones != "" {
zonesStr = s.AWS.Zones
}
case GCE:
if s.GCE.Zones != "" {
zonesStr = s.GCE.Zones
}
}
var zones []string
if zonesStr != "" {
zones = strings.Split(zonesStr, ",")
if !s.Geo {
zones = zones[:1]
}
}

var workloadMachineType string
var err error
switch cloud {
Expand All @@ -460,27 +429,80 @@ func (s *ClusterSpec) RoachprodOpts(
var workloadProviderOpts vm.ProviderOpts
switch cloud {
case AWS:
providerOpts = getAWSOpts(machineType, zones, s.VolumeSize, s.AWS.VolumeThroughput,
providerOpts = getAWSOpts(machineType, s.VolumeSize, s.AWS.VolumeThroughput,
createVMOpts.SSDOpts.UseLocalSSD)
workloadProviderOpts = getAWSOpts(workloadMachineType, zones, s.VolumeSize, s.AWS.VolumeThroughput,
workloadProviderOpts = getAWSOpts(workloadMachineType, s.VolumeSize, s.AWS.VolumeThroughput,
createVMOpts.SSDOpts.UseLocalSSD)
case GCE:
providerOpts = getGCEOpts(machineType, zones, s.VolumeSize, ssdCount,
providerOpts = getGCEOpts(machineType, s.VolumeSize, ssdCount,
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType, s.UseSpotVMs,
)
workloadProviderOpts = getGCEOpts(workloadMachineType, zones, s.VolumeSize, ssdCount,
workloadProviderOpts = getGCEOpts(workloadMachineType, s.VolumeSize, ssdCount,
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType, s.UseSpotVMs,
)
case Azure:
providerOpts = getAzureOpts(machineType, zones, s.VolumeSize)
workloadProviderOpts = getAzureOpts(workloadMachineType, zones, s.VolumeSize)
providerOpts = getAzureOpts(machineType, s.VolumeSize)
workloadProviderOpts = getAzureOpts(workloadMachineType, s.VolumeSize)
}

return createVMOpts, providerOpts, workloadProviderOpts, selectedArch, nil
}

// SetRoachprodOptsZones updates the providerOpts with the VM zones as specified in the params/spec.
// We separate this logic from RoachprodOpts as we may need to call this multiple times in order to
// randomize the default GCE zone.
func (s *ClusterSpec) SetRoachprodOptsZones(
providerOpts, workloadProviderOpts vm.ProviderOpts, params RoachprodClusterConfig, arch string,
) (vm.ProviderOpts, vm.ProviderOpts) {
zonesStr := params.Defaults.Zones
cloud := params.Cloud
switch cloud {
case AWS:
if s.AWS.Zones != "" {
zonesStr = s.AWS.Zones
}
case GCE:
if s.GCE.Zones != "" {
zonesStr = s.GCE.Zones
}
}
var zones []string
if zonesStr != "" {
zones = strings.Split(zonesStr, ",")
if !s.Geo {
zones = zones[:1]
}
}

switch cloud {
case AWS:
if len(zones) == 0 {
if !s.Geo {
zones = aws.DefaultZones[:1]
} else {
zones = aws.DefaultZones
}
}
providerOpts.(*aws.ProviderOpts).CreateZones = zones
workloadProviderOpts.(*aws.ProviderOpts).CreateZones = zones
case GCE:
// We randomize the list of default zones for GCE for quota reasons, so decide the zone
// early to ensure that the workload node and CRDB cluster have the same default zone.
if len(zones) == 0 {
if !s.Geo {
zones = gce.DefaultZones(arch)[:1]
} else {
zones = gce.DefaultZones(arch)
}
}
providerOpts.(*gce.ProviderOpts).Zones = zones
workloadProviderOpts.(*gce.ProviderOpts).Zones = zones
}
return providerOpts, workloadProviderOpts
}

// Expiration is the lifetime of the cluster. It may be destroyed after
// the expiration has passed.
func (s *ClusterSpec) Expiration() time.Time {
Expand Down
50 changes: 50 additions & 0 deletions pkg/cmd/roachtest/test_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/cloud"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
Expand Down Expand Up @@ -481,3 +482,52 @@ func TestNewCluster(t *testing.T) {
})
}
}

// Regression test for: https://github.com/cockroachdb/cockroach/issues/129997
// Tests that workload nodes are assigned the same default zone as the main CRDB cluster.
func TestGCESameDefaultZone(t *testing.T) {
ctx := context.Background()
factory := &clusterFactory{sem: make(chan struct{}, 1)}
cfg := clusterConfig{spec: spec.MakeClusterSpec(2, spec.WorkloadNode())}
setStatus := func(string) {}

defer func() {
create = roachprod.Create
}()

create = func(ctx context.Context, l *logger.Logger, username string, opts ...*cloud.ClusterCreateOpts) (retErr error) {
// Since we specified no zone for this cluster, roachtest should assign a default one for us.
// Check that it assigns the same default zone to both the CRDB cluster and the workload node.
require.Equal(t, len(opts), 2)
crdbZones := opts[0].ProviderOptsContainer[gce.ProviderName].(*gce.ProviderOpts).Zones
workloadZones := opts[1].ProviderOptsContainer[gce.ProviderName].(*gce.ProviderOpts).Zones
require.Equal(t, crdbZones, workloadZones)
// A bit of a workaround, we don't have a mock for registerCluster at this time which will panic if hit.
// Instead, just return an error to return early since we already tested the code paths we care about.
return &roachprod.ClusterAlreadyExistsError{}
}

testCases := []struct {
name string
geo bool
createMock func(ctx context.Context, l *logger.Logger, username string, opts ...*cloud.ClusterCreateOpts) (retErr error)
}{
{
name: "Separate GCE create calls for same cluster default to same zone",
geo: false,
},
{
name: "Separate GCE create calls for same geo cluster default to same zones",
geo: true,
},
}

for _, c := range testCases {
cfg.spec.Geo = c.geo
t.Run(c.name, func(t *testing.T) {
for i := 0; i < 100; i++ {
_, _, _ = factory.newCluster(ctx, cfg, setStatus, true)
}
})
}
}
Binary file added pkg/cmd/roachtest/testdata/.DS_Store
Binary file not shown.
8 changes: 4 additions & 4 deletions pkg/roachprod/vm/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ var defaultConfig = func() (cfg *awsConfig) {
return cfg
}()

// defaultZones is the list of availability zones used by default for
// DefaultZones is the list of availability zones used by default for
// cluster creation. If the geo flag is specified, nodes are
// distributed between zones.
//
Expand All @@ -358,7 +358,7 @@ var defaultConfig = func() (cfg *awsConfig) {
// doesn't support multi-regional buckets, thus resulting in material
// egress cost if the test loads from a different region. See
// https://github.com/cockroachdb/cockroach/issues/105968.
var defaultZones = []string{
var DefaultZones = []string{
"us-east-2a",
"us-west-2b",
"eu-west-2b",
Expand Down Expand Up @@ -425,7 +425,7 @@ func (o *ProviderOpts) ConfigureCreateFlags(flags *pflag.FlagSet) {
fmt.Sprintf("aws availability zones to use for cluster creation. If zones are formatted\n"+
"as AZ:N where N is an integer, the zone will be repeated N times. If > 1\n"+
"zone specified, the cluster will be spread out evenly by zone regardless\n"+
"of geo (default [%s])", strings.Join(defaultZones, ",")))
"of geo (default [%s])", strings.Join(DefaultZones, ",")))
flags.StringVar(&o.ImageAMI, ProviderName+"-image-ami",
o.ImageAMI, "Override image AMI to use. See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/ec2/describe-images.html")
flags.BoolVar(&o.UseMultipleDisks, ProviderName+"-enable-multiple-stores",
Expand Down Expand Up @@ -594,7 +594,7 @@ func (p *Provider) Create(
}

if len(expandedZones) == 0 {
expandedZones = defaultZones
expandedZones = DefaultZones
}

// We need to make sure that the SSH keys have been distributed to all regions.
Expand Down
10 changes: 5 additions & 5 deletions pkg/roachprod/vm/gce/gcloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ type ProjectsVal struct {
AcceptMultipleProjects bool
}

// defaultZones is the list of zones used by default for cluster creation.
// DefaultZones is the list of zones used by default for cluster creation.
// If the geo flag is specified, nodes are distributed between zones.
// These are GCP zones available according to this page:
// https://cloud.google.com/compute/docs/regions-zones#available
Expand All @@ -920,7 +920,7 @@ type ProjectsVal struct {
// ARM64 builds), but we randomize the specific zone. This is to avoid
// "zone exhausted" errors in one particular zone, especially during
// nightly roachtest runs.
func defaultZones(arch string) []string {
func DefaultZones(arch string) []string {
zones := []string{"us-east1-b", "us-east1-c", "us-east1-d"}
if vm.ParseArch(arch) == vm.ArchARM64 {
// T2A instances are only available in us-central1 in NA.
Expand Down Expand Up @@ -1016,7 +1016,7 @@ func (o *ProviderOpts) ConfigureCreateFlags(flags *pflag.FlagSet) {
fmt.Sprintf("Zones for cluster. If zones are formatted as AZ:N where N is an integer, the zone\n"+
"will be repeated N times. If > 1 zone specified, nodes will be geo-distributed\n"+
"regardless of geo (default [%s])",
strings.Join(defaultZones(string(vm.ArchAMD64)), ",")))
strings.Join(DefaultZones(string(vm.ArchAMD64)), ",")))
flags.BoolVar(&o.preemptible, ProviderName+"-preemptible", false,
"use preemptible GCE instances (lifetime cannot exceed 24h)")
flags.BoolVar(&o.UseSpot, ProviderName+"-use-spot", false,
Expand Down Expand Up @@ -1173,9 +1173,9 @@ func computeZones(opts vm.CreateOpts, providerOpts *ProviderOpts) ([]string, err
}
if len(zones) == 0 {
if opts.GeoDistributed {
zones = defaultZones(opts.Arch)
zones = DefaultZones(opts.Arch)
} else {
zones = []string{defaultZones(opts.Arch)[0]}
zones = []string{DefaultZones(opts.Arch)[0]}
}
}
if providerOpts.useArmAMI() {
Expand Down

0 comments on commit ef25fb7

Please sign in to comment.