Skip to content

Commit

Permalink
Merge pull request cockroachdb#130189 from DarrylWong/backport24.1-13…
Browse files Browse the repository at this point in the history
…0083

release-24.1: roachtest: ensure workload node has same zone as cluster if unspecified
  • Loading branch information
DarrylWong authored Sep 6, 2024
2 parents d89feeb + ef25fb7 commit 5b45f64
Show file tree
Hide file tree
Showing 9 changed files with 131 additions and 52 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ artifacts
/certs
# make stress, acceptance produce stress.test, acceptance.test
*.test*
pkg/cmd/roachtest/_runner-logs
# fuzz tests
work-Fuzz*
*-fuzz.zip
Expand Down
Binary file added pkg/cmd/roachtest/.DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ go_test(
"//pkg/roachprod/logger",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/azure",
"//pkg/roachprod/vm/gce",
"//pkg/testutils",
"//pkg/testutils/datapathutils",
"//pkg/testutils/echotest",
Expand Down
13 changes: 9 additions & 4 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -904,10 +904,6 @@ func (f *clusterFactory) newCluster(
if err != nil {
return nil, nil, err
}
if clusterCloud != spec.Local {
providerOptsContainer.SetProviderOpts(clusterCloud, providerOpts)
workloadProviderOptsContainer.SetProviderOpts(clusterCloud, workloadProviderOpts)
}

createFlagsOverride(&createVMOpts)
// Make sure expiration is changed if --lifetime override flag
Expand All @@ -930,6 +926,15 @@ func (f *clusterFactory) newCluster(
//
// https://github.com/cockroachdb/cockroach/issues/67906#issuecomment-887477675
genName := f.genName(cfg)

// Set the zones used for the cluster. We call this in the loop as the default GCE zone
// is randomized to avoid zone exhaustion errors.
providerOpts, workloadProviderOpts = cfg.spec.SetRoachprodOptsZones(providerOpts, workloadProviderOpts, params, string(selectedArch))
if clusterCloud != spec.Local {
providerOptsContainer.SetProviderOpts(clusterCloud, providerOpts)
workloadProviderOptsContainer.SetProviderOpts(clusterCloud, workloadProviderOpts)
}

// Logs for creating a new cluster go to a dedicated log file.
var retryStr string
if i > 1 {
Expand Down
100 changes: 61 additions & 39 deletions pkg/cmd/roachtest/spec/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,7 @@ func awsMachineSupportsSSD(machineType string) bool {
return false
}

func getAWSOpts(
machineType string, zones []string, volumeSize, ebsThroughput int, localSSD bool,
) vm.ProviderOpts {
func getAWSOpts(machineType string, volumeSize, ebsThroughput int, localSSD bool) vm.ProviderOpts {
opts := aws.DefaultProviderOpts()
if volumeSize != 0 {
opts.DefaultEBSVolume.Disk.VolumeSize = volumeSize
Expand All @@ -204,15 +202,11 @@ func getAWSOpts(
} else {
opts.MachineType = machineType
}
if len(zones) != 0 {
opts.CreateZones = zones
}
return opts
}

func getGCEOpts(
machineType string,
zones []string,
volumeSize, localSSDCount int,
localSSD bool,
RAID0 bool,
Expand All @@ -233,9 +227,6 @@ func getGCEOpts(
if volumeSize != 0 {
opts.PDVolumeSize = volumeSize
}
if len(zones) != 0 {
opts.Zones = zones
}
opts.SSDCount = localSSDCount
if localSSD && localSSDCount > 0 {
// NB: As the default behavior for _roachprod_ (at least in AWS/GCP) is
Expand All @@ -253,12 +244,9 @@ func getGCEOpts(
return opts
}

func getAzureOpts(machineType string, zones []string, volumeSize int) vm.ProviderOpts {
func getAzureOpts(machineType string, volumeSize int) vm.ProviderOpts {
opts := azure.DefaultProviderOpts()
opts.MachineType = machineType
if len(zones) != 0 {
opts.Locations = zones
}
if volumeSize != 0 {
opts.NetworkDiskSize = int32(volumeSize)
}
Expand Down Expand Up @@ -418,25 +406,6 @@ func (s *ClusterSpec) RoachprodOpts(
}
}

zonesStr := params.Defaults.Zones
switch cloud {
case AWS:
if s.AWS.Zones != "" {
zonesStr = s.AWS.Zones
}
case GCE:
if s.GCE.Zones != "" {
zonesStr = s.GCE.Zones
}
}
var zones []string
if zonesStr != "" {
zones = strings.Split(zonesStr, ",")
if !s.Geo {
zones = zones[:1]
}
}

var workloadMachineType string
var err error
switch cloud {
Expand All @@ -460,27 +429,80 @@ func (s *ClusterSpec) RoachprodOpts(
var workloadProviderOpts vm.ProviderOpts
switch cloud {
case AWS:
providerOpts = getAWSOpts(machineType, zones, s.VolumeSize, s.AWS.VolumeThroughput,
providerOpts = getAWSOpts(machineType, s.VolumeSize, s.AWS.VolumeThroughput,
createVMOpts.SSDOpts.UseLocalSSD)
workloadProviderOpts = getAWSOpts(workloadMachineType, zones, s.VolumeSize, s.AWS.VolumeThroughput,
workloadProviderOpts = getAWSOpts(workloadMachineType, s.VolumeSize, s.AWS.VolumeThroughput,
createVMOpts.SSDOpts.UseLocalSSD)
case GCE:
providerOpts = getGCEOpts(machineType, zones, s.VolumeSize, ssdCount,
providerOpts = getGCEOpts(machineType, s.VolumeSize, ssdCount,
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType, s.UseSpotVMs,
)
workloadProviderOpts = getGCEOpts(workloadMachineType, zones, s.VolumeSize, ssdCount,
workloadProviderOpts = getGCEOpts(workloadMachineType, s.VolumeSize, ssdCount,
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
s.GCE.MinCPUPlatform, vm.ParseArch(createVMOpts.Arch), s.GCE.VolumeType, s.UseSpotVMs,
)
case Azure:
providerOpts = getAzureOpts(machineType, zones, s.VolumeSize)
workloadProviderOpts = getAzureOpts(workloadMachineType, zones, s.VolumeSize)
providerOpts = getAzureOpts(machineType, s.VolumeSize)
workloadProviderOpts = getAzureOpts(workloadMachineType, s.VolumeSize)
}

return createVMOpts, providerOpts, workloadProviderOpts, selectedArch, nil
}

// SetRoachprodOptsZones updates the providerOpts with the VM zones as specified in the params/spec.
// We separate this logic from RoachprodOpts as we may need to call this multiple times in order to
// randomize the default GCE zone.
func (s *ClusterSpec) SetRoachprodOptsZones(
providerOpts, workloadProviderOpts vm.ProviderOpts, params RoachprodClusterConfig, arch string,
) (vm.ProviderOpts, vm.ProviderOpts) {
zonesStr := params.Defaults.Zones
cloud := params.Cloud
switch cloud {
case AWS:
if s.AWS.Zones != "" {
zonesStr = s.AWS.Zones
}
case GCE:
if s.GCE.Zones != "" {
zonesStr = s.GCE.Zones
}
}
var zones []string
if zonesStr != "" {
zones = strings.Split(zonesStr, ",")
if !s.Geo {
zones = zones[:1]
}
}

switch cloud {
case AWS:
if len(zones) == 0 {
if !s.Geo {
zones = aws.DefaultZones[:1]
} else {
zones = aws.DefaultZones
}
}
providerOpts.(*aws.ProviderOpts).CreateZones = zones
workloadProviderOpts.(*aws.ProviderOpts).CreateZones = zones
case GCE:
// We randomize the list of default zones for GCE for quota reasons, so decide the zone
// early to ensure that the workload node and CRDB cluster have the same default zone.
if len(zones) == 0 {
if !s.Geo {
zones = gce.DefaultZones(arch)[:1]
} else {
zones = gce.DefaultZones(arch)
}
}
providerOpts.(*gce.ProviderOpts).Zones = zones
workloadProviderOpts.(*gce.ProviderOpts).Zones = zones
}
return providerOpts, workloadProviderOpts
}

// Expiration is the lifetime of the cluster. It may be destroyed after
// the expiration has passed.
func (s *ClusterSpec) Expiration() time.Time {
Expand Down
50 changes: 50 additions & 0 deletions pkg/cmd/roachtest/test_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/cloud"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
Expand Down Expand Up @@ -481,3 +482,52 @@ func TestNewCluster(t *testing.T) {
})
}
}

// Regression test for: https://github.com/cockroachdb/cockroach/issues/129997
// Tests that workload nodes are assigned the same default zone as the main CRDB cluster.
func TestGCESameDefaultZone(t *testing.T) {
ctx := context.Background()
factory := &clusterFactory{sem: make(chan struct{}, 1)}
cfg := clusterConfig{spec: spec.MakeClusterSpec(2, spec.WorkloadNode())}
setStatus := func(string) {}

defer func() {
create = roachprod.Create
}()

create = func(ctx context.Context, l *logger.Logger, username string, opts ...*cloud.ClusterCreateOpts) (retErr error) {
// Since we specified no zone for this cluster, roachtest should assign a default one for us.
// Check that it assigns the same default zone to both the CRDB cluster and the workload node.
require.Equal(t, len(opts), 2)
crdbZones := opts[0].ProviderOptsContainer[gce.ProviderName].(*gce.ProviderOpts).Zones
workloadZones := opts[1].ProviderOptsContainer[gce.ProviderName].(*gce.ProviderOpts).Zones
require.Equal(t, crdbZones, workloadZones)
// A bit of a workaround, we don't have a mock for registerCluster at this time which will panic if hit.
// Instead, just return an error to return early since we already tested the code paths we care about.
return &roachprod.ClusterAlreadyExistsError{}
}

testCases := []struct {
name string
geo bool
createMock func(ctx context.Context, l *logger.Logger, username string, opts ...*cloud.ClusterCreateOpts) (retErr error)
}{
{
name: "Separate GCE create calls for same cluster default to same zone",
geo: false,
},
{
name: "Separate GCE create calls for same geo cluster default to same zones",
geo: true,
},
}

for _, c := range testCases {
cfg.spec.Geo = c.geo
t.Run(c.name, func(t *testing.T) {
for i := 0; i < 100; i++ {
_, _, _ = factory.newCluster(ctx, cfg, setStatus, true)
}
})
}
}
Binary file added pkg/cmd/roachtest/testdata/.DS_Store
Binary file not shown.
8 changes: 4 additions & 4 deletions pkg/roachprod/vm/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ var defaultConfig = func() (cfg *awsConfig) {
return cfg
}()

// defaultZones is the list of availability zones used by default for
// DefaultZones is the list of availability zones used by default for
// cluster creation. If the geo flag is specified, nodes are
// distributed between zones.
//
Expand All @@ -358,7 +358,7 @@ var defaultConfig = func() (cfg *awsConfig) {
// doesn't support multi-regional buckets, thus resulting in material
// egress cost if the test loads from a different region. See
// https://github.com/cockroachdb/cockroach/issues/105968.
var defaultZones = []string{
var DefaultZones = []string{
"us-east-2a",
"us-west-2b",
"eu-west-2b",
Expand Down Expand Up @@ -425,7 +425,7 @@ func (o *ProviderOpts) ConfigureCreateFlags(flags *pflag.FlagSet) {
fmt.Sprintf("aws availability zones to use for cluster creation. If zones are formatted\n"+
"as AZ:N where N is an integer, the zone will be repeated N times. If > 1\n"+
"zone specified, the cluster will be spread out evenly by zone regardless\n"+
"of geo (default [%s])", strings.Join(defaultZones, ",")))
"of geo (default [%s])", strings.Join(DefaultZones, ",")))
flags.StringVar(&o.ImageAMI, ProviderName+"-image-ami",
o.ImageAMI, "Override image AMI to use. See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/ec2/describe-images.html")
flags.BoolVar(&o.UseMultipleDisks, ProviderName+"-enable-multiple-stores",
Expand Down Expand Up @@ -594,7 +594,7 @@ func (p *Provider) Create(
}

if len(expandedZones) == 0 {
expandedZones = defaultZones
expandedZones = DefaultZones
}

// We need to make sure that the SSH keys have been distributed to all regions.
Expand Down
10 changes: 5 additions & 5 deletions pkg/roachprod/vm/gce/gcloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ type ProjectsVal struct {
AcceptMultipleProjects bool
}

// defaultZones is the list of zones used by default for cluster creation.
// DefaultZones is the list of zones used by default for cluster creation.
// If the geo flag is specified, nodes are distributed between zones.
// These are GCP zones available according to this page:
// https://cloud.google.com/compute/docs/regions-zones#available
Expand All @@ -920,7 +920,7 @@ type ProjectsVal struct {
// ARM64 builds), but we randomize the specific zone. This is to avoid
// "zone exhausted" errors in one particular zone, especially during
// nightly roachtest runs.
func defaultZones(arch string) []string {
func DefaultZones(arch string) []string {
zones := []string{"us-east1-b", "us-east1-c", "us-east1-d"}
if vm.ParseArch(arch) == vm.ArchARM64 {
// T2A instances are only available in us-central1 in NA.
Expand Down Expand Up @@ -1016,7 +1016,7 @@ func (o *ProviderOpts) ConfigureCreateFlags(flags *pflag.FlagSet) {
fmt.Sprintf("Zones for cluster. If zones are formatted as AZ:N where N is an integer, the zone\n"+
"will be repeated N times. If > 1 zone specified, nodes will be geo-distributed\n"+
"regardless of geo (default [%s])",
strings.Join(defaultZones(string(vm.ArchAMD64)), ",")))
strings.Join(DefaultZones(string(vm.ArchAMD64)), ",")))
flags.BoolVar(&o.preemptible, ProviderName+"-preemptible", false,
"use preemptible GCE instances (lifetime cannot exceed 24h)")
flags.BoolVar(&o.UseSpot, ProviderName+"-use-spot", false,
Expand Down Expand Up @@ -1173,9 +1173,9 @@ func computeZones(opts vm.CreateOpts, providerOpts *ProviderOpts) ([]string, err
}
if len(zones) == 0 {
if opts.GeoDistributed {
zones = defaultZones(opts.Arch)
zones = DefaultZones(opts.Arch)
} else {
zones = []string{defaultZones(opts.Arch)[0]}
zones = []string{DefaultZones(opts.Arch)[0]}
}
}
if providerOpts.useArmAMI() {
Expand Down

0 comments on commit 5b45f64

Please sign in to comment.