Skip to content

Commit

Permalink
docs: Operator Upgrade Docs (#206)
Browse files Browse the repository at this point in the history
* docs: Operator Upgrade Docs

* adding apply output

* Egress
  • Loading branch information
zacharyblasczyk authored Jun 6, 2024
1 parent 45e1d74 commit fbf8379
Show file tree
Hide file tree
Showing 11 changed files with 667 additions and 3 deletions.
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,14 @@ resources that lack official modules.
Users can update the EKS cluster version to the latest version offered by AWS. This can be done using the environment variable `eks_cluster_version`. Note that, cluster and nodegroup version updates can only be done in increments of one version at a time. For example, if your current cluster version is `1.21` and the latest version available is `1.25` - you'd need to:

1. update the cluster version in the app_eks module from `1.21` to `1.22`
2. run `terraform apply`
2. run `terraform apply`
3. update the cluster version to `1.23`
4. run `terraform apply`
5. update the cluster version to `1.24`
...and so on and so forth.
...and so on and so forth.

Upgrades must be executed in step-wise fashion from one version to the next. You cannot skip versions when upgrading EKS.

<!-- BEGIN_TF_DOCS -->

### Notes on EKS Add-ons
Expand Down Expand Up @@ -252,7 +253,11 @@ CLI and re-run the apply. Running pods will not be impacted.

## Migrations

#### Upgrading from 3.x -> 4.x
### Upgrading to Operator

See our upgrade guide [here](./docs/operator-migration/readme.md)

### Upgrading from 3.x -> 4.x

- If egress access for retrieving the wandb/controller image is not available, Terraform apply may experience failures.
- It's necessary to supply a license variable within the module, as shown:
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/operator-migration/images/post-operator-k8s.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/operator-migration/images/pre-operator-infra.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/operator-migration/images/pre-operator-k8s.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
113 changes: 113 additions & 0 deletions docs/operator-migration/post-operator.tf.disabled
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
provider "aws" {
region = "us-west-2"

default_tags {
tags = {
GithubRepo = "terraform-aws-wandb"
GithubOrg = "wandb"
Enviroment = "Example"
Example = "PublicDnsExternal"
}
}
}

terraform {
required_version = "~> 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.0" # Post-Operator
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.23"
}
}
}

module "wandb_infra" {
source = "wandb/wandb/aws"
version = "4.7.2"

namespace = var.namespace
public_access = true
external_dns = true

enable_dummy_dns = var.enable_dummy_dns # Post-Operator
enable_operator_alb = var.enable_operator_alb # Post-Operator
deletion_protection = false

database_instance_class = var.database_instance_class
database_engine_version = var.database_engine_version
database_snapshot_identifier = var.database_snapshot_identifier
database_sort_buffer_size = var.database_sort_buffer_size

database_performance_insights_kms_key_arn = null

allowed_inbound_cidr = var.allowed_inbound_cidr
allowed_inbound_ipv6_cidr = ["::/0"]

eks_cluster_version = "1.25"
kubernetes_public_access = true
kubernetes_public_access_cidrs = ["0.0.0.0/0"]

domain_name = var.domain_name
zone_id = var.zone_id
subdomain = var.subdomain

# Add License Post-Operator
license = var.wandb_license

# Use standard sizing Post-Operator
size = var.size

# Set the External DNS Custom Domain Filter Post-Operator
custom_domain_filter = var.custom_domain_filter

bucket_name = var.bucket_name
bucket_kms_key_arn = var.bucket_kms_key_arn
use_internal_queue = true

aws_loadbalancer_controller_tags = var.aws_loadbalancer_controller_tags
}

data "aws_eks_cluster" "app_cluster" {
name = module.wandb_infra.cluster_id
}

data "aws_eks_cluster_auth" "app_cluster" {
name = module.wandb_infra.cluster_id
}

provider "kubernetes" {
host = data.aws_eks_cluster.app_cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.app_cluster.token
exec {
api_version = "client.authentication.k8s.io/v1beta1"
args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
command = "aws"
}
}

# Enable the Helm provider
provider "helm" {
kubernetes {
host = data.aws_eks_cluster.app_cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.app_cluster.token
exec {
api_version = "client.authentication.k8s.io/v1beta1"
args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
command = "aws"
}
}
}

output "bucket_name" {
value = module.wandb_infra.bucket_name
}

output "bucket_queue_name" {
value = module.wandb_infra.bucket_queue_name
}
12 changes: 12 additions & 0 deletions docs/operator-migration/post-operator.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
namespace = "operator-upgrade"
domain_name = "sandbox-aws.wandb.ml"
zone_id = "Z032246913CW32RVRY0WU"
subdomain = "operator-upgrade"
wandb_license = "eyJh"
# wandb_version = "0.51.2" Is now coming from the Release Channel or set in the User Spec.

# Needed Operator Variables for Upgrade
size = "small"
enable_dummy_dns = true
enable_operator_alb = true
custom_domain_filter = "sandbox-aws.wandb.ml"
112 changes: 112 additions & 0 deletions docs/operator-migration/pre-operator.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
provider "aws" {
region = "us-west-2"

default_tags {
tags = {
GithubRepo = "terraform-aws-wandb"
GithubOrg = "wandb"
Enviroment = "Example"
Example = "PublicDnsExternal"
}
}
}

terraform {
required_version = "~> 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 3.6" # Pre-Operator
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.23"
}
}
}

module "wandb_infra" {
source = "wandb/wandb/aws"
version = "1.16.10"

namespace = var.namespace
public_access = true
external_dns = true

deletion_protection = false

database_instance_class = var.database_instance_class
database_engine_version = var.database_engine_version
database_snapshot_identifier = var.database_snapshot_identifier
database_sort_buffer_size = var.database_sort_buffer_size

database_performance_insights_kms_key_arn = null

allowed_inbound_cidr = var.allowed_inbound_cidr
allowed_inbound_ipv6_cidr = ["::/0"]

eks_cluster_version = "1.25"
kubernetes_public_access = true
kubernetes_public_access_cidrs = ["0.0.0.0/0"]

domain_name = var.domain_name
zone_id = var.zone_id
subdomain = var.subdomain

bucket_name = var.bucket_name
bucket_kms_key_arn = var.bucket_kms_key_arn
use_internal_queue = true
}

data "aws_eks_cluster" "app_cluster" {
name = module.wandb_infra.cluster_id
}

data "aws_eks_cluster_auth" "app_cluster" {
name = module.wandb_infra.cluster_id
}

provider "kubernetes" {
host = data.aws_eks_cluster.app_cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.app_cluster.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.app_cluster.token
exec {
api_version = "client.authentication.k8s.io/v1beta1"
args = ["eks", "get-token", "--cluster-name", data.aws_eks_cluster.app_cluster.name]
command = "aws"
}
}

module "wandb_app" {
source = "wandb/wandb/kubernetes"
version = "1.12.0"

license = var.wandb_license

host = module.wandb_infra.url
bucket = "s3://${module.wandb_infra.bucket_name}"
bucket_aws_region = module.wandb_infra.bucket_region
bucket_queue = "internal://"
bucket_kms_key_arn = module.wandb_infra.kms_key_arn
database_connection_string = "mysql://${module.wandb_infra.database_connection_string}"
redis_connection_string = "redis://${module.wandb_infra.elasticache_connection_string}?tls=true&ttlInSeconds=604800"

wandb_image = var.wandb_image
wandb_version = var.wandb_version

service_port = module.wandb_infra.internal_app_port

depends_on = [module.wandb_infra]

other_wandb_env = merge({
"GORILLA_CUSTOMER_SECRET_STORE_SOURCE" = "aws-secretmanager://${var.namespace}?namespace=${var.namespace}"
}, var.other_wandb_env)
}

output "bucket_name" {
value = module.wandb_infra.bucket_name
}

output "bucket_queue_name" {
value = module.wandb_infra.bucket_queue_name
}
7 changes: 7 additions & 0 deletions docs/operator-migration/pre-operator.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
namespace = "operator-upgrade"
domain_name = "sandbox-aws.wandb.ml"
zone_id = "Z032246913CW32RVRY0WU"
subdomain = "operator-upgrade"
wandb_license = "eyJh"
wandb_version = "0.51.2"
# size = "small"
Loading

0 comments on commit fbf8379

Please sign in to comment.