Skip to content

Commit

Permalink
Merge pull request #5228 from GeorgianaElena/maap-cluster
Browse files Browse the repository at this point in the history
maap: new cluster
  • Loading branch information
GeorgianaElena authored Dec 5, 2024
2 parents b080510 + 52a9f88 commit 6a38b14
Show file tree
Hide file tree
Showing 9 changed files with 452 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/deploy-grafana-dashboards.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ jobs:
- cluster_name: jupyter-meets-the-earth
- cluster_name: kitware
- cluster_name: leap
- cluster_name: maap
- cluster_name: nasa-cryo
- cluster_name: nasa-ghg
- cluster_name: nasa-veda
Expand Down
37 changes: 37 additions & 0 deletions config/clusters/maap/cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: maap
provider: aws # https://916098889494.signin.aws.amazon.com/console
aws:
key: enc-deployer-credentials.secret.json
clusterType: eks
clusterName: maap
region: us-west-2
billing:
paid_by_us: false
support:
helm_chart_values_files:
- support.values.yaml
- enc-support.secret.values.yaml
hubs:
[]
# Uncomment the lines below once the support infrastructure was deployed and
# you are ready to add the first cluster

# - name: staging
# # Tip: consider changing this to something more human friendly
# display_name: "maap - staging"
# domain: staging.maap.2i2c.cloud
# helm_chart: basehub
# helm_chart_values_files:
# - common.values.yaml
# - staging.values.yaml
# - enc-staging.secret.values.yaml

# - name: prod
# # Tip: consider changing this to something more human friendly
# display_name: "maap - prod"
# domain: prod.maap.2i2c.cloud
# helm_chart: basehub
# helm_chart_values_files:
# - common.values.yaml
# - prod.values.yaml
# - enc-prod.secret.values.yaml
25 changes: 25 additions & 0 deletions config/clusters/maap/enc-deployer-credentials.secret.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"AccessKey": {
"AccessKeyId": "ENC[AES256_GCM,data:JMiFl1UnzusCQNlEOBsYvHa+9Uo=,iv:CC0kCAIAbQXtJE4aWfvXd63FWVSuO9To2L8aKkHRgo4=,tag:r2ZlXvm+UtsVyim0WI0M9Q==,type:str]",
"SecretAccessKey": "ENC[AES256_GCM,data:w6Agme4BM109uRDH2CXIp9ffqeD6xXe/Rw6ed2X8uN42CecK1vamNQ==,iv:7eEROA5OrThNMgq9dsHeVyFFsSUbksmt1kA0f5dBDXA=,tag:5UD9cGGNEKvw20Cril4evw==,type:str]",
"UserName": "ENC[AES256_GCM,data:GcAK1BJTZVmJGoVxeRb4zErA7RA371Y=,iv:6udAmDeSfJ2DO8j+/aINVF4PSjhQs+j5BxBSA2llB9Y=,tag:zYLlltSLTCH01wxrr5mffg==,type:str]"
},
"sops": {
"kms": null,
"gcp_kms": [
{
"resource_id": "projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs",
"created_at": "2024-12-04T12:21:40Z",
"enc": "CiUA4OM7eOtAu8gt5nq+Tr+m64LsqMU7YruHfYzFWFswrGfKO5SgEkkAnGhyNghFbi9rWO0BUsWs199nUCTeQOOebtO8KFEMrbH5bejuZDyjRar2fU3WyUKxlBRuywgZySqZgJ9Ut+LDL+c2LdWZD+Qz"
}
],
"azure_kv": null,
"hc_vault": null,
"age": null,
"lastmodified": "2024-12-04T12:21:41Z",
"mac": "ENC[AES256_GCM,data:kuyRynza4+RG2CGJyYQgUqjLAEZiCrjRvTpR/ciO0yKoRhFzykkbg12J/1y4M4eqlsezvUfyqE+EUtsBaISH1mg8nIuchHi6sRz9XAjQeLX3cwrEPlItH7sUjjGOTbRhcHna+zXVoM2q6gxIpEdNaNq/vPtAKs9TGCRRkw1NfSQ=,iv:RvP7hU6/6kJOBStTO5FEACDPwDA5tBYvjEptdGDRcOA=,tag:as4VS4owv5yZ2c0s+lbZ8A==,type:str]",
"pgp": null,
"unencrypted_suffix": "_unencrypted",
"version": "3.8.1"
}
}
17 changes: 17 additions & 0 deletions config/clusters/maap/enc-support.secret.values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
prometheusIngressAuthSecret:
username: ENC[AES256_GCM,data:1Fs5zwh1wn4/8KWnSoswC/KiW/1jw8CJxUSnOLne6KRI1W9uftsJt43FmRdzQMqsiadc291Jo74/YWBFBC1khw==,iv:ouHNVDQcyfsHQ7zj144fVEfqQX7oIez0uLmCDeO47dw=,tag:MxKMSNP+DVTBdQbBRIxA+Q==,type:str]
password: ENC[AES256_GCM,data:qtItFIiARguwpejHWHBDSoKOl4uilmXgEkC4nBonqqWoCkMBHBDFCAr7qbH+fwep+1+yNUkuDXKJE6l0zp/gqw==,iv:8Pcbr2lulRPc0wPYOtgLez2lBLa+PKfxmd/SA75VLpY=,tag:mzZukJ3yv+IPxxDO22O9Sg==,type:str]
sops:
kms: []
gcp_kms:
- resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs
created_at: "2024-12-04T11:17:12Z"
enc: CiUA4OM7eInxKKOnVMfm7f3ZEMUF8+vdF7TSx3WQo65HugraH6wMEkkAnGhyNpIACP7jUyAu/WPOXEmSwhwAXVaQGCMbgWbeuh0A+qvSUieMHE53t/VCgGa5n0Dnitr/jqchmhNaJQfs4GyoxgF3RbAp
azure_kv: []
hc_vault: []
age: []
lastmodified: "2024-12-04T11:17:12Z"
mac: ENC[AES256_GCM,data:9hrfgDF4tkpynItWcIkFTIGF8GRxeCXm0vcdMwcuNAx4E/vC/WMKxES3LFK2ygNzSljKZ3C76F3ipHjEioognquZQoEZWF22tAcJHFfc1VGa9iR6Dh22z4X33UcEZFELXBDJUPI01YWEOybqx74Khd13Yo8ht61vnUsDEbvEPTY=,iv:EwWG5H90WIEoX1T46DDaSvascSafppbtRvQPW9byerY=,tag:wDIatpNvUyHBzLSqzhabkQ==,type:str]
pgp: []
unencrypted_suffix: _unencrypted
version: 3.8.1
42 changes: 42 additions & 0 deletions config/clusters/maap/support.values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
prometheusIngressAuthSecret:
enabled: true

prometheus:
server:
ingress:
enabled: true
hosts:
- prometheus.maap.2i2c.cloud
tls:
- secretName: prometheus-tls
hosts:
- prometheus.maap.2i2c.cloud

grafana:
grafana.ini:
server:
root_url: https://grafana.maap.2i2c.cloud/
auth.github:
enabled: true
allowed_organizations: 2i2c-org
ingress:
hosts:
- grafana.maap.2i2c.cloud
tls:
- secretName: grafana-tls
hosts:
- grafana.maap.2i2c.cloud

aws-ce-grafana-backend:
enabled: true
envBasedConfig:
clusterName: maap
serviceAccount:
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::916098889494:role/aws_ce_grafana_backend_iam_role

cluster-autoscaler:
enabled: true
autoDiscovery:
clusterName: maap
awsRegion: us-west-2
262 changes: 262 additions & 0 deletions eksctl/maap.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
/*
This file is a jsonnet template of a eksctl's cluster configuration file,
that is used with the eksctl CLI to both update and initialize an AWS EKS
based cluster.
This file has in turn been generated from eksctl/template.jsonnet which is
relevant to compare with for changes over time.
To use jsonnet to generate an eksctl configuration file from this, do:
jsonnet maap.jsonnet > maap.eksctl.yaml
References:
- https://eksctl.io/usage/schema/
*/
local ng = import "./libsonnet/nodegroup.jsonnet";

// place all cluster nodes here
local clusterRegion = "us-west-2";
local masterAzs = ["us-west-2a", "us-west-2b", "us-west-2c"];
local nodeAz = "us-west-2a";

// Node definitions for notebook nodes. Config here is merged
// with our notebook node definition.
// A `node.kubernetes.io/instance-type label is added, so pods
// can request a particular kind of node with a nodeSelector
local notebookNodes = [
// staging
{
instanceType: "r5.xlarge",
namePrefix: "nb-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" },
},
{
instanceType: "r5.4xlarge",
namePrefix: "nb-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" },
},
{
instanceType: "r5.16xlarge",
namePrefix: "nb-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" },
},
// prod
{
instanceType: "r5.xlarge",
namePrefix: "nb-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" },
},
{
instanceType: "r5.4xlarge",
namePrefix: "nb-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" },
},
{
instanceType: "r5.16xlarge",
namePrefix: "nb-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" },
},
// gpus
{
instanceType: "g4dn.xlarge",
namePrefix: "gpu-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: {
"2i2c:hub-name": "staging",
"k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
},
taints+: {
"nvidia.com/gpu": "present:NoSchedule"
},
// Allow provisioning GPUs across all AZs, to prevent situation where all
// GPUs in a single AZ are in use and no new nodes can be spawned
availabilityZones: masterAzs,
},
{
instanceType: "g4dn.xlarge",
namePrefix: "gpu-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: {
"2i2c:hub-name": "prod",
"k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu": "1"
},
taints+: {
"nvidia.com/gpu": "present:NoSchedule"
},
// Allow provisioning GPUs across all AZs, to prevent situation where all
// GPUs in a single AZ are in use and no new nodes can be spawned
availabilityZones: masterAzs,
},
];

local daskNodes = [
// Node definitions for dask worker nodes. Config here is merged
// with our dask worker node definition, which uses spot instances.
// A `node.kubernetes.io/instance-type label is set to the name of the
// *first* item in instanceDistribution.instanceTypes, to match
// what we do with notebook nodes. Pods can request a particular
// kind of node with a nodeSelector
//
// A not yet fully established policy is being developed about using a single
// node pool, see https://github.com/2i2c-org/infrastructure/issues/2687.
//
{
namePrefix: "dask-staging",
labels+: { "2i2c/hub-name": "staging" },
tags+: { "2i2c:hub-name": "staging" },
instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }
},
{
namePrefix: "dask-prod",
labels+: { "2i2c/hub-name": "prod" },
tags+: { "2i2c:hub-name": "prod" },
instancesDistribution+: { instanceTypes: ["r5.4xlarge"] }
},
];


{
apiVersion: 'eksctl.io/v1alpha5',
kind: 'ClusterConfig',
metadata+: {
name: "maap",
region: clusterRegion,
version: "1.30",
tags+: {
"ManagedBy": "2i2c",
"2i2c.org/cluster-name": $.metadata.name,
},
},
availabilityZones: masterAzs,
iam: {
withOIDC: true,
},
// If you add an addon to this config, run the create addon command.
//
// eksctl create addon --config-file=maap.eksctl.yaml
//
addons: [
{ version: "latest", tags: $.metadata.tags } + addon
for addon in
[
{ name: "coredns" },
{ name: "kube-proxy" },
{
// vpc-cni is a Amazon maintained container networking interface
// (CNI), where a CNI is required for k8s networking. The aws-node
// DaemonSet in kube-system stems from installing this.
//
// Related docs: https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/
// https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html
//
name: "vpc-cni",
attachPolicyARNs: ["arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"],
# FIXME: enabling network policy enforcement didn't work as of
# August 2024, what's wrong isn't clear.
#
# configurationValues ref: https://github.com/aws/amazon-vpc-cni-k8s/blob/HEAD/charts/aws-vpc-cni/values.yaml
configurationValues: |||
enableNetworkPolicy: "false"
|||,
},
{
// aws-ebs-csi-driver ensures that our PVCs are bound to PVs that
// couple to AWS EBS based storage, without it expect to see pods
// mounting a PVC failing to schedule and PVC resources that are
// unbound.
//
// Related docs: https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html
//
name: "aws-ebs-csi-driver",
wellKnownPolicies: {
ebsCSIController: true,
},
# configurationValues ref: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/HEAD/charts/aws-ebs-csi-driver/values.yaml
configurationValues: |||
defaultStorageClass:
enabled: true
|||,
},
]
],
nodeGroups: [
n + {clusterName: $.metadata.name} for n in
[
ng + {
namePrefix: 'core',
nameSuffix: 'a',
nameIncludeInstanceType: false,
availabilityZones: [nodeAz],
ssh: {
publicKeyPath: 'ssh-keys/maap.key.pub'
},
instanceType: "r5.xlarge",
minSize: 1,
maxSize: 6,
labels+: {
"hub.jupyter.org/node-purpose": "core",
"k8s.dask.org/node-purpose": "core",
},
tags+: {
"2i2c:node-purpose": "core"
},
},
] + [
ng + {
namePrefix: 'nb',
availabilityZones: [nodeAz],
minSize: 0,
maxSize: 500,
instanceType: n.instanceType,
ssh: {
publicKeyPath: 'ssh-keys/maap.key.pub'
},
labels+: {
"hub.jupyter.org/node-purpose": "user",
"k8s.dask.org/node-purpose": "scheduler"
},
taints+: {
"hub.jupyter.org_dedicated": "user:NoSchedule",
"hub.jupyter.org/dedicated": "user:NoSchedule",
},
tags+: {
"2i2c:node-purpose": "user"
},
} + n for n in notebookNodes
] + ( if daskNodes != null then
[
ng + {
namePrefix: 'dask',
availabilityZones: [nodeAz],
minSize: 0,
maxSize: 500,
ssh: {
publicKeyPath: 'ssh-keys/maap.key.pub'
},
labels+: {
"k8s.dask.org/node-purpose": "worker"
},
taints+: {
"k8s.dask.org_dedicated" : "worker:NoSchedule",
"k8s.dask.org/dedicated" : "worker:NoSchedule",
},
tags+: {
"2i2c:node-purpose": "worker"
},
instancesDistribution+: {
onDemandBaseCapacity: 0,
onDemandPercentageAboveBaseCapacity: 0,
spotAllocationStrategy: "capacity-optimized",
},
} + n for n in daskNodes
] else []
)
]
}
1 change: 1 addition & 0 deletions eksctl/ssh-keys/maap.key.pub
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEKAnc9uvG/u94tT0iBOzgpcIbtzYqn18Mrm0MGGscJc [email protected]
Loading

0 comments on commit 6a38b14

Please sign in to comment.