An ETL pipeline with Amazon Redshift and AWS Glue example (#498)

VirtusLab · Aug 22, 2024 · 428ce73 · 428ce73
1 parent e296d7f
commit 428ce73
Show file tree

Hide file tree

Showing 7 changed files with 352 additions and 0 deletions.
diff --git a/examples/aws-redshift-glue-etl/.gitignore b/examples/aws-redshift-glue-etl/.gitignore
@@ -0,0 +1,10 @@
+### Scala an JVM
+*.class
+*.log
+.bsp
+.scala-build
+
+# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
+hs_err_pid*
+
+kubeconfig.json
diff --git a/examples/aws-redshift-glue-etl/Main.scala b/examples/aws-redshift-glue-etl/Main.scala
@@ -0,0 +1,224 @@
+import besom.*
+import besom.api.aws
+import besom.json.*
+
+@main def main = Pulumi.run {
+
+  val clusterIdentifier = "my-redshift-cluster"
+  val clusterDBName     = "dev"
+  val clusterDBUsername = "admin"
+  val clusterDBPassword = "Password!123"
+  val glueDBName        = "my-glue-db"
+
+  // Create an S3 bucket to store some raw data.
+  val eventsBucket = aws.s3.Bucket(
+    name = "events",
+    aws.s3.BucketArgs(forceDestroy = true)
+  )
+
+  // Create a VPC.
+  val vpc = aws.ec2.Vpc(
+    name = "vpc",
+    aws.ec2.VpcArgs(
+      cidrBlock = "10.0.0.0/16",
+      enableDnsHostnames = true
+    )
+  )
+
+  // Create a private subnet within the VPC.
+  val subnet = aws.ec2.Subnet(
+    name = "subnet",
+    aws.ec2.SubnetArgs(
+      vpcId = vpc.id,
+      cidrBlock = "10.0.1.0/24"
+    )
+  )
+
+  // Declare a Redshift subnet group with the subnet ID.
+  val subnetGroup = aws.redshift.SubnetGroup(
+    name = "subnet-group",
+    aws.redshift.SubnetGroupArgs(
+      subnetIds = List(subnet.id)
+    )
+  )
+
+  // Create an IAM role granting Redshift read-only access to S3.
+  val redshiftRole = aws.iam.Role(
+    name = "redshift-role",
+    aws.iam.RoleArgs(
+      assumeRolePolicy = json"""{
+        "Version": "2012-10-17",
+        "Statement": [
+            {
+                "Action": "sts:AssumeRole",
+                "Effect": "Allow",
+                "Principal": {
+                    "Service": "redshift.amazonaws.com"
+                }
+            }
+        ]
+    }""".map(_.prettyPrint),
+      managedPolicyArns = List(
+        aws.iam.enums.ManagedPolicy.AmazonS3ReadOnlyAccess.value
+      )
+    )
+  )
+
+  // Create a VPC endpoint so the cluster can read from S3 over the private network.
+  val vpcEndpoint = aws.ec2.VpcEndpoint(
+    name = "s3-vpc-endpoint",
+    aws.ec2.VpcEndpointArgs(
+      vpcId = vpc.id,
+      serviceName = p"com.amazonaws.${aws.getRegion(aws.GetRegionArgs()).name}.s3",
+      routeTableIds = List(vpc.mainRouteTableId)
+    )
+  )
+
+  // Create a single-node Redshift cluster in the VPC.
+  val cluster = aws.redshift.Cluster(
+    name = "cluster",
+    aws.redshift.ClusterArgs(
+      clusterIdentifier = clusterIdentifier,
+      databaseName = clusterDBName,
+      masterUsername = clusterDBUsername,
+      masterPassword = clusterDBPassword,
+      nodeType = "ra3.xlplus",
+      clusterSubnetGroupName = subnetGroup.name,
+      clusterType = "single-node",
+      publiclyAccessible = false,
+      skipFinalSnapshot = true,
+      vpcSecurityGroupIds = List(vpc.defaultSecurityGroupId),
+      iamRoles = List(redshiftRole.arn)
+    )
+  )
+
+  // Define an AWS cron expression of "every 15 minutes".
+  // https://docs.aws.amazon.com/lambda/latest/dg/services-cloudwatchevents-expressions.html
+  val every15minutes = "cron(0/15 * * * ? *)"
+
+  // Create a Glue catalog database.
+  val glueCatalogDB = aws.glue.CatalogDatabase(
+    name = "glue-catalog-db",
+    aws.glue.CatalogDatabaseArgs(
+      name = glueDBName
+    )
+  )
+
+// Define an IAM role granting AWS Glue access to S3 and other Glue-required services.
+  val glueRole = aws.iam.Role(
+    name = "glue-role",
+    aws.iam.RoleArgs(
+      assumeRolePolicy = json"""{
+        "Version": "2012-10-17",
+        "Statement": [
+            {
+                "Action": "sts:AssumeRole",
+                "Effect": "Allow",
+                "Principal": {
+                    "Service": "glue.amazonaws.com"
+                }
+            }
+        ]
+    }""".map(_.prettyPrint),
+      managedPolicyArns = List(
+        aws.iam.enums.ManagedPolicy.AmazonS3FullAccess.value,
+        aws.iam.enums.ManagedPolicy.AWSGlueServiceRole.value
+      )
+    )
+  )
+
+  // Create a Glue crawler to process the contents of the data bucket on a schedule.
+  // https://docs.aws.amazon.com/glue/latest/dg/monitor-data-warehouse-schedule.html
+  val glueCrawler = aws.glue.Crawler(
+    name = "glue-crawler",
+    aws.glue.CrawlerArgs(
+      databaseName = glueCatalogDB.name,
+      role = glueRole.arn,
+      schedule = every15minutes,
+      s3Targets = List(
+        aws.glue.inputs.CrawlerS3TargetArgs(
+          path = p"s3://${eventsBucket.bucket}"
+        )
+      )
+    )
+  )
+
+  // Create a Glue connection to the Redshift cluster.
+  val glueRedshiftConnection = aws.glue.Connection(
+    name = "glue-redshift-connection",
+    aws.glue.ConnectionArgs(
+      connectionType = "JDBC",
+      connectionProperties = Map(
+        "JDBC_CONNECTION_URL" -> p"jdbc:redshift://${cluster.endpoint}/${clusterDBName}",
+        "USERNAME" -> clusterDBUsername,
+        "PASSWORD" -> clusterDBPassword
+      ),
+      physicalConnectionRequirements = aws.glue.inputs.ConnectionPhysicalConnectionRequirementsArgs(
+        securityGroupIdLists = cluster.vpcSecurityGroupIds,
+        availabilityZone = subnet.availabilityZone,
+        subnetId = subnet.id
+      )
+    )
+  )
+
+  // Create an S3 bucket for Glue scripts and temporary storage.
+  val glueJobBucket = aws.s3.Bucket(
+    name = "glue-job-bucket",
+    aws.s3.BucketArgs(
+      forceDestroy = true
+    )
+  )
+
+  // Upload a Glue job script.
+  val glueJobScript = aws.s3.BucketObject(
+    name = "glue-job.py",
+    aws.s3.BucketObjectArgs(
+      bucket = glueJobBucket.id,
+      source = Asset.FileAsset("./glue-job.py")
+    )
+  )
+
+  // Create a Glue job that runs our Python ETL script.
+  val glueJob = aws.glue.Job(
+    name = "glue-job",
+    aws.glue.JobArgs(
+      roleArn = glueRole.arn,
+      glueVersion = "3.0",
+      numberOfWorkers = 10,
+      workerType = "G.1X",
+      defaultArguments = Map(
+        // Enabling job bookmarks helps you avoid loading duplicate data.
+        // https://docs.aws.amazon.com/glue/latest/dg/monitor-continuations.html
+        "--job-bookmark-option" -> "job-bookmark-enable",
+        "--ConnectionName" -> glueRedshiftConnection.name,
+        "--GlueDBName" -> glueDBName,
+        "--GlueDBTableName" -> eventsBucket.bucket.map(_.replace("-", "_")),
+        "--RedshiftDBName" -> clusterDBName,
+        "--RedshiftDBTableName" -> "events",
+        "--RedshiftRoleARN" -> redshiftRole.arn,
+        "--TempDir" -> p"s3://${glueJobBucket.bucket}/glue-job-temp"
+      ),
+      connections = List(glueRedshiftConnection.name),
+      command = aws.glue.inputs.JobCommandArgs(
+        scriptLocation = p"s3://${glueJobBucket.bucket}/glue-job.py",
+        pythonVersion = "3"
+      )
+    )
+  )
+
+  // Create a Glue trigger to run the job every 15 minutes.
+  val glueJobTrigger = aws.glue.Trigger(
+    name = "trigger",
+    aws.glue.TriggerArgs(
+      schedule = every15minutes,
+      `type` = "SCHEDULED",
+      actions = List(
+        aws.glue.inputs.TriggerActionArgs(jobName = glueJob.name)
+      )
+    )
+  )
+
+  Stack(vpcEndpoint, glueCrawler, glueJobScript, glueJobTrigger).exports(
+    dataBucketName = eventsBucket.bucket
+  )
+}
diff --git a/examples/aws-redshift-glue-etl/Pulumi.yaml b/examples/aws-redshift-glue-etl/Pulumi.yaml
@@ -0,0 +1,3 @@
+name: aws-redshift-glue-etl
+description: An ETL pipeline with Amazon Redshift and AWS Glue
+runtime: scala
diff --git a/examples/aws-redshift-glue-etl/README.md b/examples/aws-redshift-glue-etl/README.md
@@ -0,0 +1,45 @@
+# ETL pipeline with Amazon Redshift and AWS Glue
+
+This example creates an ETL pipeline using Amazon Redshift and AWS Glue. The pipeline extracts data from an S3 bucket
+with a Glue crawler, transforms it with a Python script wrapped in a Glue job, and loads it into a Redshift database
+deployed in a VPC.
+
+## Prerequisites
+
+[Follow the instructions](https://www.pulumi.com/docs/clouds/aws/get-started/begin/)
+to get started with Pulumi & AWS.
+
+## Deploying
+
+1. Create a new stack, which is an isolated deployment target for this example:
+
+   ```bash
+   pulumi stack init dev
+   ```
+
+2. Set the AWS region:
+
+   ```bash
+   pulumi config set aws:region us-west-2
+   ```
+
+3. Stand up the cluster:
+
+   ```bash
+   pulumi up
+   ```
+4. In a few moments, the Redshift cluster and Glue components will be up and running and the S3 bucket name emitted as a
+   Pulumi stack output:
+
+5. Upload the included sample data file to S3 to verify the automation works as expected:
+
+    ```bash
+    aws s3 cp events-1.txt s3://$(pulumi stack output dataBucketName)
+    ```
+
+6. When you're ready, destroy your stack and remove it:
+
+    ```bash
+    pulumi destroy --yes
+    pulumi stack rm --yes
+    ```
diff --git a/examples/aws-redshift-glue-etl/events-1.txt b/examples/aws-redshift-glue-etl/events-1.txt
@@ -0,0 +1,3 @@
+{"id": 1, "name": "An interesting event"}
+{"id": 2, "name": "Another interesting event"}
+{"id": 3, "name": "An event of monumental importance"}
diff --git a/examples/aws-redshift-glue-etl/glue-job.py b/examples/aws-redshift-glue-etl/glue-job.py
@@ -0,0 +1,61 @@
+import sys
+from awsglue.utils import getResolvedOptions
+from awsglue.transforms import ApplyMapping
+from awsglue.context import GlueContext
+from awsglue.job import Job
+from pyspark.context import SparkContext
+
+# Collect the arguments passed in by the glue.Job run.
+args = getResolvedOptions(
+    sys.argv,
+    [
+        "JOB_NAME",
+        "TempDir",
+        "ConnectionName",
+        "GlueDBName",
+        "GlueDBTableName",
+        "RedshiftRoleARN",
+        "RedshiftDBName",
+        "RedshiftDBTableName",
+    ],
+)
+
+glueContext = GlueContext(SparkContext.getOrCreate())
+
+job = Job(glueContext)
+job.init(args["JOB_NAME"], args)
+
+# Extract all unprocessed data from the Glue catalog.
+source0 = glueContext.create_dynamic_frame.from_catalog(
+    database=args["GlueDBName"],
+    table_name=args["GlueDBTableName"],
+    additional_options={
+        "jobBookmarkKeys": ["id"],
+        "jobBookmarkKeysSortOrder": "asc",
+    },
+    transformation_ctx="source0",
+)
+
+# Transform the data (mostly just to show how to do so).
+transformed0 = ApplyMapping.apply(
+    frame=source0,
+    mappings=[
+        ("id", "int", "event_id", "int"),
+        ("name", "string", "event_name", "string"),
+    ],
+)
+
+# Load the data into the Redshift database.
+glueContext.write_dynamic_frame.from_jdbc_conf(
+    frame=transformed0,
+    catalog_connection=args["ConnectionName"],
+    connection_options={
+        "database": args["RedshiftDBName"],
+        "dbtable": args["RedshiftDBTableName"],
+        "aws_iam_role": args["RedshiftRoleARN"],
+    },
+    redshift_tmp_dir=args["TempDir"],
+)
+
+# Call commit() to reset the job bookmark for the next run.
+job.commit()
diff --git a/examples/aws-redshift-glue-etl/project.scala b/examples/aws-redshift-glue-etl/project.scala
@@ -0,0 +1,6 @@
+//> using scala "3.3.1"
+//> using options -Werror -Wunused:all -Wvalue-discard -Wnonunit-statement
+//> using dep "org.virtuslab::besom-core:0.4.0-SNAPSHOT"
+//> using dep "org.virtuslab::besom-aws:6.32.0-core.0.4-SNAPSHOT"
+
+//> using repository sonatype:snapshots