Merge pull request apache#3304 from bamaer/3281

new AWS Redshift bulk loader transform apache#3281
hansva · Oct 16, 2023 · 7a96618 · 7a96618
2 parents 4a30965 + 48a8822
commit 7a96618
Show file tree

Hide file tree

Showing 15 changed files with 3,434 additions and 1 deletion.
diff --git a/assemblies/plugins/tech/aws/pom.xml b/assemblies/plugins/tech/aws/pom.xml
@@ -36,6 +36,7 @@
         <!-- Third-party dependencies -->
         <aws-java-sdk-s3.version>1.12.279</aws-java-sdk-s3.version>
         <aws-java-sdk-core.version>1.12.279</aws-java-sdk-core.version>
+        <redshift.jdbc.version>2.1.0.19</redshift.jdbc.version>
 
         <!-- Test dependencies -->
         <mockito.version>1.10.19</mockito.version>
@@ -72,5 +73,10 @@
             <version>${aws-java-sdk-s3.version}</version>
             <scope>compile</scope>
         </dependency>
+        <dependency>
+            <groupId>com.amazon.redshift</groupId>
+            <artifactId>redshift-jdbc42</artifactId>
+            <version>${redshift.jdbc.version}</version>
+        </dependency>
     </dependencies>
 </project>
diff --git a/assemblies/plugins/tech/aws/src/assembly/assembly.xml b/assemblies/plugins/tech/aws/src/assembly/assembly.xml
@@ -57,6 +57,7 @@
                 <include>joda-time:joda-time</include>
                 <include>com.amazonaws:aws-java-sdk-s3</include>
                 <include>com.amazonaws:aws-java-sdk-kms</include>
+                <include>com.amazon.redshift:redshift-jdbc42:jar</include>
             </includes>
         </dependencySet>
     </dependencySets>

diff --git a/docs/hop-user-manual/modules/ROOT/assets/images/transforms/icons/redshift.svg b/docs/hop-user-manual/modules/ROOT/assets/images/transforms/icons/redshift.svg
diff --git a/docs/hop-user-manual/modules/ROOT/nav.adoc b/docs/hop-user-manual/modules/ROOT/nav.adoc
@@ -207,6 +207,7 @@ under the License.
 *** xref:pipeline/transforms/processfiles.adoc[Process files]
 *** xref:pipeline/transforms/propertyinput.adoc[Properties file Input]
 *** xref:pipeline/transforms/propertyoutput.adoc[Properties file Output]
+*** xref:pipeline/transforms/redshift-bulkloader.adoc[Redshift Bulk Loader]
 *** xref:pipeline/transforms/regexeval.adoc[Regex Evaluation]
 *** xref:pipeline/transforms/replacestring.adoc[Replace in String]
 *** xref:pipeline/transforms/reservoirsampling.adoc[Reservoir Sampling]

diff --git a/.../hop-user-manual/modules/ROOT/pages/pipeline/transforms/postgresbulkloader.adoc b/.../hop-user-manual/modules/ROOT/pages/pipeline/transforms/postgresbulkloader.adoc
@@ -40,7 +40,7 @@ TIP: replace boolean fields in your pipeline stream by string fields with "Y" or
 !===
 |===
 
-IMPORTANT: The PostgreSQL Bulk Loader is linked to the database type. It will fetch the JDBC driver from the hop/plugins/databases/postgresql/lib folder. +
+IMPORTANT: The PostgreSQL Bulk Loader is linked to the database type. It will fetch the JDBC driver from the hop/lib/jdbc folder. +
  +
 Valid locations for the JDBC driver for this transform are the database plugin lib and the main hop/lib folder. It will not work in combination with the SHARED_JDBC_FOLDER variable.
 

diff --git a/...hop-user-manual/modules/ROOT/pages/pipeline/transforms/redshift-bulkloader.adoc b/...hop-user-manual/modules/ROOT/pages/pipeline/transforms/redshift-bulkloader.adoc
@@ -0,0 +1,86 @@
+////
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+////
+:documentationPath: /pipeline/transforms/
+:language: en_US
+:description: The Redshift Bulk Loader transform loads data from Apache Hop to AWS Redshift using the COPY command.
+
+= image:transforms/icons/redshift.svg[Redshift Bulk Loader transform Icon, role="image-doc-icon"] Redshift Bulk Loader
+
+[%noheader,cols="3a,1a", role="table-no-borders" ]
+|===
+|
+== Description
+
+The Redshift Bulk Loader transform loads data from Apache Hop to AWS Redshift using the https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html[`COPY`^] command.
+
+TIP: make sure your target Redshift table has a layout that is compatible with Parquet data types, e.g. use `int8` instead of `int4` data types.
+
+|
+== Supported Engines
+[%noheader,cols="2,1a",frame=none, role="table-supported-engines"]
+!===
+!Hop Engine! image:check_mark.svg[Supported, 24]
+!Spark! image:question_mark.svg[Maybe Supported, 24]
+!Flink! image:question_mark.svg[Maybe Supported, 24]
+!Dataflow! image:question_mark.svg[Maybe Supported, 24]
+!===
+|===
+
+IMPORTANT: The Redshift Bulk Loader is linked to the database type. It will fetch the JDBC driver from the hop/lib/jdbc folder. +
++
+
+== General Options
+
+[options="header"]
+|===
+|Option|Description
+|Transform name|Name of the transform.
+|Connection|Name of the database connection on which the target table resides.
+|Target schema|The name of the target schema to write data to.
+|Target table|The name of the target table to write data to.
+|AWS Authentication a|choose which authentication method to use with the `COPY` command. Supported options are `AWS Credentials` and `IAM Role`. +
+
+* check the https://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-access-permissions.html#copy-usage_notes-access-key-based[Key-based access control] for more information on the `Credentials` option.
+* check the https://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-access-permissions.html#copy-usage_notes-access-role-based[IAM Role] docs for more information on the `IAM Role` option.
+
+|Use AWS system variables|(`Credentials` only!) pick up the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` values from your operating system's environment variables.
+|AWS_ACCESS_KEY_ID|(if `Credentials` is selected and `Use AWS system variables` is unchecked) specify a value or variable for your `AWS_ACCESS_KEY_ID`.
+|AWS_SECRET_ACCESS_KEY|(if `Credentials` is selected and `Use AWS system variables` is unchecked) specify a value or variable for your `AWS_SECRET_ACCESS_KEY`.
+|IAM Role|(if `IAM Role` is selected) specify the IAM Role to use, in the syntax `arn:aws:iam::<aws-account-id>:role/<role-name>`
+|Truncate table|Truncate the target table before loading data.
+|Truncate on first row|Truncate the target table before loading data, but only when a first data row is received (will not truncate when a pipeline runs an empty stream (0 rows)).
+|Specify database fields|Specify the database and stream fields mapping
+|===
+
+== Main Options
+
+[options="header"]
+|===
+|Option|Description
+|Stream to S3 CSV|write the current pipeline stream to a CSV file in an S3 bucket before performing the `COPY` load.
+|Load from existing file|do not stream the contents of the current pipeline, but perform the `COPY` load from a pre-existing file in S3. Suppoorted formats are `CSV` (comma delimited) and `Parquet`.
+|Copy into Redshift from existing file|path to the file in S3 to `COPY` load the data from.
+|===
+
+== Database fields
+
+Map the current stream fields to the Redshift table's columns.
+
+== Metadata Injection Support
+
+All fields of this transform support metadata injection.
+You can use this transform with Metadata Injection to pass metadata to your pipeline at runtime.
-Original file line number
+Diff line change
@@ Expand Up @@
     !===
     |===
-    IMPORTANT: The PostgreSQL Bulk Loader is linked to the database type. It will fetch the JDBC driver from the hop/plugins/databases/postgresql/lib folder. +
+    IMPORTANT: The PostgreSQL Bulk Loader is linked to the database type. It will fetch the JDBC driver from the hop/lib/jdbc folder. +
      +
     Valid locations for the JDBC driver for this transform are the database plugin lib and the main hop/lib folder. It will not work in combination with the SHARED_JDBC_FOLDER variable.
@@ Expand Down @@