diff --git a/README.md b/README.md index e06001c..90b29b6 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,19 @@ This ETL (Extract, Transform, Load) project aims to extract metadata from Window ## Quick Start ### Prerequisites -Ensure Docker and Docker Compose are installed on your machine to build and run the necessary containers for the application, database, and ELK stack. - +1. Ensure Docker and Docker Compose are installed on your machine to build and run the necessary containers for the application, database, and ELK stack.
+2. **You have to have at least 10GB free ram to run the application.** ### Setup +#### Development Environment (lazy start one command and app is up and running) +1. **Environment Variables:** + all environment variables are set in the docker-compose.dev.yaml file +2. **Build and Run Docker Containers:** + - Navigate to the project directory and run the following command: + ```sh + docker compose -f docker-compose.dev.yaml up --build + ``` + +#### Production Environment 1. **Environment Variables:** - Create a `.env` file at the root of the project. - Fill the `.env` file with your specific configuration: @@ -54,7 +64,8 @@ In your Dockerfile, you might have an entry similar to: ENTRYPOINT ["python3", "main.py"] CMD ["10000"] ``` -This means by default 10000 files will be downloaded unless the command in the docker-compose.prod.yaml overrides it (as in the example where it's set to 1000000). +This means by default 10000 files will be downloaded unless the command in the docker-compose.prod.yaml overrides it (as in the example where it's set to 1000000). +Attention: The number of files to be downloaded should be less than the number of files in the S3 bucket. ### Accessing the Applications - **Spark Master Dashboard:** [http://localhost:8080/](http://localhost:8080/) @@ -105,7 +116,8 @@ AWS Secret Access Key: test
Default region name: your preferred region (e.g., us-east-1)
Default output format: json
-## Create a Bucket +## Localstack S3 on localhost +### Create a Bucket To create a bucket in LocalStack, use the AWS CLI command with the endpoint URL pointing to your LocalStack instance: ```bash @@ -113,9 +125,30 @@ aws --endpoint-url=http://localhost:4566 s3 mb s3://my-bucket ``` Replace my-bucket with your desired bucket name. -## List Buckets +### List Buckets To list all the buckets: ```bash aws --endpoint-url=http://localhost:4566 s3 ls +``` + +### List Bucket Contents +To list the contents of a bucket: + +```bash +aws --endpoint-url=http://localhost:4566 s3 ls s3://my-bucket +``` + +### List all files in a bucket +To list all the files in a bucket: + +```bash +aws --endpoint-url=http://localhost:4566 s3 ls s3://my-bucket --recursive +``` + +### Upload a File +To upload a file to the bucket: + +```bash +aws --endpoint-url=http://localhost:4566 s3 cp /path/to/local/file s3://my-bucket ``` \ No newline at end of file diff --git a/docker-compose.e2e.yaml b/docker-compose.dev.yaml similarity index 100% rename from docker-compose.e2e.yaml rename to docker-compose.dev.yaml diff --git a/docker-compose.yaml b/docker-compose.yaml deleted file mode 100644 index 2915b41..0000000 --- a/docker-compose.yaml +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright VMware, Inc. -# SPDX-License-Identifier: APACHE-2.0 - -version: '3.7' - -services: - db: - image: postgres - environment: - - POSTGRES_USER=${DB_USER} - - POSTGRES_PASSWORD=${DB_PASSWORD} - - POSTGRES_DB=${DB_NAME} - ports: - - "5432:5432" - networks: - - spark-network - adminer: - image: adminer - restart: always - ports: - - 8089:8080 - networks: - - spark-network - - - spark-master: - build: - context: . - dockerfile: Dockerfile.spark - environment: - - SPARK_MODE=master - - SPARK_RPC_AUTHENTICATION_ENABLED=no - - SPARK_RPC_ENCRYPTION_ENABLED=no - - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no - - SPARK_SSL_ENABLED=no - - SPARK_USER=spark - - AWS_ACCESS_KEY_ID=sadf - - AWS_ACCESS_KEY=sadf - - AWS_SECRET_KEY=asdf - - AWS_SECRET_ACCESS_KEY=asdf - ports: - - '8080:8080' - - '7078:7077' - networks: - - spark-network - - spark-worker: - build: - context: . - dockerfile: Dockerfile.spark - environment: - - SPARK_MODE=worker - - SPARK_MASTER_URL=spark://spark-master:7077 - - SPARK_WORKER_MEMORY=1G - - SPARK_WORKER_CORES=1 - - SPARK_RPC_AUTHENTICATION_ENABLED=no - - SPARK_RPC_ENCRYPTION_ENABLED=no - - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no - - SPARK_SSL_ENABLED=no - - SPARK_USER=spark - - AWS_ACCESS_KEY_ID=asdf - - AWS_ACCESS_KEY=asdf - - AWS_SECRET_KEY=asdf - - AWS_SECRET_ACCESS_KEY=asdf - networks: - - spark-network - - - python-app: - build: - context: . - dockerfile: Dockerfile.app - command: ["5"] - environment: - LOGGER_HOST: logstash - LOGGER_PORT: 5044 - DB_USER: ${DB_USER} - DB_PASSWORD: ${DB_PASSWORD} - DB_NAME: ${DB_NAME} - DB_HOST: db - BUCKET_NAME: ${BUCKET_NAME} - PREFIX_LIST: 0/, 1/ - APP_ENV: production - networks: - - spark-network - depends_on: - - db - - elasticsearch - - kibana - - logstash - - spark-master - - spark-worker - - - elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:7.15.1 - container_name: elasticsearch - ports: - - "9200:9200" - environment: - - discovery.type=single-node - networks: - - spark-network - - kibana: - image: docker.elastic.co/kibana/kibana:7.15.1 - environment: - ELASTICSEARCH_URL: http://elasticsearch:9200 - ports: - - "5601:5601" - networks: - - spark-network - - logstash: - image: docker.elastic.co/logstash/logstash:7.15.1 - container_name: logstash - volumes: - - ./logstash:/usr/share/logstash/pipeline - ports: - - "5044:5044" - environment: - - "ES_HOST=elasticsearch" - - "ES_PORT=9200" - networks: - - spark-network - - localstack: - image: localstack/localstack - ports: - - "4566:4566" - environment: - - SERVICES=s3,dynamodb,sqs - - DEFAULT_REGION=us-east-1 - - DATA_DIR=/tmp/localstack/data - networks: - - spark-network - -networks: - spark-network: - driver: bridge \ No newline at end of file