diff --git a/README.md b/README.md
index e06001c..90b29b6 100644
--- a/README.md
+++ b/README.md
@@ -21,9 +21,19 @@ This ETL (Extract, Transform, Load) project aims to extract metadata from Window
## Quick Start
### Prerequisites
-Ensure Docker and Docker Compose are installed on your machine to build and run the necessary containers for the application, database, and ELK stack.
-
+1. Ensure Docker and Docker Compose are installed on your machine to build and run the necessary containers for the application, database, and ELK stack.
+2. **You have to have at least 10GB free ram to run the application.**
### Setup
+#### Development Environment (lazy start one command and app is up and running)
+1. **Environment Variables:**
+ all environment variables are set in the docker-compose.dev.yaml file
+2. **Build and Run Docker Containers:**
+ - Navigate to the project directory and run the following command:
+ ```sh
+ docker compose -f docker-compose.dev.yaml up --build
+ ```
+
+#### Production Environment
1. **Environment Variables:**
- Create a `.env` file at the root of the project.
- Fill the `.env` file with your specific configuration:
@@ -54,7 +64,8 @@ In your Dockerfile, you might have an entry similar to:
ENTRYPOINT ["python3", "main.py"]
CMD ["10000"]
```
-This means by default 10000 files will be downloaded unless the command in the docker-compose.prod.yaml overrides it (as in the example where it's set to 1000000).
+This means by default 10000 files will be downloaded unless the command in the docker-compose.prod.yaml overrides it (as in the example where it's set to 1000000).
+Attention: The number of files to be downloaded should be less than the number of files in the S3 bucket.
### Accessing the Applications
- **Spark Master Dashboard:** [http://localhost:8080/](http://localhost:8080/)
@@ -105,7 +116,8 @@ AWS Secret Access Key: test
Default region name: your preferred region (e.g., us-east-1)
Default output format: json
-## Create a Bucket
+## Localstack S3 on localhost
+### Create a Bucket
To create a bucket in LocalStack, use the AWS CLI command with the endpoint URL pointing to your LocalStack instance:
```bash
@@ -113,9 +125,30 @@ aws --endpoint-url=http://localhost:4566 s3 mb s3://my-bucket
```
Replace my-bucket with your desired bucket name.
-## List Buckets
+### List Buckets
To list all the buckets:
```bash
aws --endpoint-url=http://localhost:4566 s3 ls
+```
+
+### List Bucket Contents
+To list the contents of a bucket:
+
+```bash
+aws --endpoint-url=http://localhost:4566 s3 ls s3://my-bucket
+```
+
+### List all files in a bucket
+To list all the files in a bucket:
+
+```bash
+aws --endpoint-url=http://localhost:4566 s3 ls s3://my-bucket --recursive
+```
+
+### Upload a File
+To upload a file to the bucket:
+
+```bash
+aws --endpoint-url=http://localhost:4566 s3 cp /path/to/local/file s3://my-bucket
```
\ No newline at end of file
diff --git a/docker-compose.e2e.yaml b/docker-compose.dev.yaml
similarity index 100%
rename from docker-compose.e2e.yaml
rename to docker-compose.dev.yaml
diff --git a/docker-compose.yaml b/docker-compose.yaml
deleted file mode 100644
index 2915b41..0000000
--- a/docker-compose.yaml
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright VMware, Inc.
-# SPDX-License-Identifier: APACHE-2.0
-
-version: '3.7'
-
-services:
- db:
- image: postgres
- environment:
- - POSTGRES_USER=${DB_USER}
- - POSTGRES_PASSWORD=${DB_PASSWORD}
- - POSTGRES_DB=${DB_NAME}
- ports:
- - "5432:5432"
- networks:
- - spark-network
- adminer:
- image: adminer
- restart: always
- ports:
- - 8089:8080
- networks:
- - spark-network
-
-
- spark-master:
- build:
- context: .
- dockerfile: Dockerfile.spark
- environment:
- - SPARK_MODE=master
- - SPARK_RPC_AUTHENTICATION_ENABLED=no
- - SPARK_RPC_ENCRYPTION_ENABLED=no
- - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- - SPARK_SSL_ENABLED=no
- - SPARK_USER=spark
- - AWS_ACCESS_KEY_ID=sadf
- - AWS_ACCESS_KEY=sadf
- - AWS_SECRET_KEY=asdf
- - AWS_SECRET_ACCESS_KEY=asdf
- ports:
- - '8080:8080'
- - '7078:7077'
- networks:
- - spark-network
-
- spark-worker:
- build:
- context: .
- dockerfile: Dockerfile.spark
- environment:
- - SPARK_MODE=worker
- - SPARK_MASTER_URL=spark://spark-master:7077
- - SPARK_WORKER_MEMORY=1G
- - SPARK_WORKER_CORES=1
- - SPARK_RPC_AUTHENTICATION_ENABLED=no
- - SPARK_RPC_ENCRYPTION_ENABLED=no
- - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- - SPARK_SSL_ENABLED=no
- - SPARK_USER=spark
- - AWS_ACCESS_KEY_ID=asdf
- - AWS_ACCESS_KEY=asdf
- - AWS_SECRET_KEY=asdf
- - AWS_SECRET_ACCESS_KEY=asdf
- networks:
- - spark-network
-
-
- python-app:
- build:
- context: .
- dockerfile: Dockerfile.app
- command: ["5"]
- environment:
- LOGGER_HOST: logstash
- LOGGER_PORT: 5044
- DB_USER: ${DB_USER}
- DB_PASSWORD: ${DB_PASSWORD}
- DB_NAME: ${DB_NAME}
- DB_HOST: db
- BUCKET_NAME: ${BUCKET_NAME}
- PREFIX_LIST: 0/, 1/
- APP_ENV: production
- networks:
- - spark-network
- depends_on:
- - db
- - elasticsearch
- - kibana
- - logstash
- - spark-master
- - spark-worker
-
-
- elasticsearch:
- image: docker.elastic.co/elasticsearch/elasticsearch:7.15.1
- container_name: elasticsearch
- ports:
- - "9200:9200"
- environment:
- - discovery.type=single-node
- networks:
- - spark-network
-
- kibana:
- image: docker.elastic.co/kibana/kibana:7.15.1
- environment:
- ELASTICSEARCH_URL: http://elasticsearch:9200
- ports:
- - "5601:5601"
- networks:
- - spark-network
-
- logstash:
- image: docker.elastic.co/logstash/logstash:7.15.1
- container_name: logstash
- volumes:
- - ./logstash:/usr/share/logstash/pipeline
- ports:
- - "5044:5044"
- environment:
- - "ES_HOST=elasticsearch"
- - "ES_PORT=9200"
- networks:
- - spark-network
-
- localstack:
- image: localstack/localstack
- ports:
- - "4566:4566"
- environment:
- - SERVICES=s3,dynamodb,sqs
- - DEFAULT_REGION=us-east-1
- - DATA_DIR=/tmp/localstack/data
- networks:
- - spark-network
-
-networks:
- spark-network:
- driver: bridge
\ No newline at end of file