Skip to content

Commit

Permalink
Change appli13
Browse files Browse the repository at this point in the history
  • Loading branch information
linogaliana committed Mar 15, 2024
1 parent cdfe3d3 commit 23a44dc
Show file tree
Hide file tree
Showing 10 changed files with 1,308 additions and 605 deletions.
1 change: 1 addition & 0 deletions src/appli13/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
config.yaml
__pycache__/
data/
titanic/
4 changes: 0 additions & 4 deletions src/appli13/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
FROM ubuntu:22.04

WORKDIR ${HOME}/titanic

# Install Python
RUN apt-get -y update && \
apt-get install -y python3-pip

# Install project dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt

COPY main.py .
COPY src ./src
CMD ["python3", "main.py"]
14 changes: 4 additions & 10 deletions src/appli13/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,11 @@ ayant la structure suivante:

```yaml
jeton_api: ####
train_path: ####
test_path: ####
test_fraction: ####
data_path: https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv
```
## Réutilisation
Pour installer les dépendances
Pour pouvoir tester ce projet, le code suivant
suffit:
```python
```bash
pip install -r requirements.txt
python main.py
```
```
3 changes: 0 additions & 3 deletions src/appli13/install.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
#!/bin/bash

# Install Python
apt-get -y update
apt-get install -y python3-pip python3-venv

# Create empty virtual environment
python3 -m venv titanic
source titanic/bin/activate

# Install project dependencies
pip install -r requirements.txt
89 changes: 46 additions & 43 deletions src/appli13/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,67 +2,70 @@
Prediction de la survie d'un individu sur le Titanic
"""

# GESTION ENVIRONNEMENT --------------------------------

import argparse
import src.data.import_data as imp
import src.features.build_features as bf
import src.models.train_evaluate as te


import pathlib
import pandas as pd

# PARAMETRES -------------------------------
from src.data.import_data import import_yaml_config, split_and_count
from src.pipeline.build_pipeline import split_train_test, create_pipeline
from src.models.train_evaluate import evaluate_model

# Paramètres ligne de commande
parser = argparse.ArgumentParser(description="Paramètres du random forest")
parser.add_argument(
"--n_trees", type=int, default=20, help="Nombre d'arbres"
)
parser.add_argument("--n_trees", type=int, default=20, help="Nombre d'arbres")
args = parser.parse_args()

# Paramètres YAML
config = imp.import_yaml_config("configuration/config.yaml")
base_url = "https://minio.lab.sspcloud.fr/projet-formation/ensae-reproductibilite/data/raw"
API_TOKEN = config.get("jeton_api")
LOCATION_TRAIN = config.get("train_path", f"{base_url}/train.csv")
LOCATION_TEST = config.get("test_path", f"{base_url}/test.csv")
TEST_FRACTION = config.get("test_fraction", .1)
N_TREES = args.n_trees
n_trees = args.n_trees

URL_RAW = "https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv"
config = import_yaml_config("configuration/config.yaml")
jeton_api = config.get("jeton_api")
data_path = config.get("data_path", URL_RAW)
data_train_path = config.get("train_path", "data/derived/train.csv")
data_test_path = config.get("test_path", "data/derived/test.csv")

MAX_DEPTH = None
MAX_FEATURES = "sqrt"

# FEATURE ENGINEERING --------------------------------

print(LOCATION_TRAIN)
# IMPORT ET EXPLORATION DONNEES --------------------------------

TrainingData = imp.import_data(LOCATION_TRAIN)
TestData = imp.import_data(LOCATION_TEST)
TrainingData = pd.read_csv(data_path)

# Create a 'Title' variable
TrainingData = bf.create_variable_title(TrainingData)
TestData = bf.create_variable_title(TestData)

# Usage example:
ticket_count = split_and_count(TrainingData, "Ticket", "/")
name_count = split_and_count(TrainingData, "Name", ",")

## IMPUTATION DES VARIABLES ================

TrainingData = bf.fill_na_titanic(TrainingData)
TestData = bf.fill_na_titanic(TestData)
# SPLIT TRAIN/TEST --------------------------------

TrainingData = bf.label_encoder_titanic(TrainingData)
TestData = bf.label_encoder_titanic(TestData)
p = pathlib.Path("data/derived/")
p.mkdir(parents=True, exist_ok=True)

X_train, X_test, y_train, y_test = split_train_test(
TrainingData, test_size=0.1,
train_path=data_train_path,
test_path=data_test_path
)


# PIPELINE ----------------------------


# Create the pipeline
pipe = create_pipeline(
n_trees, max_depth=MAX_DEPTH, max_features=MAX_FEATURES
)

# Making a new feature hasCabin which is 1 if cabin is available else 0
TrainingData = bf.check_has_cabin(TrainingData)
TestData = bf.check_has_cabin(TestData)

TrainingData = bf.ticket_length(TrainingData)
TestData = bf.ticket_length(TestData)
# ESTIMATION ET EVALUATION ----------------------

pipe.fit(X_train, y_train)

# MODELISATION: RANDOM FOREST ----------------------------

model = te.random_forest_titanic(
data=TrainingData,
fraction_test=TEST_FRACTION,
n_trees=N_TREES
)
# Evaluate the model
score, matrix = evaluate_model(pipe, X_test, y_test)
print(f"{score:.1%} de bonnes réponses sur les données de test pour validation")
print(20 * "-")
print("matrice de confusion")
print(matrix)
Loading

0 comments on commit 23a44dc

Please sign in to comment.