neptune_random_search.yaml

project: ORGANIZATION/Santander-Value-Prediction-Challenge

name: Santander-Value-Prediction-Challenge
tags: [solution-3]

metric:
  channel: 'RMSLE'
  goal: minimize

exclude:
  - output
  - imgs
  - neptune.log
  - offline_job.log
  - .git
  - .github
  - .idea
  - .ipynb_checkpoints
  - Untitled.ipynb

parameters:
# Data
  train_filepath:             YOUR/PATH/TO/train.csv
  test_filepath:              YOUR/PATH/TO/test.csv
  sample_submission_filepath: YOUR/PATH/TO/sample_submission.csv
  experiment_directory:       YOUR/PATH/WORKDIR

# Kaggle
  kaggle_api: 0
  kaggle_message: 'solution-3'

# Data preparation
  n_cv_splits: 5
  validation_size: 0.1
  shuffle: 1

# Execution
  clean_experiment_directory_before_training: 1
  num_workers: 4
  verbose: 1

# Data Cleaning
  drop_zero_fraction__threshold: 0.98
  variance_threshold__threshold: 0.0

# Feature Extraction
  row_aggregations__bucket_nrs: "[1, 2]"
  truncated_svd__use: False
  truncated_svd__n_components: 50
  truncated_svd__n_iter: 10
  pca__use: False
  pca__n_components: 100
  fast_ica__use: False
  fast_ica__n_components: 15
  factor_analysis__use: True
  factor_analysis__n_components: 50
  gaussian_random_projection__use: False
  gaussian_random_projection__n_components: 50
  gaussian_projection__eps: 0.1
  sparse_random_projection__use: True
  sparse_random_projection__n_components: 50

# Light GBM
  lgbm_random_search_runs: 500
  lgbm__device: cpu # gpu cpu
  lgbm__boosting_type: gbdt
  lgbm__objective: rmse
  lgbm__metric: rmse
  lgbm__number_boosting_rounds: 10000
  lgbm__early_stopping_rounds: 100
  lgbm__learning_rate: 0.01
  lgbm__num_leaves: '[10, 50]'
  lgbm__max_depth: '[1, 20]'
  lgbm__min_child_samples: '[1, 20]'
  lgbm__max_bin: '[180, 500]' # at most 255 for device=gpu
  lgbm__subsample: '[0.8, 0.9, 0.99, 0.6, 0.7, "list"]'
  lgbm__subsample_freq: 1
  lgbm__colsample_bytree: 0.8
  lgbm__min_child_weight: '[1, 20]'
  lgbm__reg_lambda: '[0.0, 0.1, "uniform"]'
  lgbm__reg_alpha: '[0.0, 0.1, "uniform"]'
  lgbm__scale_pos_weight: 1
  lgbm__zero_as_missing: True

# Postprocessing
  aggregation_method: mean