Skip to content

Commit

Permalink
Refactoring tsvtools (#338)
Browse files Browse the repository at this point in the history
* change diagnosis into group and subgroup

* change diagnosis into group and subgroup

* change diagnosis into group and subgroup

* change diagnosis into group and subgroup

* Update tsvtools tests

* Update tsvtools tests

* simplify kfold and split

* update doc

* Add docs and add some changes

* Update tests

* update tests

* update getlabels

* Update tsvtools

* update docs

* update jenkins

* update tests

* Update test tsvtools

* test

* update Jenkins

* update

* update

* update docs

* update docs

* update kfold

* add get_metadata function

* add tests

* Changes 30/09

* Update

* update

* update

* Update clinicadl/tsvtools/analysis/analysis_cli.py

Co-authored-by: mdiazmel <[email protected]>

* update

* update

* update

* update after review

* update tests

* update tests

* Update clinicadl/tsvtools/get_metadata/get_metadata.py

Co-authored-by: mdiazmel <[email protected]>

* Update clinicadl/tsvtools/get_metadata/get_metadata.py

Co-authored-by: mdiazmel <[email protected]>

* Update clinicadl/tsvtools/get_progression/get_progression_cli.py

Co-authored-by: mdiazmel <[email protected]>

* Update clinicadl/tsvtools/get_progression/get_progression_cli.py

Co-authored-by: mdiazmel <[email protected]>

* Update clinicadl/tsvtools/split/split.py

Co-authored-by: mdiazmel <[email protected]>

* Update clinicadl/tsvtools/split/split.py

Co-authored-by: mdiazmel <[email protected]>

* couple of changes before merging

Co-authored-by: mdiazmel <[email protected]>
  • Loading branch information
2 people authored and ravih18 committed Oct 10, 2022
1 parent 4fbd73e commit 5b6a56a
Show file tree
Hide file tree
Showing 66 changed files with 1,677 additions and 55,091 deletions.
35 changes: 19 additions & 16 deletions .jenkins/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,24 @@ pipeline {
'''
}
}
stage('TSVTOOL tests Linux') {
stage('tsvtools tests Linux') {
steps {
echo 'Testing tsvtool tasks...'
sh "echo 'Agent name: ${NODE_NAME}'"
sh '''
source "${CONDA_HOME}/etc/profile.d/conda.sh"
conda activate "${CONDA_ENV}"
cd $WORKSPACE/tests
poetry run pytest \
--junitxml=./test-reports/test_tsvtool_report.xml \
--verbose \
--disable-warnings \
test_tsvtool.py
conda deactivate
'''
catchError(buildResult: 'FAILURE', stageResult: 'UNSTABLE'){
echo 'Testing tsvtool tasks...'
sh "echo 'Agent name: ${NODE_NAME}'"
sh '''
source "${CONDA_HOME}/etc/profile.d/conda.sh"
conda activate "${CONDA_ENV}"
cd $WORKSPACE/tests
poetry run pytest \
--junitxml=./test-reports/test_tsvtool_report.xml \
--verbose \
--disable-warnings \
test_tsvtool.py
conda deactivate
'''
}

}
post {
always {
Expand Down Expand Up @@ -137,7 +140,7 @@ pipeline {
}
}
}
stage('Extract tests Linux') {
stage('Prepare data tests Linux') {
steps {
echo 'Testing extract task...'
sh "echo 'Agent name: ${NODE_NAME}'"
Expand All @@ -151,7 +154,7 @@ pipeline {
--junitxml=./test-reports/test_extract_report.xml \
--verbose \
--disable-warnings \
test_extract.py
test_prepare_data.py
conda deactivate
'''
}
Expand Down
4 changes: 2 additions & 2 deletions clinicadl/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import click

from clinicadl.extract.extract_cli import cli as extract_cli
from clinicadl.generate.generate_cli import cli as generate_cli
from clinicadl.interpret.interpret_cli import cli as interpret_cli
from clinicadl.predict.predict_cli import cli as predict_cli
from clinicadl.prepare_data.prepare_data_cli import cli as prepare_data_cli
from clinicadl.quality_check.qc_cli import cli as qc_cli
from clinicadl.random_search.random_search_cli import cli as random_search_cli
from clinicadl.train.train_cli import cli as train_cli
Expand Down Expand Up @@ -37,7 +37,7 @@ def cli(verbose):
cli.add_command(tsvtools_cli)
cli.add_command(train_cli)
cli.add_command(generate_cli)
cli.add_command(extract_cli)
cli.add_command(prepare_data_cli)
cli.add_command(predict_cli)
cli.add_command(interpret_cli)
cli.add_command(qc_cli)
Expand Down
2 changes: 1 addition & 1 deletion clinicadl/generate/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import torch
from clinica.utils.inputs import RemoteFileStructure, clinica_file_reader, fetch_file

from clinicadl.extract.extract_utils import compute_extract_json
from clinicadl.prepare_data.prepare_data_utils import compute_extract_json
from clinicadl.utils.caps_dataset.data import CapsDataset
from clinicadl.utils.maps_manager.iotools import check_and_clean, commandline_to_json
from clinicadl.utils.preprocessing import write_preprocessing
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def DeepLearningPrepareData(caps_directory, tsv_file, n_proc, parameters):
from clinicadl.utils.exceptions import ClinicaDLArgumentError
from clinicadl.utils.preprocessing import write_preprocessing

from .extract_utils import check_mask_list, compute_folder_and_file_type
from .prepare_data_utils import check_mask_list, compute_folder_and_file_type

logger = getLogger("clinicadl.extract")

Expand Down Expand Up @@ -71,7 +71,7 @@ def write_output_imgs(output_mode, container, subfolder):
if parameters["mode"] == "image" or not parameters["prepare_dl"]:

def prepare_image(file):
from .extract_utils import extract_images
from .prepare_data_utils import extract_images

logger.debug(f" Processing of {file}.")
container = container_from_filename(file)
Expand All @@ -85,7 +85,7 @@ def prepare_image(file):
elif parameters["prepare_dl"] and parameters["mode"] == "slice":

def prepare_slice(file):
from .extract_utils import extract_slices
from .prepare_data_utils import extract_slices

logger.debug(f" Processing of {file}.")
container = container_from_filename(file)
Expand All @@ -104,7 +104,7 @@ def prepare_slice(file):
elif parameters["prepare_dl"] and parameters["mode"] == "patch":

def prepare_patch(file):
from .extract_utils import extract_patches
from .prepare_data_utils import extract_patches

logger.debug(f" Processing of {file}.")
container = container_from_filename(file)
Expand All @@ -122,7 +122,7 @@ def prepare_patch(file):
elif parameters["prepare_dl"] and parameters["mode"] == "roi":

def prepare_roi(file):
from .extract_utils import extract_roi
from .prepare_data_utils import extract_roi

logger.debug(f" Processing of {file}.")
container = container_from_filename(file)
Expand All @@ -135,7 +135,7 @@ def prepare_roi(file):
parameters["roi_template"] = parameters["roi_custom_template"]
parameters["roi_mask_pattern"] = parameters["roi_custom_mask_pattern"]
else:
from .extract_utils import PATTERN_DICT, TEMPLATE_DICT
from .prepare_data_utils import PATTERN_DICT, TEMPLATE_DICT

parameters["roi_template"] = TEMPLATE_DICT[parameters["preprocessing"]]
parameters["roi_mask_pattern"] = PATTERN_DICT[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

from clinicadl.utils import cli_param

from .extract import DeepLearningPrepareData
from .extract_utils import get_parameters_dict
from .prepare_data import DeepLearningPrepareData
from .prepare_data_utils import get_parameters_dict


@click.command(name="image", no_args_is_help=True)
Expand Down Expand Up @@ -297,7 +297,7 @@ def list_commands(self, ctx):
return self.commands.keys()


@click.group(cls=RegistrationOrderGroup, name="extract", no_args_is_help=True)
@click.group(cls=RegistrationOrderGroup, name="prepare-data", no_args_is_help=True)
def cli() -> None:
"""Extract Pytorch tensors from nifti images."""
pass
Expand Down
File renamed without changes.
178 changes: 98 additions & 80 deletions clinicadl/tsvtools/analysis/analysis.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,48 @@
# coding: utf-8

import os
from copy import copy
from os import path
from warnings import warn

import numpy as np
import pandas as pd

from clinicadl.utils.exceptions import ClinicaDLArgumentError
from clinicadl.utils.tsvtools_utils import (
add_demographics,
cleaning_nan_diagnoses,
find_label,
first_session,
merged_tsv_reader,
next_session,
)


def demographics_analysis(merged_tsv, formatted_data_path, results_path, diagnoses):
def demographics_analysis(merged_tsv, data_tsv, results_tsv, diagnoses):
"""
Produces a tsv file with rows corresponding to the labels defined by the diagnoses list,
and the columns being demographic statistics.
Args:
merged_tsv (str): Path to the file obtained by the command clinica iotools merge-tsv.
formatted_data_path (str): Path to the folder containing data extracted by clinicadl tsvtool getlabels.
results_path (str): Path to the output tsv file (filename included).
diagnoses (list): Labels selected for the demographic analysis.
Writes one tsv file at results_tsv containing the demographic analysis of the tsv files in data_tsv.
Parameters
----------
merged_tsv: str (path)
Path to the file obtained by the command clinica iotools merge-tsv.
data_tsv: str (path)
Path to the folder containing data extracted by clinicadl tsvtool getlabels.
results_tsv: str (path)
Path to the output tsv file (filename included).
diagnoses: list of str
Labels selected for the demographic analysis.
Returns:
writes one tsv file at results_path containing the
demographic analysis of the tsv files in formatted_data_path.
"""

merged_df = pd.read_csv(merged_tsv, sep="\t")
merged_df = merged_tsv_reader(merged_tsv)
merged_df.set_index(["participant_id", "session_id"], inplace=True)
parent_directory = path.abspath(path.join(results_path, os.pardir))
merged_df = cleaning_nan_diagnoses(merged_df)
parent_directory = path.abspath(path.join(results_tsv, os.pardir))
os.makedirs(parent_directory, exist_ok=True)

fields_dict = {
Expand Down Expand Up @@ -70,83 +79,92 @@ def demographics_analysis(merged_tsv, formatted_data_path, results_path, diagnos

# Need all values for mean and variance (age, MMSE and scans)
diagnosis_dict = dict.fromkeys(diagnoses)
if not path.exists(data_tsv):
print(
f"getlabels.tsv file with all sessions was not found. "
# f"Loads baseline version instead."
)
for diagnosis in diagnoses:
diagnosis_dict[diagnosis] = {"age": [], "MMSE": [], "scans": []}
diagnosis_path = path.join(formatted_data_path, diagnosis + ".tsv")
if not path.exists(diagnosis_path):
print(
f"TSV file with all sessions was not found for diagnosis {diagnosis}. "
f"Loads baseline version instead."
getlabels_df = pd.read_csv(data_tsv, sep="\t")
diagnosis_copy_df = copy(getlabels_df)
diagnosis_copy_df = diagnosis_copy_df[
diagnosis_copy_df["diagnosis"] == diagnosis
]
if not diagnosis_copy_df.empty:
diagnosis_demographics_df = add_demographics(
diagnosis_copy_df, merged_df, diagnosis
)
diagnosis_path = path.join(formatted_data_path, diagnosis + "_baseline.tsv")
diagnosis_df = pd.read_csv(diagnosis_path, sep="\t")
diagnosis_demographics_df = add_demographics(diagnosis_df, merged_df, diagnosis)
diagnosis_demographics_df.set_index(
["participant_id", "session_id"], inplace=True
)
diagnosis_df.set_index(["participant_id", "session_id"], inplace=True)

for subject, subject_df in diagnosis_df.groupby(level=0):
first_session_id = first_session(subject_df)
feature_absence = isinstance(
merged_df.loc[(subject, first_session_id), "diagnosis"], float
diagnosis_demographics_df.reset_index()
diagnosis_demographics_df.set_index(
["participant_id", "session_id"], inplace=True
)
while feature_absence:
first_session_id = next_session(subject_df, first_session_id)
diagnosis_copy_df.set_index(["participant_id", "session_id"], inplace=True)
for subject, subject_df in diagnosis_copy_df.groupby(level=0):
first_session_id = first_session(subject_df)
feature_absence = isinstance(
merged_df.loc[(subject, first_session_id), "diagnosis"], float
)
demographics_subject_df = merged_df.loc[subject]

# Extract features
results_df.loc[diagnosis, "n_subjects"] += 1
results_df.loc[diagnosis, "n_scans"] += len(subject_df)
diagnosis_dict[diagnosis]["age"].append(
merged_df.loc[(subject, first_session_id), fields_dict["age"]]
)
diagnosis_dict[diagnosis]["MMSE"].append(
merged_df.loc[(subject, first_session_id), fields_dict["MMSE"]]
)
diagnosis_dict[diagnosis]["scans"].append(len(subject_df))
sexF = (
len(
demographics_subject_df[
(demographics_subject_df[fields_dict["sex"]].isin(["F"]))
]
while feature_absence:
first_session_id = next_session(subject_df, first_session_id)

feature_absence = isinstance(
merged_df.loc[(subject, first_session_id), "diagnosis"], float
)
demographics_subject_df = merged_df.loc[subject]

# Extract features
results_df.loc[diagnosis, "n_subjects"] += 1
results_df.loc[diagnosis, "n_scans"] += len(subject_df)
diagnosis_dict[diagnosis]["age"].append(
merged_df.loc[(subject, first_session_id), fields_dict["age"]]
)
> 0
)
sexM = (
len(
demographics_subject_df[
(demographics_subject_df[fields_dict["sex"]].isin(["M"]))
]
diagnosis_dict[diagnosis]["MMSE"].append(
merged_df.loc[(subject, first_session_id), fields_dict["MMSE"]]
)
> 0
)
if sexF:
results_df.loc[diagnosis, "sexF"] += 1
elif sexM:
results_df.loc[diagnosis, "sexM"] += 1
else:
raise ValueError(
f"The field 'sex' for patient {subject} can not be determined"
diagnosis_dict[diagnosis]["scans"].append(len(subject_df))
sexF = (
len(
demographics_subject_df[
(demographics_subject_df[fields_dict["sex"]].isin(["F"]))
]
)
> 0
)

cdr = merged_df.at[(subject, first_session_id), fields_dict["CDR"]]
if cdr == 0:
results_df.loc[diagnosis, "CDR_0"] += 1
elif cdr == 0.5:
results_df.loc[diagnosis, "CDR_0.5"] += 1
elif cdr == 1:
results_df.loc[diagnosis, "CDR_1"] += 1
elif cdr == 2:
results_df.loc[diagnosis, "CDR_2"] += 1
elif cdr == 3:
results_df.loc[diagnosis, "CDR_3"] += 1
else:
warn(f"Patient {subject} has CDR {cdr}")

sexM = (
len(
demographics_subject_df[
(demographics_subject_df[fields_dict["sex"]].isin(["M"]))
]
)
> 0
)
if sexF:
results_df.loc[diagnosis, "sexF"] += 1
elif sexM:
results_df.loc[diagnosis, "sexM"] += 1
else:
raise ValueError(
f"The field 'sex' for patient {subject} can not be determined"
)

cdr = merged_df.at[(subject, first_session_id), fields_dict["CDR"]]
if cdr == 0:
results_df.loc[diagnosis, "CDR_0"] += 1
elif cdr == 0.5:
results_df.loc[diagnosis, "CDR_0.5"] += 1
elif cdr == 1:
results_df.loc[diagnosis, "CDR_1"] += 1
elif cdr == 2:
results_df.loc[diagnosis, "CDR_2"] += 1
elif cdr == 3:
results_df.loc[diagnosis, "CDR_3"] += 1
else:
tt = 3 # warn(f"Patient {subject} has CDR {cdr}")
else:
raise ClinicaDLArgumentError(
f"There is no subject with diagnosis {diagnosis}"
)
for diagnosis in diagnoses:
results_df.loc[diagnosis, "mean_age"] = np.nanmean(
diagnosis_dict[diagnosis]["age"]
Expand Down Expand Up @@ -185,6 +203,6 @@ def demographics_analysis(merged_tsv, formatted_data_path, results_path, diagnos
f"NaN values were found for {key} values associated to diagnosis {diagnosis}"
)

results_df.index.name = "diagnosis"
results_df.index.name = "group"

results_df.to_csv(results_path, sep="\t")
results_df.to_csv(results_tsv, sep="\t")
Loading

0 comments on commit 5b6a56a

Please sign in to comment.