Skip to content

Commit

Permalink
Manage wildcards like real wildcards
Browse files Browse the repository at this point in the history
  • Loading branch information
EmmaRenauld committed Feb 14, 2024
1 parent af2e3de commit 37cdbb8
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 91 deletions.
169 changes: 83 additions & 86 deletions dwi_ml/data/hdf5/hdf5_creation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import datetime
import glob
import logging
import os
from pathlib import Path
Expand All @@ -24,6 +25,39 @@
from dwi_ml.data.processing.dwi.dwi import standardize_data


def format_filelist(filenames, enforce_presence, folder=None) -> List[str]:
"""
If folder is not None, it will be added as prefix to all files.
"""
if isinstance(filenames, str):
filenames = [filenames]

new_files = []
for i, f in enumerate(filenames):
if folder is not None:
f = str(folder.joinpath(f))
if '*' in f:
tmp = glob.glob(f)
if len(tmp) == 0:
msg = "File not found, even with the wildcard: {}".format(f)
if enforce_presence:
raise FileNotFoundError(msg)
else:
logging.warning(msg)
else:
new_files.extend(f)
else:
if not Path(f).is_file():
msg = "File not found: {}".format(f)
if enforce_presence:
raise FileNotFoundError(msg)
else:
logging.warning(msg)
else:
new_files.append(f)
return new_files


def _load_and_verify_file(filename: str, subj_input_path, group_name: str,
group_affine, group_res):
"""
Expand Down Expand Up @@ -297,19 +331,9 @@ def _check_files_presence(self):
subj_input_dir = Path(self.root_folder).joinpath(subj_id)

# Find subject's files from group_config
for this_file in config_file_list:
this_file = this_file.replace('*', subj_id)
if this_file.endswith('/ALL'):
logging.debug(
" Keyword 'ALL' detected; we will load all "
"files in the folder '{}'"
.format(this_file.replace('/ALL', '')))
else:
this_file = subj_input_dir.joinpath(this_file)
if not this_file.is_file():
raise FileNotFoundError(
"File from groups_config ({}) not found for "
"subject {}!".format(this_file, subj_id))
config_file_list = format_filelist(config_file_list,
self.enforce_files_presence,
folder=subj_input_dir)

def create_database(self):
"""
Expand Down Expand Up @@ -441,26 +465,25 @@ def _process_one_volume_group(self, group: str, subj_id: str,
if isinstance(std_masks, str):
std_masks = [std_masks]

for sub_mask in std_masks:
sub_mask = sub_mask.replace('*', subj_id)
std_masks = format_filelist(std_masks, folder=subj_input_dir)
for mask in std_masks:
logging.info(" - Loading standardization mask {}"
.format(sub_mask))
sub_mask_file = subj_input_dir.joinpath(sub_mask)
sub_mask_img = nib.load(sub_mask_file)
sub_mask_data = np.asanyarray(sub_mask_img.dataobj) > 0
.format(os.path.basename(mask)))
sub_mask_data = nib.load(mask).get_fdata() > 0
if std_mask is None:
std_mask = sub_mask_data
else:
std_mask = np.logical_or(sub_mask_data, std_mask)

file_list = self.groups_config[group]['files']
file_list = format_filelist(file_list, self.enforce_files_presence,
folder=subj_input_dir)

# First file will define data dimension and affine
file_name = file_list[0].replace('*', subj_id)
first_file = subj_input_dir.joinpath(file_name)
logging.info(" - Processing file {}".format(file_name))
logging.info(" - Processing file {}"
.format(os.path.basename(file_list[0])))
group_data, group_affine, group_res, group_header = load_file_to4d(
first_file)
file_list[0])

if std_option == 'per_file':
logging.debug(' *Standardizing sub-data')
Expand All @@ -470,23 +493,24 @@ def _process_one_volume_group(self, group: str, subj_id: str,
# Other files must fit (data shape, affine, voxel size)
# It is not a promise that data has been correctly registered, but it
# is a minimal check.
for file_name in file_list[1:]:
file_name = file_name.replace('*', subj_id)
data = _load_and_verify_file(file_name, subj_input_dir, group,
group_affine, group_res)

if std_option == 'per_file':
logging.debug(' *Standardizing sub-data')
data = standardize_data(data, std_mask,
independent=False)

# Append file data to hdf group.
try:
group_data = np.append(group_data, data, axis=-1)
except ImportError:
raise ImportError(
'Data file {} could not be added to data group {}. '
'Wrong dimensions?'.format(file_name, group))
if len(file_list) > 1:
for file_name in file_list[1:]:
logging.info(" - Processing file {}"
.format(os.path.basename(file_name)))
data = _load_and_verify_file(file_name, subj_input_dir, group,
group_affine, group_res)

if std_option == 'per_file':
logging.debug(' *Standardizing sub-data')
data = standardize_data(data, std_mask, independent=False)

# Append file data to hdf group.
try:
group_data = np.append(group_data, data, axis=-1)
except ImportError:
raise ImportError(
'Data file {} could not be added to data group {}. '
'Wrong dimensions?'.format(file_name, group))

# Standardize data (per channel) (if not done 'per_file' yet).
if std_option == 'independent':
Expand Down Expand Up @@ -590,9 +614,6 @@ def _process_one_streamline_group(
Loads and processes a group of tractograms and merges all streamlines
together.
Note. Wildcards will be replaced by the subject id. If the list is
folder/ALL, all tractograms in the folder will be used.
Parameters
----------
subj_dir : Path
Expand Down Expand Up @@ -628,41 +649,26 @@ def _process_one_streamline_group(
final_sft = None
output_lengths = []

for instructions in tractograms:
if instructions.endswith('/ALL'):
# instructions are to get all tractograms in given folder.
tractograms_dir = instructions.split('/ALL')
tractograms_dir = ''.join(tractograms_dir[:-1])
tractograms_sublist = [
instructions.replace('/ALL', '/' + os.path.basename(p))
for p in subj_dir.glob(tractograms_dir + '/*')]
else:
# instruction is to get one specific tractogram
tractograms_sublist = [instructions]

# Either a loop on "ALL" or a loop on only one file.
for tractogram_name in tractograms_sublist:
tractogram_name = tractogram_name.replace('*', subj_id)
tractogram_file = subj_dir.joinpath(tractogram_name)
tractograms = format_filelist(tractograms, self.enforce_files_presence,
folder=subj_dir)
for tractogram_file in tractograms:
sft = self._load_and_process_sft(tractogram_file, header)

sft = self._load_and_process_sft(
tractogram_file, tractogram_name, header)
if sft is not None:
# Compute euclidean lengths (rasmm space)
sft.to_space(Space.RASMM)
output_lengths.extend(length(sft.streamlines))

if sft is not None:
# Compute euclidean lengths (rasmm space)
sft.to_space(Space.RASMM)
output_lengths.extend(length(sft.streamlines))
# Sending to common space
sft.to_vox()
sft.to_corner()

# Sending to common space
sft.to_vox()
sft.to_corner()

# Add processed tractogram to final big tractogram
if final_sft is None:
final_sft = sft
else:
final_sft = concatenate_sft([final_sft, sft],
erase_metadata=False)
# Add processed tractogram to final big tractogram
if final_sft is None:
final_sft = sft
else:
final_sft = concatenate_sft([final_sft, sft],
erase_metadata=False)

if self.save_intermediate:
output_fname = self.intermediate_folder.joinpath(
Expand Down Expand Up @@ -716,16 +722,7 @@ def _process_one_streamline_group(

return final_sft, output_lengths, conn_matrix, conn_info

def _load_and_process_sft(self, tractogram_file, tractogram_name, header):
if not tractogram_file.is_file():
logging.debug(
" Skipping file {} because it was not found in this "
"subject's folder".format(tractogram_name))
# Note: if args.enforce_files_presence was set to true,
# this case is not possible, already checked in
# create_hdf5_dataset
return None

def _load_and_process_sft(self, tractogram_file, header):
# Check file extension
_, file_extension = os.path.splitext(str(tractogram_file))
if file_extension not in ['.trk', '.tck']:
Expand All @@ -742,7 +739,7 @@ def _load_and_process_sft(self, tractogram_file, tractogram_name, header):

# Loading tractogram and sending to wanted space
logging.info(" - Processing tractogram {}"
.format(os.path.basename(tractogram_name)))
.format(os.path.basename(tractogram_file)))
sft = load_tractogram(str(tractogram_file), header)

# Resample or compress streamlines
Expand Down
4 changes: 3 additions & 1 deletion dwi_ml/data/io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
import os

import nibabel as nib
import numpy as np

Expand All @@ -19,7 +21,7 @@ def load_file_to4d(data_file):
voxel_size: np.array with size 3,
header: nibabel header.
"""
ext = data_file.suffix
_, ext = os.path.splitext(data_file)

if ext != '.gz' and ext != '.nii':
raise ValueError('All data files should be nifti (.nii or .nii.gz) '
Expand Down
2 changes: 1 addition & 1 deletion scripts_python/dwiml_create_hdf5_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
- How to organize your data
- How to prepare the config file
- How to run this script.
https://dwi-ml.readthedocs.io/en/latest/2_B_preprocessing.html
https://dwi-ml.readthedocs.io/en/latest/2_A_creating_the_hdf5.html
--------------------------------------
** Note: The memory is a delicate question here, but checks have been made, and
Expand Down
5 changes: 2 additions & 3 deletions source/2_A_creating_the_hdf5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ To create the hdf5 file, you will need a config file such as below. HDF groups w
}
"bad_streamlines": {
"type": "streamlines",
"files": ["bad_tractograms/ALL"] ---> Will get all trk and tck files.
"files": ["bad_tractograms/*"] ---> Will get all trk and tck files.
}
"wm_mask": {
"type": "volume",
Expand All @@ -111,8 +111,7 @@ Each group may have a number of parameters:
- **"files"**: The listed file(s) must exist in every subject folder inside the root repository. That is: the files must be organized correctly on your computer (except if option 'enforce_files_presence is set to False). If there are more than one files, they will be concatenated (on the 4th dimension for volumes, using the union of tractograms for streamlines).
- There is the possibility to add a wildcard (\*) that will be replaced by the subject's id while loading. Ex: anat/\*__t1.nii.gz would become anat/subjX__t1.nii.gz.
- For streamlines, there is the possibility to use 'ALL' to load all tractograms present in a folder.
- There is the possibility to add a wildcard (\*).
Additional attributes for volume groups:
""""""""""""""""""""""""""""""""""""""""
Expand Down

0 comments on commit 37cdbb8

Please sign in to comment.