From 37cdbb8f48a6766e190c218fb5e4d4edf7f052d5 Mon Sep 17 00:00:00 2001 From: EmmaRenauld Date: Wed, 14 Feb 2024 12:44:05 -0500 Subject: [PATCH] Manage wildcards like real wildcards --- dwi_ml/data/hdf5/hdf5_creation.py | 169 ++++++++++---------- dwi_ml/data/io.py | 4 +- scripts_python/dwiml_create_hdf5_dataset.py | 2 +- source/2_A_creating_the_hdf5.rst | 5 +- 4 files changed, 89 insertions(+), 91 deletions(-) diff --git a/dwi_ml/data/hdf5/hdf5_creation.py b/dwi_ml/data/hdf5/hdf5_creation.py index f5d95eed..650c1cce 100644 --- a/dwi_ml/data/hdf5/hdf5_creation.py +++ b/dwi_ml/data/hdf5/hdf5_creation.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import datetime +import glob import logging import os from pathlib import Path @@ -24,6 +25,39 @@ from dwi_ml.data.processing.dwi.dwi import standardize_data +def format_filelist(filenames, enforce_presence, folder=None) -> List[str]: + """ + If folder is not None, it will be added as prefix to all files. + """ + if isinstance(filenames, str): + filenames = [filenames] + + new_files = [] + for i, f in enumerate(filenames): + if folder is not None: + f = str(folder.joinpath(f)) + if '*' in f: + tmp = glob.glob(f) + if len(tmp) == 0: + msg = "File not found, even with the wildcard: {}".format(f) + if enforce_presence: + raise FileNotFoundError(msg) + else: + logging.warning(msg) + else: + new_files.extend(f) + else: + if not Path(f).is_file(): + msg = "File not found: {}".format(f) + if enforce_presence: + raise FileNotFoundError(msg) + else: + logging.warning(msg) + else: + new_files.append(f) + return new_files + + def _load_and_verify_file(filename: str, subj_input_path, group_name: str, group_affine, group_res): """ @@ -297,19 +331,9 @@ def _check_files_presence(self): subj_input_dir = Path(self.root_folder).joinpath(subj_id) # Find subject's files from group_config - for this_file in config_file_list: - this_file = this_file.replace('*', subj_id) - if this_file.endswith('/ALL'): - logging.debug( - " Keyword 'ALL' detected; we will load all " - "files in the folder '{}'" - .format(this_file.replace('/ALL', ''))) - else: - this_file = subj_input_dir.joinpath(this_file) - if not this_file.is_file(): - raise FileNotFoundError( - "File from groups_config ({}) not found for " - "subject {}!".format(this_file, subj_id)) + config_file_list = format_filelist(config_file_list, + self.enforce_files_presence, + folder=subj_input_dir) def create_database(self): """ @@ -441,26 +465,25 @@ def _process_one_volume_group(self, group: str, subj_id: str, if isinstance(std_masks, str): std_masks = [std_masks] - for sub_mask in std_masks: - sub_mask = sub_mask.replace('*', subj_id) + std_masks = format_filelist(std_masks, folder=subj_input_dir) + for mask in std_masks: logging.info(" - Loading standardization mask {}" - .format(sub_mask)) - sub_mask_file = subj_input_dir.joinpath(sub_mask) - sub_mask_img = nib.load(sub_mask_file) - sub_mask_data = np.asanyarray(sub_mask_img.dataobj) > 0 + .format(os.path.basename(mask))) + sub_mask_data = nib.load(mask).get_fdata() > 0 if std_mask is None: std_mask = sub_mask_data else: std_mask = np.logical_or(sub_mask_data, std_mask) file_list = self.groups_config[group]['files'] + file_list = format_filelist(file_list, self.enforce_files_presence, + folder=subj_input_dir) # First file will define data dimension and affine - file_name = file_list[0].replace('*', subj_id) - first_file = subj_input_dir.joinpath(file_name) - logging.info(" - Processing file {}".format(file_name)) + logging.info(" - Processing file {}" + .format(os.path.basename(file_list[0]))) group_data, group_affine, group_res, group_header = load_file_to4d( - first_file) + file_list[0]) if std_option == 'per_file': logging.debug(' *Standardizing sub-data') @@ -470,23 +493,24 @@ def _process_one_volume_group(self, group: str, subj_id: str, # Other files must fit (data shape, affine, voxel size) # It is not a promise that data has been correctly registered, but it # is a minimal check. - for file_name in file_list[1:]: - file_name = file_name.replace('*', subj_id) - data = _load_and_verify_file(file_name, subj_input_dir, group, - group_affine, group_res) - - if std_option == 'per_file': - logging.debug(' *Standardizing sub-data') - data = standardize_data(data, std_mask, - independent=False) - - # Append file data to hdf group. - try: - group_data = np.append(group_data, data, axis=-1) - except ImportError: - raise ImportError( - 'Data file {} could not be added to data group {}. ' - 'Wrong dimensions?'.format(file_name, group)) + if len(file_list) > 1: + for file_name in file_list[1:]: + logging.info(" - Processing file {}" + .format(os.path.basename(file_name))) + data = _load_and_verify_file(file_name, subj_input_dir, group, + group_affine, group_res) + + if std_option == 'per_file': + logging.debug(' *Standardizing sub-data') + data = standardize_data(data, std_mask, independent=False) + + # Append file data to hdf group. + try: + group_data = np.append(group_data, data, axis=-1) + except ImportError: + raise ImportError( + 'Data file {} could not be added to data group {}. ' + 'Wrong dimensions?'.format(file_name, group)) # Standardize data (per channel) (if not done 'per_file' yet). if std_option == 'independent': @@ -590,9 +614,6 @@ def _process_one_streamline_group( Loads and processes a group of tractograms and merges all streamlines together. - Note. Wildcards will be replaced by the subject id. If the list is - folder/ALL, all tractograms in the folder will be used. - Parameters ---------- subj_dir : Path @@ -628,41 +649,26 @@ def _process_one_streamline_group( final_sft = None output_lengths = [] - for instructions in tractograms: - if instructions.endswith('/ALL'): - # instructions are to get all tractograms in given folder. - tractograms_dir = instructions.split('/ALL') - tractograms_dir = ''.join(tractograms_dir[:-1]) - tractograms_sublist = [ - instructions.replace('/ALL', '/' + os.path.basename(p)) - for p in subj_dir.glob(tractograms_dir + '/*')] - else: - # instruction is to get one specific tractogram - tractograms_sublist = [instructions] - - # Either a loop on "ALL" or a loop on only one file. - for tractogram_name in tractograms_sublist: - tractogram_name = tractogram_name.replace('*', subj_id) - tractogram_file = subj_dir.joinpath(tractogram_name) + tractograms = format_filelist(tractograms, self.enforce_files_presence, + folder=subj_dir) + for tractogram_file in tractograms: + sft = self._load_and_process_sft(tractogram_file, header) - sft = self._load_and_process_sft( - tractogram_file, tractogram_name, header) + if sft is not None: + # Compute euclidean lengths (rasmm space) + sft.to_space(Space.RASMM) + output_lengths.extend(length(sft.streamlines)) - if sft is not None: - # Compute euclidean lengths (rasmm space) - sft.to_space(Space.RASMM) - output_lengths.extend(length(sft.streamlines)) + # Sending to common space + sft.to_vox() + sft.to_corner() - # Sending to common space - sft.to_vox() - sft.to_corner() - - # Add processed tractogram to final big tractogram - if final_sft is None: - final_sft = sft - else: - final_sft = concatenate_sft([final_sft, sft], - erase_metadata=False) + # Add processed tractogram to final big tractogram + if final_sft is None: + final_sft = sft + else: + final_sft = concatenate_sft([final_sft, sft], + erase_metadata=False) if self.save_intermediate: output_fname = self.intermediate_folder.joinpath( @@ -716,16 +722,7 @@ def _process_one_streamline_group( return final_sft, output_lengths, conn_matrix, conn_info - def _load_and_process_sft(self, tractogram_file, tractogram_name, header): - if not tractogram_file.is_file(): - logging.debug( - " Skipping file {} because it was not found in this " - "subject's folder".format(tractogram_name)) - # Note: if args.enforce_files_presence was set to true, - # this case is not possible, already checked in - # create_hdf5_dataset - return None - + def _load_and_process_sft(self, tractogram_file, header): # Check file extension _, file_extension = os.path.splitext(str(tractogram_file)) if file_extension not in ['.trk', '.tck']: @@ -742,7 +739,7 @@ def _load_and_process_sft(self, tractogram_file, tractogram_name, header): # Loading tractogram and sending to wanted space logging.info(" - Processing tractogram {}" - .format(os.path.basename(tractogram_name))) + .format(os.path.basename(tractogram_file))) sft = load_tractogram(str(tractogram_file), header) # Resample or compress streamlines diff --git a/dwi_ml/data/io.py b/dwi_ml/data/io.py index df202c96..77bcc40a 100644 --- a/dwi_ml/data/io.py +++ b/dwi_ml/data/io.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import os + import nibabel as nib import numpy as np @@ -19,7 +21,7 @@ def load_file_to4d(data_file): voxel_size: np.array with size 3, header: nibabel header. """ - ext = data_file.suffix + _, ext = os.path.splitext(data_file) if ext != '.gz' and ext != '.nii': raise ValueError('All data files should be nifti (.nii or .nii.gz) ' diff --git a/scripts_python/dwiml_create_hdf5_dataset.py b/scripts_python/dwiml_create_hdf5_dataset.py index 61fb176c..3377bc2f 100644 --- a/scripts_python/dwiml_create_hdf5_dataset.py +++ b/scripts_python/dwiml_create_hdf5_dataset.py @@ -9,7 +9,7 @@ - How to organize your data - How to prepare the config file - How to run this script. - https://dwi-ml.readthedocs.io/en/latest/2_B_preprocessing.html + https://dwi-ml.readthedocs.io/en/latest/2_A_creating_the_hdf5.html -------------------------------------- ** Note: The memory is a delicate question here, but checks have been made, and diff --git a/source/2_A_creating_the_hdf5.rst b/source/2_A_creating_the_hdf5.rst index 86abf217..ea7e1f55 100644 --- a/source/2_A_creating_the_hdf5.rst +++ b/source/2_A_creating_the_hdf5.rst @@ -87,7 +87,7 @@ To create the hdf5 file, you will need a config file such as below. HDF groups w } "bad_streamlines": { "type": "streamlines", - "files": ["bad_tractograms/ALL"] ---> Will get all trk and tck files. + "files": ["bad_tractograms/*"] ---> Will get all trk and tck files. } "wm_mask": { "type": "volume", @@ -111,8 +111,7 @@ Each group may have a number of parameters: - **"files"**: The listed file(s) must exist in every subject folder inside the root repository. That is: the files must be organized correctly on your computer (except if option 'enforce_files_presence is set to False). If there are more than one files, they will be concatenated (on the 4th dimension for volumes, using the union of tractograms for streamlines). - - There is the possibility to add a wildcard (\*) that will be replaced by the subject's id while loading. Ex: anat/\*__t1.nii.gz would become anat/subjX__t1.nii.gz. - - For streamlines, there is the possibility to use 'ALL' to load all tractograms present in a folder. + - There is the possibility to add a wildcard (\*). Additional attributes for volume groups: """"""""""""""""""""""""""""""""""""""""