-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(4.0): Add check for conditions that would cause anndata.write to fail. #643
Changes from 10 commits
20162bd
9e73307
d85e0d5
6ddc7d5
d33494f
554d5aa
180c60b
0564a2e
7ac5d75
1b12731
738d85d
5645fb6
c4d4afd
bce67eb
221ef8c
46ac681
7a9b7a3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -644,18 +644,30 @@ def _validate_dataframe(self, df_name: str): | |
f"Features SHOULD NOT be filtered from expression matrix." | ||
) | ||
|
||
# Check for columns that have a category defined 0 times (obs only) | ||
if df_name == "obs": | ||
for column in df.columns: | ||
col = df[column] | ||
if col.dtype != "category": | ||
continue | ||
for category in col.dtype.categories: | ||
if category not in col.values: | ||
self.warnings.append( | ||
f"Column '{column}' in dataframe '{df_name}' contains a category '{category}' with " | ||
f"zero observations. These categories will be removed when `--add-labels` flag is present." | ||
) | ||
for column_name in df.columns: | ||
column = df[column_name] | ||
if column.dtype != "category": | ||
nayib-jose-gloria marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Check for columns with mixed values. | ||
value_types = {type(x) for x in column.values} | ||
if len(value_types) != 1: | ||
self.errors.append( | ||
f"Column '{column_name}' in dataframe '{df_name}' cannot contained mixed types. Found {value_types}." | ||
Bento007 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
else: | ||
# Check for columns that have a category defined 0 times (obs only) | ||
if df_name == "obs": | ||
for category in column.dtype.categories: | ||
if category not in column.values: | ||
self.warnings.append( | ||
f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with " | ||
f"zero observations. These categories will be removed when `--add-labels` flag is present." | ||
) | ||
# Check for columns that have none string catagories | ||
catagory_types = {type(x) for x in column.dtype.categories.values} | ||
if len(catagory_types) > 1 or str not in catagory_types: | ||
self.errors.append( | ||
f"Column '{column_name}' in dataframe '{df_name}' must only contain string catagories. Found {catagory_types}." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit catagories -> categories There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same w/ catagory_types var name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, I think it is true that we only support str categories in our current schema definition. But are we 100% certain that will always be true? If this is an anndata constraint, mind linking me to a doc or comment specifying this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not documented in anndata because it is a bug. Anndata should support not string values as categories and this is fixed in anndata 0.10.0. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gotcha. Mind adding a comment to this effect--maybe a TODO so we remember to remove this check for anndata 0.10.0 and/or if we get a non-string categorical field in schema 5 we remember to highlight this anndata 0.8.0 bug to product? |
||
) | ||
|
||
# Validate columns | ||
if "columns" in df_definition: | ||
|
@@ -1302,13 +1314,16 @@ def _deep_check(self): | |
"fixing current errors." | ||
) | ||
|
||
def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False) -> bool: | ||
def validate_adata( | ||
self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False, write_check: bool = False | ||
) -> bool: | ||
""" | ||
Validates adata | ||
|
||
:params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to validate, if None it will try to validate | ||
from self.adata | ||
:param to_memory: indicate if the h5ad should be read into memory. | ||
:param write_check: indicate if write_check should be performed. | ||
|
||
:return True if successful validation, False otherwise | ||
:rtype bool | ||
|
@@ -1330,6 +1345,9 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me | |
if not self.errors: | ||
self._deep_check() | ||
|
||
if write_check: | ||
self.write_check() | ||
|
||
# Print warnings if any | ||
if self.warnings: | ||
self.warnings = ["WARNING: " + i for i in self.warnings] | ||
|
@@ -1347,12 +1365,25 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me | |
|
||
return self.is_valid | ||
|
||
def write_check(self): | ||
"""Check that the h5ad can be written back out.""" | ||
logger.warning("Checking that the h5ad can be written back out. This may take a while.") | ||
import tempfile | ||
|
||
with tempfile.TemporaryDirectory() as tmpdir: | ||
tmp_path = os.path.join(tmpdir, "validated.h5ad") | ||
try: | ||
self.adata.write(tmp_path) | ||
except Exception as e: | ||
self.errors.append(f"Unable to write back to h5ad: {e}") | ||
|
||
|
||
def validate( | ||
h5ad_path: Union[str, bytes, os.PathLike], | ||
add_labels_file: str = None, | ||
ignore_labels: bool = False, | ||
verbose: bool = False, | ||
write_check: bool = False, | ||
) -> (bool, list, bool): | ||
from .write_labels import AnnDataLabelAppender | ||
|
||
|
@@ -1376,8 +1407,10 @@ def validate( | |
validator = Validator( | ||
ignore_labels=ignore_labels, | ||
) | ||
to_memory = add_labels_file is not None | ||
validator.validate_adata(h5ad_path, to_memory=to_memory) | ||
to_memory = ( | ||
add_labels_file is not None or write_check | ||
) # if we're adding labels or doing a write_check, then read to memory | ||
validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check) | ||
logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}") | ||
|
||
# Stop if validation was unsuccessful | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jahilton I'd like to get your thoughts on this new flag before I merge.