Merge branch 'main' into BCR_tutorial

scverse · Oct 11, 2024 · c2b8632 · c2b8632
2 parents 3424006 + c9cda49
commit c2b8632
Show file tree

Hide file tree

Showing 56 changed files with 1,030 additions and 929 deletions.
diff --git a/.conda/meta.yaml b/.conda/meta.yaml
@@ -16,20 +16,19 @@ build:
 
 requirements:
   host:
-    - python >=3.9
+    - python >=3.10
     - hatchling
     - hatch-vcs
 
   run:
-    - python >=3.9
+    - python >=3.10
     - anndata >=0.9
     - awkward >=2.1.0
     - mudata >=0.2.3
     - scanpy >=1.9.3
     - pandas >=1.5,!=2.1.2
     - numpy >=1.17.0
     - scipy
-    - parasail-python
     - scikit-learn
     - python-levenshtein
     - python-igraph !=0.10.0,!=0.10.1

diff --git a/.github/workflows/conda.yaml b/.github/workflows/conda.yaml
@@ -22,27 +22,26 @@ jobs:
       matrix:
         include:
           - os: ubuntu-latest
-            python: "3.9"
+            python: "3.11"
 
     env:
       OS: ${{ matrix.os }}
       PYTHON: ${{ matrix.python }}
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Setup Miniconda
-        uses: conda-incubator/setup-miniconda@v2
+        uses: conda-incubator/setup-miniconda@v3
         with:
-          miniforge-variant: Mambaforge
-          miniforge-version: latest
+          mamba-version: "*"
           channels: conda-forge,bioconda
           channel-priority: strict
-          python-version: ${{ matrix.python-version }}
+          python-version: ${{ matrix.python }}
 
       - name: install conda build
         run: |
-          mamba install -y boa conda-verify
+          mamba install -y boa conda-verify python=${{ matrix.python }}
         shell: bash
 
       - name: build and test package

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -24,7 +24,7 @@ jobs:
       matrix:
         include:
           - os: ubuntu-latest
-            python: "3.9"
+            python: "3.10"
           - os: ubuntu-latest
             python: "3.12"
           - os: ubuntu-latest
@@ -52,7 +52,7 @@ jobs:
           python -m pip install --upgrade pip wheel
       - name: Install dependencies
         run: |
-          pip install ${{ matrix.pip-flags }} ".[dev,test,rpack,dandelion,diversity]"
+          pip install ${{ matrix.pip-flags }} ".[dev,test,rpack,dandelion,diversity,parasail]"
       - name: Test
         env:
           MPLBACKEND: agg

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,8 +2,8 @@ fail_fast: false
 default_language_version:
   python: python3
 default_stages:
-  - commit
-  - push
+  - pre-commit
+  - pre-push
 minimum_pre_commit_version: 2.16.0
 repos:
   - repo: https://github.com/pre-commit/mirrors-prettier
@@ -12,15 +12,15 @@ repos:
       - id: prettier
         exclude: '^\.conda'
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.1
+    rev: v0.6.9
     hooks:
       - id: ruff
         types_or: [python, pyi, jupyter]
         args: [--fix, --exit-non-zero-on-fix]
       - id: ruff-format
         types_or: [python, pyi, jupyter]
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: detect-private-key
       - id: check-ast

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,11 +10,33 @@ and this project adheres to [Semantic Versioning][].
 
 ## [Unreleased]
 
-### Addition
+### Additions
+
+-   Add a `mask_obs` argument to `tl.clonotype_network` that allows to compute the clonotype networks on a subset of the cells ([#557](https://github.com/scverse/scirpy/pull/557)).
+
+## v0.18.0
+
+### Additions
+
+-   Isotypically included B cells are now labelled as `receptor_subtype="IGH+IGK/L"` instead of `ambiguous` in `tl.chain_qc` ([#537](https://github.com/scverse/scirpy/pull/537)).
+-   Added the `normalized_hamming` metric to `pp.ir_dist` that accounts for differences in CDR3 sequence length ([#512](https://github.com/scverse/scirpy/pull/512)).
+-   `tl.define_clonotype_clusters` now has an option to require J genes to match (`same_j_gene=True`) in addition to `same_v_gene`. ([#470](https://github.com/scverse/scirpy/pull/470)).
+
+### Performance improvements
+
+-   The hamming distance has been reimplemented with numba, achieving a significant speedup ([#512](https://github.com/scverse/scirpy/pull/512)).
+-   Clonotype clustering has been accelerated leveraging sparse matrix operations ([#470](https://github.com/scverse/scirpy/pull/470)).
+
+### Fixes
+
+-   Fix that `pl.clonotype_network` couldn't use non-standard obsm key ([#545](https://github.com/scverse/scirpy/pull/545)).
+
+### Other changes
 
--   Isotypically included B cells are now labelled as `receptor_subtype="IGH+IGK/L"` instead of `ambiguous` in `tl.chain_qc`. ([#537](https://github.com/scverse/scirpy/pull/537))
--   Added the `normalized_hamming` metric to `pp.ir_dist` that accounts for differences in CDR3 sequence length. Additionally,
-    the hamming distance was reimplemented with numba, achieving a significant speedup ([#512](https://github.com/scverse/scirpy/pull/512)).
+-   Make `parasail` an optional dependency since it is hard to install it on ARM CPUs. `TCRdist` is now the
+    recommended default distance metric which is much faster than parasail-based pairwise sequence alignments while
+    providing very similar results ([#547](https://github.com/scverse/scirpy/pull/547)).
+-   Drop support for Python 3.9 in accordance with [SPEC0](https://scientific-python.org/specs/spec-0000/) ([#546](https://github.com/scverse/scirpy/pull/546))
 
 ## v0.17.2
 
@@ -40,7 +62,7 @@ and this project adheres to [Semantic Versioning][].
 
 ### Fixes
 
--   Fix issue with detecting the number of available CPUs on MacOD ([#518](https://github.com/scverse/scirpy/pull/502))
+-   Fix issue with detecting the number of available CPUs on MacOS ([#518](https://github.com/scverse/scirpy/pull/502))
 
 ## v0.16.1
 

diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ Please refer to the [documentation][link-docs]. In particular, the
 
 ## Installation
 
-You need to have Python 3.9 or newer installed on your system. If you don't have
+You need to have Python 3.10 or newer installed on your system. If you don't have
 Python installed, we recommend installing [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge).
 
 There are several alternative options to install scirpy:

diff --git a/docs/tutorials/tutorial_3k_tcr.ipynb b/docs/tutorials/tutorial_3k_tcr.ipynb
diff --git a/docs/tutorials/tutorial_io.ipynb b/docs/tutorials/tutorial_io.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ name = 'scirpy'
 dynamic = ["version"]
 description = "Python library for single-cell adaptive immune receptor repertoire (AIRR) analysis"
 readme = "README.md"
-requires-python = '>= 3.9'
+requires-python = '>= 3.10'
 license = {file = "LICENSE"}
 authors = [
     {name = "Gregor Sturm"},
@@ -20,15 +20,13 @@ urls.Documentation = "https://scirpy.readthedocs.io/"
 urls.Source = "https://github.com/scverse/scirpy"
 urls.Home-page = "https://github.com/scverse/scirpy"
 dependencies = [
-    'anndata>=0.9',
+    'anndata>=0.9,<0.10.9', # TODO remove constraint
     'awkward>=2.1.0',
     'mudata>=0.2.3',
     'scanpy>=1.9.3',
     'pandas>=1.5,!=2.1.2',
     'numpy>=1.17.0',
     'scipy',
-    # parasail 1.2.1 fails to be installd on MacOS
-    'parasail != 1.2.1',
     'scikit-learn',
     'python-levenshtein',
     # 0.10.0 and 0.10.1 have the bug described in https://github.com/igraph/python-igraph/issues/570
@@ -87,6 +85,10 @@ diversity = [
 rpack = [
     'rectangle-packer',
 ]
+parasail = [
+    # parasail 1.2.1 fails to be installd on MacOS
+    'parasail != 1.2.1',
+]
 
 [tool.hatch.version]
 source = "vcs"

diff --git a/src/scirpy/datasets/_processing_scripts/wu2020.py b/src/scirpy/datasets/_processing_scripts/wu2020.py
@@ -92,7 +92,7 @@ def _load_adata(path):
 adatas = p.map(_load_adata, mtx_paths)
 p.close()
 
-adatas, adatas_airr = zip(*adatas)
+adatas, adatas_airr = zip(*adatas, strict=False)
 
 adata = anndata.concat(adatas)
 

diff --git a/src/scirpy/get/__init__.py b/src/scirpy/get/__init__.py
@@ -19,13 +19,13 @@
 @DataHandler.inject_param_docs()
 def airr(
     adata: DataHandler.TYPE,
-    airr_variable: Union[str, Sequence[str]],
-    chain: Union[ChainType, Sequence[ChainType]] = ("VJ_1", "VDJ_1", "VJ_2", "VDJ_2"),
+    airr_variable: str | Sequence[str],
+    chain: ChainType | Sequence[ChainType] = ("VJ_1", "VDJ_1", "VJ_2", "VDJ_2"),
     *,
     airr_mod: str = "airr",
     airr_key: str = "airr",
     chain_idx_key: str = "chain_indices",
-) -> Union[pd.DataFrame, pd.Series]:
+) -> pd.DataFrame | pd.Series:
     """\
     Retrieve AIRR variables for each cell, given a specific chain.
 
@@ -110,7 +110,7 @@ def _airr_col(
 
 
 @contextmanager
-def obs_context(data: Union[AnnData, MuData], temp_cols: Union[pd.DataFrame, Mapping[str, Any]]):
+def obs_context(data: AnnData | MuData, temp_cols: pd.DataFrame | Mapping[str, Any]):
     """
     Contextmanager that temporarily adds columns to obs.
 
@@ -151,8 +151,8 @@ def obs_context(data: Union[AnnData, MuData], temp_cols: Union[pd.DataFrame, Map
 @DataHandler.inject_param_docs()
 def airr_context(
     data: DataHandler.TYPE,
-    airr_variable: Union[str, Sequence[str]],
-    chain: Union[ChainType, Sequence[ChainType]] = ("VJ_1", "VDJ_1", "VJ_2", "VDJ_2"),
+    airr_variable: str | Sequence[str],
+    chain: ChainType | Sequence[ChainType] = ("VJ_1", "VDJ_1", "VJ_2", "VDJ_2"),
     *,
     airr_mod: str = "airr",
     airr_key: str = "airr",

diff --git a/src/scirpy/io/_convert_anndata.py b/src/scirpy/io/_convert_anndata.py
@@ -87,7 +87,7 @@ def to_airr_cells(adata: DataHandler.TYPE, *, airr_mod: str = "airr", airr_key:
         tmp_airr = ak.to_list(params.airr[i : i + CHUNKSIZE])
         tmp_obs = params.adata.obs.iloc[i : i + CHUNKSIZE].to_dict(orient="index")
 
-        for (cell_id, row), chains in zip(tmp_obs.items(), tmp_airr):
+        for (cell_id, row), chains in zip(tmp_obs.items(), tmp_airr, strict=False):
             tmp_cell = AirrCell(cast(str, cell_id), logger=logger)
             # add cell-level metadata
             tmp_cell.update(row)

diff --git a/src/scirpy/io/_io.py b/src/scirpy/io/_io.py
@@ -6,7 +6,7 @@
 from collections.abc import Collection, Iterable, Sequence
 from glob import iglob
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -46,7 +46,7 @@ def _cdr3_from_junction(junction_aa, junction_nt):
 
 
 def _read_10x_vdj_json(
-    path: Union[str, Path],
+    path: str | Path,
     filtered: bool = True,
 ) -> Iterable[AirrCell]:
     """Read IR data from a 10x genomics `all_contig_annotations.json` file"""
@@ -148,7 +148,7 @@ def _read_10x_vdj_json(
 
 
 def _read_10x_vdj_csv(
-    path: Union[str, Path],
+    path: str | Path,
     filtered: bool = True,
 ) -> Iterable[AirrCell]:
     """Read IR data from a 10x genomics `_contig_annotations.csv` file"""
@@ -199,7 +199,7 @@ def _read_10x_vdj_csv(
 
 
 @_doc_params(doc_working_model=doc_working_model)
-def read_10x_vdj(path: Union[str, Path], filtered: bool = True, include_fields: Any = None, **kwargs) -> AnnData:
+def read_10x_vdj(path: str | Path, filtered: bool = True, include_fields: Any = None, **kwargs) -> AnnData:
     """\
     Read :term:`AIRR` data from 10x Genomics cell-ranger output.
 
@@ -241,7 +241,7 @@ def read_10x_vdj(path: Union[str, Path], filtered: bool = True, include_fields:
 
 
 @_doc_params(doc_working_model=doc_working_model)
-def read_tracer(path: Union[str, Path], **kwargs) -> AnnData:
+def read_tracer(path: str | Path, **kwargs) -> AnnData:
     """\
     Read data from `TraCeR <https://github.com/Teichlab/tracer>`_ (:cite:`Stubbington2016-kh`).
 
@@ -351,7 +351,7 @@ def _process_chains(chains, chain_type):
     cell_attributes=f"""`({",".join([f'"{x}"' for x in DEFAULT_AIRR_CELL_ATTRIBUTES])})`""",
 )
 def read_airr(
-    path: Union[str, Sequence[str], Path, Sequence[Path], pd.DataFrame, Sequence[pd.DataFrame]],
+    path: str | Sequence[str] | Path | Sequence[Path] | pd.DataFrame | Sequence[pd.DataFrame],
     use_umi_count_col: None = None,  # deprecated, kept for backwards-compatibility
     infer_locus: bool = True,
     cell_attributes: Collection[str] = DEFAULT_AIRR_CELL_ATTRIBUTES,
@@ -405,8 +405,8 @@ def read_airr(
     airr_cells = {}
     logger = _IOLogger()
 
-    if isinstance(path, (str, Path, pd.DataFrame)):
-        path: list[Union[str, Path, pd.DataFrame]] = [path]  # type: ignore
+    if isinstance(path, str | Path | pd.DataFrame):
+        path: list[str | Path | pd.DataFrame] = [path]  # type: ignore
 
     for tmp_path_or_df in path:
         if isinstance(tmp_path_or_df, pd.DataFrame):
@@ -475,7 +475,7 @@ def _infer_locus_from_gene_names(chain_dict, *, keys=("v_call", "d_call", "j_cal
 
 
 @_doc_params(doc_working_model=doc_working_model)
-def read_bracer(path: Union[str, Path], **kwargs) -> AnnData:
+def read_bracer(path: str | Path, **kwargs) -> AnnData:
     """\
     Read data from `BraCeR <https://github.com/Teichlab/bracer>`_ (:cite:`Lindeman2018`).
 
@@ -546,7 +546,7 @@ def read_bracer(path: Union[str, Path], **kwargs) -> AnnData:
     return from_airr_cells(bcr_cells.values(), **kwargs)
 
 
-def write_airr(adata: DataHandler.TYPE, filename: Union[str, Path], **kwargs) -> None:
+def write_airr(adata: DataHandler.TYPE, filename: str | Path, **kwargs) -> None:
     """Export :term:`IR` data to :term:`AIRR` Rearrangement `tsv` format.
 
     Parameters
@@ -636,7 +636,7 @@ def from_dandelion(dandelion, transfer: bool = False, to_mudata: bool = False, *
 
 
 @_doc_params(doc_working_model=doc_working_model)
-def read_bd_rhapsody(path: Union[str, Path], **kwargs) -> AnnData:
+def read_bd_rhapsody(path: str | Path, **kwargs) -> AnnData:
     """\
     Read :term:`IR` data from the BD Rhapsody Analysis Pipeline.