scverse · grst · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,17 +10,22 @@ and this project adheres to [Semantic Versioning][].
 
 ## Unreleased
 
-### Fixes
-
--   Fix incompatibility with `adjustText` 1.0 ([#477](https://github.com/scverse/scirpy/pull/477))
-
 ### New features
 
 -   Speed up alignment distances by pre-filtering. There are two filtering strategies: A (lossless) length-based filter
     and a heuristic based on the expected penalty per mismatch. This is implemented in the `FastAlignmentDistanceCalculator`
     class which supersedes the `AlignmentDistanceCalculator` class, which is now deprecated. Using the `"alignment"` metric
     in `pp.ir_dist` now uses the `FastAlignmentDistanceCalculator` with only the lenght-based filter activated.
-    Using the `"fastalignment"` activates the heuristic, which is significantly faster, but results in some false-negatives.
+    Using the `"fastalignment"` activates the heuristic, which is significantly faster, but results in some false-negatives. ([#456](https://github.com/scverse/scirpy/pull/456))
+
+### Fixes
+
+-   In `datasets.iedb`, the `junction_aa` field now correctly contains the initial `C` and the terminal `W/F`. Previously
+    it contained what should have been in the `cdr3_aa` field. The `junction_aa` is not contained in the IEDB dump directly.
+    Instead it is calculated from the protein sequence and the corresponding start/end coordinates. Note that if you have
+    a cached `iedb.h5ad` in the `data` folder (relative to your work dir), you need to remove it to re-download the
+    new version. ([#476](https://github.com/scverse/scirpy/pull/476))
+-   Fix incompatibility with `adjustText` 1.0 ([#477](https://github.com/scverse/scirpy/pull/477))
 
 ## v0.14.0
 

diff --git a/src/scirpy/datasets/__init__.py b/src/scirpy/datasets/__init__.py
@@ -16,6 +16,7 @@
 import scanpy as sc
 from anndata import AnnData
 from mudata import MuData
+from pandas.api.types import is_string_dtype
 from scanpy import logging
 
 from scirpy.io._convert_anndata import from_airr_cells
@@ -247,6 +248,9 @@
     :cite:`iedb` is a curated database of
     T-cell receptor (TCR) sequences with known antigen specificities.
 
+    The `junction_aa` is not contained in the IEDB dump directly.
+    Instead it is calculated from the protein sequence and the corresponding start/end coordinates.
+
     Parameters
     ----------
     cached
@@ -305,20 +309,49 @@
         ]
     )
 
-    # If no curated CDR3 sequence or V/D/J gene is available, the calculated one is used.
     def replace_curated(input_1, input_2):
-        calculated_1 = input_1.replace("Curated", "Calculated")
-        iedb_df.loc[iedb_df[input_1].isna(), input_1] = iedb_df[calculated_1]
-        iedb_df[input_1] = iedb_df[input_1].str.upper()
-        calculated_2 = input_2.replace("Curated", "Calculated")
-        iedb_df.loc[iedb_df[input_2].isna(), input_2] = iedb_df[calculated_2]
-        iedb_df[input_2] = iedb_df[input_2].str.upper()
+        """If no curated CDR3 sequence or V/D/J gene is available, use the calculated one"""
+        for input in [input_1, input_2]:
+            calculated = input.replace("Curated", "Calculated")
+            iedb_df.loc[iedb_df[input].isna(), input] = iedb_df[calculated]
+            if is_string_dtype(iedb_df[input]):
+                iedb_df[input] = iedb_df[input].str.upper()
+
+    def _get_junction_aa(row, chain="Chain 1"):
+        """Compute the "junction_aa" sequence from the protein sequence based on the Start/End indices.
+
+        In the database, only CDR3 (without initial C and terminal W/F) is available.
+        """
+        try:
+            # if the start end is missing (NaN)
+            start = int(row[f"{chain} CDR3 Start Curated"])
+            end = int(row[f"{chain} CDR3 End Curated"])
+        except ValueError:
+            return pd.NA
+        protein = row[f"{chain} Protein Sequence"]
+        if pd.isnull(protein):
+            # if a start/end is specified (possibly a "curated" CDR3 is available), but no reference protein sequence given
+            return pd.NA
+
+        # +1 at each end for "junction" vs. "cdr3", start one earlier because of different indexing
+        junction = protein[start - 2 : end + 1]
+
+        # It turns out that this does not always match - but in the inspected cases, the "curated" CDR3 was
+        # incorrectly including the C/F|W.
+        # cdr3 = junction[1:-1]
+        # print(junction, cdr3, row["Chain 1 CDR3 Curated"])
+
+        return junction
 
     replace_curated("Chain 1 CDR3 Curated", "Chain 2 CDR3 Curated")
+    replace_curated("Chain 1 CDR3 Start Curated", "Chain 2 CDR3 Start Curated")
+    replace_curated("Chain 1 CDR3 End Curated", "Chain 2 CDR3 End Curated")
     replace_curated("Chain 1 Curated V Gene", "Chain 2 Curated V Gene")
     replace_curated("Chain 1 Curated D Gene", "Chain 2 Curated D Gene")
     replace_curated("Chain 1 Curated J Gene", "Chain 2 Curated J Gene")
 
+    iedb_df["Chain 1 junction_aa"] = iedb_df.apply(_get_junction_aa, axis=1, chain="Chain 1")
+    iedb_df["Chain 2 junction_aa"] = iedb_df.apply(_get_junction_aa, axis=1, chain="Chain 2")
     iedb_df["cell_id"] = iedb_df.reset_index(drop=True).index
 
     accepted_chains = ["alpha", "beta", "heavy", "light", "gamma", "delta"]
@@ -348,7 +381,8 @@
         chain1.update(
             {
                 "locus": receptor_dict[row["Chain 1 Type"]],
-                "junction_aa": row["Chain 1 CDR3 Curated"],
+                "cdr3_aa": row["Chain 1 CDR3 Curated"],
+                "junction_aa": row["Chain 1 junction_aa"],
                 "junction": None,
                 "consensus_count": None,
                 "v_call": row["Chain 1 Curated V Gene"],
@@ -360,7 +394,8 @@
         chain2.update(
             {
                 "locus": receptor_dict[row["Chain 2 Type"]],
-                "junction_aa": row["Chain 2 CDR3 Curated"],
+                "cdr3_aa": row["Chain 2 CDR3 Curated"],
+                "junction_aa": row["Chain 2 junction_aa"],
                 "junction": None,
                 "consensus_count": None,
                 "v_call": row["Chain 2 Curated V Gene"],