From 00a1940377463072e30357bc76e4203c5d7ca8cd Mon Sep 17 00:00:00 2001 From: Marcin Wojdyr Date: Thu, 16 May 2024 20:47:51 +0200 Subject: [PATCH] to_mmcif: write _entity_poly_seq.hetero writes y/n if the sequence was read from mmCIF _entity_poly_seq, but unknown (?) if it's based on PDB SEQRES I haven't tested the output on PDB deposition yet. --- include/gemmi/metadata.hpp | 2 ++ src/mmcif.cpp | 4 +++- src/to_mmcif.cpp | 14 ++++++++------ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/gemmi/metadata.hpp b/include/gemmi/metadata.hpp index 52b1873e..665b4f43 100644 --- a/include/gemmi/metadata.hpp +++ b/include/gemmi/metadata.hpp @@ -234,6 +234,8 @@ struct Entity { std::vector subchains; EntityType entity_type = EntityType::Unknown; PolymerType polymer_type = PolymerType::Unknown; + // In case of microheterogeneity, PDB SEQRES has only the first residue name. + bool reflects_microhetero = false; std::vector dbrefs; /// List of SIFTS Uniprot ACs referenced by SiftsUnpResidue::acc_index std::vector sifts_unp_acc; diff --git a/src/mmcif.cpp b/src/mmcif.cpp index 249560bd..e09c1040 100644 --- a/src/mmcif.cpp +++ b/src/mmcif.cpp @@ -814,6 +814,8 @@ Structure make_structure_from_block(const cif::Block& block_) { ent.polymer_type = polymer_type_from_string(poly_type); } catch (std::runtime_error&) {} } + // _entity_poly_seq is supposed to reflect heterogeneities in _atom_site. + ent.reflects_microhetero = true; st.entities.push_back(ent); } @@ -825,7 +827,7 @@ Structure make_structure_from_block(const cif::Block& block_) { if (pos == (int) ent->full_sequence.size()) ent->full_sequence.push_back(row.str(2)); else if (pos >= 0 && pos < (int) ent->full_sequence.size()) - ent->full_sequence[pos] += "," + row.str(2); + cat_to(ent->full_sequence[pos], ',', row.str(2)); } cif::Table struct_ref = block.find("_struct_ref.", diff --git a/src/to_mmcif.cpp b/src/to_mmcif.cpp index 16936e6c..00d7825c 100644 --- a/src/to_mmcif.cpp +++ b/src/to_mmcif.cpp @@ -1106,23 +1106,25 @@ void update_mmcif_block(const Structure& st, cif::Block& block, MmcifOutputGroup } if (groups.entity_poly_seq) { - // SEQRES from PDB doesn't record microheterogeneity, so if the resulting - // cif has unknown("?") _entity_poly_seq.num, it cannot be trusted. cif::Loop& poly_loop = block.init_mmcif_loop("_entity_poly_seq.", - {"entity_id", "num", "mon_id"}); + {"entity_id", "num", "mon_id", "hetero"}); for (const Entity& ent : st.entities) - if (ent.entity_type == EntityType::Polymer) + if (ent.entity_type == EntityType::Polymer) { + // SEQRES from PDB doesn't record microheterogeneity. + std::string hetero_no = ent.reflects_microhetero ? "n" : "?"; for (size_t i = 0; i != ent.full_sequence.size(); ++i) { const std::string& mon_ids = ent.full_sequence[i]; std::string num = std::to_string(i+1); size_t start = 0, end; while ((end = mon_ids.find(',', start)) != std::string::npos) { poly_loop.add_row({qchain(ent.name), num, - mon_ids.substr(start, end-start)}); + mon_ids.substr(start, end-start), "y"}); start = end + 1; } - poly_loop.add_row({qchain(ent.name), num, mon_ids.substr(start)}); + poly_loop.add_row({qchain(ent.name), num, mon_ids.substr(start), + start == 0 ? hetero_no : "y"}); } + } } if (groups.atoms)