handling long monomer names: tweaks and documentation

project-gemmi · Dec 31, 2023 · 11a71c9 · 11a71c9
1 parent 5c5d360
commit 11a71c9
Show file tree

Hide file tree

Showing 9 changed files with 77 additions and 26 deletions.
diff --git a/.gitignore b/.gitignore
@@ -47,6 +47,7 @@ docs/out.ccp4
 docs/out.cif
 docs/out.pdb
 docs/output.mtz
+docs/8xfm.cif
 
 # file stored by tools/cmp-size.py
 /.size-save.txt

diff --git a/docs/mol.rst b/docs/mol.rst
@@ -593,6 +593,8 @@ it starts in column 13 even if it has a one-letter element code:
    HETATM 6495  CAX R58 A 502      17.143 -29.934   7.180  1.00 58.54           C
    HETATM 6496 CAX3 R58 A 502      16.438 -31.175   6.663  1.00 57.68           C
 
+.. _tilde_hetnam:
+
 Columns 18-20 contain the residue name (CCD code). When the PDB ran out of
 three-character codes in 2023, it started assigning codes with 5 characters,
 which no longer fit into the PDB format. The tilde-hetnam extension addresses
@@ -1969,6 +1971,45 @@ the chain names first:
 
 In C++ this functions is in ``gemmi/assembly.hpp``.
 
+Long monomer names
+~~~~~~~~~~~~~~~~~~
+
+Five-character monomer names are new.
+Until Dec 2023, monomer names were up to 3 characters (and were often
+called three-letter codes). Therefore, not all programs support
+residue names longer than three characters. In particular, the PDB
+file format does not support them. To work around this, we introduced
+the :ref:`tilde-hetnam extension <tilde_hetnam>`.
+However, the problem is not limited to the PDB format.
+Programs using the mmCIF format may also not support 5-character codes.
+
+Therefore, gemmi provides a file format independent aliasing mechanism
+with two functions:
+
+
+* ``shorten_ccd_codes()`` replaces 5-character residue names in a structure
+  with 3-character names (aliases) where the third character is ``~``,
+
+* ``restore_full_ccd_codes()`` restores the original names.
+
+When reading a PDB file with the tilde-hetnam extension,
+the long names are restored automatically. Apart from this,
+switching between long and short names requires function calls.
+
+Internally, the mapping between old and new names is stored in
+``Structure::shortened_ccd_codes``.
+
+.. doctest::
+  :skipif: not os.path.isfile('8xfm.cif')
+
+  >>> st_8xfm = gemmi.read_structure('8xfm.cif')
+  >>> st_8xfm.shorten_ccd_codes()
+  >>> st_8xfm.shortened_ccd_codes
+  [('A1LU6', 'A1~')]
+  >>> st_8xfm.restore_full_ccd_codes()
+  >>> st_8xfm.shortened_ccd_codes
+  []
+
 Bounding box
 ~~~~~~~~~~~~
 

diff --git a/include/gemmi/model.hpp b/include/gemmi/model.hpp
@@ -87,9 +87,6 @@ enum class CalcFlag : signed char {
   NotSet=0, NoHydrogen, Determined, Calculated, Dummy
 };
 
-/// helper type used for Structure::shortened_ccd_codes
-struct OldToNew { std::string old, new_; };
-
 /// options affecting how pdb file is read
 struct PdbReadOptions {
   int max_line_length = 0;
@@ -946,7 +943,7 @@ struct Structure {
   /// Minimal metadata with keys being mmcif tags: _entry.id, _cell.Z_PDB, ...
   std::map<std::string, std::string> info;
   /// Mapping of long (4+) CCD codes (residue names) to PDB-compatible ones
-  std::vector<OldToNew> shortened_ccd_codes;
+  std::vector<std::pair<std::string,std::string>> shortened_ccd_codes;
   /// original REMARK records stored if the file was read from the PDB format
   std::vector<std::string> raw_remarks;
   /// simplistic resolution value from/for REMARK 2

diff --git a/include/gemmi/pdb.hpp b/include/gemmi/pdb.hpp
@@ -439,7 +439,7 @@ Structure read_pdb_from_stream(Stream&& stream, const std::string& source,
       if (len > 71 && line[70] == ' ') {
         std::string full_code = read_string(line + 71, 8);
         if (!full_code.empty())
-          st.shortened_ccd_codes.push_back({full_code, read_string(line + 11, 3)});
+          st.shortened_ccd_codes.emplace_back(full_code, read_string(line + 11, 3));
       }
 
     } else if (is_record_type(line, "DBREF")) { // DBREF or DBREF1 or DBREF2

diff --git a/include/gemmi/polyheur.hpp b/include/gemmi/polyheur.hpp
@@ -210,8 +210,8 @@ void change_ccd_code(Structure& st, const std::string& old, const std::string& n
 GEMMI_DLL void shorten_ccd_codes(Structure& st);
 
 inline void restore_full_ccd_codes(Structure& st) {
-  for (const OldToNew& item : st.shortened_ccd_codes)
-    change_ccd_code(st, item.new_, item.old);
+  for (const auto& item : st.shortened_ccd_codes)
+    change_ccd_code(st, item.second, item.first);
   st.shortened_ccd_codes.clear();
 }
 

diff --git a/prog/h.cpp b/prog/h.cpp
@@ -142,6 +142,7 @@ int GEMMI_MAIN(int argc, char **argv) {
       std::printf("Writing coordinates to %s\n", output.c_str());
     gemmi::Ofstream os(output, &std::cout);
     if (gemmi::coor_format_from_ext_gz(output) == gemmi::CoorFormat::Pdb) {
+      shorten_ccd_codes(st);
       gemmi::write_pdb(st, os.ref());
     } else {
       if (preserve_doc) {

diff --git a/python/mol.cpp b/python/mol.cpp
@@ -164,15 +164,21 @@ void add_mol(py::module& m) {
     .def("ensure_entities", &ensure_entities)
     .def("deduplicate_entities", &deduplicate_entities)
     .def("setup_entities", &setup_entities)
+    .def("remove_waters", remove_waters<Structure>)
+    .def("remove_ligands_and_waters", remove_ligands_and_waters<Structure>)
+    .def("shorten_ccd_codes", &shorten_ccd_codes)
+    .def("restore_full_ccd_codes", &restore_full_ccd_codes)
+    .def_readwrite("shortened_ccd_codes", &Structure::shortened_ccd_codes)
+
+    // modify.hpp
     .def("remove_alternative_conformations",
          remove_alternative_conformations<Structure>)
     .def("remove_hydrogens", remove_hydrogens<Structure>)
-    .def("remove_waters", remove_waters<Structure>)
-    .def("remove_ligands_and_waters", remove_ligands_and_waters<Structure>)
     .def("store_deuterium_as_fraction", &store_deuterium_as_fraction)
     .def("standardize_crystal_frame", &standardize_crystal_frame)
     .def("assign_serial_numbers", (void (*)(Structure&, bool)) &assign_serial_numbers,
          py::arg("numbered_ter")=false)
+    // assembly.hpp
     .def("shorten_chain_names", &shorten_chain_names)
     .def("expand_ncs", &expand_ncs, py::arg("how"), py::arg("merge_dist")=0.2)
     .def("transform_to_assembly",
@@ -182,8 +188,10 @@ void add_mol(py::module& m) {
                                      keep_spacegroup, merge_dist);
     }, py::arg("assembly_name"), py::arg("how"), py::arg("keep_spacegroup")=false,
        py::arg("merge_dist")=0.2)
+    // calculate.hpp
     .def("calculate_box", &calculate_box, py::arg("margin")=0.)
     .def("calculate_fractional_box", &calculate_fractional_box, py::arg("margin")=0.)
+
     .def("clone", [](const Structure& self) { return new Structure(self); })
     .def("__repr__", [](const Structure& self) {
         return cat("<gemmi.Structure ", self.name, " with ",

diff --git a/src/polyheur.cpp b/src/polyheur.cpp
@@ -318,40 +318,43 @@ void change_ccd_code(Structure& st, const std::string& old, const std::string& n
     }
 }
 
+template <size_t I, typename T1, typename T2>
+bool in_vector_at(T1& x, std::vector<T2>& v) {
+  for (const auto& el : v)
+    if (std::get<I>(el) == x)
+      return true;
+  return false;
+}
+
 void shorten_ccd_codes(Structure& st) {
   // find all long residue names in both models and sequences
-  auto already_in = [&](const std::string& name) -> bool {
-    return in_vector_f([&](const OldToNew& x) { return x.old == name; },
-                       st.shortened_ccd_codes);
-  };
   for (Model& model : st.models)
     for (Chain& chain : model.chains)
       for (Residue& res : chain.residues)
-        if (res.name.size() > 3 && !already_in(res.name))
-          st.shortened_ccd_codes.push_back({res.name, {}});
+        if (res.name.size() > 3 && !in_vector_at<0>(res.name, st.shortened_ccd_codes))
+          st.shortened_ccd_codes.emplace_back(res.name, "");
   for (const Entity& ent : st.entities)
     for (const std::string& mon_ids : ent.full_sequence) {
       for (size_t start = 0;;) {
         size_t end = mon_ids.find(',', start);
         size_t len = std::min(end, mon_ids.size()) - start;
         if (len > 3) {
           std::string s(mon_ids, start, len);
-          if (!already_in(s))
-            st.shortened_ccd_codes.push_back({s, {}});
+          if (!in_vector_at<0>(s, st.shortened_ccd_codes))
+            st.shortened_ccd_codes.emplace_back(s, "");
         }
         if (end == std::string::npos)
           break;
         start = end + 1;
       }
     }
   // pick a new residue name and call change_ccd_code()
-  for (OldToNew& item : st.shortened_ccd_codes) {
-    std::string short_code = item.old.substr(0, 3);
-    short_code[2] = '~';
+  for (auto& old_new : st.shortened_ccd_codes) {
+    const std::string& old = old_new.first;
+    char short_code[4] = {old[0], old[1], '~', '\0'};
     // if short_code it's already used, try X[0-9]~ and then [0-9][0-9]~
     char c0 = '0', c1 = '0';
-    while (in_vector_f([&](const OldToNew& x) { return x.new_ == short_code; },
-                       st.shortened_ccd_codes)) {
+    while (in_vector_at<1>(short_code, st.shortened_ccd_codes)) {
       short_code[1] = c1++;
       if (c1 > '9') {
         short_code[0] = c0++;
@@ -360,8 +363,8 @@ void shorten_ccd_codes(Structure& st) {
           break;
       }
     }
-    item.new_ = short_code;
-    change_ccd_code(st, item.old, item.new_);
+    old_new.second = short_code;
+    change_ccd_code(st, old_new.first, old_new.second);
   }
 }
 

diff --git a/src/to_pdb.cpp b/src/to_pdb.cpp
@@ -456,8 +456,8 @@ void write_pdb(const Structure& st, std::ostream& os, PdbWriteOptions opt) {
     }
 
   // HETNAM - but it's used only for tilde-hetnam extension
-  for (const OldToNew& mapping : st.shortened_ccd_codes)
-    WRITE("HETNAM     %3s %55s %-9s\n", mapping.new_.c_str(), "", mapping.old.c_str());
+  for (const auto& mapping : st.shortened_ccd_codes)
+    WRITE("HETNAM     %3s %55s %-9s\n", mapping.second.c_str(), "", mapping.first.c_str());
 
   // HELIX
   if (!opt.minimal_file && !st.helices.empty()) {