Skip to content

Commit

Permalink
handling long monomer names: tweaks and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
wojdyr committed Dec 31, 2023
1 parent 5c5d360 commit 11a71c9
Show file tree
Hide file tree
Showing 9 changed files with 77 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ docs/out.ccp4
docs/out.cif
docs/out.pdb
docs/output.mtz
docs/8xfm.cif

# file stored by tools/cmp-size.py
/.size-save.txt
Expand Down
41 changes: 41 additions & 0 deletions docs/mol.rst
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,8 @@ it starts in column 13 even if it has a one-letter element code:
HETATM 6495 CAX R58 A 502 17.143 -29.934 7.180 1.00 58.54 C
HETATM 6496 CAX3 R58 A 502 16.438 -31.175 6.663 1.00 57.68 C
.. _tilde_hetnam:

Columns 18-20 contain the residue name (CCD code). When the PDB ran out of
three-character codes in 2023, it started assigning codes with 5 characters,
which no longer fit into the PDB format. The tilde-hetnam extension addresses
Expand Down Expand Up @@ -1969,6 +1971,45 @@ the chain names first:

In C++ this functions is in ``gemmi/assembly.hpp``.

Long monomer names
~~~~~~~~~~~~~~~~~~

Five-character monomer names are new.
Until Dec 2023, monomer names were up to 3 characters (and were often
called three-letter codes). Therefore, not all programs support
residue names longer than three characters. In particular, the PDB
file format does not support them. To work around this, we introduced
the :ref:`tilde-hetnam extension <tilde_hetnam>`.
However, the problem is not limited to the PDB format.
Programs using the mmCIF format may also not support 5-character codes.

Therefore, gemmi provides a file format independent aliasing mechanism
with two functions:


* ``shorten_ccd_codes()`` replaces 5-character residue names in a structure
with 3-character names (aliases) where the third character is ``~``,

* ``restore_full_ccd_codes()`` restores the original names.

When reading a PDB file with the tilde-hetnam extension,
the long names are restored automatically. Apart from this,
switching between long and short names requires function calls.

Internally, the mapping between old and new names is stored in
``Structure::shortened_ccd_codes``.

.. doctest::
:skipif: not os.path.isfile('8xfm.cif')

>>> st_8xfm = gemmi.read_structure('8xfm.cif')
>>> st_8xfm.shorten_ccd_codes()
>>> st_8xfm.shortened_ccd_codes
[('A1LU6', 'A1~')]
>>> st_8xfm.restore_full_ccd_codes()
>>> st_8xfm.shortened_ccd_codes
[]

Bounding box
~~~~~~~~~~~~

Expand Down
5 changes: 1 addition & 4 deletions include/gemmi/model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,6 @@ enum class CalcFlag : signed char {
NotSet=0, NoHydrogen, Determined, Calculated, Dummy
};

/// helper type used for Structure::shortened_ccd_codes
struct OldToNew { std::string old, new_; };

/// options affecting how pdb file is read
struct PdbReadOptions {
int max_line_length = 0;
Expand Down Expand Up @@ -946,7 +943,7 @@ struct Structure {
/// Minimal metadata with keys being mmcif tags: _entry.id, _cell.Z_PDB, ...
std::map<std::string, std::string> info;
/// Mapping of long (4+) CCD codes (residue names) to PDB-compatible ones
std::vector<OldToNew> shortened_ccd_codes;
std::vector<std::pair<std::string,std::string>> shortened_ccd_codes;
/// original REMARK records stored if the file was read from the PDB format
std::vector<std::string> raw_remarks;
/// simplistic resolution value from/for REMARK 2
Expand Down
2 changes: 1 addition & 1 deletion include/gemmi/pdb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ Structure read_pdb_from_stream(Stream&& stream, const std::string& source,
if (len > 71 && line[70] == ' ') {
std::string full_code = read_string(line + 71, 8);
if (!full_code.empty())
st.shortened_ccd_codes.push_back({full_code, read_string(line + 11, 3)});
st.shortened_ccd_codes.emplace_back(full_code, read_string(line + 11, 3));
}

} else if (is_record_type(line, "DBREF")) { // DBREF or DBREF1 or DBREF2
Expand Down
4 changes: 2 additions & 2 deletions include/gemmi/polyheur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,8 @@ void change_ccd_code(Structure& st, const std::string& old, const std::string& n
GEMMI_DLL void shorten_ccd_codes(Structure& st);

inline void restore_full_ccd_codes(Structure& st) {
for (const OldToNew& item : st.shortened_ccd_codes)
change_ccd_code(st, item.new_, item.old);
for (const auto& item : st.shortened_ccd_codes)
change_ccd_code(st, item.second, item.first);
st.shortened_ccd_codes.clear();
}

Expand Down
1 change: 1 addition & 0 deletions prog/h.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ int GEMMI_MAIN(int argc, char **argv) {
std::printf("Writing coordinates to %s\n", output.c_str());
gemmi::Ofstream os(output, &std::cout);
if (gemmi::coor_format_from_ext_gz(output) == gemmi::CoorFormat::Pdb) {
shorten_ccd_codes(st);
gemmi::write_pdb(st, os.ref());
} else {
if (preserve_doc) {
Expand Down
12 changes: 10 additions & 2 deletions python/mol.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,21 @@ void add_mol(py::module& m) {
.def("ensure_entities", &ensure_entities)
.def("deduplicate_entities", &deduplicate_entities)
.def("setup_entities", &setup_entities)
.def("remove_waters", remove_waters<Structure>)
.def("remove_ligands_and_waters", remove_ligands_and_waters<Structure>)
.def("shorten_ccd_codes", &shorten_ccd_codes)
.def("restore_full_ccd_codes", &restore_full_ccd_codes)
.def_readwrite("shortened_ccd_codes", &Structure::shortened_ccd_codes)

// modify.hpp
.def("remove_alternative_conformations",
remove_alternative_conformations<Structure>)
.def("remove_hydrogens", remove_hydrogens<Structure>)
.def("remove_waters", remove_waters<Structure>)
.def("remove_ligands_and_waters", remove_ligands_and_waters<Structure>)
.def("store_deuterium_as_fraction", &store_deuterium_as_fraction)
.def("standardize_crystal_frame", &standardize_crystal_frame)
.def("assign_serial_numbers", (void (*)(Structure&, bool)) &assign_serial_numbers,
py::arg("numbered_ter")=false)
// assembly.hpp
.def("shorten_chain_names", &shorten_chain_names)
.def("expand_ncs", &expand_ncs, py::arg("how"), py::arg("merge_dist")=0.2)
.def("transform_to_assembly",
Expand All @@ -182,8 +188,10 @@ void add_mol(py::module& m) {
keep_spacegroup, merge_dist);
}, py::arg("assembly_name"), py::arg("how"), py::arg("keep_spacegroup")=false,
py::arg("merge_dist")=0.2)
// calculate.hpp
.def("calculate_box", &calculate_box, py::arg("margin")=0.)
.def("calculate_fractional_box", &calculate_fractional_box, py::arg("margin")=0.)

.def("clone", [](const Structure& self) { return new Structure(self); })
.def("__repr__", [](const Structure& self) {
return cat("<gemmi.Structure ", self.name, " with ",
Expand Down
33 changes: 18 additions & 15 deletions src/polyheur.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,40 +318,43 @@ void change_ccd_code(Structure& st, const std::string& old, const std::string& n
}
}

template <size_t I, typename T1, typename T2>
bool in_vector_at(T1& x, std::vector<T2>& v) {
for (const auto& el : v)
if (std::get<I>(el) == x)
return true;
return false;
}

void shorten_ccd_codes(Structure& st) {
// find all long residue names in both models and sequences
auto already_in = [&](const std::string& name) -> bool {
return in_vector_f([&](const OldToNew& x) { return x.old == name; },
st.shortened_ccd_codes);
};
for (Model& model : st.models)
for (Chain& chain : model.chains)
for (Residue& res : chain.residues)
if (res.name.size() > 3 && !already_in(res.name))
st.shortened_ccd_codes.push_back({res.name, {}});
if (res.name.size() > 3 && !in_vector_at<0>(res.name, st.shortened_ccd_codes))
st.shortened_ccd_codes.emplace_back(res.name, "");
for (const Entity& ent : st.entities)
for (const std::string& mon_ids : ent.full_sequence) {
for (size_t start = 0;;) {
size_t end = mon_ids.find(',', start);
size_t len = std::min(end, mon_ids.size()) - start;
if (len > 3) {
std::string s(mon_ids, start, len);
if (!already_in(s))
st.shortened_ccd_codes.push_back({s, {}});
if (!in_vector_at<0>(s, st.shortened_ccd_codes))
st.shortened_ccd_codes.emplace_back(s, "");
}
if (end == std::string::npos)
break;
start = end + 1;
}
}
// pick a new residue name and call change_ccd_code()
for (OldToNew& item : st.shortened_ccd_codes) {
std::string short_code = item.old.substr(0, 3);
short_code[2] = '~';
for (auto& old_new : st.shortened_ccd_codes) {
const std::string& old = old_new.first;
char short_code[4] = {old[0], old[1], '~', '\0'};
// if short_code it's already used, try X[0-9]~ and then [0-9][0-9]~
char c0 = '0', c1 = '0';
while (in_vector_f([&](const OldToNew& x) { return x.new_ == short_code; },
st.shortened_ccd_codes)) {
while (in_vector_at<1>(short_code, st.shortened_ccd_codes)) {
short_code[1] = c1++;
if (c1 > '9') {
short_code[0] = c0++;
Expand All @@ -360,8 +363,8 @@ void shorten_ccd_codes(Structure& st) {
break;
}
}
item.new_ = short_code;
change_ccd_code(st, item.old, item.new_);
old_new.second = short_code;
change_ccd_code(st, old_new.first, old_new.second);
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/to_pdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,8 +456,8 @@ void write_pdb(const Structure& st, std::ostream& os, PdbWriteOptions opt) {
}

// HETNAM - but it's used only for tilde-hetnam extension
for (const OldToNew& mapping : st.shortened_ccd_codes)
WRITE("HETNAM %3s %55s %-9s\n", mapping.new_.c_str(), "", mapping.old.c_str());
for (const auto& mapping : st.shortened_ccd_codes)
WRITE("HETNAM %3s %55s %-9s\n", mapping.second.c_str(), "", mapping.first.c_str());

// HELIX
if (!opt.minimal_file && !st.helices.empty()) {
Expand Down

0 comments on commit 11a71c9

Please sign in to comment.