change the tilde-hetnam renaming: ABCDE -> ~DE instead of AB~

Because the last two character are more likely to vary. If they are not unique, using ~00, ~01, ...
project-gemmi · Jan 11, 2024 · 0a1a488 · 0a1a488
1 parent 7607a3c
commit 0a1a488
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 18 deletions.
diff --git a/docs/mol.rst b/docs/mol.rst
@@ -598,8 +598,8 @@ it starts in column 13 even if it has a one-letter element code:
 Columns 18-20 contain the residue name (CCD code). When the PDB ran out of
 three-character codes in 2023, it started assigning codes with 5 characters,
 which no longer fit into the PDB format. The tilde-hetnam extension addresses
-this issue: long CCD code is substituted with 3 characters,
-of which the last one is a tilde (``~``);
+this issue: long CCD code is substituted with a 3-character alias
+that starts with a tilde (``~``);
 the original code is stored in columns 72-79 of the HETNAM record.
 
 Columns 23-27 contain a sequence ID. It consists of a number (columns 23-26)
@@ -1988,7 +1988,7 @@ with two functions:
 
 
 * ``shorten_ccd_codes()`` replaces 5-character residue names in a structure
-  with 3-character names (aliases) where the third character is ``~``,
+  with 3-character names (aliases) that start with ``~``,
 
 * ``restore_full_ccd_codes()`` restores the original names.
 
@@ -2010,7 +2010,7 @@ Internally, the mapping between names is stored in
   >>> st_8xfm = gemmi.read_structure('8xfm.cif')
   >>> st_8xfm.shorten_ccd_codes()
   >>> st_8xfm.shortened_ccd_codes
-  [('A1LU6', 'A1~')]
+  [('A1LU6', '~U6')]
   >>> st_8xfm.restore_full_ccd_codes()
   >>> st_8xfm.shortened_ccd_codes
   []

diff --git a/src/mmcif.cpp b/src/mmcif.cpp
@@ -901,7 +901,7 @@ Structure make_structure_from_block(const cif::Block& block_) {
     for (auto row : chem_comp_table) {
       std::string alias = row.str(0);
       std::string long_id = row.str(1);
-      if (!alias.empty() && !long_id.empty() && alias != long_id && alias.back() == '~')
+      if (alias[0] == '~' && long_id[0] != '~' && long_id[0] != '\0')
         st.shortened_ccd_codes.emplace_back(long_id, alias);
     }
     restore_full_ccd_codes(st);

diff --git a/src/polyheur.cpp b/src/polyheur.cpp
@@ -348,22 +348,24 @@ void shorten_ccd_codes(Structure& st) {
         start = end + 1;
       }
     }
-  // pick a new residue name and call change_ccd_code()
+  // the first try on renaming: ABCDE -> ~DE
   for (auto& old_new : st.shortened_ccd_codes) {
     const std::string& old = old_new.first;
-    char short_code[4] = {old[0], old[1], '~', '\0'};
-    // if short_code it's already used, try X[0-9]~ and then [0-9][0-9]~
-    char c0 = '0', c1 = '0';
-    while (in_vector_at<1>(short_code, st.shortened_ccd_codes)) {
-      short_code[1] = c1++;
-      if (c1 > '9') {
-        short_code[0] = c0++;
-        c1 = '0';
-        if (c0 > 'Z')  // shouldn't happen
-          break;
-      }
+    char short_code[4] = {'~', *(old.end()-2), *(old.end()-1), '\0'};
+    if (!in_vector_at<1>(short_code, st.shortened_ccd_codes))
+      old_new.second = short_code;
+  }
+  // pick a new residue name and call change_ccd_code()
+  int i = -1;
+  for (auto& old_new : st.shortened_ccd_codes) {
+    // If ~DE was not unique, use ~00, ~01, ...
+    // After ~99, the middle character will be punctation or letter.
+    // After ~Z9 (430+ names), we give up and the names will be empty.
+    while (old_new.second.empty() && ++i < 'Z'*10) {
+      char short_code[4] = {'~', char('0' + i/10), char('0' + i%10), '\0'};
+      if (!in_vector_at<1>(short_code, st.shortened_ccd_codes))
+        old_new.second = short_code;
     }
-    old_new.second = short_code;
     change_ccd_code(st, old_new.first, old_new.second);
   }
 }