/src/openbabel/src/canon.cpp
Line | Count | Source |
1 | | /********************************************************************** |
2 | | canon.h - Canonical labeling. |
3 | | |
4 | | Copyright (C) 2009-2010 by Tim Vandermeersch |
5 | | Copyright (C) 2005-2006, eMolecules, Inc. (www.emolecules.com) |
6 | | Craig A. James |
7 | | |
8 | | This file is part of the Open Babel project. |
9 | | For more information, see <http://openbabel.org/> |
10 | | |
11 | | This program is free software; you can redistribute it and/or modify |
12 | | it under the terms of the GNU General Public License as published by |
13 | | the Free Software Foundation version 2 of the License. |
14 | | |
15 | | This program is distributed in the hope that it will be useful, |
16 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | | GNU General Public License for more details. |
19 | | ***********************************************************************/ |
20 | | |
21 | | #include <openbabel/canon.h> |
22 | | #include <openbabel/graphsym.h> |
23 | | #include <openbabel/babelconfig.h> |
24 | | #include <openbabel/mol.h> |
25 | | #include <openbabel/atom.h> |
26 | | #include <openbabel/bond.h> |
27 | | #include <openbabel/obiter.h> |
28 | | #include <openbabel/obutil.h> |
29 | | #include <openbabel/elements.h> |
30 | | |
31 | | #include <openbabel/stereo/cistrans.h> |
32 | | #include <openbabel/stereo/tetrahedral.h> |
33 | | |
34 | | #include <iterator> // std::istream_iterator |
35 | | #include <cassert> |
36 | | #include <algorithm> |
37 | | #include <cmath> |
38 | | #include <limits> |
39 | | |
40 | | #include "stereo/stereoutil.h" |
41 | | |
42 | 0 | #define DEBUG 0 |
43 | | |
44 | 0 | #define MAX_IDENTITY_NODES 50 |
45 | | |
46 | | using namespace std; |
47 | | |
48 | | // debug function |
49 | | template<typename T> |
50 | | void print_vector(const std::string &label, const std::vector<T> &v) |
51 | | { |
52 | | std::cout << label << ": "; |
53 | | for (std::size_t i = 0; i < v.size(); ++i) |
54 | | if (v[i] < 10) |
55 | | std::cout << " " << v[i] << " "; |
56 | | else |
57 | | std::cout << v[i] << " "; |
58 | | |
59 | | std::cout << endl; |
60 | | } |
61 | | |
62 | | |
63 | | namespace OpenBabel { |
64 | | |
65 | | static unsigned int TotalHydrogenCount(OBAtom* atom) |
66 | 0 | { |
67 | 0 | return atom->ExplicitHydrogenCount() + atom->GetImplicitHCount(); |
68 | 0 | } |
69 | | |
70 | | inline bool CompareBondPairSecond(const std::pair<OBBond*,unsigned int> &a,const std::pair<OBBond*,unsigned int> &b) |
71 | 0 | { |
72 | 0 | return(a.second < b.second); |
73 | 0 | } |
74 | | |
75 | | /** |
76 | | * Helper function for getFragment below. |
77 | | */ |
78 | | void addNbrs(OBBitVec &fragment, OBAtom *atom, const OBBitVec &mask, const std::vector<OBBond*> &metalloceneBonds) |
79 | 0 | { |
80 | 0 | FOR_NBORS_OF_ATOM (nbr, atom) { |
81 | 0 | if (!mask.BitIsSet(nbr->GetIdx())) |
82 | 0 | continue; |
83 | | // skip visited atoms |
84 | 0 | if (fragment.BitIsSet(nbr->GetIdx())) |
85 | 0 | continue; |
86 | | // skip mettalocene bonds |
87 | 0 | if (std::find(metalloceneBonds.begin(), metalloceneBonds.end(), |
88 | 0 | atom->GetParent()->GetBond(atom, &*nbr)) != metalloceneBonds.end()) |
89 | 0 | continue; |
90 | | // add the neighbor atom to the fragment |
91 | 0 | fragment.SetBitOn(nbr->GetIdx()); |
92 | | // recurse... |
93 | 0 | addNbrs(fragment, &*nbr, mask, metalloceneBonds); |
94 | 0 | } |
95 | 0 | } |
96 | | |
97 | | /** |
98 | | * Create an OBBitVec objects with bets set for the fragment consisting of all |
99 | | * atoms for which there is a path to atom without going through skip. These |
100 | | * fragment bitvecs are indexed by atom idx (i.e. OBAtom::GetIdx()). |
101 | | */ |
102 | | OBBitVec getFragment(OBAtom *atom, const OBBitVec &mask, const std::vector<OBBond*> &metalloceneBonds = std::vector<OBBond*>()) |
103 | 0 | { |
104 | 0 | OBBitVec fragment; |
105 | 0 | fragment.SetBitOn(atom->GetIdx()); |
106 | | // start the recursion |
107 | 0 | addNbrs(fragment, atom, mask, metalloceneBonds); |
108 | 0 | return fragment; |
109 | 0 | } |
110 | | |
111 | | OBBitVec getFragment(OBAtom *atom, OBAtom *skip, const OBBitVec &mask) |
112 | 0 | { |
113 | 0 | struct getFragmentImpl |
114 | 0 | { |
115 | 0 | static void addNbrs(OBBitVec &fragment, OBAtom *atom, OBAtom *skip, const OBBitVec &mask) |
116 | 0 | { |
117 | 0 | FOR_NBORS_OF_ATOM (nbr, atom) { |
118 | | // don't pass through skip |
119 | 0 | if (nbr->GetIdx() == skip->GetIdx()) |
120 | 0 | continue; |
121 | | // skip visited atoms |
122 | 0 | if (fragment.BitIsSet(nbr->GetIdx())) |
123 | 0 | continue; |
124 | 0 | if (!mask.BitIsSet(nbr->GetIdx())) |
125 | 0 | continue; |
126 | | // add the neighbor atom to the fragment |
127 | 0 | fragment.SetBitOn(nbr->GetIdx()); |
128 | | // recurse... |
129 | 0 | addNbrs(fragment, &*nbr, skip, mask); |
130 | 0 | } |
131 | 0 | } |
132 | 0 | }; |
133 | |
|
134 | 0 | OBBitVec fragment; |
135 | 0 | fragment.SetBitOn(atom->GetIdx()); |
136 | | // start the recursion |
137 | 0 | getFragmentImpl::addNbrs(fragment, atom, skip, mask); |
138 | 0 | return fragment; |
139 | 0 | } |
140 | | |
141 | | |
142 | | |
143 | | bool isFerroceneBond(OBBond *bond) |
144 | 0 | { |
145 | 0 | if (bond->GetBondOrder() != 1) |
146 | 0 | return false; |
147 | | |
148 | 0 | OBAtom *Fe = nullptr, *C = nullptr; |
149 | |
|
150 | 0 | OBAtom *begin = bond->GetBeginAtom(); |
151 | 0 | if (begin->GetAtomicNum() == 26) |
152 | 0 | Fe = begin; |
153 | 0 | if (begin->GetAtomicNum() == 6) |
154 | 0 | C = begin; |
155 | |
|
156 | 0 | OBAtom *end = bond->GetEndAtom(); |
157 | 0 | if (end->GetAtomicNum() == 26) |
158 | 0 | Fe = end; |
159 | 0 | if (end->GetAtomicNum() == 6) |
160 | 0 | C = end; |
161 | |
|
162 | 0 | if (!Fe || !C) |
163 | 0 | return false; |
164 | | |
165 | 0 | if (Fe->GetExplicitDegree() < 10) |
166 | 0 | return false; |
167 | | |
168 | 0 | return C->HasDoubleBond() && C->IsInRing(); |
169 | 0 | } |
170 | | |
171 | | void findMetalloceneBonds(std::vector<OBBond*> &bonds, OBMol *mol, const std::vector<unsigned int> &symmetry_classes) |
172 | 0 | { |
173 | 0 | FOR_ATOMS_OF_MOL (atom, mol) { |
174 | 0 | if (!atom->IsInRingSize(3)) |
175 | 0 | continue; |
176 | 0 | std::vector<unsigned int> nbrSymClasses; |
177 | 0 | FOR_NBORS_OF_ATOM (nbr, &*atom) { |
178 | 0 | if (nbr->IsInRingSize(3)) |
179 | 0 | nbrSymClasses.push_back(symmetry_classes[nbr->GetIndex()]); |
180 | 0 | } |
181 | |
|
182 | 0 | if (nbrSymClasses.size() < 8) |
183 | 0 | continue; |
184 | | |
185 | 0 | std::sort(nbrSymClasses.begin(), nbrSymClasses.end()); |
186 | 0 | unsigned int numUnique = std::unique(nbrSymClasses.begin(), nbrSymClasses.end()) - nbrSymClasses.begin(); |
187 | 0 | if (numUnique > 1) |
188 | 0 | continue; |
189 | | |
190 | 0 | FOR_NBORS_OF_ATOM (nbr, &*atom) |
191 | 0 | bonds.push_back(mol->GetBond(&*atom, &*nbr)); |
192 | 0 | } |
193 | 0 | } |
194 | | |
195 | | |
196 | | /** |
197 | | * @page canonical_code_algorithm Canonical Coding Algorithm |
198 | | * |
199 | | * @section canonical_introduction Introduction |
200 | | * The aim of a canonical coding algorithm is to assign unique labels to |
201 | | * atoms regardless of their input order. In practise, these labels or orders |
202 | | * are atom indexes resulting from the order in which the atoms are defined |
203 | | * in an input file. Although most chemical file formats could be used to |
204 | | * store canonical ordered molecules, canonical single line notations are used |
205 | | * more often since they allow two canonical molecules to be compared using a |
206 | | * string comparison. Two well known examples are canonical smiles and inchi. |
207 | | * While the canonical smiles for the same molecule should always be the same |
208 | | * when using a specific implementation (i.e. toolkits), these canonical |
209 | | * smiles are usually not transferable between implementations. There is only |
210 | | * one inchi implementation and the canonical code, although not formally |
211 | | * specified is always the same. A typical use case for canonical codes is to |
212 | | * determine if a molecule is already in a database. |
213 | | * |
214 | | * The rest of this page documents the OpenBabel canonical coding algorithm. |
215 | | * |
216 | | * @section Topological Symmetry |
217 | | * The starting point of the canonical coding algorithm is the topological |
218 | | * symmetry or graph symmetry. This symmetry can be expressed by assigning |
219 | | * symmetry classes to atoms. Symmetric atoms have the same symmetry class. |
220 | | * The term color is often used in graph theory as a synonym for symmetry |
221 | | * classes. |
222 | | * |
223 | | * Symmetry classes are used as graph invariants to order atoms. The symmetry |
224 | | * classes can be used in two ways: |
225 | | * |
226 | | * - lowest symmetry class = lowest label (i.e. 1) |
227 | | * - highest symmetry class = lowest label |
228 | | * |
229 | | * Both methods are valid. OpenBabel uses the second method. In the following |
230 | | * sections there are more of these decisions that have to be made. The first, |
231 | | * choice is used as example to illustrate that it doesn't make any real |
232 | | * difference as long as the same choice is used every time. When trying to |
233 | | * reproduce OpenBabel canonical smiles the same choices or conventions have |
234 | | * to be used. |
235 | | * |
236 | | * The used initial graph invariant is specified below and the iteration |
237 | | * algorithm is similar to the original Morgan algorithm. The OBGraphSym class |
238 | | * should be consulted for a detailed description. |
239 | | @verbatim |
240 | | 0 1 2 3 |
241 | | 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 |
242 | | GI = D D D D D D D D D D D V V V V A R N N N N N N N B B B B C C C C |
243 | | |
244 | | GI: Graph invariant (32 bits) |
245 | | D: Graph theoretical distance (10 bits) |
246 | | V: Heavy valence (4 bits) |
247 | | A: Aromaticity (1 bit) |
248 | | R: Ring atom (1 bit) |
249 | | N: Atomic number (7 bits) |
250 | | B: Heavy bond sum (4 bits) |
251 | | C: Formal charge + 7 (4 bits) |
252 | | @endverbatim |
253 | | * |
254 | | * (FIXME: ugly smiles) |
255 | | * |
256 | | * The symmetry classes encode various attributes for each atom. Consult the |
257 | | * OBGraphSym documentation for details. (note: when trying to reproduce |
258 | | * OpenBabel canonical smiles, make sure these graph invariants are the same) |
259 | | * |
260 | | * @section canonical_labeling Labeling |
261 | | * The labeling starts by selecting the initial atom using a set of rules to |
262 | | * ensure it starts at terminal atoms. (FIXME) Label 1 is assigned to this atom |
263 | | * and the labeling continues using this atom as the current atom: |
264 | | * |
265 | | * -# Find the neighbor atoms of the current atom to label next. Already labeled |
266 | | * and atoms not in the fragment are ignored. If there are no such atoms, go |
267 | | * to 3. |
268 | | * -# Sort the neighbors by their symmetry classes. Each group of atoms with the |
269 | | * same symmetry class is now handled in sequence. If there is only one atom |
270 | | * in the group, this atom is labeled and the process continues with the next |
271 | | * group. If there are multiple (n) atoms in a group, all n! permutations of |
272 | | * the group are tried. (note: The Optimization section below mentions some |
273 | | * exceptions which affect the final result. These optimizations are part of |
274 | | * the algorithm) |
275 | | * -# When all neighbor atoms of the current atom are labeled, the next atom |
276 | | * (i.e. the atom with current_label+1 label) is considered the current atom |
277 | | * and step 1 through 3 are repeated. If there is no next atom, the labeling |
278 | | * is done. |
279 | | * |
280 | | * The algorithm just described results in a set of labellings that are candidates |
281 | | * for the canonical labeling. For each labeling, a code is generated to select |
282 | | * the canonical code and the associated labels. |
283 | | * |
284 | | * @section canonical_code Canonical Code |
285 | | * A canonical code can be anything encoding the molecular graph with all the |
286 | | * properties that should be canonicalized. The code should be invariant to the |
287 | | * original atom order. At this point, it may seem that this is similar to the |
288 | | * graph invariants from above (i.e. symmetry classes). However, generating these |
289 | | * codes allows for additional attributes to be included. A simple example are |
290 | | * isotopes. More importantly, stereochemistry is included in these codes which |
291 | | * is an attribute that does not depend an isolated atom. |
292 | | * |
293 | | * The actual code is a list of numbers (unsigned int) constructed by joining |
294 | | * smaller codes. These smaller codes usually encode specific attributes. |
295 | | * |
296 | | * @subsection canonical_from FROM code |
297 | | * When labeling the molecule as described earlier, all but the initial atom in |
298 | | * a connected fragment have a "from" atom (i.e. current atom in the algorithm). |
299 | | * The FROM code contains the label of the "from" atom, for each atom (ignoring |
300 | | * the initial atom) ordered by their assigned label. The FROM code is a |
301 | | * spanning tree containing all bonds that are not ring closures. |
302 | | * |
303 | | * @subsection canonical_closure CLOSURE code |
304 | | * The ring closure bonds are easily identified while creating the FROM code. |
305 | | * The CLOSURE code contains two labels for each closure bond. The labels are |
306 | | * sorted in the following way: |
307 | | * |
308 | | * - [1 6] < [6 1] (i.e. 1 6 is used, individual bond sorting) |
309 | | * - [1 3][1 4] < [1 4][1 3] (i.e. 1 3 1 4 is used, sorting bonds) |
310 | | * - [1 3][5 7] < [5 7][1 3] |
311 | | * |
312 | | * The square brackets are only to indicate individual bonds, the code is just |
313 | | * a single list. |
314 | | * |
315 | | * @subsection canonical_atomtype ATOM-TYPES & ISOTOPES code |
316 | | * The ATOM-TYPES code is simply the atomic number for each atom ordered by |
317 | | * labels. If the molecule contains isotopes, an additional ISOTOPES code is |
318 | | * used to correctly canonicalize these molecules. |
319 | | * |
320 | | * @subsection canonical_bondtype BOND-TYPES code |
321 | | * The BOND-TYPES code contains, for each bond, the bond order or 5 if the |
322 | | * bond is aromatic. The bonds are ordered using the order obtained during |
323 | | * the FROM code construction. The closure bonds are last, ordered as |
324 | | * described in the CLOSURE code section. |
325 | | * |
326 | | * @subsection canonical_stereo STEREO code |
327 | | * The STEREO code contains a descriptor for each stereo center. The stereo |
328 | | * centers are ordered using the central atom(s) (i.e. center for tetrahedral |
329 | | * stereochemistry, lowest label for cistrans). |
330 | | * |
331 | | * @subsection canonical_canonical Canonical Code |
332 | | * The canonical code is the greatest code (elements are compared, the first |
333 | | * difference determines order). The complete canonical code can also be seen |
334 | | * as layers. The first layer is the FROM code together with the CLOSURE code. |
335 | | * Together these codes describe the graph without node (atom) or edge (bond) |
336 | | * properties. The ATOM-TYPES code is the layer adding node properties. The |
337 | | * optional ISOTOPES code also encodes node properties. Edge properties are |
338 | | * specified in the BOND-TYPES code. The previous codes together are enough |
339 | | * to specify the constitution of a molecule. Additional layers such as the |
340 | | * STEREO code could be ignored for certain use cases. For example, if two |
341 | | * molecules have the same canonical code up to the STEREO part, they are |
342 | | * stereoisomers. |
343 | | * |
344 | | * @section canonical_fragments Disconnected Fragments |
345 | | * Disconnected fragments are handled individually and the resulting canonical |
346 | | * codes are sorted. The final canonical code for the whole molecule, are the |
347 | | * sorted canonical codes joined together. |
348 | | * |
349 | | * @section canonical_optimization Optimizations |
350 | | * This section documents optimizations to reduce the number of states which |
351 | | * affect the final result. The aim of these optimizations is to avoid the n! |
352 | | * permutations in step 2 of the labeling algorithm. Some of the optimizations |
353 | | * affect the final result and should be implemented when trying to reproduce |
354 | | * the same canonical code. |
355 | | * |
356 | | * @subsection canonical_opt1 Optimization 1 |
357 | | * If the current atom is not in a ring, it follows that the bonds to all it's |
358 | | * neighbors are not ring bonds. It also follows that there is no path between |
359 | | * two neighbor atoms with the same symmetry class without going through the |
360 | | * current atom. Therefore, these ligands (i.e. the fragments that would be |
361 | | * created by deleting the current atom) can be labeled individually. For |
362 | | * example, for an atom with 4 neighbors with the same symmetry class, the |
363 | | * number of permutations to label is reduced from 4! * 4 = 96 to 4. The times |
364 | | * 4 is because after generating the 24 permutations, the ligands are not |
365 | | * labeled yet. |
366 | | * |
367 | | * @note This optimization <b>affects the final result</b>. |
368 | | * |
369 | | * @subsection canonical_opt2 Optimization 2 (not implemented) |
370 | | * If the bond(s) to the neighbor atom(s) with the same symmetry classes is not |
371 | | * a ring bond, optimization 1 could be used. This is the more general version |
372 | | * of optimization 1. |
373 | | * |
374 | | * @note This optimization <b>affects the final result</b> but is <b>not |
375 | | * implemented</b>. |
376 | | * |
377 | | * @subsection canonical_opt3 Optimization 3 (not implemented) |
378 | | * If the current atom is a stereo center, only permutations with the highest |
379 | | * descriptor produce a greatest canonical code. This does not change the final |
380 | | * result and can be implemented without changing canonical smiles etc. |
381 | | * |
382 | | * @subsection canonical_opt4 Other optimizations |
383 | | * In essence, the canonical coding algorithm is very similar to the algorithm |
384 | | * from the well-known nauty package [1, 2]. However, the optimizations mentioned |
385 | | * above are far easier to implement, document and result in acceptable |
386 | | * performance for most structures. This enables anyone to reproduce OpenBabel |
387 | | * canonical smiles without the absolute requirement to implement the complex |
388 | | * optimizations from nauty. Since the optimizations from the nauty paper do |
389 | | * not have any affect on the final result, these can be implemented as required. |
390 | | * |
391 | | * The OpenBabel algorithm uses the molecule graph as the main object while |
392 | | * searching for a canonical code. Nauty starts from a partition of the set |
393 | | * of nodes which is refined. This is not the essential part though. The |
394 | | * similarity between the algorithms is that both algorithms share the |
395 | | * concept of a search tree. A completed canonical candidate code is a leaf |
396 | | * node in this search tree. If a second labeling is found with the same |
397 | | * canonical code, there exists a permutation mapping the first labels to |
398 | | * the second. This permutation is an automorphism that takes all attributes |
399 | | * of the canonical code into account. The found automorphism is not only |
400 | | * an automorphism of the molecule but also an isomorphism of the search tree. |
401 | | * These accidentally discovered automorphisms can be used to reduce the |
402 | | * number of states to consider in various ways. |
403 | | * |
404 | | * @subsubsection canonical_opt5 Optimization 4 |
405 | | * |
406 | | * |
407 | | * |
408 | | * |
409 | | * |
410 | | @verbatim |
411 | | [1] Brendan D. McKay, Backtrack programming and isomorphism rejection on |
412 | | ordered subsets, ARS Combinatoria, Vol. 5, 1979, pp. 65-99 |
413 | | http://cs.anu.edu.au/~bdm/papers/backtrack1978.pdf |
414 | | [2] Brendan D. McKay, Practical graph isomorphism, Congressus Numerantium, |
415 | | Vol. 30, 1981, pp. 45-87 |
416 | | http://cs.anu.edu.au/~bdm/papers/pgi.pdf |
417 | | @endverbatim |
418 | | */ |
419 | | |
420 | | struct CanonicalLabelsImpl |
421 | | { |
422 | | typedef std::vector<OBAtom*> Orbit; |
423 | | typedef std::vector<Orbit> Orbits; |
424 | | |
425 | | static void print_orbits(const Orbits &orbits) |
426 | 0 | { |
427 | 0 | for (std::size_t j = 0; j < orbits.size(); ++j) { |
428 | 0 | cout << "( "; |
429 | 0 | for (std::size_t k = 0; k < orbits[j].size(); ++k) |
430 | 0 | cout << orbits[j][k]->GetIndex() << " "; |
431 | 0 | cout << ") "; |
432 | 0 | } |
433 | 0 | } |
434 | | static void print_orbits(const std::string &label, const Orbits &orbits) |
435 | 0 | { |
436 | 0 | cout << label << ": "; |
437 | 0 | print_orbits(orbits); |
438 | 0 | cout << endl; |
439 | 0 | } |
440 | | |
441 | | |
442 | | /** |
443 | | * Structure to represent a completed labeling and it's associated canonical |
444 | | * candidate code. |
445 | | */ |
446 | | struct FullCode |
447 | | { |
448 | | /** |
449 | | * The atom labels starting from 1. All excluded atoms have label 0. |
450 | | */ |
451 | | std::vector<unsigned int> labels; |
452 | | /** |
453 | | * The canonical candidate code resulting from the @p labels. |
454 | | */ |
455 | | std::vector<unsigned short> code; |
456 | | |
457 | | /** |
458 | | * Default constructor, used to create initial empty bestCode object. |
459 | | */ |
460 | | FullCode() |
461 | 0 | { |
462 | 0 | } |
463 | | |
464 | | /** |
465 | | * Constructor specifying both the @p labels and @p from (the FROM part |
466 | | * of the canonical code). The other parts of the canonical code are |
467 | | * added using the CompleteCode function. |
468 | | */ |
469 | | FullCode(const std::vector<unsigned int> &_labels, const std::vector<unsigned short> &from) |
470 | 0 | : labels(_labels), code(from) |
471 | 0 | { |
472 | 0 | } |
473 | | |
474 | | /** |
475 | | * Compare this object's canonical candidate code to the other object's |
476 | | * code and return true if this code is greater. This method is used to |
477 | | * select the final canonical code. In other words, the canonical labels |
478 | | * for a molecule are the labels with the greatest canonical code. |
479 | | */ |
480 | | inline bool operator>(const FullCode &other) const |
481 | 0 | { |
482 | 0 | return (code > other.code); |
483 | 0 | } |
484 | | }; |
485 | | |
486 | | /** |
487 | | * Structure to represent a partial labeling and the associated FROM part |
488 | | * of the canonical code. |
489 | | */ |
490 | | struct PartialCode |
491 | | { |
492 | | /** |
493 | | * The atoms in canonical order. An atom is added when a label is assigned |
494 | | * to it. When all atoms are labeled, this vector will contain all atoms |
495 | | * in the fragment. |
496 | | */ |
497 | | std::vector<OBAtom*> atoms; |
498 | | /** |
499 | | * The bonds in canonical order. Since the atoms are labeled from another |
500 | | * atom ("from"), these are the bonds connecting the "from" atom with the |
501 | | * next labeled atom. The FROM list forms a spanning tree that does not |
502 | | * include the ring closure bonds. These ring closure bonds are added at |
503 | | * the end of this vector, in the correct order, in the CompleteCode |
504 | | * function. |
505 | | */ |
506 | | std::vector<OBBond*> bonds; |
507 | | /** |
508 | | * The FROM part of the canonical code. This is a vector containing the |
509 | | * label of the "from" atom for the atoms in canonical order. The initial |
510 | | * atom of a bonded fragment is not included in this list since there is |
511 | | * no "from" atom. |
512 | | */ |
513 | | std::vector<unsigned short> from; |
514 | | /** |
515 | | * The atom labels starting from 1. All atoms start with label 0. |
516 | | */ |
517 | | std::vector<unsigned int> labels; |
518 | | |
519 | | /** |
520 | | * Constructor specifying the number of atoms in the molecule. The @p |
521 | | * numAtoms are used to initialize the labels vector to contain label |
522 | | * 0 for all atoms. |
523 | | */ |
524 | | PartialCode(std::size_t numAtoms) |
525 | 0 | { |
526 | 0 | labels.resize(numAtoms, 0); |
527 | 0 | } |
528 | | |
529 | | /** |
530 | | * Add an atom to atoms. Used to add the initial atom. |
531 | | */ |
532 | 0 | inline void add(OBAtom *atom) { atoms.push_back(atom); } |
533 | | /** |
534 | | * Add a bond to bonds. Used to add the ring closure bonds. |
535 | | */ |
536 | 0 | inline void add(OBBond *bond) { bonds.push_back(bond); } |
537 | | /** |
538 | | * Add an atom that is labeled "from" another atom. This function adds |
539 | | * the @p atom to atoms, adds bond between @p fromAtom and @p atom to |
540 | | * bonds and updates the @p from vector. |
541 | | */ |
542 | | inline void add(OBAtom *fromAtom, OBAtom *atom) |
543 | 0 | { |
544 | 0 | from.push_back(labels[fromAtom->GetIndex()]); |
545 | 0 | atoms.push_back(atom); |
546 | 0 | bonds.push_back(atom->GetParent()->GetBond(fromAtom, atom)); |
547 | 0 | } |
548 | | |
549 | | /** |
550 | | * Compare the @p from vector with a FullCode's code and return true if |
551 | | * this code is lower. This function is used to avoid completing a |
552 | | * labeling that will never result in a greatest canonical code. |
553 | | */ |
554 | | inline bool operator<(const FullCode &other) const |
555 | 0 | { |
556 | 0 | std::size_t numFrom = std::min(from.size(), other.code.size()); |
557 | 0 | for (std::size_t i = 0; i < numFrom; ++i) { |
558 | 0 | if (from[i] > other.code[i]) |
559 | 0 | return false; |
560 | 0 | if (from[i] < other.code[i]) |
561 | 0 | return true; |
562 | 0 | } |
563 | | |
564 | 0 | return false; |
565 | 0 | } |
566 | | }; |
567 | | |
568 | | /** |
569 | | * Structure to store and compute stereo center parities. Currently for |
570 | | * both tetrahedral and cistrans stereochemistry. Precomputing these |
571 | | * StereoCenter objects is done for performance reasons since parities |
572 | | * need to be computed for each canonical candidate labeling. |
573 | | */ |
574 | | struct StereoCenter |
575 | | { |
576 | | /** |
577 | | * Use the stored nbrIndexes to access the @p labels and compute a parity |
578 | | * or descriptor. This function returns 0 or 1 for both tetrahedral and |
579 | | * cistrans stereo centers. |
580 | | */ |
581 | | int getDescriptor(const std::vector<unsigned int> &symmetry_classes, const std::vector<unsigned int> &labels) const |
582 | 0 | { |
583 | | // Unspecified stereo centers have their own descriptor. |
584 | 0 | if (nbrIndexes1.empty()) |
585 | 0 | return 2; |
586 | 0 | std::vector<unsigned long> refs1, refs2; |
587 | 0 | for (std::size_t i = 0; i < nbrIndexes1.size(); ++i) { |
588 | 0 | if (nbrIndexes1[i] < labels.size()) |
589 | 0 | refs1.push_back(labels[nbrIndexes1[i]]); |
590 | 0 | else |
591 | 0 | refs1.push_back(nbrIndexes1[i]); |
592 | 0 | } |
593 | 0 | for (std::size_t i = 0; i < nbrIndexes2.size(); ++i) |
594 | 0 | if (nbrIndexes2[i] < labels.size()) |
595 | 0 | refs2.push_back(labels[nbrIndexes2[i]]); |
596 | 0 | else |
597 | 0 | refs2.push_back(nbrIndexes2[i]); |
598 | 0 | if (indexes.size() == 2) { |
599 | 0 | bool symOrder = symmetry_classes[indexes[0]] < symmetry_classes[indexes[1]]; |
600 | 0 | bool canOrder = labels[indexes[0]] < labels[indexes[1]]; |
601 | 0 | if (symOrder != canOrder && symmetry_classes[indexes[0]] != symmetry_classes[indexes[1]]) |
602 | 0 | std::swap(refs1[0], refs1[1]); |
603 | 0 | } |
604 | 0 | return ((OBStereo::NumInversions(refs1) % 2 + OBStereo::NumInversions(refs2) % 2) % 2); |
605 | 0 | } |
606 | | /** |
607 | | * Index(es) for the stereo center. This is used to sort the stereo |
608 | | * centers by label. Tetrahedral stereo centers have one index in this |
609 | | * vector and cistrans stereocenters have both double bond atom indexes |
610 | | * in this vector. |
611 | | */ |
612 | | std::vector<unsigned int> indexes; |
613 | | /** |
614 | | * Indexes for the stereo center neighbor atoms. Tetrahedral centers have |
615 | | * all neighbor atoms in nbrIndexes1. CisTrans stereo centers store the |
616 | | * neighbor atoms for each double bond atom separately. |
617 | | */ |
618 | | std::vector<unsigned int> nbrIndexes1, nbrIndexes2; |
619 | | }; |
620 | | |
621 | | /** |
622 | | * Sort StereoCenter objects by their label. Used in combination with |
623 | | * std::sort to create STEREO code. |
624 | | */ |
625 | | struct SortStereoCenters |
626 | | { |
627 | | const std::vector<unsigned int> &labels; |
628 | 0 | SortStereoCenters(const std::vector<unsigned int> &_labels) : labels(_labels) {} |
629 | | inline unsigned int getLabel(const StereoCenter &c) const |
630 | 0 | { |
631 | 0 | switch (c.indexes.size()) { |
632 | 0 | case 2: |
633 | 0 | return std::min(labels[c.indexes[0]], labels[c.indexes[1]]); |
634 | 0 | default: |
635 | 0 | return labels[c.indexes[0]]; |
636 | 0 | } |
637 | 0 | } |
638 | | inline bool operator()(const StereoCenter &c1, const StereoCenter &c2) const |
639 | 0 | { |
640 | 0 | return (getLabel(c1) < getLabel(c2)); |
641 | 0 | } |
642 | | }; |
643 | | |
644 | | /** |
645 | | * Sort FullCode objects by their code. Used in combination with |
646 | | * std::sort order disconnected fragments. |
647 | | */ |
648 | | static bool SortCode(const FullCode &code1, const FullCode &code2) |
649 | 0 | { |
650 | 0 | return (code1.code < code2.code); |
651 | 0 | } |
652 | | |
653 | | /** |
654 | | * Sort FullCode objects with associated int in std::pair. Used to sort |
655 | | * ligands (i.e. equivalent fragments connected to a symmetric atom). |
656 | | */ |
657 | | static bool SortCode2(const std::pair<int, FullCode> &code1, const std::pair<int, FullCode> &code2) |
658 | 0 | { |
659 | 0 | return (code1.second.code > code2.second.code); |
660 | 0 | } |
661 | | |
662 | | /** |
663 | | * Sort atoms by ascending ranks (e.g. labels). |
664 | | */ |
665 | | struct SortAtomsAscending |
666 | | { |
667 | 0 | SortAtomsAscending(const std::vector<unsigned int> &_ranks) : ranks(_ranks) {} |
668 | | const std::vector<unsigned int> &ranks; |
669 | | inline bool operator()(const OBAtom *a1, const OBAtom *a2) const |
670 | 0 | { |
671 | 0 | return ranks[a1->GetIndex()] < ranks[a2->GetIndex()]; |
672 | 0 | } |
673 | | }; |
674 | | |
675 | | /** |
676 | | * Sort atoms by descending ranks (e.g. symmetry classes) |
677 | | */ |
678 | | struct SortAtomsDescending |
679 | | { |
680 | 0 | SortAtomsDescending(const std::vector<unsigned int> &_ranks) : ranks(_ranks) {} |
681 | | const std::vector<unsigned int> &ranks; |
682 | | inline bool operator()(const OBAtom *a1, const OBAtom *a2) const |
683 | 0 | { |
684 | 0 | return ranks[a1->GetIndex()] > ranks[a2->GetIndex()]; |
685 | 0 | } |
686 | | }; |
687 | | |
688 | | /** |
689 | | * Structure used while labeling a single connected fragment. All input is |
690 | | * specified using the constructor and @p code is generated as a result of |
691 | | * assigning labels. |
692 | | */ |
693 | | struct State |
694 | | { |
695 | | State(const std::vector<unsigned int> &_symmetry_classes, |
696 | | const OBBitVec &_fragment, std::vector<StereoCenter> &_stereoCenters, |
697 | | std::vector<FullCode> &_identityCodes, Orbits &_orbits, OBBitVec &_mcr, |
698 | 0 | bool _onlyOne) : symmetry_classes(_symmetry_classes), fragment(_fragment), |
699 | 0 | onlyOne(_onlyOne), stereoCenters(_stereoCenters), |
700 | 0 | code(_symmetry_classes.size()), identityCodes(_identityCodes), |
701 | 0 | backtrackDepth(0), orbits(_orbits), mcr(_mcr) |
702 | 0 | { |
703 | 0 | mcr.Clear(); |
704 | 0 | if (mcr.IsEmpty()) |
705 | 0 | for (std::size_t i = 0; i < symmetry_classes.size(); ++i) |
706 | 0 | mcr.SetBitOn(i+1); |
707 | 0 | } |
708 | | |
709 | | const std::vector<unsigned int> &symmetry_classes; |
710 | | /** |
711 | | * The connected fragment. This is a subset of the mask. |
712 | | */ |
713 | | const OBBitVec &fragment; |
714 | | const bool onlyOne; |
715 | | |
716 | | /** |
717 | | * The pre-computed stereo centers. Non-const since it needs to be |
718 | | * sorted. |
719 | | */ |
720 | | std::vector<StereoCenter> &stereoCenters; |
721 | | /** |
722 | | * The partial code with labels and FROM code. |
723 | | */ |
724 | | PartialCode code; |
725 | | |
726 | | /** |
727 | | * Identity nodes of the search tree. |
728 | | */ |
729 | | std::vector<FullCode> identityCodes; |
730 | | unsigned int backtrackDepth; |
731 | | Orbits orbits; |
732 | | OBBitVec &mcr; |
733 | | }; |
734 | | |
735 | | /** |
736 | | * Simple struct to manage timeout. |
737 | | */ |
738 | | struct Timeout |
739 | | { |
740 | 0 | Timeout(time_t _maxTime) : maxTime(_maxTime) |
741 | 0 | { |
742 | 0 | startTime = time(nullptr); |
743 | 0 | } |
744 | | time_t startTime, maxTime; |
745 | | }; |
746 | | |
747 | | |
748 | | /** |
749 | | * Given a complete labeling (@p state.code.labels) and FROM code (@p state.code.from), |
750 | | * construct a FullCode with complete code. |
751 | | */ |
752 | | static void CompleteCode(OBMol *mol, FullCode &fullcode, State &state) |
753 | 0 | { |
754 | 0 | PartialCode &code = state.code; |
755 | | |
756 | | // initialize the FullCode object with the found labels and the from code |
757 | 0 | fullcode = FullCode(code.labels, code.from); |
758 | 0 | unsigned int numClosures = 0; |
759 | | |
760 | | // |
761 | | // the RING-CLOSURE list |
762 | | // |
763 | 0 | unsigned int current_label = 1; // the second atom is always labeled from 1 |
764 | | // iterate over all atoms in the order they are labeled |
765 | | // (this ensures [1 3] < [3 1]) |
766 | 0 | for (std::size_t j = 0; j < code.atoms.size(); ++j) { |
767 | 0 | OBAtom *atom = code.atoms[j]; |
768 | | // still need to sort [1 3] and [1 4] |
769 | 0 | std::vector<std::pair<OBBond*, unsigned int> > closures; |
770 | 0 | FOR_BONDS_OF_ATOM (bond, atom) { |
771 | | // skip atoms not in the fragment |
772 | 0 | if (!state.fragment.BitIsSet(bond->GetNbrAtom(atom)->GetIdx())) |
773 | 0 | continue; |
774 | | // a closure bond is a bond not found while generating the FROM spanning tree. |
775 | 0 | if (std::find(code.bonds.begin(), code.bonds.end(), &*bond) == code.bonds.end()) { |
776 | 0 | closures.push_back(std::make_pair(&*bond, code.labels[bond->GetNbrAtom(atom)->GetIndex()])); |
777 | 0 | } |
778 | 0 | } |
779 | | |
780 | | // do the sorting: [1 3] < [1 4] |
781 | 0 | std::sort(closures.begin(), closures.end(), CompareBondPairSecond); |
782 | |
|
783 | 0 | for (std::size_t k = 0; k < closures.size(); ++k) { |
784 | | // add the closure bond to the code |
785 | 0 | fullcode.code.push_back(current_label); |
786 | 0 | fullcode.code.push_back(closures[k].second); |
787 | | // add the bond to the list (needed for BOND-TYPES below) |
788 | 0 | code.add(closures[k].first); |
789 | 0 | numClosures++; |
790 | 0 | } |
791 | |
|
792 | 0 | current_label++; |
793 | 0 | } |
794 | | |
795 | | // Isotopes are only considered if there are isotopes. |
796 | 0 | bool hasIsotope = false; |
797 | | // Charges are only considered if there is a formal charge. |
798 | 0 | bool hasCharge = false; |
799 | | |
800 | | // |
801 | | // the ATOM-TYPES list |
802 | | // |
803 | 0 | OBStereoFacade facade(mol); |
804 | 0 | for (std::size_t j = 0; j < code.atoms.size(); ++j) { |
805 | 0 | OBAtom *atom = code.atoms[j]; |
806 | 0 | if (atom->GetIsotope()) |
807 | 0 | hasIsotope = true; |
808 | 0 | if (atom->GetFormalCharge()) |
809 | 0 | hasCharge = true; |
810 | | |
811 | | // Include all hydrogens |
812 | 0 | int hydrogens_to_include = TotalHydrogenCount(atom); |
813 | |
|
814 | 0 | unsigned int c = 10000 * atom->GetSpinMultiplicity() + |
815 | 0 | 1000 * hydrogens_to_include + |
816 | 0 | atom->GetAtomicNum(); |
817 | | |
818 | | // add the atomic number to the code |
819 | 0 | fullcode.code.push_back(c); |
820 | 0 | } |
821 | | |
822 | | // |
823 | | // the (optional) ISOTOPES list |
824 | | // |
825 | 0 | if (hasIsotope) |
826 | 0 | for (std::size_t j = 0; j < code.atoms.size(); ++j) |
827 | 0 | fullcode.code.push_back(code.atoms[j]->GetIsotope()); |
828 | | |
829 | | // |
830 | | // the (optional) CHARGES list |
831 | | // |
832 | 0 | if (hasCharge) |
833 | 0 | for (std::size_t j = 0; j < code.atoms.size(); ++j) |
834 | 0 | fullcode.code.push_back(7 + code.atoms[j]->GetFormalCharge()); |
835 | | |
836 | | // |
837 | | // the BOND-TYPES list |
838 | | // |
839 | 0 | for (std::size_t j = 0; j < code.bonds.size(); ++j) { |
840 | 0 | OBBond *bond = code.bonds[j]; |
841 | 0 | if (bond->IsAromatic()) |
842 | 0 | fullcode.code.push_back(5); |
843 | 0 | else |
844 | 0 | fullcode.code.push_back(bond->GetBondOrder()); |
845 | 0 | } |
846 | | |
847 | | // the STEREO flag |
848 | 0 | if (state.stereoCenters.size()) { |
849 | | // sort the stereo centers |
850 | 0 | std::sort(state.stereoCenters.begin(), state.stereoCenters.end(), SortStereoCenters(code.labels)); |
851 | |
|
852 | 0 | for (std::size_t i = 0; i < state.stereoCenters.size(); ++i) { |
853 | 0 | bool isInFragment = false; |
854 | 0 | for (std::size_t j = 0; j < state.stereoCenters[i].indexes.size(); ++j) |
855 | 0 | if (state.fragment.BitIsSet(state.stereoCenters[i].indexes[j]+1)) |
856 | 0 | isInFragment = true; |
857 | | // ignore stereo centers not in this fragment |
858 | 0 | if (isInFragment) |
859 | 0 | fullcode.code.push_back(state.stereoCenters[i].getDescriptor(state.symmetry_classes, code.labels)); |
860 | 0 | } |
861 | 0 | } |
862 | | |
863 | | // backtrack |
864 | 0 | for (unsigned int i = 0; i < numClosures; ++i) |
865 | 0 | code.bonds.pop_back(); |
866 | 0 | } |
867 | | |
868 | | /** |
869 | | * This function implements optimization 1 as described above. |
870 | | */ |
871 | | static void LabelFragments(OBAtom *current, std::vector<OBAtom*> &nbrs, unsigned int label, Timeout &timeout, FullCode &bestCode, State &state) |
872 | 0 | { |
873 | 0 | OBMol *mol = current->GetParent(); |
874 | 0 | PartialCode &code = state.code; |
875 | | |
876 | | // Sort the neighbor atoms according to symmetry class. |
877 | 0 | std::sort(nbrs.begin(), nbrs.end(), SortAtomsDescending(state.symmetry_classes)); |
878 | | |
879 | | // Count the number of unique neighbor symmetry classes. |
880 | 0 | unsigned int numUnique = 1; |
881 | 0 | unsigned int lastSymClass = state.symmetry_classes[nbrs[0]->GetIndex()]; |
882 | 0 | for (std::size_t i = 0; i < nbrs.size(); ++i) { |
883 | 0 | unsigned int symClass = state.symmetry_classes[nbrs[i]->GetIndex()]; |
884 | 0 | if (symClass != lastSymClass) |
885 | 0 | numUnique++; |
886 | 0 | lastSymClass = state.symmetry_classes[nbrs[i]->GetIndex()]; |
887 | 0 | } |
888 | |
|
889 | 0 | if (numUnique < nbrs.size()) { |
890 | | // The canonical codes for the equivalent ligands |
891 | 0 | std::vector< std::pair<int, CanonicalLabelsImpl::FullCode> > lcodes; |
892 | |
|
893 | 0 | std::vector<unsigned int> ligandSizes; |
894 | 0 | for (std::size_t i = 0; i < nbrs.size(); ++i) { |
895 | 0 | OBBitVec ligand = getFragment(nbrs[i], current, state.fragment); |
896 | 0 | ligandSizes.push_back(ligand.CountBits()); |
897 | |
|
898 | 0 | CanonicalLabelsImpl::FullCode lbestCode; |
899 | 0 | if (ligandSizes.back() == 1) { |
900 | | // Avoid additional state creation if the neighbor is a single terminal atom. |
901 | 0 | lbestCode.code.push_back(nbrs[i]->GetAtomicNum()); |
902 | 0 | lbestCode.labels.resize(state.symmetry_classes.size(), 0); |
903 | 0 | lbestCode.labels[nbrs[i]->GetIndex()] = 1; |
904 | 0 | } else if (ligandSizes.back() == 2) { |
905 | | // Avoid additional state creation if the neighbor is a fragment of 2 atoms. |
906 | 0 | lbestCode.labels.resize(state.symmetry_classes.size(), 0); |
907 | 0 | lbestCode.code.push_back(1); // FROM |
908 | 0 | lbestCode.code.push_back(nbrs[i]->GetAtomicNum()); // ATOM-TYPES 1 |
909 | 0 | lbestCode.labels[nbrs[i]->GetIndex()] = 1; |
910 | 0 | FOR_NBORS_OF_ATOM (nbr, nbrs[i]) { |
911 | 0 | if (!state.fragment.BitIsSet(nbr->GetIdx())) |
912 | 0 | continue; |
913 | 0 | if (code.labels[nbr->GetIndex()]) |
914 | 0 | continue; |
915 | 0 | lbestCode.code.push_back(nbr->GetAtomicNum()); // ATOM-TYPES 2 |
916 | 0 | OBBond *bond = mol->GetBond(nbrs[i], &*nbr); |
917 | 0 | if (bond->IsAromatic()) |
918 | 0 | lbestCode.code.push_back(5); |
919 | 0 | else |
920 | 0 | lbestCode.code.push_back(bond->GetBondOrder()); // BOND-TYPES 1 |
921 | 0 | lbestCode.labels[nbr->GetIndex()] = 2; |
922 | 0 | } |
923 | 0 | } else { |
924 | | // Start labeling from the ligand atom. |
925 | 0 | std::vector<CanonicalLabelsImpl::FullCode> identityCodes; |
926 | 0 | Orbits orbits; |
927 | 0 | OBBitVec mcr; |
928 | 0 | State lstate(state.symmetry_classes, ligand, state.stereoCenters, identityCodes, orbits, mcr, state.onlyOne); |
929 | 0 | lstate.code.add(nbrs[i]); |
930 | 0 | lstate.code.labels[nbrs[i]->GetIndex()] = 1; |
931 | 0 | CanonicalLabelsRecursive(nbrs[i], 1, timeout, lbestCode, lstate); |
932 | 0 | } |
933 | | |
934 | | // Store the canonical code (and labels) for the ligand. |
935 | 0 | lcodes.push_back(std::make_pair(i, lbestCode)); |
936 | 0 | } |
937 | | |
938 | | // Sort the codes for the fragments. Each neighbor symmetry class is sorted separately. |
939 | 0 | unsigned int firstIndex = 0; |
940 | 0 | lastSymClass = state.symmetry_classes[nbrs[0]->GetIndex()]; |
941 | 0 | for (std::size_t i = 1; i < nbrs.size(); ++i) { |
942 | 0 | unsigned int symClass = state.symmetry_classes[nbrs[i]->GetIndex()]; |
943 | 0 | if (symClass != lastSymClass) { |
944 | 0 | std::sort(lcodes.begin() + firstIndex, lcodes.begin() + i, SortCode2); |
945 | 0 | firstIndex = i; |
946 | 0 | } |
947 | 0 | lastSymClass = state.symmetry_classes[nbrs[i]->GetIndex()]; |
948 | 0 | } |
949 | | // Make sure to sort the last set too. |
950 | 0 | std::sort(lcodes.begin() + firstIndex, lcodes.end(), SortCode2); |
951 | | |
952 | | // Label the neighbor atoms by updating code. |
953 | 0 | std::vector<OBAtom*> atoms; |
954 | 0 | unsigned int nextLbl = label + 1; |
955 | 0 | for (std::size_t l = 0; l < lcodes.size(); ++l) { |
956 | | //print_vector("LIG CODE", lcodes[l].second.code); |
957 | 0 | OBAtom *atom = nbrs[lcodes[l].first]; |
958 | 0 | code.add(current, atom); |
959 | 0 | code.labels[atom->GetIndex()] = nextLbl; |
960 | 0 | atoms.push_back(atom); |
961 | 0 | nextLbl++; |
962 | 0 | } |
963 | | |
964 | | // Convert the labels from the ligands to labels in the whole fragment. |
965 | 0 | unsigned int ligandSize = *std::max_element(ligandSizes.begin(), ligandSizes.end()); |
966 | 0 | for (unsigned int lbl = 1; lbl < ligandSize; ++lbl) { |
967 | 0 | for (std::size_t l = 0; l < lcodes.size(); ++l) { |
968 | 0 | if (lbl >= ligandSizes[lcodes[l].first]) |
969 | 0 | continue; |
970 | | |
971 | 0 | OBAtom *atom = nullptr; |
972 | 0 | for (std::size_t i = 0; i < mol->NumAtoms(); ++i) |
973 | 0 | if (lcodes[l].second.labels[i] == lbl) { |
974 | 0 | atom = mol->GetAtom(i+1); |
975 | 0 | break; |
976 | 0 | } |
977 | |
|
978 | 0 | if (!atom) |
979 | 0 | continue; |
980 | | |
981 | 0 | std::vector<OBAtom*> atomNbrs; |
982 | 0 | FOR_NBORS_OF_ATOM (nbr, atom) { |
983 | 0 | if (lcodes[l].second.labels[nbr->GetIndex()] > lbl) { |
984 | 0 | if (std::find(atoms.begin(), atoms.end(), &*nbr) != atoms.end()) |
985 | 0 | continue; |
986 | 0 | atomNbrs.push_back(&*nbr); |
987 | 0 | } |
988 | 0 | } |
989 | |
|
990 | 0 | std::sort(atomNbrs.begin(), atomNbrs.end(), SortAtomsAscending(lcodes[l].second.labels)); |
991 | |
|
992 | 0 | for (std::size_t i = 0; i < atomNbrs.size(); ++i) { |
993 | 0 | code.add(atom, atomNbrs[i]); |
994 | 0 | code.labels[atomNbrs[i]->GetIndex()] = nextLbl; |
995 | 0 | atoms.push_back(atomNbrs[i]); |
996 | 0 | nextLbl++; |
997 | 0 | } |
998 | 0 | } |
999 | 0 | } |
1000 | | |
1001 | | // Recurse... |
1002 | 0 | CanonicalLabelsRecursive(current, nextLbl - 1, timeout, bestCode, state); |
1003 | | |
1004 | | // Backtrack. |
1005 | 0 | for (std::size_t j = 0; j < atoms.size(); ++j) { |
1006 | 0 | code.atoms.pop_back(); |
1007 | 0 | code.bonds.pop_back(); |
1008 | 0 | code.from.pop_back(); |
1009 | 0 | code.labels[atoms[j]->GetIndex()] = 0; |
1010 | 0 | } |
1011 | 0 | } else { |
1012 | | // There are no duplicated symmetry classes for the neighbor atoms. |
1013 | | // Label each neighbor atom in sorted sequence. |
1014 | 0 | unsigned int lbl = label; |
1015 | 0 | for (std::size_t i = 0; i < nbrs.size(); ++i) { |
1016 | 0 | lbl++; |
1017 | 0 | code.add(current, nbrs[i]); |
1018 | 0 | code.labels[nbrs[i]->GetIndex()] = lbl; |
1019 | 0 | } |
1020 | | |
1021 | | // Recurse... |
1022 | 0 | CanonicalLabelsRecursive(current, lbl, timeout, bestCode, state); |
1023 | | |
1024 | | // Backtrack. |
1025 | 0 | for (std::size_t i = 0; i < nbrs.size(); ++i) { |
1026 | 0 | code.atoms.pop_back(); |
1027 | 0 | code.bonds.pop_back(); |
1028 | 0 | code.from.pop_back(); |
1029 | 0 | code.labels[nbrs[i]->GetIndex()] = 0; |
1030 | 0 | } |
1031 | 0 | } |
1032 | 0 | } |
1033 | | |
1034 | | static void FindOrbits(Orbits &orbits, OBMol *mol, const std::vector<unsigned int> &labels1, |
1035 | | const std::vector<unsigned int> &labels2) |
1036 | 0 | { |
1037 | 0 | Orbits newOrbits; |
1038 | 0 | std::vector<bool> visited(labels1.size(), false); |
1039 | | |
1040 | | //print_vector("labels1", labels1); |
1041 | | //print_vector("labels2", labels2); |
1042 | | |
1043 | | // Construct the orbits for automorphic permutation labels1 -> labels2 |
1044 | 0 | for (std::size_t i = 0; i < labels1.size(); ++i) { |
1045 | 0 | if (visited[i]) |
1046 | 0 | continue; |
1047 | 0 | unsigned int vi = labels1[i]; |
1048 | 0 | if (vi == labels2[i]) |
1049 | 0 | continue; |
1050 | | //cout << "( "; |
1051 | 0 | std::size_t j = i; |
1052 | 0 | unsigned int vj = labels2[j]; |
1053 | | //cout << j << " "; |
1054 | 0 | newOrbits.resize(newOrbits.size()+1); |
1055 | 0 | newOrbits.back().push_back(mol->GetAtom(j+1)); |
1056 | 0 | visited[i] = true; |
1057 | 0 | while (vi != vj) { |
1058 | 0 | for (std::size_t k = i; k < labels1.size(); ++k) { |
1059 | 0 | if (vj == labels1[k]) { |
1060 | 0 | j = k; |
1061 | 0 | vj = labels2[j]; |
1062 | | //cout << j << " "; |
1063 | 0 | newOrbits.back().push_back(mol->GetAtom(j+1)); |
1064 | 0 | visited[j] = true; |
1065 | 0 | break; |
1066 | 0 | } |
1067 | 0 | } |
1068 | 0 | } |
1069 | | //cout << ") "; |
1070 | 0 | } |
1071 | | //cout << endl; |
1072 | | |
1073 | | //print_orbits("newOrbits", newOrbits); |
1074 | | |
1075 | | // Merge the orbits with previously found orbits. |
1076 | 0 | for (std::size_t j = 0; j < newOrbits.size(); ++j) { |
1077 | 0 | bool merge = false; |
1078 | 0 | for (std::size_t k = 0; k < orbits.size(); ++k) { |
1079 | |
|
1080 | 0 | for (std::size_t l = 0; l < newOrbits[j].size(); ++l) { |
1081 | 0 | if (std::find(orbits[k].begin(), orbits[k].end(), newOrbits[j][l]) != orbits[k].end()) |
1082 | 0 | merge = true; |
1083 | 0 | } |
1084 | |
|
1085 | 0 | if (merge) { |
1086 | 0 | for (std::size_t l = 0; l < newOrbits[j].size(); ++l) |
1087 | 0 | if (std::find(orbits[k].begin(), orbits[k].end(), newOrbits[j][l]) == orbits[k].end()) |
1088 | 0 | orbits[k].push_back(newOrbits[j][l]); |
1089 | 0 | break; |
1090 | 0 | } |
1091 | 0 | } |
1092 | |
|
1093 | 0 | if (!merge) |
1094 | 0 | orbits.push_back(newOrbits[j]); |
1095 | 0 | } |
1096 | | |
1097 | | // print_orbits("orbits", orbits); |
1098 | |
|
1099 | 0 | newOrbits.clear(); |
1100 | 0 | std::vector<bool> vivisted(orbits.size(), false); |
1101 | 0 | for (std::size_t i = 0; i < orbits.size(); ++i) { |
1102 | 0 | if (visited[i]) |
1103 | 0 | continue; |
1104 | 0 | visited[i] = true; |
1105 | |
|
1106 | 0 | Orbit newOrbit = orbits[i]; |
1107 | |
|
1108 | 0 | for (std::size_t j = i; j < orbits.size(); ++j) { |
1109 | 0 | if (visited[j]) |
1110 | 0 | continue; |
1111 | | |
1112 | 0 | std::sort(newOrbit.begin(), newOrbit.end()); |
1113 | 0 | std::sort(orbits[j].begin(), orbits[j].end()); |
1114 | 0 | Orbit result; |
1115 | |
|
1116 | 0 | std::set_intersection(newOrbit.begin(), newOrbit.end(), |
1117 | 0 | orbits[j].begin(), orbits[j].end(), |
1118 | 0 | std::back_inserter(result)); |
1119 | 0 | if (result.empty()) |
1120 | 0 | continue; |
1121 | | |
1122 | 0 | visited[j] = true; |
1123 | 0 | result.clear(); |
1124 | 0 | std::set_union(newOrbit.begin(), newOrbit.end(), |
1125 | 0 | orbits[j].begin(), orbits[j].end(), |
1126 | 0 | std::back_inserter(result)); |
1127 | |
|
1128 | 0 | newOrbit = result; |
1129 | 0 | } |
1130 | |
|
1131 | 0 | newOrbits.push_back(newOrbit); |
1132 | 0 | } |
1133 | |
|
1134 | 0 | orbits = newOrbits; |
1135 | 0 | } |
1136 | | |
1137 | | /** |
1138 | | * Update the minimum cell representations (mcr). |
1139 | | */ |
1140 | | static void UpdateMcr(OBBitVec &mcr, Orbits &orbits, const std::vector<unsigned int> &bestLabels) |
1141 | 0 | { |
1142 | | //print_orbits("UpdateMcr", orbits); |
1143 | |
|
1144 | 0 | for (std::size_t i = 0; i < bestLabels.size(); ++i) |
1145 | 0 | mcr.SetBitOn(i+1); |
1146 | |
|
1147 | 0 | for (std::size_t j = 0; j < orbits.size(); ++j) { |
1148 | 0 | std::sort(orbits[j].begin(), orbits[j].end(), SortAtomsAscending(bestLabels)); |
1149 | |
|
1150 | 0 | for (std::size_t k = 0; k < orbits[j].size(); ++k) |
1151 | 0 | if (k) |
1152 | 0 | mcr.SetBitOff(orbits[j][k]->GetIdx()); |
1153 | 0 | } |
1154 | 0 | } |
1155 | | |
1156 | | |
1157 | | /** |
1158 | | * This is the recursive function implementing the labeling algorithm |
1159 | | * outlined above (steps 1-3). This function works for single connected |
1160 | | * fragments and uses @p state to hold most information. The @p bestCode |
1161 | | * is the current greatest code (if any). given the @p current atom and |
1162 | | * the last @p label, this function labels the neighbor atoms with |
1163 | | * label+1, label+2, ... |
1164 | | */ |
1165 | | static void CanonicalLabelsRecursive(OBAtom *current, unsigned int label, Timeout &timeout, FullCode &bestCode, State &state) |
1166 | 0 | { |
1167 | 0 | OBMol *mol = current->GetParent(); |
1168 | 0 | PartialCode &code = state.code; |
1169 | |
|
1170 | 0 | if (state.backtrackDepth) { |
1171 | | //std::cout << "backtrackDepth = " << state.backtrackDepth << std::endl; |
1172 | |
|
1173 | 0 | if (code.atoms.size() > state.backtrackDepth) { |
1174 | | //std::cout << "BACKTRACKING" << std::endl; |
1175 | 0 | return; |
1176 | 0 | } |
1177 | 0 | if (code.atoms.size() == state.backtrackDepth) { |
1178 | | //std::cout << "BACKTRACK DONE 1" << std::endl; |
1179 | 0 | state.backtrackDepth = 0; |
1180 | 0 | return; |
1181 | 0 | } else |
1182 | 0 | if (code.atoms.size() < state.backtrackDepth) { |
1183 | | //std::cout << "BACKTRACK DONE 2" << std::endl; |
1184 | 0 | state.backtrackDepth = 0; |
1185 | 0 | } |
1186 | 0 | } |
1187 | | |
1188 | | |
1189 | | // Check if there is a full mapping. |
1190 | 0 | if (label == state.fragment.CountBits()) { |
1191 | | // Complete the canonical code. |
1192 | 0 | FullCode fullcode; |
1193 | 0 | CompleteCode(mol, fullcode, state); |
1194 | | |
1195 | | //print_vector("TERMINAL", fullcode.code); |
1196 | | |
1197 | | // Check previously found codes to find redundant subtrees. |
1198 | 0 | for (std::size_t i = state.identityCodes.size(); i > 0; --i) |
1199 | 0 | if (fullcode.code == state.identityCodes[i-1].code) { |
1200 | | // |
1201 | | // An explicit automorphism has been found. |
1202 | | // |
1203 | 0 | std::vector<unsigned int> v1(fullcode.labels.size(), 0); |
1204 | 0 | for (std::size_t j = 0; j < fullcode.labels.size(); ++j) |
1205 | 0 | if (fullcode.labels[j]) |
1206 | 0 | v1[fullcode.labels[j]-1] = j + 1; |
1207 | |
|
1208 | 0 | std::vector<unsigned int> v2(state.identityCodes[i-1].labels.size(), 0); |
1209 | 0 | for (std::size_t j = 0; j < state.identityCodes[i-1].labels.size(); ++j) |
1210 | 0 | if (state.identityCodes[i-1].labels[j]) |
1211 | 0 | v2[state.identityCodes[i-1].labels[j]-1] = j + 1; |
1212 | |
|
1213 | 0 | state.backtrackDepth = 0; |
1214 | 0 | for (std::size_t j = 0; j < v1.size(); ++j) { |
1215 | 0 | if (v1[j] != v2[j]) { |
1216 | | // Yield backtrackDepth. |
1217 | 0 | return; |
1218 | 0 | } |
1219 | | |
1220 | 0 | if (v1[j]) |
1221 | 0 | state.backtrackDepth++; |
1222 | 0 | } |
1223 | 0 | } |
1224 | | |
1225 | 0 | if (fullcode.code == bestCode.code) { |
1226 | 0 | UpdateMcr(state.mcr, state.orbits, bestCode.labels); |
1227 | 0 | FindOrbits(state.orbits, mol, fullcode.labels, bestCode.labels); |
1228 | 0 | } else if (fullcode > bestCode) { |
1229 | | // if fullcode is greater than bestCode, we have found a new greatest code |
1230 | 0 | bestCode = fullcode; |
1231 | 0 | } |
1232 | | |
1233 | |
|
1234 | 0 | if (state.identityCodes.size() < MAX_IDENTITY_NODES) { |
1235 | 0 | state.identityCodes.push_back(FullCode()); |
1236 | 0 | state.identityCodes.back().labels.swap(fullcode.labels); |
1237 | 0 | state.identityCodes.back().code.swap(fullcode.code); |
1238 | 0 | } else { |
1239 | 0 | state.identityCodes[MAX_IDENTITY_NODES-1].labels.swap(fullcode.labels); |
1240 | 0 | state.identityCodes[MAX_IDENTITY_NODES-1].code.swap(fullcode.code); |
1241 | 0 | } |
1242 | | |
1243 | |
|
1244 | 0 | return; |
1245 | 0 | } |
1246 | | |
1247 | | // Avoid endless loops. |
1248 | 0 | if (time(nullptr) - timeout.startTime > timeout.maxTime) { |
1249 | 0 | return; |
1250 | 0 | } |
1251 | | |
1252 | | // If there is a bestCode and only one labeling is required, return. |
1253 | 0 | if (state.onlyOne && !bestCode.code.empty()) |
1254 | 0 | return; |
1255 | | |
1256 | | // Abort early if this will not lead to a greatest canonical code. The |
1257 | | // code.from vector is compared with the elements in the bestCode. |
1258 | 0 | if (code < bestCode) |
1259 | 0 | return; |
1260 | | |
1261 | | // Find the neighbors of the current atom to assign the next label(s). |
1262 | 0 | std::vector<OBAtom*> nbrs; |
1263 | 0 | std::vector<unsigned int> nbrSymClasses; |
1264 | |
|
1265 | 0 | FOR_NBORS_OF_ATOM (nbr, current) { |
1266 | | // Skip atoms not in the fragment. |
1267 | 0 | if (!state.fragment.BitIsSet(nbr->GetIdx())) |
1268 | 0 | continue; |
1269 | | // Skip already labeled atoms. |
1270 | 0 | if (code.labels[nbr->GetIndex()]) |
1271 | 0 | continue; |
1272 | | |
1273 | 0 | OBBond *bond = mol->GetBond(current, &*nbr); |
1274 | | // Ugly, but it helps... // <--- not documented! need better approach to handle this (not only ferrocene)... |
1275 | 0 | if (!isFerroceneBond(bond)) { |
1276 | 0 | nbrSymClasses.push_back(state.symmetry_classes[nbr->GetIndex()]); |
1277 | 0 | nbrs.push_back(&*nbr); |
1278 | 0 | } |
1279 | 0 | } |
1280 | |
|
1281 | 0 | if (nbrs.empty()) { |
1282 | | // If there are no neighbor atoms to label, recurse with the next |
1283 | | // current atom. |
1284 | 0 | unsigned int nextLabel = code.labels[current->GetIndex()] + 1; |
1285 | 0 | for (std::size_t i = 0; i < code.labels.size(); ++i) { |
1286 | 0 | if (code.labels[i] == nextLabel) { |
1287 | 0 | CanonicalLabelsRecursive(mol->GetAtom(i+1), label, timeout, bestCode, state); |
1288 | 0 | return; |
1289 | 0 | } |
1290 | 0 | } |
1291 | 0 | return; |
1292 | 0 | } |
1293 | | |
1294 | 0 | if (!current->IsInRing()) { |
1295 | | // Optimization to avoid generating the n! permutations. If the current |
1296 | | // atom is not in a ring, the neighbor atoms with the same symmetry |
1297 | | // are not connected and it is possible to label without considering |
1298 | | // the rest of the molecule. The found ligand labels can be sorted and |
1299 | | // the labels can be calculated using offsets. |
1300 | | // |
1301 | | // This modifies the final canonical code since it produces a different |
1302 | | // canonical order but although different, the code is still canonical. |
1303 | 0 | LabelFragments(current, nbrs, label, timeout, bestCode, state); |
1304 | 0 | } else { |
1305 | | // Create all possible labelings for the neighbors. |
1306 | | // |
1307 | | // Example: The current atom has 4 neighbors to label. |
1308 | | // |
1309 | | // neighbor atom : 1 2 3 4 |
1310 | | // symmetry classes: 3 2 2 1 |
1311 | | // |
1312 | | // The first neighbor with symmetry class 3 is added to allOrderedNbrs. |
1313 | | // |
1314 | | // allOrderedNbrs = [ [1] ] |
1315 | | // |
1316 | | // The two neighbor atoms with symmetry class 2 are added next. All |
1317 | | // permutations are added (2 in this case). |
1318 | | // |
1319 | | // allOrderedNbrs = [ [1,2,3], [1,3,2] ] |
1320 | | // |
1321 | | // The last atom is similar to the first one. |
1322 | | // |
1323 | | // allOrderedNbrs = [ [1,2,3,4], [1,3,2,4] ] |
1324 | | // |
1325 | | // Note: If there are atoms with a large number of neighbor atoms |
1326 | | // with the same symmetry class (n), this can result in a large number |
1327 | | // of permutations (n!). |
1328 | 0 | std::vector<std::vector<OBAtom*> > allOrderedNbrs(1); |
1329 | 0 | while (!nbrs.empty()) { |
1330 | | |
1331 | | // Select the next nbr atoms with highest symmetry classes. |
1332 | 0 | unsigned int maxSymClass = *std::min_element(nbrSymClasses.begin(), nbrSymClasses.end()); |
1333 | 0 | std::vector<OBAtom*> finalNbrs; |
1334 | 0 | for (std::size_t i = 0; i < nbrs.size(); ++i) { |
1335 | 0 | if (nbrSymClasses[i] == maxSymClass) |
1336 | 0 | finalNbrs.push_back(nbrs[i]); |
1337 | 0 | } |
1338 | | |
1339 | | // Remove the selected atoms from nbrs and nbrSymClasses (this could be made more efficient) |
1340 | 0 | for (std::size_t i = 0; i < finalNbrs.size(); ++i) { |
1341 | 0 | nbrs.erase(std::find(nbrs.begin(), nbrs.end(), finalNbrs[i])); |
1342 | 0 | nbrSymClasses.erase(std::find(nbrSymClasses.begin(), nbrSymClasses.end(), maxSymClass)); |
1343 | 0 | } |
1344 | | |
1345 | |
|
1346 | 0 | if (finalNbrs.size() == 1) { |
1347 | | // If there is only one atom with the same symmetry class, label it |
1348 | | // and select the next group of neighbor atoms with the same symmetry |
1349 | | // class. |
1350 | 0 | for (std::size_t i = 0; i < allOrderedNbrs.size(); ++i) |
1351 | 0 | allOrderedNbrs[i].push_back(finalNbrs[0]); |
1352 | 0 | } else { |
1353 | | // Sort the atoms lexicographically. |
1354 | 0 | std::sort(finalNbrs.begin(), finalNbrs.end()); |
1355 | | |
1356 | | // Copy the current labelings for the neighbor atoms. |
1357 | 0 | std::vector<std::vector<OBAtom*> > allOrderedNbrsCopy(allOrderedNbrs); |
1358 | | |
1359 | | // Add the first permutation for the neighbor atoms. |
1360 | | // if (state.mcr.BitIsSet(finalNbrs[0]->GetIdx())) |
1361 | 0 | for (std::size_t j = 0; j < allOrderedNbrs.size(); ++j) { |
1362 | 0 | for (std::size_t i = 0; i < finalNbrs.size(); ++i) { |
1363 | 0 | allOrderedNbrs[j].push_back(finalNbrs[i]); |
1364 | 0 | } |
1365 | 0 | } |
1366 | | |
1367 | | // Add the other permutations. |
1368 | 0 | while (std::next_permutation(finalNbrs.begin(), finalNbrs.end())) { |
1369 | 0 | if (state.mcr.BitIsSet(finalNbrs[0]->GetIdx())) |
1370 | 0 | for (std::size_t j = 0; j < allOrderedNbrsCopy.size(); ++j) { |
1371 | 0 | allOrderedNbrs.push_back(allOrderedNbrsCopy[j]); |
1372 | 0 | for (std::size_t i = 0; i < finalNbrs.size(); ++i) |
1373 | 0 | allOrderedNbrs.back().push_back(finalNbrs[i]); |
1374 | 0 | } |
1375 | 0 | } |
1376 | |
|
1377 | 0 | } // finalNbrs.size() != 1 |
1378 | 0 | } // while (!nbrs.empty()) |
1379 | |
|
1380 | 0 | if (DEBUG) { |
1381 | 0 | cout << "allOrderedNbrs:" << endl; |
1382 | 0 | for (std::size_t i = 0; i < allOrderedNbrs.size(); ++i) { |
1383 | 0 | for (std::size_t j = 0; j < allOrderedNbrs[i].size(); ++j) { |
1384 | 0 | cout << allOrderedNbrs[i][j]->GetIndex() << " "; |
1385 | 0 | } |
1386 | 0 | cout << endl; |
1387 | 0 | } |
1388 | 0 | } |
1389 | |
|
1390 | 0 | for (std::size_t i = 0; i < allOrderedNbrs.size(); ++i) { |
1391 | | // Convert the order stored in allOrderedNbrs to labels. |
1392 | 0 | unsigned int lbl = label; |
1393 | 0 | for (std::size_t j = 0; j < allOrderedNbrs[i].size(); ++j) { |
1394 | 0 | lbl++; |
1395 | 0 | code.add(current, allOrderedNbrs[i][j]); |
1396 | 0 | code.labels[allOrderedNbrs[i][j]->GetIndex()] = lbl; |
1397 | 0 | } |
1398 | | |
1399 | | // Recurse... |
1400 | 0 | CanonicalLabelsRecursive(current, lbl, timeout, bestCode, state); |
1401 | | |
1402 | | // Backtrack... |
1403 | 0 | for (std::size_t j = 0; j < allOrderedNbrs[i].size(); ++j) { |
1404 | 0 | code.atoms.pop_back(); |
1405 | 0 | code.bonds.pop_back(); |
1406 | 0 | code.from.pop_back(); |
1407 | 0 | code.labels[allOrderedNbrs[i][j]->GetIndex()] = 0; |
1408 | 0 | } |
1409 | | |
1410 | | // Optimization |
1411 | 0 | if (state.backtrackDepth) { |
1412 | 0 | if (code.atoms.size() <= state.backtrackDepth) { |
1413 | | //std::cout << "BACKTRACK DONE 3" << std::endl; |
1414 | 0 | state.backtrackDepth = 0; |
1415 | 0 | } |
1416 | 0 | } |
1417 | |
|
1418 | 0 | } |
1419 | |
|
1420 | 0 | } // current->IsInRing() |
1421 | |
|
1422 | 0 | } |
1423 | | |
1424 | | /** |
1425 | | * Select an initial atom from a fragment to assign the first label. |
1426 | | */ |
1427 | | static std::vector<OBAtom*> findStartAtoms(OBMol *mol, const OBBitVec &fragment, const std::vector<unsigned int> &symmetry_classes) |
1428 | 0 | { |
1429 | | // find the a symmetry class in the fragment using criteria |
1430 | 0 | std::vector<unsigned int> ranks; |
1431 | 0 | for (std::size_t i = 0; i < mol->NumAtoms(); ++i) { |
1432 | 0 | if (!fragment.BitIsSet(i+1)) |
1433 | 0 | continue; |
1434 | | |
1435 | 0 | OBAtom *atom = mol->GetAtom(i+1); |
1436 | 0 | unsigned int rank = 10000 * symmetry_classes[i] + |
1437 | 0 | 1000 * atom->GetSpinMultiplicity() + |
1438 | 0 | 10 * (atom->GetFormalCharge() + 7) + |
1439 | 0 | TotalHydrogenCount(atom); |
1440 | |
|
1441 | 0 | ranks.push_back(rank); |
1442 | 0 | } |
1443 | |
|
1444 | 0 | unsigned int lowestRank = *std::min_element(ranks.begin(), ranks.end()); |
1445 | |
|
1446 | 0 | std::vector<OBAtom*> result; |
1447 | 0 | for (std::size_t i = 0; i < mol->NumAtoms(); ++i) { |
1448 | 0 | if (!fragment.BitIsSet(i+1)) |
1449 | 0 | continue; |
1450 | | |
1451 | 0 | OBAtom *atom = mol->GetAtom(i+1); |
1452 | 0 | unsigned int rank = 10000 * symmetry_classes[i] + |
1453 | 0 | 1000 * atom->GetSpinMultiplicity() + |
1454 | 0 | 10 * (atom->GetFormalCharge() + 7) + |
1455 | 0 | TotalHydrogenCount(atom); |
1456 | |
|
1457 | 0 | if (rank == lowestRank) |
1458 | 0 | result.push_back(atom); |
1459 | 0 | } |
1460 | |
|
1461 | 0 | return result; |
1462 | 0 | } |
1463 | | |
1464 | | /** |
1465 | | * Compute the canonical labels for @p mol. This function handles a whole molecule with |
1466 | | * disconnected fragments. It finds a canonical code for each fragment, sorts these codes |
1467 | | * and computes labels for the molecule as a whole. |
1468 | | * |
1469 | | * This is the CanonicalLabelsImpl entry point. |
1470 | | */ |
1471 | | static void CalcCanonicalLabels(OBMol *mol, const std::vector<unsigned int> &symmetry_classes, |
1472 | | std::vector<unsigned int> &canonical_labels, const OBStereoUnitSet &stereoUnits, const OBBitVec &mask, |
1473 | | OBStereoFacade *stereoFacade, int maxSeconds, bool onlyOne = false) |
1474 | 0 | { |
1475 | | // Handle some special cases. |
1476 | 0 | if (!mol->NumAtoms()) |
1477 | 0 | return; |
1478 | 0 | if (mol->NumAtoms() == 1) { |
1479 | 0 | canonical_labels.resize(1, 1); |
1480 | 0 | return; |
1481 | 0 | } |
1482 | | |
1483 | | // Set all labels to 0. |
1484 | 0 | canonical_labels.clear(); |
1485 | 0 | canonical_labels.resize(mol->NumAtoms(), 0); |
1486 | |
|
1487 | 0 | std::vector<OBBond*> metalloceneBonds; |
1488 | 0 | findMetalloceneBonds(metalloceneBonds, mol, symmetry_classes); |
1489 | | |
1490 | | // Find the (dis)connected fragments. |
1491 | 0 | OBBitVec visited; |
1492 | 0 | std::vector<OBBitVec> fragments; |
1493 | 0 | for (std::size_t i = 0; i < mol->NumAtoms(); ++i) { |
1494 | 0 | if (!mask.BitIsSet(i+1) || visited.BitIsSet(i+1)) |
1495 | 0 | continue; |
1496 | 0 | fragments.push_back(getFragment(mol->GetAtom(i+1), mask, metalloceneBonds)); |
1497 | 0 | visited |= fragments.back(); |
1498 | 0 | } |
1499 | | |
1500 | | // Pre-compute the stereo center information. (See StereoCenter) |
1501 | 0 | std::vector<StereoCenter> stereoCenters; |
1502 | 0 | if (stereoFacade) { |
1503 | 0 | for (std::size_t i = 0; i < stereoUnits.size(); ++i) { |
1504 | 0 | const OBStereoUnit &unit = stereoUnits[i]; |
1505 | |
|
1506 | 0 | if (unit.type == OBStereo::Tetrahedral) { |
1507 | 0 | OBAtom *atom = mol->GetAtomById(unit.id); |
1508 | 0 | if (!atom) |
1509 | 0 | continue; |
1510 | | // Add the StereoCenter indexes. |
1511 | 0 | stereoCenters.resize(stereoCenters.size()+1); |
1512 | 0 | stereoCenters.back().indexes.push_back(atom->GetIndex()); |
1513 | |
|
1514 | 0 | if (!stereoFacade->HasTetrahedralStereo(unit.id)) |
1515 | 0 | continue; |
1516 | 0 | OBTetrahedralStereo::Config config = stereoFacade->GetTetrahedralStereo(unit.id)->GetConfig(); |
1517 | 0 | if (!config.specified) |
1518 | 0 | continue; |
1519 | | |
1520 | | // Add the neighbor atom indexes. |
1521 | 0 | OBAtom *from = mol->GetAtomById(config.from); |
1522 | 0 | if (from && from->GetAtomicNum() != OBElements::Hydrogen) |
1523 | 0 | stereoCenters.back().nbrIndexes1.push_back(from->GetIndex()); |
1524 | 0 | else |
1525 | 0 | stereoCenters.back().nbrIndexes1.push_back(std::numeric_limits<unsigned int>::max()); |
1526 | 0 | for (std::size_t j = 0; j < config.refs.size(); ++j) { |
1527 | 0 | OBAtom *ref = mol->GetAtomById(config.refs[j]); |
1528 | 0 | if (ref && ref->GetAtomicNum() != OBElements::Hydrogen) |
1529 | 0 | stereoCenters.back().nbrIndexes1.push_back(ref->GetIndex()); |
1530 | 0 | else |
1531 | 0 | stereoCenters.back().nbrIndexes1.push_back(std::numeric_limits<unsigned int>::max()); |
1532 | 0 | } |
1533 | 0 | } else if (unit.type == OBStereo::CisTrans) { |
1534 | 0 | OBBond *bond = mol->GetBondById(unit.id); |
1535 | 0 | if (!bond || bond->IsAromatic()) |
1536 | 0 | continue; |
1537 | 0 | OBAtom *begin = bond->GetBeginAtom(); |
1538 | 0 | OBAtom *end = bond->GetEndAtom(); |
1539 | 0 | if (!begin || !end) |
1540 | 0 | continue; |
1541 | | // Add the StereoCenter indexes. |
1542 | 0 | stereoCenters.resize(stereoCenters.size()+1); |
1543 | 0 | stereoCenters.back().indexes.push_back(begin->GetIndex()); |
1544 | 0 | stereoCenters.back().indexes.push_back(end->GetIndex()); |
1545 | |
|
1546 | 0 | if (!stereoFacade->HasCisTransStereo(unit.id)) |
1547 | 0 | continue; |
1548 | 0 | OBCisTransStereo::Config config = stereoFacade->GetCisTransStereo(unit.id)->GetConfig(); |
1549 | 0 | if (!config.specified) |
1550 | 0 | continue; |
1551 | | |
1552 | | // Add the neighbor atom indexes. |
1553 | 0 | for (std::size_t j = 0; j < config.refs.size(); ++j) { |
1554 | 0 | OBAtom *ref = mol->GetAtomById(config.refs[j]); |
1555 | 0 | unsigned int r = (ref && ref->GetAtomicNum() != OBElements::Hydrogen) ? ref->GetIndex() : std::numeric_limits<unsigned int>::max(); |
1556 | 0 | if (stereoCenters.back().nbrIndexes1.size() < 2) |
1557 | 0 | stereoCenters.back().nbrIndexes1.push_back(r); |
1558 | 0 | else |
1559 | 0 | stereoCenters.back().nbrIndexes2.push_back(r); |
1560 | 0 | } |
1561 | 0 | } |
1562 | 0 | } |
1563 | 0 | } |
1564 | | |
1565 | | // Find the canonical code for each fragment. |
1566 | 0 | std::vector<CanonicalLabelsImpl::FullCode> fcodes; |
1567 | 0 | for (std::size_t f = 0; f < fragments.size(); ++f) { |
1568 | 0 | const OBBitVec &fragment = fragments[f]; |
1569 | | |
1570 | | // Select the first atom. |
1571 | 0 | std::vector<OBAtom*> startAtoms = findStartAtoms(mol, fragment, symmetry_classes); |
1572 | |
|
1573 | 0 | CanonicalLabelsImpl::Timeout timeout(maxSeconds); |
1574 | 0 | CanonicalLabelsImpl::FullCode bestCode; |
1575 | 0 | std::vector<CanonicalLabelsImpl::FullCode> identityCodes; |
1576 | 0 | Orbits orbits; |
1577 | 0 | OBBitVec mcr; |
1578 | |
|
1579 | 0 | for (std::size_t i = 0; i < startAtoms.size(); ++i) { |
1580 | 0 | OBAtom *atom = startAtoms[i]; |
1581 | | |
1582 | | // Start labeling of the fragment. |
1583 | 0 | State state(symmetry_classes, fragment, stereoCenters, identityCodes, orbits, mcr, onlyOne); |
1584 | | //if (!state.mcr.BitIsSet(atom->GetIdx()) && atom->IsInRing()) |
1585 | | // continue; |
1586 | |
|
1587 | 0 | state.code.add(atom); |
1588 | 0 | state.code.labels[atom->GetIndex()] = 1; |
1589 | 0 | CanonicalLabelsRecursive(atom, 1, timeout, bestCode, state); |
1590 | 0 | } |
1591 | | |
1592 | | // Throw an error if the timeout is exceeded. |
1593 | 0 | if (time(nullptr) - timeout.startTime > timeout.maxTime) { |
1594 | 0 | obErrorLog.ThrowError(__FUNCTION__, "maximum time exceeded...", obError); |
1595 | 0 | } |
1596 | | |
1597 | | // Store the canonical code for the fragment. |
1598 | 0 | fcodes.push_back(bestCode); |
1599 | 0 | } |
1600 | | |
1601 | | // Sort the codes for the fragments. |
1602 | 0 | std::sort(fcodes.begin(), fcodes.end(), SortCode); |
1603 | | |
1604 | | // Construct the full labeling from the sorted fragment labels. |
1605 | 0 | unsigned int offset = 0; |
1606 | 0 | for (std::size_t f = 0; f < fcodes.size(); ++f) { |
1607 | | //print_vector("CODE", fcodes[f].code); |
1608 | | //print_vector("code_labels", fcodes[f].labels); |
1609 | 0 | if (fcodes[f].labels.size() == 0) |
1610 | 0 | continue; // defensive programming |
1611 | | |
1612 | 0 | unsigned int max_label = 0; |
1613 | 0 | for (std::size_t i = 0; i < mol->NumAtoms(); ++i) { |
1614 | 0 | if (fcodes[f].labels[i]) { |
1615 | 0 | canonical_labels[i] = fcodes[f].labels[i] + offset; |
1616 | 0 | max_label = std::max(max_label, canonical_labels[i]); |
1617 | 0 | } |
1618 | 0 | } |
1619 | 0 | offset = max_label; |
1620 | 0 | } |
1621 | |
|
1622 | 0 | } |
1623 | | |
1624 | | }; // CanonicalLabelsImpl |
1625 | | |
1626 | | /* |
1627 | | * See header for detailed description of parameters. |
1628 | | * |
1629 | | * The main purpose of this function is calling CanonicalLabelsImpl::CalcCanonicalLabels |
1630 | | * with the correct parameters regarding stereochemistry. |
1631 | | */ |
1632 | | void CanonicalLabels(OBMol *mol, const std::vector<unsigned int> &symmetry_classes, |
1633 | | std::vector<unsigned int> &canonical_labels, const OBBitVec &mask, |
1634 | | int maxSeconds, bool onlyOne) |
1635 | 0 | { |
1636 | | // make sure the mask is valid: no mask = all atoms |
1637 | 0 | OBBitVec maskCopy(mask); |
1638 | 0 | if (!maskCopy.CountBits()) |
1639 | 0 | FOR_ATOMS_OF_MOL (atom, mol) |
1640 | 0 | maskCopy.SetBitOn(atom->GetIdx()); |
1641 | |
|
1642 | 0 | if (onlyOne) { |
1643 | | // Only one labeling requested. This results in canonical labels that do not |
1644 | | // consider stereochemistry. Used for finding stereo centers with automorphisms. |
1645 | 0 | CanonicalLabelsImpl::CalcCanonicalLabels(mol, symmetry_classes, canonical_labels, OBStereoUnitSet(), maskCopy, nullptr, maxSeconds, true); |
1646 | 0 | } else { |
1647 | 0 | std::vector<OBBond*> metalloceneBonds; |
1648 | 0 | findMetalloceneBonds(metalloceneBonds, mol, symmetry_classes); |
1649 | | |
1650 | | // Check if there are specified stereo centers. |
1651 | 0 | bool hasAtLeastOneDefined = false; |
1652 | 0 | OBStereoFacade sf(mol, false); |
1653 | 0 | FOR_ATOMS_OF_MOL (atom, mol) { |
1654 | 0 | FOR_BONDS_OF_ATOM (bond, &*atom) |
1655 | 0 | if (std::find(metalloceneBonds.begin(), metalloceneBonds.end(), &*bond) != metalloceneBonds.end()) |
1656 | 0 | continue; |
1657 | 0 | if (sf.HasTetrahedralStereo(atom->GetId())) { |
1658 | 0 | if (sf.GetTetrahedralStereo(atom->GetId())->GetConfig().specified) { |
1659 | 0 | hasAtLeastOneDefined = true; |
1660 | 0 | break; |
1661 | 0 | } |
1662 | 0 | } |
1663 | 0 | } |
1664 | 0 | FOR_BONDS_OF_MOL (bond, mol) { |
1665 | 0 | if (sf.HasCisTransStereo(bond->GetId())) { |
1666 | 0 | if (sf.GetCisTransStereo(bond->GetId())->GetConfig().specified) { |
1667 | 0 | hasAtLeastOneDefined = true; |
1668 | 0 | break; |
1669 | 0 | } |
1670 | 0 | } |
1671 | 0 | } |
1672 | 0 | if (!hasAtLeastOneDefined) { |
1673 | | // If there are no specified stereo centers, we don't need to find stereogenic units. |
1674 | 0 | CanonicalLabelsImpl::CalcCanonicalLabels(mol, symmetry_classes, canonical_labels, OBStereoUnitSet(), maskCopy, nullptr, maxSeconds); |
1675 | 0 | return; |
1676 | 0 | } |
1677 | | |
1678 | | // Find the stereogenic units |
1679 | 0 | OBStereoUnitSet stereoUnits = FindStereogenicUnits(mol, symmetry_classes); |
1680 | | |
1681 | | // Mark all invalid stereo data as unspecified |
1682 | 0 | std::vector<OBGenericData*> stereoData = mol->GetAllData(OBGenericDataType::StereoData); |
1683 | 0 | std::vector<OBGenericData*>::iterator data; |
1684 | 0 | for (data = stereoData.begin(); data != stereoData.end(); ++data) { |
1685 | 0 | OBStereo::Type type = ((OBStereoBase*)*data)->GetType(); |
1686 | 0 | if (type == OBStereo::Tetrahedral) { |
1687 | 0 | OBTetrahedralStereo *ts = dynamic_cast<OBTetrahedralStereo*>(*data); |
1688 | 0 | OBTetrahedralStereo::Config config = ts->GetConfig(); |
1689 | 0 | bool valid = true; |
1690 | 0 | if (!ts->IsValid()) |
1691 | 0 | valid = false; |
1692 | 0 | OBAtom *center = mol->GetAtomById(config.center); |
1693 | 0 | if (!center) |
1694 | 0 | valid = false; |
1695 | 0 | else if (!isTetrahedral(center, stereoUnits)) |
1696 | 0 | valid = false; |
1697 | |
|
1698 | 0 | if (!valid) { |
1699 | 0 | config.specified = false; |
1700 | 0 | ts->SetConfig(config); |
1701 | 0 | } |
1702 | 0 | } |
1703 | 0 | if (type == OBStereo::CisTrans) { |
1704 | 0 | OBCisTransStereo *ct = dynamic_cast<OBCisTransStereo*>(*data); |
1705 | 0 | OBCisTransStereo::Config config = ct->GetConfig(); |
1706 | 0 | bool valid = true; |
1707 | 0 | if (!ct->IsValid()) |
1708 | 0 | valid = false; |
1709 | 0 | OBAtom *beginAtom = mol->GetAtomById(config.begin); |
1710 | 0 | OBAtom *endAtom = mol->GetAtomById(config.end); |
1711 | 0 | if (!beginAtom || !endAtom) |
1712 | 0 | valid = false; |
1713 | 0 | else { |
1714 | 0 | OBBond *bond = mol->GetBond(beginAtom, endAtom); |
1715 | 0 | if (!bond) |
1716 | 0 | valid = false; |
1717 | 0 | else if (!isCisTrans(bond, stereoUnits)) |
1718 | 0 | valid = false; |
1719 | 0 | } |
1720 | |
|
1721 | 0 | if (!valid) { |
1722 | 0 | config.specified = false; |
1723 | 0 | ct->SetConfig(config); |
1724 | 0 | } |
1725 | 0 | } |
1726 | 0 | } |
1727 | | |
1728 | | // Determine stereochemistry from coordinates if needed |
1729 | 0 | if (!mol->HasChiralityPerceived()) { |
1730 | 0 | switch (mol->GetDimension()) { |
1731 | 0 | case 2: |
1732 | 0 | mol->DeleteData(OBGenericDataType::StereoData); |
1733 | 0 | TetrahedralFrom2D(mol, stereoUnits); |
1734 | 0 | CisTransFrom2D(mol, stereoUnits); |
1735 | 0 | break; |
1736 | 0 | case 3: |
1737 | 0 | mol->DeleteData(OBGenericDataType::StereoData); |
1738 | 0 | TetrahedralFrom3D(mol, stereoUnits); |
1739 | 0 | CisTransFrom3D(mol, stereoUnits); |
1740 | 0 | break; |
1741 | 0 | default: |
1742 | 0 | TetrahedralFrom0D(mol, stereoUnits); |
1743 | 0 | CisTransFrom0D(mol, stereoUnits); |
1744 | 0 | break; |
1745 | 0 | } |
1746 | 0 | } |
1747 | | |
1748 | | // The mol->DeleteData() above deleted some of the data that the OBStereoFacade sf() |
1749 | | // is using. We have to create a new one. (CJ - fixes a bug.) |
1750 | 0 | OBStereoFacade newsf(mol, false); |
1751 | | |
1752 | | // Start the labeling process |
1753 | 0 | CanonicalLabelsImpl::CalcCanonicalLabels(mol, symmetry_classes, canonical_labels, stereoUnits, maskCopy, &newsf, maxSeconds); |
1754 | 0 | } |
1755 | | |
1756 | | // if the labeling failed, just return the identity labels to avoid craches |
1757 | 0 | if (canonical_labels.empty()) |
1758 | 0 | for (std::size_t i = 0; i < symmetry_classes.size(); ++i) |
1759 | 0 | canonical_labels.push_back(i+1); |
1760 | 0 | } |
1761 | | |
1762 | | |
1763 | | |
1764 | | /* -*-C++-*- |
1765 | | +====================================================================== |
1766 | | | |
1767 | | | !!!DEPRECATED!!! |
1768 | | | |
1769 | | | |
1770 | | | AUTHOR: Craig A. James, eMolecules, Inc. |
1771 | | | |
1772 | | | DESCRIPTION: CANONICALIZATION OF SMILES |
1773 | | | |
1774 | | | This is a specialized SMILES canonicalization algorithm. Although |
1775 | | | it can be applied in the standard fashion to a whole molecule, |
1776 | | | its real job is to generate canonical SMILES for fragments, or |
1777 | | | "subsets", of the atoms of a molecule. |
1778 | | | |
1779 | | | For example, consider the first three atoms of Oc1ccccc1. With |
1780 | | | a "normal" SMILES canonicalizer, you couldn't generate a SMILES |
1781 | | | for Occ, because it's not a valid molecule. However, this system |
1782 | | | can do exactly that, by taking both the whole molecule (which |
1783 | | | retains the aromaticity), and a "subset" bitmap that specifies |
1784 | | | which atoms are to be included in the SMILES. |
1785 | | | |
1786 | | | Canonicalization is carried out per Weininger et al (J. Chem. |
1787 | | | Inf. Comput. Sci., Vol. 29, No. 2, 1989, pp 97-101), with some |
1788 | | | modifications to handle bond symmetries not foreseen by Weininger |
1789 | | | in that paper. |
1790 | | | |
1791 | | | WARNING - KNOWN BUG: These functions make use of a bitmap vector |
1792 | | | to represent a "fragment" -- a subset of the atoms in a molecule. |
1793 | | | But this means the bonds of the fragment are implicit, not explicit, |
1794 | | | which is incorrect. For example, if you want to break one bond of |
1795 | | | cyclehexane (C1CCCCC1), all six atoms will still be there, so the |
1796 | | | "fragment" will be cyclic. This is relevant when generating fragment |
1797 | | | SMILES for ring systems where breaking a bond can reduce the number |
1798 | | | of ring without removing any atoms. We need to add a pair of bit |
1799 | | | vectors, the atoms AND the bonds, to represent a fragment. (Note |
1800 | | | that this is also an ambiguity in OpenBabel itself, which represents |
1801 | | | a ring as a set of atoms. This is only valid if the ring is a member |
1802 | | | of a SSSR.) |
1803 | | +====================================================================== |
1804 | | */ |
1805 | | |
1806 | | } // namespace OpenBabel |
1807 | | |
1808 | | //! \file canon.cpp |
1809 | | //! \brief Canonical numbering of SMILES, molecules and fragments |