Coverage Report

Created: 2026-01-10 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openbabel/src/canon.cpp
Line
Count
Source
1
/**********************************************************************
2
canon.h - Canonical labeling.
3
4
  Copyright (C) 2009-2010 by Tim Vandermeersch
5
  Copyright (C) 2005-2006, eMolecules, Inc. (www.emolecules.com)
6
                           Craig A. James
7
8
This file is part of the Open Babel project.
9
For more information, see <http://openbabel.org/>
10
11
This program is free software; you can redistribute it and/or modify
12
it under the terms of the GNU General Public License as published by
13
the Free Software Foundation version 2 of the License.
14
15
This program is distributed in the hope that it will be useful,
16
but WITHOUT ANY WARRANTY; without even the implied warranty of
17
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
GNU General Public License for more details.
19
***********************************************************************/
20
21
#include <openbabel/canon.h>
22
#include <openbabel/graphsym.h>
23
#include <openbabel/babelconfig.h>
24
#include <openbabel/mol.h>
25
#include <openbabel/atom.h>
26
#include <openbabel/bond.h>
27
#include <openbabel/obiter.h>
28
#include <openbabel/obutil.h>
29
#include <openbabel/elements.h>
30
31
#include <openbabel/stereo/cistrans.h>
32
#include <openbabel/stereo/tetrahedral.h>
33
34
#include <iterator> // std::istream_iterator
35
#include <cassert>
36
#include <algorithm>
37
#include <cmath>
38
#include <limits>
39
40
#include "stereo/stereoutil.h"
41
42
0
#define DEBUG 0
43
44
0
#define MAX_IDENTITY_NODES 50
45
46
using namespace std;
47
48
// debug function
49
template<typename T>
50
void print_vector(const std::string &label, const std::vector<T> &v)
51
{
52
  std::cout << label << ": ";
53
  for (std::size_t i = 0; i < v.size(); ++i)
54
    if (v[i] < 10)
55
      std::cout << " " << v[i] << " ";
56
    else
57
      std::cout << v[i] << " ";
58
59
  std::cout << endl;
60
}
61
62
63
namespace OpenBabel {
64
65
  static unsigned int TotalHydrogenCount(OBAtom* atom)
66
0
  {
67
0
    return atom->ExplicitHydrogenCount() + atom->GetImplicitHCount();
68
0
  }
69
70
  inline bool CompareBondPairSecond(const std::pair<OBBond*,unsigned int> &a,const std::pair<OBBond*,unsigned int> &b)
71
0
  {
72
0
    return(a.second < b.second);
73
0
  }
74
75
  /**
76
   * Helper function for getFragment below.
77
   */
78
  void addNbrs(OBBitVec &fragment, OBAtom *atom, const OBBitVec &mask, const std::vector<OBBond*> &metalloceneBonds)
79
0
  {
80
0
    FOR_NBORS_OF_ATOM (nbr, atom) {
81
0
      if (!mask.BitIsSet(nbr->GetIdx()))
82
0
        continue;
83
      // skip visited atoms
84
0
      if (fragment.BitIsSet(nbr->GetIdx()))
85
0
        continue;
86
      // skip mettalocene bonds
87
0
      if (std::find(metalloceneBonds.begin(), metalloceneBonds.end(),
88
0
            atom->GetParent()->GetBond(atom, &*nbr)) != metalloceneBonds.end())
89
0
        continue;
90
      // add the neighbor atom to the fragment
91
0
      fragment.SetBitOn(nbr->GetIdx());
92
      // recurse...
93
0
      addNbrs(fragment, &*nbr, mask, metalloceneBonds);
94
0
    }
95
0
  }
96
97
  /**
98
   * Create an OBBitVec objects with bets set for the fragment consisting of all
99
   * atoms for which there is a path to atom without going through skip. These
100
   * fragment bitvecs are indexed by atom idx (i.e. OBAtom::GetIdx()).
101
   */
102
  OBBitVec getFragment(OBAtom *atom, const OBBitVec &mask, const std::vector<OBBond*> &metalloceneBonds = std::vector<OBBond*>())
103
0
  {
104
0
    OBBitVec fragment;
105
0
    fragment.SetBitOn(atom->GetIdx());
106
    // start the recursion
107
0
    addNbrs(fragment, atom, mask, metalloceneBonds);
108
0
    return fragment;
109
0
  }
110
111
  OBBitVec getFragment(OBAtom *atom, OBAtom *skip, const OBBitVec &mask)
112
0
  {
113
0
    struct getFragmentImpl
114
0
    {
115
0
      static void addNbrs(OBBitVec &fragment, OBAtom *atom, OBAtom *skip, const OBBitVec &mask)
116
0
      {
117
0
        FOR_NBORS_OF_ATOM (nbr, atom) {
118
          // don't pass through skip
119
0
          if (nbr->GetIdx() == skip->GetIdx())
120
0
            continue;
121
          // skip visited atoms
122
0
          if (fragment.BitIsSet(nbr->GetIdx()))
123
0
            continue;
124
0
          if (!mask.BitIsSet(nbr->GetIdx()))
125
0
            continue;
126
          // add the neighbor atom to the fragment
127
0
          fragment.SetBitOn(nbr->GetIdx());
128
          // recurse...
129
0
          addNbrs(fragment, &*nbr, skip, mask);
130
0
        }
131
0
      }
132
0
    };
133
134
0
    OBBitVec fragment;
135
0
    fragment.SetBitOn(atom->GetIdx());
136
    // start the recursion
137
0
    getFragmentImpl::addNbrs(fragment, atom, skip, mask);
138
0
    return fragment;
139
0
  }
140
141
142
143
  bool isFerroceneBond(OBBond *bond)
144
0
  {
145
0
    if (bond->GetBondOrder() != 1)
146
0
      return false;
147
148
0
    OBAtom *Fe = nullptr, *C = nullptr;
149
150
0
    OBAtom *begin = bond->GetBeginAtom();
151
0
    if (begin->GetAtomicNum() == 26)
152
0
      Fe = begin;
153
0
    if (begin->GetAtomicNum() == 6)
154
0
      C = begin;
155
156
0
    OBAtom *end = bond->GetEndAtom();
157
0
    if (end->GetAtomicNum() == 26)
158
0
      Fe = end;
159
0
    if (end->GetAtomicNum() == 6)
160
0
      C = end;
161
162
0
    if (!Fe || !C)
163
0
      return false;
164
165
0
    if (Fe->GetExplicitDegree() < 10)
166
0
      return false;
167
168
0
    return C->HasDoubleBond() && C->IsInRing();
169
0
  }
170
171
  void findMetalloceneBonds(std::vector<OBBond*> &bonds, OBMol *mol, const std::vector<unsigned int> &symmetry_classes)
172
0
  {
173
0
    FOR_ATOMS_OF_MOL (atom, mol) {
174
0
      if (!atom->IsInRingSize(3))
175
0
        continue;
176
0
      std::vector<unsigned int> nbrSymClasses;
177
0
      FOR_NBORS_OF_ATOM (nbr, &*atom) {
178
0
        if (nbr->IsInRingSize(3))
179
0
          nbrSymClasses.push_back(symmetry_classes[nbr->GetIndex()]);
180
0
      }
181
182
0
      if (nbrSymClasses.size() < 8)
183
0
        continue;
184
185
0
      std::sort(nbrSymClasses.begin(), nbrSymClasses.end());
186
0
      unsigned int numUnique = std::unique(nbrSymClasses.begin(), nbrSymClasses.end()) - nbrSymClasses.begin();
187
0
      if (numUnique > 1)
188
0
        continue;
189
190
0
      FOR_NBORS_OF_ATOM (nbr, &*atom)
191
0
        bonds.push_back(mol->GetBond(&*atom, &*nbr));
192
0
    }
193
0
  }
194
195
196
  /**
197
   * @page canonical_code_algorithm Canonical Coding Algorithm
198
   *
199
   * @section canonical_introduction Introduction
200
   * The aim of a canonical coding algorithm is to assign unique labels to
201
   * atoms regardless of their input order. In practise, these labels or orders
202
   * are atom indexes resulting from the order in which the atoms are defined
203
   * in an input file. Although most chemical file formats could be used to
204
   * store canonical ordered molecules, canonical single line notations are used
205
   * more often since they allow two canonical molecules to be compared using a
206
   * string comparison. Two well known examples are canonical smiles and inchi.
207
   * While the canonical smiles for the same molecule should always be the same
208
   * when using a specific implementation (i.e. toolkits), these canonical
209
   * smiles are usually not transferable between implementations. There is only
210
   * one inchi implementation and the canonical code, although not formally
211
   * specified is always the same. A typical use case for canonical codes is to
212
   * determine if a molecule is already in a database.
213
   *
214
   * The rest of this page documents the OpenBabel canonical coding algorithm.
215
   *
216
   * @section Topological Symmetry
217
   * The starting point of the canonical coding algorithm is the topological
218
   * symmetry or graph symmetry. This symmetry can be expressed by assigning
219
   * symmetry classes to atoms. Symmetric atoms have the same symmetry class.
220
   * The term color is often used in graph theory as a synonym for symmetry
221
   * classes.
222
   *
223
   * Symmetry classes are used as graph invariants to order atoms. The symmetry
224
   * classes can be used in two ways:
225
   *
226
   * - lowest symmetry class = lowest label (i.e. 1)
227
   * - highest symmetry class = lowest label
228
   *
229
   * Both methods are valid. OpenBabel uses the second method. In the following
230
   * sections there are more of these decisions that have to be made. The first,
231
   * choice is used as example to illustrate that it doesn't make any real
232
   * difference as long as the same choice is used every time. When trying to
233
   * reproduce OpenBabel canonical smiles the same choices or conventions have
234
   * to be used.
235
   *
236
   * The used initial graph invariant is specified below and the iteration
237
   * algorithm is similar to the original Morgan algorithm. The OBGraphSym class
238
   * should be consulted for a detailed description.
239
     @verbatim
240
          0                   1                   2                   3
241
          1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2
242
     GI = D D D D D D D D D D D V V V V A R N N N N N N N B B B B C C C C
243
244
     GI: Graph invariant (32 bits)
245
     D: Graph theoretical distance (10 bits)
246
     V: Heavy valence (4 bits)
247
     A: Aromaticity (1 bit)
248
     R: Ring atom (1 bit)
249
     N: Atomic number (7 bits)
250
     B: Heavy bond sum (4 bits)
251
     C: Formal charge + 7 (4 bits)
252
     @endverbatim
253
   *
254
   * (FIXME: ugly smiles)
255
   *
256
   * The symmetry classes encode various attributes for each atom. Consult the
257
   * OBGraphSym documentation for details. (note: when trying to reproduce
258
   * OpenBabel canonical smiles, make sure these graph invariants are the same)
259
   *
260
   * @section canonical_labeling Labeling
261
   * The labeling starts by selecting the initial atom using a set of rules to
262
   * ensure it starts at terminal atoms. (FIXME) Label 1 is assigned to this atom
263
   * and the labeling continues using this atom as the current atom:
264
   *
265
   * -# Find the neighbor atoms of the current atom to label next. Already labeled
266
   *   and atoms not in the fragment are ignored. If there are no such atoms, go
267
   *   to 3.
268
   * -# Sort the neighbors by their symmetry classes. Each group of atoms with the
269
   *    same symmetry class is now handled in sequence. If there is only one atom
270
   *    in the group, this atom is labeled and the process continues with the next
271
   *    group. If there are multiple (n) atoms in a group, all n! permutations of
272
   *    the group are tried. (note: The Optimization section below mentions some
273
   *    exceptions which affect the final result. These optimizations are part of
274
   *    the algorithm)
275
   * -# When all neighbor atoms of the current atom are labeled, the next atom
276
   *    (i.e. the atom with current_label+1 label) is considered the current atom
277
   *    and step 1 through 3 are repeated. If there is no next atom, the labeling
278
   *    is done.
279
   *
280
   * The algorithm just described results in a set of labellings that are candidates
281
   * for the canonical labeling. For each labeling, a code is generated to select
282
   * the canonical code and the associated labels.
283
   *
284
   * @section canonical_code Canonical Code
285
   * A canonical code can be anything encoding the molecular graph with all the
286
   * properties that should be canonicalized. The code should be invariant to the
287
   * original atom order. At this point, it may seem that this is similar to the
288
   * graph invariants from above (i.e. symmetry classes). However, generating these
289
   * codes allows for additional attributes to be included. A simple example are
290
   * isotopes. More importantly, stereochemistry is included in these codes which
291
   * is an attribute that does not depend an isolated atom.
292
   *
293
   * The actual code is a list of numbers (unsigned int) constructed by joining
294
   * smaller codes. These smaller codes usually encode specific attributes.
295
   *
296
   * @subsection canonical_from FROM code
297
   * When labeling the molecule as described earlier, all but the initial atom in
298
   * a connected fragment have a "from" atom (i.e. current atom in the algorithm).
299
   * The FROM code contains the label of the "from" atom, for each atom (ignoring
300
   * the initial atom) ordered by their assigned label. The FROM code is a
301
   * spanning tree containing all bonds that are not ring closures.
302
   *
303
   * @subsection canonical_closure CLOSURE code
304
   * The ring closure bonds are easily identified while creating the FROM code.
305
   * The CLOSURE code contains two labels for each closure bond. The labels are
306
   * sorted in the following way:
307
   *
308
   * - [1 6] < [6 1] (i.e. 1 6 is used, individual bond sorting)
309
   * - [1 3][1 4] < [1 4][1 3] (i.e. 1 3 1 4 is used, sorting bonds)
310
   * - [1 3][5 7] < [5 7][1 3]
311
   *
312
   * The square brackets are only to indicate individual bonds, the code is just
313
   * a single list.
314
   *
315
   * @subsection canonical_atomtype ATOM-TYPES & ISOTOPES code
316
   * The ATOM-TYPES code is simply the atomic number for each atom ordered by
317
   * labels. If the molecule contains isotopes, an additional ISOTOPES code is
318
   * used to correctly canonicalize these molecules.
319
   *
320
   * @subsection canonical_bondtype BOND-TYPES code
321
   * The BOND-TYPES code contains, for each bond, the bond order or 5 if the
322
   * bond is aromatic. The bonds are ordered using the order obtained during
323
   * the FROM code construction. The closure bonds are last, ordered as
324
   * described in the CLOSURE code section.
325
   *
326
   * @subsection canonical_stereo STEREO code
327
   * The STEREO code contains a descriptor for each stereo center. The stereo
328
   * centers are ordered using the central atom(s) (i.e. center for tetrahedral
329
   * stereochemistry, lowest label for cistrans).
330
   *
331
   * @subsection canonical_canonical Canonical Code
332
   * The canonical code is the greatest code (elements are compared, the first
333
   * difference determines order). The complete canonical code can also be seen
334
   * as layers. The first layer is the FROM code together with the CLOSURE code.
335
   * Together these codes describe the graph without node (atom) or edge (bond)
336
   * properties. The ATOM-TYPES code is the layer adding node properties. The
337
   * optional ISOTOPES code also encodes node properties. Edge properties are
338
   * specified in the BOND-TYPES code. The previous codes together are enough
339
   * to specify the constitution of a molecule. Additional layers such as the
340
   * STEREO code could be ignored for certain use cases. For example, if two
341
   * molecules have the same canonical code up to the STEREO part, they are
342
   * stereoisomers.
343
   *
344
   * @section canonical_fragments Disconnected Fragments
345
   * Disconnected fragments are handled individually and the resulting canonical
346
   * codes are sorted. The final canonical code for the whole molecule, are the
347
   * sorted canonical codes joined together.
348
   *
349
   * @section canonical_optimization Optimizations
350
   * This section documents optimizations to reduce the number of states which
351
   * affect the final result. The aim of these optimizations is to avoid the n!
352
   * permutations in step 2 of the labeling algorithm. Some of the optimizations
353
   * affect the final result and should be implemented when trying to reproduce
354
   * the same canonical code.
355
   *
356
   * @subsection canonical_opt1 Optimization 1
357
   * If the current atom is not in a ring, it follows that the bonds to all it's
358
   * neighbors are not ring bonds. It also follows that there is no path between
359
   * two neighbor atoms with the same symmetry class without going through the
360
   * current atom. Therefore, these ligands (i.e. the fragments that would be
361
   * created by deleting the current atom) can be labeled individually. For
362
   * example, for an atom with 4 neighbors with the same symmetry class, the
363
   * number of permutations to label is reduced from 4! * 4 = 96 to 4. The times
364
   * 4 is because after generating the 24 permutations, the ligands are not
365
   * labeled yet.
366
   *
367
   * @note This optimization <b>affects the final result</b>.
368
   *
369
   * @subsection canonical_opt2 Optimization 2 (not implemented)
370
   * If the bond(s) to the neighbor atom(s) with the same symmetry classes is not
371
   * a ring bond, optimization 1 could be used. This is the more general version
372
   * of optimization 1.
373
   *
374
   * @note This optimization <b>affects the final result</b> but is <b>not
375
   * implemented</b>.
376
   *
377
   * @subsection canonical_opt3 Optimization 3 (not implemented)
378
   * If the current atom is a stereo center, only permutations with the highest
379
   * descriptor produce a greatest canonical code. This does not change the final
380
   * result and can be implemented without changing canonical smiles etc.
381
   *
382
   * @subsection canonical_opt4 Other optimizations
383
   * In essence, the canonical coding algorithm is very similar to the algorithm
384
   * from the well-known nauty package [1, 2]. However, the optimizations mentioned
385
   * above are far easier to implement, document and result in acceptable
386
   * performance for most structures. This enables anyone to reproduce OpenBabel
387
   * canonical smiles without the absolute requirement to implement the complex
388
   * optimizations from nauty. Since the optimizations from the nauty paper do
389
   * not have any affect on the final result, these can be implemented as required.
390
   *
391
   * The OpenBabel algorithm uses the molecule graph as the main object while
392
   * searching for a canonical code. Nauty starts from a partition of the set
393
   * of nodes which is refined. This is not the essential part though. The
394
   * similarity between the algorithms is that both algorithms share the
395
   * concept of a search tree. A completed canonical candidate code is a leaf
396
   * node in this search tree. If a second labeling is found with the same
397
   * canonical code, there exists a permutation mapping the first labels to
398
   * the second. This permutation is an automorphism that takes all attributes
399
   * of the canonical code into account. The found automorphism is not only
400
   * an automorphism of the molecule but also an isomorphism of the search tree.
401
   * These accidentally discovered automorphisms can be used to reduce the
402
   * number of states to consider in various ways.
403
   *
404
   * @subsubsection canonical_opt5 Optimization 4
405
   *
406
   *
407
   *
408
   *
409
   *
410
     @verbatim
411
     [1] Brendan D. McKay, Backtrack programming and isomorphism rejection on
412
         ordered subsets, ARS Combinatoria, Vol. 5, 1979, pp. 65-99
413
         http://cs.anu.edu.au/~bdm/papers/backtrack1978.pdf
414
     [2] Brendan D. McKay, Practical graph isomorphism, Congressus Numerantium,
415
         Vol. 30, 1981, pp. 45-87
416
         http://cs.anu.edu.au/~bdm/papers/pgi.pdf
417
     @endverbatim
418
   */
419
420
  struct CanonicalLabelsImpl
421
  {
422
    typedef std::vector<OBAtom*> Orbit;
423
    typedef std::vector<Orbit> Orbits;
424
425
    static void print_orbits(const Orbits &orbits)
426
0
    {
427
0
      for (std::size_t j = 0; j < orbits.size(); ++j) {
428
0
        cout << "( ";
429
0
        for (std::size_t k = 0; k < orbits[j].size(); ++k)
430
0
          cout << orbits[j][k]->GetIndex() << " ";
431
0
        cout << ") ";
432
0
      }
433
0
    }
434
    static void print_orbits(const std::string &label, const Orbits &orbits)
435
0
    {
436
0
      cout << label << ": ";
437
0
      print_orbits(orbits);
438
0
      cout << endl;
439
0
    }
440
441
442
    /**
443
     * Structure to represent a completed labeling and it's associated canonical
444
     * candidate code.
445
     */
446
    struct FullCode
447
    {
448
      /**
449
       * The atom labels starting from 1. All excluded atoms have label 0.
450
       */
451
      std::vector<unsigned int> labels;
452
      /**
453
       * The canonical candidate code resulting from the @p labels.
454
       */
455
      std::vector<unsigned short> code;
456
457
      /**
458
       * Default constructor, used to create initial empty bestCode object.
459
       */
460
      FullCode()
461
0
      {
462
0
      }
463
464
      /**
465
       * Constructor specifying both the @p labels and @p from (the FROM part
466
       * of the canonical code). The other parts of the canonical code are
467
       * added using the CompleteCode function.
468
       */
469
      FullCode(const std::vector<unsigned int> &_labels, const std::vector<unsigned short> &from)
470
0
          : labels(_labels), code(from)
471
0
      {
472
0
      }
473
474
      /**
475
       * Compare this object's canonical candidate code to the other object's
476
       * code and return true if this code is greater. This method is used to
477
       * select the final canonical code. In other words, the canonical labels
478
       * for a molecule are the labels with the greatest canonical code.
479
       */
480
      inline bool operator>(const FullCode &other) const
481
0
      {
482
0
        return (code > other.code);
483
0
      }
484
    };
485
486
    /**
487
     * Structure to represent a partial labeling and the associated FROM part
488
     * of the canonical code.
489
     */
490
    struct PartialCode
491
    {
492
      /**
493
       * The atoms in canonical order. An atom is added when a label is assigned
494
       * to it. When all atoms are labeled, this vector will contain all atoms
495
       * in the fragment.
496
       */
497
      std::vector<OBAtom*> atoms;
498
      /**
499
       * The bonds in canonical order. Since the atoms are labeled from another
500
       * atom ("from"), these are the bonds connecting the "from" atom with the
501
       * next labeled atom. The FROM list forms a spanning tree that does not
502
       * include the ring closure bonds. These ring closure bonds are added at
503
       * the end of this vector, in the correct order, in the CompleteCode
504
       * function.
505
       */
506
      std::vector<OBBond*> bonds;
507
      /**
508
       * The FROM part of the canonical code. This is a vector containing the
509
       * label of the "from" atom for the atoms in canonical order. The initial
510
       * atom of a bonded fragment is not included in this list since there is
511
       * no "from" atom.
512
       */
513
      std::vector<unsigned short> from;
514
      /**
515
       * The atom labels starting from 1. All atoms start with label 0.
516
       */
517
      std::vector<unsigned int> labels;
518
519
      /**
520
       * Constructor specifying the number of atoms in the molecule. The @p
521
       * numAtoms are used to initialize the labels vector to contain label
522
       * 0 for all atoms.
523
       */
524
      PartialCode(std::size_t numAtoms)
525
0
      {
526
0
        labels.resize(numAtoms, 0);
527
0
      }
528
529
      /**
530
       * Add an atom to atoms. Used to add the initial atom.
531
       */
532
0
      inline void add(OBAtom *atom) { atoms.push_back(atom); }
533
      /**
534
       * Add a bond to bonds. Used to add the ring closure bonds.
535
       */
536
0
      inline void add(OBBond *bond) { bonds.push_back(bond); }
537
      /**
538
       * Add an atom that is labeled "from" another atom. This function adds
539
       * the @p atom to atoms, adds bond between @p fromAtom and @p atom to
540
       * bonds and updates the @p from vector.
541
       */
542
      inline void add(OBAtom *fromAtom, OBAtom *atom)
543
0
      {
544
0
        from.push_back(labels[fromAtom->GetIndex()]);
545
0
        atoms.push_back(atom);
546
0
        bonds.push_back(atom->GetParent()->GetBond(fromAtom, atom));
547
0
      }
548
549
      /**
550
       * Compare the @p from vector with a FullCode's code and return true if
551
       * this code is lower. This function is used to avoid completing a
552
       * labeling that will never result in a greatest canonical code.
553
       */
554
      inline bool operator<(const FullCode &other) const
555
0
      {
556
0
        std::size_t numFrom = std::min(from.size(), other.code.size());
557
0
        for (std::size_t i = 0; i < numFrom; ++i) {
558
0
          if (from[i] > other.code[i])
559
0
            return false;
560
0
          if (from[i] < other.code[i])
561
0
            return true;
562
0
        }
563
564
0
        return false;
565
0
      }
566
    };
567
568
    /**
569
     * Structure to store and compute stereo center parities. Currently for
570
     * both tetrahedral and cistrans stereochemistry. Precomputing these
571
     * StereoCenter objects is done for performance reasons since parities
572
     * need to be computed for each canonical candidate labeling.
573
     */
574
    struct StereoCenter
575
    {
576
      /**
577
       * Use the stored nbrIndexes to access the @p labels and compute a parity
578
       * or descriptor. This function returns 0 or 1 for both tetrahedral and
579
       * cistrans stereo centers.
580
       */
581
      int getDescriptor(const std::vector<unsigned int> &symmetry_classes, const std::vector<unsigned int> &labels) const
582
0
      {
583
        // Unspecified stereo centers have their own descriptor.
584
0
        if (nbrIndexes1.empty())
585
0
          return 2;
586
0
        std::vector<unsigned long> refs1, refs2;
587
0
        for (std::size_t i = 0; i < nbrIndexes1.size(); ++i) {
588
0
          if (nbrIndexes1[i] < labels.size())
589
0
            refs1.push_back(labels[nbrIndexes1[i]]);
590
0
          else
591
0
            refs1.push_back(nbrIndexes1[i]);
592
0
        }
593
0
        for (std::size_t i = 0; i < nbrIndexes2.size(); ++i)
594
0
          if (nbrIndexes2[i] < labels.size())
595
0
            refs2.push_back(labels[nbrIndexes2[i]]);
596
0
          else
597
0
            refs2.push_back(nbrIndexes2[i]);
598
0
        if (indexes.size() == 2) {
599
0
          bool symOrder = symmetry_classes[indexes[0]] < symmetry_classes[indexes[1]];
600
0
          bool canOrder = labels[indexes[0]] < labels[indexes[1]];
601
0
          if (symOrder != canOrder && symmetry_classes[indexes[0]] != symmetry_classes[indexes[1]])
602
0
            std::swap(refs1[0], refs1[1]);
603
0
        }
604
0
        return ((OBStereo::NumInversions(refs1) % 2 + OBStereo::NumInversions(refs2) % 2) % 2);
605
0
      }
606
      /**
607
       * Index(es) for the stereo center. This is used to sort the stereo
608
       * centers by label. Tetrahedral stereo centers have one index in this
609
       * vector and cistrans stereocenters have both double bond atom indexes
610
       * in this vector.
611
       */
612
      std::vector<unsigned int> indexes;
613
      /**
614
       * Indexes for the stereo center neighbor atoms. Tetrahedral centers have
615
       * all neighbor atoms in nbrIndexes1. CisTrans stereo centers store the
616
       * neighbor atoms for each double bond atom separately.
617
       */
618
      std::vector<unsigned int> nbrIndexes1, nbrIndexes2;
619
    };
620
621
    /**
622
     * Sort StereoCenter objects by their label. Used in combination with
623
     * std::sort to create STEREO code.
624
     */
625
    struct SortStereoCenters
626
    {
627
      const std::vector<unsigned int> &labels;
628
0
      SortStereoCenters(const std::vector<unsigned int> &_labels) : labels(_labels) {}
629
      inline unsigned int getLabel(const StereoCenter &c) const
630
0
      {
631
0
        switch (c.indexes.size()) {
632
0
          case 2:
633
0
            return std::min(labels[c.indexes[0]], labels[c.indexes[1]]);
634
0
          default:
635
0
            return labels[c.indexes[0]];
636
0
        }
637
0
      }
638
      inline bool operator()(const StereoCenter &c1, const StereoCenter &c2) const
639
0
      {
640
0
        return (getLabel(c1) < getLabel(c2));
641
0
      }
642
    };
643
644
    /**
645
     * Sort FullCode objects by their code. Used in combination with
646
     * std::sort order disconnected fragments.
647
     */
648
    static bool SortCode(const FullCode &code1, const FullCode &code2)
649
0
    {
650
0
      return (code1.code < code2.code);
651
0
    }
652
653
    /**
654
     * Sort FullCode objects with associated int in std::pair. Used to sort
655
     * ligands (i.e. equivalent fragments connected to a symmetric atom).
656
     */
657
    static bool SortCode2(const std::pair<int, FullCode> &code1, const std::pair<int, FullCode> &code2)
658
0
    {
659
0
      return (code1.second.code > code2.second.code);
660
0
    }
661
662
    /**
663
     * Sort atoms by ascending ranks (e.g. labels).
664
     */
665
    struct SortAtomsAscending
666
    {
667
0
      SortAtomsAscending(const std::vector<unsigned int> &_ranks) : ranks(_ranks) {}
668
      const std::vector<unsigned int> &ranks;
669
      inline bool operator()(const OBAtom *a1, const OBAtom *a2) const
670
0
      {
671
0
        return ranks[a1->GetIndex()] < ranks[a2->GetIndex()];
672
0
      }
673
    };
674
675
    /**
676
     * Sort atoms by descending ranks (e.g. symmetry classes)
677
     */
678
    struct SortAtomsDescending
679
    {
680
0
      SortAtomsDescending(const std::vector<unsigned int> &_ranks) : ranks(_ranks) {}
681
      const std::vector<unsigned int> &ranks;
682
      inline bool operator()(const OBAtom *a1, const OBAtom *a2) const
683
0
      {
684
0
        return ranks[a1->GetIndex()] > ranks[a2->GetIndex()];
685
0
      }
686
    };
687
688
    /**
689
     * Structure used while labeling a single connected fragment. All input is
690
     * specified using the constructor and @p code is generated as a result of
691
     * assigning labels.
692
     */
693
    struct State
694
    {
695
      State(const std::vector<unsigned int> &_symmetry_classes,
696
            const OBBitVec &_fragment, std::vector<StereoCenter> &_stereoCenters,
697
            std::vector<FullCode> &_identityCodes, Orbits &_orbits, OBBitVec &_mcr,
698
0
            bool _onlyOne) : symmetry_classes(_symmetry_classes), fragment(_fragment),
699
0
          onlyOne(_onlyOne), stereoCenters(_stereoCenters),
700
0
          code(_symmetry_classes.size()), identityCodes(_identityCodes),
701
0
          backtrackDepth(0), orbits(_orbits), mcr(_mcr)
702
0
      {
703
0
        mcr.Clear();
704
0
        if (mcr.IsEmpty())
705
0
          for (std::size_t i = 0; i < symmetry_classes.size(); ++i)
706
0
            mcr.SetBitOn(i+1);
707
0
      }
708
709
      const std::vector<unsigned int> &symmetry_classes;
710
      /**
711
       * The connected fragment. This is a subset of the mask.
712
       */
713
      const OBBitVec &fragment;
714
      const bool onlyOne;
715
716
      /**
717
       * The pre-computed stereo centers. Non-const since it needs to be
718
       * sorted.
719
       */
720
      std::vector<StereoCenter> &stereoCenters;
721
      /**
722
       * The partial code with labels and FROM code.
723
       */
724
      PartialCode code;
725
726
      /**
727
       * Identity nodes of the search tree.
728
       */
729
      std::vector<FullCode> identityCodes;
730
      unsigned int backtrackDepth;
731
      Orbits orbits;
732
      OBBitVec &mcr;
733
    };
734
735
    /**
736
     * Simple struct to manage timeout.
737
     */
738
    struct Timeout
739
    {
740
0
      Timeout(time_t _maxTime) : maxTime(_maxTime)
741
0
      {
742
0
        startTime = time(nullptr);
743
0
      }
744
      time_t startTime, maxTime;
745
    };
746
747
748
    /**
749
     * Given a complete labeling (@p state.code.labels) and FROM code (@p state.code.from),
750
     * construct a FullCode with complete code.
751
     */
752
    static void CompleteCode(OBMol *mol, FullCode &fullcode, State &state)
753
0
    {
754
0
      PartialCode &code = state.code;
755
756
      // initialize the FullCode object with the found labels and the from code
757
0
      fullcode = FullCode(code.labels, code.from);
758
0
      unsigned int numClosures = 0;
759
760
      //
761
      // the RING-CLOSURE list
762
      //
763
0
      unsigned int current_label = 1; // the second atom is always labeled from 1
764
      // iterate over all atoms in the order they are labeled
765
      // (this ensures [1 3] < [3 1])
766
0
      for (std::size_t j = 0; j < code.atoms.size(); ++j) {
767
0
        OBAtom *atom = code.atoms[j];
768
        // still need to sort [1 3] and [1 4]
769
0
        std::vector<std::pair<OBBond*, unsigned int> > closures;
770
0
        FOR_BONDS_OF_ATOM (bond, atom) {
771
          // skip atoms not in the fragment
772
0
          if (!state.fragment.BitIsSet(bond->GetNbrAtom(atom)->GetIdx()))
773
0
            continue;
774
          // a closure bond is a bond not found while generating the FROM spanning tree.
775
0
          if (std::find(code.bonds.begin(), code.bonds.end(), &*bond) == code.bonds.end()) {
776
0
            closures.push_back(std::make_pair(&*bond, code.labels[bond->GetNbrAtom(atom)->GetIndex()]));
777
0
          }
778
0
        }
779
780
        // do the sorting: [1 3] < [1 4]
781
0
        std::sort(closures.begin(), closures.end(), CompareBondPairSecond);
782
783
0
        for (std::size_t k = 0; k < closures.size(); ++k) {
784
          // add the closure bond to the code
785
0
          fullcode.code.push_back(current_label);
786
0
          fullcode.code.push_back(closures[k].second);
787
          // add the bond to the list (needed for BOND-TYPES below)
788
0
          code.add(closures[k].first);
789
0
          numClosures++;
790
0
        }
791
792
0
        current_label++;
793
0
      }
794
795
      // Isotopes are only considered if there are isotopes.
796
0
      bool hasIsotope = false;
797
      // Charges are only considered if there is a formal charge.
798
0
      bool hasCharge = false;
799
800
      //
801
      // the ATOM-TYPES list
802
      //
803
0
      OBStereoFacade facade(mol);
804
0
      for (std::size_t j = 0; j < code.atoms.size(); ++j) {
805
0
        OBAtom *atom = code.atoms[j];
806
0
        if (atom->GetIsotope())
807
0
          hasIsotope = true;
808
0
        if (atom->GetFormalCharge())
809
0
          hasCharge = true;
810
811
        // Include all hydrogens
812
0
        int hydrogens_to_include = TotalHydrogenCount(atom);
813
814
0
        unsigned int c = 10000 * atom->GetSpinMultiplicity() +
815
0
                          1000 * hydrogens_to_include +
816
0
                                 atom->GetAtomicNum();
817
818
        // add the atomic number to the code
819
0
        fullcode.code.push_back(c);
820
0
      }
821
822
      //
823
      // the (optional) ISOTOPES list
824
      //
825
0
      if (hasIsotope)
826
0
        for (std::size_t j = 0; j < code.atoms.size(); ++j)
827
0
          fullcode.code.push_back(code.atoms[j]->GetIsotope());
828
829
      //
830
      // the (optional) CHARGES list
831
      //
832
0
      if (hasCharge)
833
0
        for (std::size_t j = 0; j < code.atoms.size(); ++j)
834
0
          fullcode.code.push_back(7 + code.atoms[j]->GetFormalCharge());
835
836
      //
837
      // the BOND-TYPES list
838
      //
839
0
      for (std::size_t j = 0; j < code.bonds.size(); ++j) {
840
0
        OBBond *bond = code.bonds[j];
841
0
        if (bond->IsAromatic())
842
0
          fullcode.code.push_back(5);
843
0
        else
844
0
          fullcode.code.push_back(bond->GetBondOrder());
845
0
      }
846
847
      // the STEREO flag
848
0
      if (state.stereoCenters.size()) {
849
        // sort the stereo centers
850
0
        std::sort(state.stereoCenters.begin(), state.stereoCenters.end(), SortStereoCenters(code.labels));
851
852
0
        for (std::size_t i = 0; i < state.stereoCenters.size(); ++i) {
853
0
          bool isInFragment = false;
854
0
          for (std::size_t j = 0; j < state.stereoCenters[i].indexes.size(); ++j)
855
0
            if (state.fragment.BitIsSet(state.stereoCenters[i].indexes[j]+1))
856
0
              isInFragment = true;
857
          // ignore stereo centers not in this fragment
858
0
          if (isInFragment)
859
0
            fullcode.code.push_back(state.stereoCenters[i].getDescriptor(state.symmetry_classes, code.labels));
860
0
        }
861
0
      }
862
863
      // backtrack
864
0
      for (unsigned int i = 0; i < numClosures; ++i)
865
0
        code.bonds.pop_back();
866
0
    }
867
868
    /**
869
     * This function implements optimization 1 as described above.
870
     */
871
    static void LabelFragments(OBAtom *current, std::vector<OBAtom*> &nbrs, unsigned int label, Timeout &timeout, FullCode &bestCode, State &state)
872
0
    {
873
0
      OBMol *mol = current->GetParent();
874
0
      PartialCode &code = state.code;
875
876
      // Sort the neighbor atoms according to symmetry class.
877
0
      std::sort(nbrs.begin(), nbrs.end(), SortAtomsDescending(state.symmetry_classes));
878
879
      // Count the number of unique neighbor symmetry classes.
880
0
      unsigned int numUnique = 1;
881
0
      unsigned int lastSymClass = state.symmetry_classes[nbrs[0]->GetIndex()];
882
0
      for (std::size_t i = 0; i < nbrs.size(); ++i) {
883
0
        unsigned int symClass = state.symmetry_classes[nbrs[i]->GetIndex()];
884
0
        if (symClass != lastSymClass)
885
0
          numUnique++;
886
0
        lastSymClass = state.symmetry_classes[nbrs[i]->GetIndex()];
887
0
      }
888
889
0
      if (numUnique < nbrs.size()) {
890
        // The canonical codes for the equivalent ligands
891
0
        std::vector< std::pair<int, CanonicalLabelsImpl::FullCode> > lcodes;
892
893
0
        std::vector<unsigned int> ligandSizes;
894
0
        for (std::size_t i = 0; i < nbrs.size(); ++i) {
895
0
          OBBitVec ligand = getFragment(nbrs[i], current, state.fragment);
896
0
          ligandSizes.push_back(ligand.CountBits());
897
898
0
          CanonicalLabelsImpl::FullCode lbestCode;
899
0
          if (ligandSizes.back() == 1) {
900
            // Avoid additional state creation if the neighbor is a single terminal atom.
901
0
            lbestCode.code.push_back(nbrs[i]->GetAtomicNum());
902
0
            lbestCode.labels.resize(state.symmetry_classes.size(), 0);
903
0
            lbestCode.labels[nbrs[i]->GetIndex()] = 1;
904
0
          } else if (ligandSizes.back() == 2) {
905
            // Avoid additional state creation if the neighbor is a fragment of 2 atoms.
906
0
            lbestCode.labels.resize(state.symmetry_classes.size(), 0);
907
0
            lbestCode.code.push_back(1); // FROM
908
0
            lbestCode.code.push_back(nbrs[i]->GetAtomicNum()); // ATOM-TYPES 1
909
0
            lbestCode.labels[nbrs[i]->GetIndex()] = 1;
910
0
            FOR_NBORS_OF_ATOM (nbr, nbrs[i]) {
911
0
              if (!state.fragment.BitIsSet(nbr->GetIdx()))
912
0
                continue;
913
0
              if (code.labels[nbr->GetIndex()])
914
0
                continue;
915
0
              lbestCode.code.push_back(nbr->GetAtomicNum()); // ATOM-TYPES 2
916
0
              OBBond *bond = mol->GetBond(nbrs[i], &*nbr);
917
0
              if (bond->IsAromatic())
918
0
                lbestCode.code.push_back(5);
919
0
              else
920
0
                lbestCode.code.push_back(bond->GetBondOrder()); // BOND-TYPES 1
921
0
              lbestCode.labels[nbr->GetIndex()] = 2;
922
0
            }
923
0
          } else {
924
            // Start labeling from the ligand atom.
925
0
            std::vector<CanonicalLabelsImpl::FullCode> identityCodes;
926
0
            Orbits orbits;
927
0
            OBBitVec mcr;
928
0
            State lstate(state.symmetry_classes, ligand, state.stereoCenters, identityCodes, orbits, mcr, state.onlyOne);
929
0
            lstate.code.add(nbrs[i]);
930
0
            lstate.code.labels[nbrs[i]->GetIndex()] = 1;
931
0
            CanonicalLabelsRecursive(nbrs[i], 1, timeout, lbestCode, lstate);
932
0
          }
933
934
          // Store the canonical code (and labels) for the ligand.
935
0
          lcodes.push_back(std::make_pair(i, lbestCode));
936
0
        }
937
938
        // Sort the codes for the fragments. Each neighbor symmetry class is sorted separately.
939
0
        unsigned int firstIndex = 0;
940
0
        lastSymClass = state.symmetry_classes[nbrs[0]->GetIndex()];
941
0
        for (std::size_t i = 1; i < nbrs.size(); ++i) {
942
0
          unsigned int symClass = state.symmetry_classes[nbrs[i]->GetIndex()];
943
0
          if (symClass != lastSymClass) {
944
0
            std::sort(lcodes.begin() + firstIndex, lcodes.begin() + i, SortCode2);
945
0
            firstIndex = i;
946
0
          }
947
0
          lastSymClass = state.symmetry_classes[nbrs[i]->GetIndex()];
948
0
        }
949
        // Make sure to sort the last set too.
950
0
        std::sort(lcodes.begin() + firstIndex, lcodes.end(), SortCode2);
951
952
        // Label the neighbor atoms by updating code.
953
0
        std::vector<OBAtom*> atoms;
954
0
        unsigned int nextLbl = label + 1;
955
0
        for (std::size_t l = 0; l < lcodes.size(); ++l) {
956
          //print_vector("LIG CODE", lcodes[l].second.code);
957
0
          OBAtom *atom = nbrs[lcodes[l].first];
958
0
          code.add(current, atom);
959
0
          code.labels[atom->GetIndex()] = nextLbl;
960
0
          atoms.push_back(atom);
961
0
          nextLbl++;
962
0
        }
963
964
        // Convert the labels from the ligands to labels in the whole fragment.
965
0
        unsigned int ligandSize = *std::max_element(ligandSizes.begin(), ligandSizes.end());
966
0
        for (unsigned int lbl = 1; lbl < ligandSize; ++lbl) {
967
0
          for (std::size_t l = 0; l < lcodes.size(); ++l) {
968
0
            if (lbl >= ligandSizes[lcodes[l].first])
969
0
              continue;
970
971
0
            OBAtom *atom = nullptr;
972
0
            for (std::size_t i = 0; i < mol->NumAtoms(); ++i)
973
0
              if (lcodes[l].second.labels[i] == lbl) {
974
0
                atom = mol->GetAtom(i+1);
975
0
                break;
976
0
              }
977
978
0
            if (!atom)
979
0
              continue;
980
981
0
            std::vector<OBAtom*> atomNbrs;
982
0
            FOR_NBORS_OF_ATOM (nbr, atom) {
983
0
              if (lcodes[l].second.labels[nbr->GetIndex()] > lbl) {
984
0
                if (std::find(atoms.begin(), atoms.end(), &*nbr) != atoms.end())
985
0
                  continue;
986
0
                atomNbrs.push_back(&*nbr);
987
0
              }
988
0
            }
989
990
0
            std::sort(atomNbrs.begin(), atomNbrs.end(), SortAtomsAscending(lcodes[l].second.labels));
991
992
0
            for (std::size_t i = 0; i < atomNbrs.size(); ++i) {
993
0
              code.add(atom, atomNbrs[i]);
994
0
              code.labels[atomNbrs[i]->GetIndex()] = nextLbl;
995
0
              atoms.push_back(atomNbrs[i]);
996
0
              nextLbl++;
997
0
            }
998
0
          }
999
0
        }
1000
1001
        // Recurse...
1002
0
        CanonicalLabelsRecursive(current, nextLbl - 1, timeout, bestCode, state);
1003
1004
        // Backtrack.
1005
0
        for (std::size_t j = 0; j < atoms.size(); ++j) {
1006
0
          code.atoms.pop_back();
1007
0
          code.bonds.pop_back();
1008
0
          code.from.pop_back();
1009
0
          code.labels[atoms[j]->GetIndex()] = 0;
1010
0
        }
1011
0
      } else {
1012
        // There are no duplicated symmetry classes for the neighbor atoms.
1013
        // Label each neighbor atom in sorted sequence.
1014
0
        unsigned int lbl = label;
1015
0
        for (std::size_t i = 0; i < nbrs.size(); ++i) {
1016
0
          lbl++;
1017
0
          code.add(current, nbrs[i]);
1018
0
          code.labels[nbrs[i]->GetIndex()] = lbl;
1019
0
        }
1020
1021
        // Recurse...
1022
0
        CanonicalLabelsRecursive(current, lbl, timeout, bestCode, state);
1023
1024
        // Backtrack.
1025
0
        for (std::size_t i = 0; i < nbrs.size(); ++i) {
1026
0
          code.atoms.pop_back();
1027
0
          code.bonds.pop_back();
1028
0
          code.from.pop_back();
1029
0
          code.labels[nbrs[i]->GetIndex()] = 0;
1030
0
        }
1031
0
      }
1032
0
    }
1033
1034
    static void FindOrbits(Orbits &orbits, OBMol *mol, const std::vector<unsigned int> &labels1,
1035
        const std::vector<unsigned int> &labels2)
1036
0
    {
1037
0
      Orbits newOrbits;
1038
0
      std::vector<bool> visited(labels1.size(), false);
1039
1040
      //print_vector("labels1", labels1);
1041
      //print_vector("labels2", labels2);
1042
1043
      // Construct the orbits for automorphic permutation labels1 -> labels2
1044
0
      for (std::size_t i = 0; i < labels1.size(); ++i) {
1045
0
        if (visited[i])
1046
0
          continue;
1047
0
        unsigned int vi = labels1[i];
1048
0
        if (vi == labels2[i])
1049
0
          continue;
1050
        //cout << "( ";
1051
0
        std::size_t j = i;
1052
0
        unsigned int vj = labels2[j];
1053
        //cout << j << " ";
1054
0
        newOrbits.resize(newOrbits.size()+1);
1055
0
        newOrbits.back().push_back(mol->GetAtom(j+1));
1056
0
        visited[i] = true;
1057
0
        while (vi != vj) {
1058
0
          for (std::size_t k = i; k < labels1.size(); ++k) {
1059
0
            if (vj == labels1[k]) {
1060
0
              j = k;
1061
0
              vj =  labels2[j];
1062
              //cout << j << " ";
1063
0
              newOrbits.back().push_back(mol->GetAtom(j+1));
1064
0
              visited[j] = true;
1065
0
              break;
1066
0
            }
1067
0
          }
1068
0
        }
1069
        //cout << ") ";
1070
0
      }
1071
      //cout << endl;
1072
1073
      //print_orbits("newOrbits", newOrbits);
1074
1075
      // Merge the orbits with previously found orbits.
1076
0
      for (std::size_t j = 0; j < newOrbits.size(); ++j) {
1077
0
        bool merge = false;
1078
0
        for (std::size_t k = 0; k < orbits.size(); ++k) {
1079
1080
0
          for (std::size_t l = 0; l < newOrbits[j].size(); ++l) {
1081
0
            if (std::find(orbits[k].begin(), orbits[k].end(), newOrbits[j][l]) != orbits[k].end())
1082
0
              merge = true;
1083
0
          }
1084
1085
0
          if (merge) {
1086
0
            for (std::size_t l = 0; l < newOrbits[j].size(); ++l)
1087
0
              if (std::find(orbits[k].begin(), orbits[k].end(), newOrbits[j][l]) == orbits[k].end())
1088
0
                orbits[k].push_back(newOrbits[j][l]);
1089
0
            break;
1090
0
          }
1091
0
        }
1092
1093
0
        if (!merge)
1094
0
          orbits.push_back(newOrbits[j]);
1095
0
      }
1096
1097
//      print_orbits("orbits", orbits);
1098
1099
0
      newOrbits.clear();
1100
0
      std::vector<bool> vivisted(orbits.size(), false);
1101
0
      for (std::size_t i = 0; i < orbits.size(); ++i) {
1102
0
        if (visited[i])
1103
0
          continue;
1104
0
        visited[i] = true;
1105
1106
0
        Orbit newOrbit = orbits[i];
1107
1108
0
        for (std::size_t j = i; j < orbits.size(); ++j) {
1109
0
          if (visited[j])
1110
0
            continue;
1111
1112
0
          std::sort(newOrbit.begin(), newOrbit.end());
1113
0
          std::sort(orbits[j].begin(), orbits[j].end());
1114
0
          Orbit result;
1115
1116
0
          std::set_intersection(newOrbit.begin(), newOrbit.end(),
1117
0
                                orbits[j].begin(), orbits[j].end(),
1118
0
                                std::back_inserter(result));
1119
0
          if (result.empty())
1120
0
            continue;
1121
1122
0
          visited[j] = true;
1123
0
          result.clear();
1124
0
          std::set_union(newOrbit.begin(), newOrbit.end(),
1125
0
                         orbits[j].begin(), orbits[j].end(),
1126
0
                         std::back_inserter(result));
1127
1128
0
          newOrbit = result;
1129
0
        }
1130
1131
0
        newOrbits.push_back(newOrbit);
1132
0
      }
1133
1134
0
      orbits = newOrbits;
1135
0
    }
1136
1137
    /**
1138
     * Update the minimum cell representations (mcr).
1139
     */
1140
    static void UpdateMcr(OBBitVec &mcr, Orbits &orbits, const std::vector<unsigned int> &bestLabels)
1141
0
    {
1142
      //print_orbits("UpdateMcr", orbits);
1143
1144
0
      for (std::size_t i = 0; i < bestLabels.size(); ++i)
1145
0
        mcr.SetBitOn(i+1);
1146
1147
0
      for (std::size_t j = 0; j < orbits.size(); ++j) {
1148
0
        std::sort(orbits[j].begin(), orbits[j].end(), SortAtomsAscending(bestLabels));
1149
1150
0
        for (std::size_t k = 0; k < orbits[j].size(); ++k)
1151
0
          if (k)
1152
0
            mcr.SetBitOff(orbits[j][k]->GetIdx());
1153
0
      }
1154
0
    }
1155
1156
1157
    /**
1158
     * This is the recursive function implementing the labeling algorithm
1159
     * outlined above (steps 1-3). This function works for single connected
1160
     * fragments and uses @p state to hold most information. The @p bestCode
1161
     * is the current greatest code (if any). given the @p current atom and
1162
     * the last @p label, this function labels the neighbor atoms with
1163
     * label+1, label+2, ...
1164
     */
1165
    static void CanonicalLabelsRecursive(OBAtom *current, unsigned int label, Timeout &timeout, FullCode &bestCode, State &state)
1166
0
    {
1167
0
      OBMol *mol = current->GetParent();
1168
0
      PartialCode &code = state.code;
1169
1170
0
      if (state.backtrackDepth) {
1171
        //std::cout << "backtrackDepth = " << state.backtrackDepth << std::endl;
1172
1173
0
        if (code.atoms.size() > state.backtrackDepth) {
1174
          //std::cout << "BACKTRACKING" << std::endl;
1175
0
          return;
1176
0
        }
1177
0
        if (code.atoms.size() == state.backtrackDepth) {
1178
          //std::cout << "BACKTRACK DONE 1" << std::endl;
1179
0
          state.backtrackDepth = 0;
1180
0
          return;
1181
0
        } else
1182
0
        if (code.atoms.size() < state.backtrackDepth) {
1183
          //std::cout << "BACKTRACK DONE 2" << std::endl;
1184
0
          state.backtrackDepth = 0;
1185
0
        }
1186
0
      }
1187
1188
1189
      // Check if there is a full mapping.
1190
0
      if (label == state.fragment.CountBits()) {
1191
        // Complete the canonical code.
1192
0
        FullCode fullcode;
1193
0
        CompleteCode(mol, fullcode, state);
1194
1195
        //print_vector("TERMINAL", fullcode.code);
1196
1197
        // Check previously found codes to find redundant subtrees.
1198
0
        for (std::size_t i = state.identityCodes.size(); i > 0; --i)
1199
0
          if (fullcode.code == state.identityCodes[i-1].code) {
1200
            //
1201
            // An explicit automorphism has been found.
1202
            //
1203
0
            std::vector<unsigned int> v1(fullcode.labels.size(), 0);
1204
0
            for (std::size_t j = 0; j < fullcode.labels.size(); ++j)
1205
0
              if (fullcode.labels[j])
1206
0
                v1[fullcode.labels[j]-1] = j + 1;
1207
1208
0
            std::vector<unsigned int> v2(state.identityCodes[i-1].labels.size(), 0);
1209
0
            for (std::size_t j = 0; j < state.identityCodes[i-1].labels.size(); ++j)
1210
0
              if (state.identityCodes[i-1].labels[j])
1211
0
                v2[state.identityCodes[i-1].labels[j]-1] = j + 1;
1212
1213
0
            state.backtrackDepth = 0;
1214
0
            for (std::size_t j = 0; j < v1.size(); ++j) {
1215
0
              if (v1[j] != v2[j]) {
1216
                // Yield backtrackDepth.
1217
0
                return;
1218
0
              }
1219
1220
0
              if (v1[j])
1221
0
                state.backtrackDepth++;
1222
0
            }
1223
0
          }
1224
1225
0
        if (fullcode.code == bestCode.code) {
1226
0
          UpdateMcr(state.mcr, state.orbits, bestCode.labels);
1227
0
          FindOrbits(state.orbits, mol, fullcode.labels, bestCode.labels);
1228
0
        } else if (fullcode > bestCode) {
1229
          // if fullcode is greater than bestCode, we have found a new greatest code
1230
0
          bestCode = fullcode;
1231
0
        }
1232
1233
1234
0
        if (state.identityCodes.size() < MAX_IDENTITY_NODES) {
1235
0
          state.identityCodes.push_back(FullCode());
1236
0
          state.identityCodes.back().labels.swap(fullcode.labels);
1237
0
          state.identityCodes.back().code.swap(fullcode.code);
1238
0
        } else {
1239
0
          state.identityCodes[MAX_IDENTITY_NODES-1].labels.swap(fullcode.labels);
1240
0
          state.identityCodes[MAX_IDENTITY_NODES-1].code.swap(fullcode.code);
1241
0
        }
1242
1243
1244
0
        return;
1245
0
      }
1246
1247
      // Avoid endless loops.
1248
0
      if (time(nullptr) - timeout.startTime > timeout.maxTime) {
1249
0
        return;
1250
0
      }
1251
1252
      // If there is a bestCode and only one labeling is required, return.
1253
0
      if (state.onlyOne && !bestCode.code.empty())
1254
0
        return;
1255
1256
      // Abort early if this will not lead to a greatest canonical code. The
1257
      // code.from vector is compared with the elements in the bestCode.
1258
0
      if (code < bestCode)
1259
0
        return;
1260
1261
      // Find the neighbors of the current atom to assign the next label(s).
1262
0
      std::vector<OBAtom*> nbrs;
1263
0
      std::vector<unsigned int> nbrSymClasses;
1264
1265
0
      FOR_NBORS_OF_ATOM (nbr, current) {
1266
        // Skip atoms not in the fragment.
1267
0
        if (!state.fragment.BitIsSet(nbr->GetIdx()))
1268
0
          continue;
1269
        // Skip already labeled atoms.
1270
0
        if (code.labels[nbr->GetIndex()])
1271
0
          continue;
1272
1273
0
        OBBond *bond = mol->GetBond(current, &*nbr);
1274
        // Ugly, but it helps...                                        // <--- not documented! need better approach to handle this (not only ferrocene)...
1275
0
        if (!isFerroceneBond(bond)) {
1276
0
          nbrSymClasses.push_back(state.symmetry_classes[nbr->GetIndex()]);
1277
0
          nbrs.push_back(&*nbr);
1278
0
        }
1279
0
      }
1280
1281
0
      if (nbrs.empty()) {
1282
        // If there are no neighbor atoms to label, recurse with the next
1283
        // current atom.
1284
0
        unsigned int nextLabel = code.labels[current->GetIndex()] + 1;
1285
0
        for (std::size_t i = 0; i < code.labels.size(); ++i) {
1286
0
          if (code.labels[i] == nextLabel) {
1287
0
            CanonicalLabelsRecursive(mol->GetAtom(i+1), label, timeout, bestCode, state);
1288
0
            return;
1289
0
          }
1290
0
        }
1291
0
        return;
1292
0
      }
1293
1294
0
      if (!current->IsInRing()) {
1295
        // Optimization to avoid generating the n! permutations. If the current
1296
        // atom is not in a ring, the neighbor atoms with the same symmetry
1297
        // are not connected and it is possible to label without considering
1298
        // the rest of the molecule. The found ligand labels can be sorted and
1299
        // the labels can be calculated using offsets.
1300
        //
1301
        // This modifies the final canonical code since it produces a different
1302
        // canonical order but although different, the code is still canonical.
1303
0
        LabelFragments(current, nbrs, label, timeout, bestCode, state);
1304
0
      } else {
1305
        // Create all possible labelings for the neighbors.
1306
        //
1307
        // Example: The current atom has 4 neighbors to label.
1308
        //
1309
        // neighbor atom   : 1 2 3 4
1310
        // symmetry classes: 3 2 2 1
1311
        //
1312
        // The first neighbor with symmetry class 3 is added to allOrderedNbrs.
1313
        //
1314
        // allOrderedNbrs = [ [1] ]
1315
        //
1316
        // The two neighbor atoms with symmetry class 2 are added next. All
1317
        // permutations are added (2 in this case).
1318
        //
1319
        // allOrderedNbrs = [ [1,2,3], [1,3,2] ]
1320
        //
1321
        // The last atom is similar to the first one.
1322
        //
1323
        // allOrderedNbrs = [ [1,2,3,4], [1,3,2,4] ]
1324
        //
1325
        // Note: If there are atoms with a large number of neighbor atoms
1326
        // with the same symmetry class (n), this can result in a large number
1327
        // of permutations (n!).
1328
0
        std::vector<std::vector<OBAtom*> > allOrderedNbrs(1);
1329
0
        while (!nbrs.empty()) {
1330
1331
          // Select the next nbr atoms with highest symmetry classes.
1332
0
          unsigned int maxSymClass = *std::min_element(nbrSymClasses.begin(), nbrSymClasses.end());
1333
0
          std::vector<OBAtom*> finalNbrs;
1334
0
          for (std::size_t i = 0; i < nbrs.size(); ++i) {
1335
0
            if (nbrSymClasses[i] == maxSymClass)
1336
0
              finalNbrs.push_back(nbrs[i]);
1337
0
          }
1338
1339
          // Remove the selected atoms from nbrs and nbrSymClasses (this could be made more efficient)
1340
0
          for (std::size_t i = 0; i < finalNbrs.size(); ++i) {
1341
0
            nbrs.erase(std::find(nbrs.begin(), nbrs.end(), finalNbrs[i]));
1342
0
            nbrSymClasses.erase(std::find(nbrSymClasses.begin(), nbrSymClasses.end(), maxSymClass));
1343
0
          }
1344
1345
1346
0
          if (finalNbrs.size() == 1) {
1347
            // If there is only one atom with the same symmetry class, label it
1348
            // and select the next group of neighbor atoms with the same symmetry
1349
            // class.
1350
0
            for (std::size_t i = 0; i < allOrderedNbrs.size(); ++i)
1351
0
              allOrderedNbrs[i].push_back(finalNbrs[0]);
1352
0
          } else {
1353
            // Sort the atoms lexicographically.
1354
0
            std::sort(finalNbrs.begin(), finalNbrs.end());
1355
1356
            // Copy the current labelings for the neighbor atoms.
1357
0
            std::vector<std::vector<OBAtom*> > allOrderedNbrsCopy(allOrderedNbrs);
1358
1359
            // Add the first permutation for the neighbor atoms.
1360
//            if (state.mcr.BitIsSet(finalNbrs[0]->GetIdx()))
1361
0
              for (std::size_t j = 0; j < allOrderedNbrs.size(); ++j) {
1362
0
                for (std::size_t i = 0; i < finalNbrs.size(); ++i) {
1363
0
                  allOrderedNbrs[j].push_back(finalNbrs[i]);
1364
0
                }
1365
0
              }
1366
1367
            // Add the other permutations.
1368
0
            while (std::next_permutation(finalNbrs.begin(), finalNbrs.end())) {
1369
0
              if (state.mcr.BitIsSet(finalNbrs[0]->GetIdx()))
1370
0
                for (std::size_t j = 0; j < allOrderedNbrsCopy.size(); ++j) {
1371
0
                  allOrderedNbrs.push_back(allOrderedNbrsCopy[j]);
1372
0
                  for (std::size_t i = 0; i < finalNbrs.size(); ++i)
1373
0
                    allOrderedNbrs.back().push_back(finalNbrs[i]);
1374
0
                }
1375
0
            }
1376
1377
0
          } // finalNbrs.size() != 1
1378
0
        } // while (!nbrs.empty())
1379
1380
0
        if (DEBUG) {
1381
0
          cout << "allOrderedNbrs:" << endl;
1382
0
          for (std::size_t i = 0; i < allOrderedNbrs.size(); ++i) {
1383
0
            for (std::size_t j = 0; j < allOrderedNbrs[i].size(); ++j) {
1384
0
              cout << allOrderedNbrs[i][j]->GetIndex() << " ";
1385
0
            }
1386
0
            cout << endl;
1387
0
          }
1388
0
        }
1389
1390
0
        for (std::size_t i = 0; i < allOrderedNbrs.size(); ++i) {
1391
          // Convert the order stored in allOrderedNbrs to labels.
1392
0
          unsigned int lbl = label;
1393
0
          for (std::size_t j = 0; j < allOrderedNbrs[i].size(); ++j) {
1394
0
            lbl++;
1395
0
            code.add(current, allOrderedNbrs[i][j]);
1396
0
            code.labels[allOrderedNbrs[i][j]->GetIndex()] = lbl;
1397
0
          }
1398
1399
          // Recurse...
1400
0
          CanonicalLabelsRecursive(current, lbl, timeout, bestCode, state);
1401
1402
          // Backtrack...
1403
0
          for (std::size_t j = 0; j < allOrderedNbrs[i].size(); ++j) {
1404
0
            code.atoms.pop_back();
1405
0
            code.bonds.pop_back();
1406
0
            code.from.pop_back();
1407
0
            code.labels[allOrderedNbrs[i][j]->GetIndex()] = 0;
1408
0
          }
1409
1410
          // Optimization
1411
0
          if (state.backtrackDepth) {
1412
0
            if (code.atoms.size() <= state.backtrackDepth) {
1413
              //std::cout << "BACKTRACK DONE 3" << std::endl;
1414
0
              state.backtrackDepth = 0;
1415
0
            }
1416
0
          }
1417
1418
0
        }
1419
1420
0
      } // current->IsInRing()
1421
1422
0
    }
1423
1424
    /**
1425
     * Select an initial atom from a fragment to assign the first label.
1426
     */
1427
    static std::vector<OBAtom*> findStartAtoms(OBMol *mol, const OBBitVec &fragment, const std::vector<unsigned int> &symmetry_classes)
1428
0
    {
1429
      // find the a symmetry class in the fragment using criteria
1430
0
      std::vector<unsigned int> ranks;
1431
0
      for (std::size_t i = 0; i < mol->NumAtoms(); ++i) {
1432
0
        if (!fragment.BitIsSet(i+1))
1433
0
          continue;
1434
1435
0
        OBAtom *atom = mol->GetAtom(i+1);
1436
0
        unsigned int rank = 10000 * symmetry_classes[i]  +
1437
0
                             1000 * atom->GetSpinMultiplicity() +
1438
0
                               10 * (atom->GetFormalCharge() + 7) +
1439
0
                                    TotalHydrogenCount(atom);
1440
1441
0
        ranks.push_back(rank);
1442
0
      }
1443
1444
0
      unsigned int lowestRank = *std::min_element(ranks.begin(), ranks.end());
1445
1446
0
      std::vector<OBAtom*> result;
1447
0
      for (std::size_t i = 0; i < mol->NumAtoms(); ++i) {
1448
0
        if (!fragment.BitIsSet(i+1))
1449
0
          continue;
1450
1451
0
        OBAtom *atom = mol->GetAtom(i+1);
1452
0
        unsigned int rank = 10000 * symmetry_classes[i]  +
1453
0
                             1000 * atom->GetSpinMultiplicity() +
1454
0
                               10 * (atom->GetFormalCharge() + 7) +
1455
0
                                    TotalHydrogenCount(atom);
1456
1457
0
        if (rank == lowestRank)
1458
0
          result.push_back(atom);
1459
0
      }
1460
1461
0
      return result;
1462
0
    }
1463
1464
    /**
1465
     * Compute the canonical labels for @p mol. This function handles a whole molecule with
1466
     * disconnected fragments. It finds a canonical code for each fragment, sorts these codes
1467
     * and computes labels for the molecule as a whole.
1468
     *
1469
     * This is the CanonicalLabelsImpl entry point.
1470
     */
1471
    static void CalcCanonicalLabels(OBMol *mol, const std::vector<unsigned int> &symmetry_classes,
1472
        std::vector<unsigned int> &canonical_labels, const OBStereoUnitSet &stereoUnits, const OBBitVec &mask,
1473
        OBStereoFacade *stereoFacade, int maxSeconds, bool onlyOne = false)
1474
0
    {
1475
      // Handle some special cases.
1476
0
      if (!mol->NumAtoms())
1477
0
        return;
1478
0
      if (mol->NumAtoms() == 1) {
1479
0
        canonical_labels.resize(1, 1);
1480
0
        return;
1481
0
      }
1482
1483
      // Set all labels to 0.
1484
0
      canonical_labels.clear();
1485
0
      canonical_labels.resize(mol->NumAtoms(), 0);
1486
1487
0
      std::vector<OBBond*> metalloceneBonds;
1488
0
      findMetalloceneBonds(metalloceneBonds, mol, symmetry_classes);
1489
1490
      // Find the (dis)connected fragments.
1491
0
      OBBitVec visited;
1492
0
      std::vector<OBBitVec> fragments;
1493
0
      for (std::size_t i = 0; i < mol->NumAtoms(); ++i) {
1494
0
        if (!mask.BitIsSet(i+1) || visited.BitIsSet(i+1))
1495
0
          continue;
1496
0
        fragments.push_back(getFragment(mol->GetAtom(i+1), mask, metalloceneBonds));
1497
0
        visited |= fragments.back();
1498
0
      }
1499
1500
      // Pre-compute the stereo center information. (See StereoCenter)
1501
0
      std::vector<StereoCenter> stereoCenters;
1502
0
      if (stereoFacade) {
1503
0
        for (std::size_t i = 0; i < stereoUnits.size(); ++i) {
1504
0
          const OBStereoUnit &unit = stereoUnits[i];
1505
1506
0
          if (unit.type == OBStereo::Tetrahedral) {
1507
0
            OBAtom *atom = mol->GetAtomById(unit.id);
1508
0
            if (!atom)
1509
0
              continue;
1510
            // Add the StereoCenter indexes.
1511
0
            stereoCenters.resize(stereoCenters.size()+1);
1512
0
            stereoCenters.back().indexes.push_back(atom->GetIndex());
1513
1514
0
            if (!stereoFacade->HasTetrahedralStereo(unit.id))
1515
0
              continue;
1516
0
            OBTetrahedralStereo::Config config = stereoFacade->GetTetrahedralStereo(unit.id)->GetConfig();
1517
0
            if (!config.specified)
1518
0
              continue;
1519
1520
            // Add the neighbor atom indexes.
1521
0
            OBAtom *from = mol->GetAtomById(config.from);
1522
0
            if (from && from->GetAtomicNum() != OBElements::Hydrogen)
1523
0
              stereoCenters.back().nbrIndexes1.push_back(from->GetIndex());
1524
0
            else
1525
0
              stereoCenters.back().nbrIndexes1.push_back(std::numeric_limits<unsigned int>::max());
1526
0
            for (std::size_t j = 0; j < config.refs.size(); ++j) {
1527
0
              OBAtom *ref = mol->GetAtomById(config.refs[j]);
1528
0
              if (ref && ref->GetAtomicNum() != OBElements::Hydrogen)
1529
0
                stereoCenters.back().nbrIndexes1.push_back(ref->GetIndex());
1530
0
              else
1531
0
                stereoCenters.back().nbrIndexes1.push_back(std::numeric_limits<unsigned int>::max());
1532
0
            }
1533
0
          } else if (unit.type == OBStereo::CisTrans) {
1534
0
            OBBond *bond = mol->GetBondById(unit.id);
1535
0
            if (!bond || bond->IsAromatic())
1536
0
              continue;
1537
0
            OBAtom *begin = bond->GetBeginAtom();
1538
0
            OBAtom *end = bond->GetEndAtom();
1539
0
            if (!begin || !end)
1540
0
              continue;
1541
            // Add the StereoCenter indexes.
1542
0
            stereoCenters.resize(stereoCenters.size()+1);
1543
0
            stereoCenters.back().indexes.push_back(begin->GetIndex());
1544
0
            stereoCenters.back().indexes.push_back(end->GetIndex());
1545
1546
0
            if (!stereoFacade->HasCisTransStereo(unit.id))
1547
0
              continue;
1548
0
            OBCisTransStereo::Config config = stereoFacade->GetCisTransStereo(unit.id)->GetConfig();
1549
0
            if (!config.specified)
1550
0
              continue;
1551
1552
            // Add the neighbor atom indexes.
1553
0
            for (std::size_t j = 0; j < config.refs.size(); ++j) {
1554
0
              OBAtom *ref = mol->GetAtomById(config.refs[j]);
1555
0
              unsigned int r = (ref && ref->GetAtomicNum() != OBElements::Hydrogen) ? ref->GetIndex() : std::numeric_limits<unsigned int>::max();
1556
0
              if (stereoCenters.back().nbrIndexes1.size() < 2)
1557
0
                stereoCenters.back().nbrIndexes1.push_back(r);
1558
0
              else
1559
0
                stereoCenters.back().nbrIndexes2.push_back(r);
1560
0
            }
1561
0
          }
1562
0
        }
1563
0
      }
1564
1565
      // Find the canonical code for each fragment.
1566
0
      std::vector<CanonicalLabelsImpl::FullCode> fcodes;
1567
0
      for (std::size_t f = 0; f < fragments.size(); ++f) {
1568
0
        const OBBitVec &fragment = fragments[f];
1569
1570
        // Select the first atom.
1571
0
        std::vector<OBAtom*> startAtoms = findStartAtoms(mol, fragment, symmetry_classes);
1572
1573
0
        CanonicalLabelsImpl::Timeout timeout(maxSeconds);
1574
0
        CanonicalLabelsImpl::FullCode bestCode;
1575
0
        std::vector<CanonicalLabelsImpl::FullCode> identityCodes;
1576
0
        Orbits orbits;
1577
0
        OBBitVec mcr;
1578
1579
0
        for (std::size_t i = 0; i < startAtoms.size(); ++i) {
1580
0
          OBAtom *atom = startAtoms[i];
1581
1582
          // Start labeling of the fragment.
1583
0
          State state(symmetry_classes, fragment, stereoCenters, identityCodes, orbits, mcr, onlyOne);
1584
          //if (!state.mcr.BitIsSet(atom->GetIdx()) && atom->IsInRing())
1585
          //  continue;
1586
1587
0
          state.code.add(atom);
1588
0
          state.code.labels[atom->GetIndex()] = 1;
1589
0
          CanonicalLabelsRecursive(atom, 1, timeout, bestCode, state);
1590
0
        }
1591
1592
        // Throw an error if the timeout is exceeded.
1593
0
        if (time(nullptr) - timeout.startTime > timeout.maxTime) {
1594
0
          obErrorLog.ThrowError(__FUNCTION__, "maximum time exceeded...", obError);
1595
0
        }
1596
1597
        // Store the canonical code for the fragment.
1598
0
        fcodes.push_back(bestCode);
1599
0
      }
1600
1601
      // Sort the codes for the fragments.
1602
0
      std::sort(fcodes.begin(), fcodes.end(), SortCode);
1603
1604
      // Construct the full labeling from the sorted fragment labels.
1605
0
      unsigned int offset = 0;
1606
0
      for (std::size_t f = 0; f < fcodes.size(); ++f) {
1607
        //print_vector("CODE", fcodes[f].code);
1608
        //print_vector("code_labels", fcodes[f].labels);
1609
0
        if (fcodes[f].labels.size() == 0)
1610
0
          continue; // defensive programming
1611
1612
0
        unsigned int max_label = 0;
1613
0
        for (std::size_t i = 0; i < mol->NumAtoms(); ++i) {
1614
0
          if (fcodes[f].labels[i]) {
1615
0
            canonical_labels[i] = fcodes[f].labels[i] + offset;
1616
0
            max_label = std::max(max_label, canonical_labels[i]);
1617
0
          }
1618
0
        }
1619
0
        offset = max_label;
1620
0
      }
1621
1622
0
    }
1623
1624
  }; // CanonicalLabelsImpl
1625
1626
  /*
1627
   * See header for detailed description of parameters.
1628
   *
1629
   * The main purpose of this function is calling CanonicalLabelsImpl::CalcCanonicalLabels
1630
   * with the correct parameters regarding stereochemistry.
1631
   */
1632
  void CanonicalLabels(OBMol *mol, const std::vector<unsigned int> &symmetry_classes,
1633
      std::vector<unsigned int> &canonical_labels, const OBBitVec &mask,
1634
      int maxSeconds, bool onlyOne)
1635
0
  {
1636
    // make sure the mask is valid: no mask = all atoms
1637
0
    OBBitVec maskCopy(mask);
1638
0
    if (!maskCopy.CountBits())
1639
0
      FOR_ATOMS_OF_MOL (atom, mol)
1640
0
        maskCopy.SetBitOn(atom->GetIdx());
1641
1642
0
    if (onlyOne) {
1643
      // Only one labeling requested. This results in canonical labels that do not
1644
      // consider stereochemistry. Used for finding stereo centers with automorphisms.
1645
0
      CanonicalLabelsImpl::CalcCanonicalLabels(mol, symmetry_classes, canonical_labels, OBStereoUnitSet(), maskCopy, nullptr, maxSeconds, true);
1646
0
    } else {
1647
0
      std::vector<OBBond*> metalloceneBonds;
1648
0
      findMetalloceneBonds(metalloceneBonds, mol, symmetry_classes);
1649
1650
      // Check if there are specified stereo centers.
1651
0
      bool hasAtLeastOneDefined = false;
1652
0
      OBStereoFacade sf(mol, false);
1653
0
      FOR_ATOMS_OF_MOL (atom, mol) {
1654
0
        FOR_BONDS_OF_ATOM (bond, &*atom)
1655
0
        if (std::find(metalloceneBonds.begin(), metalloceneBonds.end(), &*bond) != metalloceneBonds.end())
1656
0
          continue;
1657
0
        if (sf.HasTetrahedralStereo(atom->GetId())) {
1658
0
          if (sf.GetTetrahedralStereo(atom->GetId())->GetConfig().specified) {
1659
0
            hasAtLeastOneDefined = true;
1660
0
            break;
1661
0
          }
1662
0
        }
1663
0
      }
1664
0
      FOR_BONDS_OF_MOL (bond, mol) {
1665
0
        if (sf.HasCisTransStereo(bond->GetId())) {
1666
0
          if (sf.GetCisTransStereo(bond->GetId())->GetConfig().specified) {
1667
0
            hasAtLeastOneDefined = true;
1668
0
            break;
1669
0
          }
1670
0
        }
1671
0
      }
1672
0
      if (!hasAtLeastOneDefined) {
1673
        // If there are no specified stereo centers, we don't need to find stereogenic units.
1674
0
        CanonicalLabelsImpl::CalcCanonicalLabels(mol, symmetry_classes, canonical_labels, OBStereoUnitSet(), maskCopy, nullptr, maxSeconds);
1675
0
        return;
1676
0
      }
1677
1678
      // Find the stereogenic units
1679
0
      OBStereoUnitSet stereoUnits = FindStereogenicUnits(mol, symmetry_classes);
1680
1681
      // Mark all invalid stereo data as unspecified
1682
0
      std::vector<OBGenericData*> stereoData = mol->GetAllData(OBGenericDataType::StereoData);
1683
0
      std::vector<OBGenericData*>::iterator data;
1684
0
      for (data = stereoData.begin(); data != stereoData.end(); ++data) {
1685
0
        OBStereo::Type type = ((OBStereoBase*)*data)->GetType();
1686
0
        if (type == OBStereo::Tetrahedral) {
1687
0
          OBTetrahedralStereo *ts = dynamic_cast<OBTetrahedralStereo*>(*data);
1688
0
          OBTetrahedralStereo::Config config = ts->GetConfig();
1689
0
          bool valid = true;
1690
0
          if (!ts->IsValid())
1691
0
            valid = false;
1692
0
          OBAtom *center = mol->GetAtomById(config.center);
1693
0
          if (!center)
1694
0
            valid = false;
1695
0
          else if (!isTetrahedral(center, stereoUnits))
1696
0
            valid = false;
1697
1698
0
          if (!valid) {
1699
0
            config.specified = false;
1700
0
            ts->SetConfig(config);
1701
0
          }
1702
0
        }
1703
0
        if (type == OBStereo::CisTrans) {
1704
0
          OBCisTransStereo *ct = dynamic_cast<OBCisTransStereo*>(*data);
1705
0
          OBCisTransStereo::Config config = ct->GetConfig();
1706
0
          bool valid = true;
1707
0
          if (!ct->IsValid())
1708
0
            valid = false;
1709
0
          OBAtom *beginAtom = mol->GetAtomById(config.begin);
1710
0
          OBAtom *endAtom = mol->GetAtomById(config.end);
1711
0
          if (!beginAtom || !endAtom)
1712
0
            valid = false;
1713
0
          else {
1714
0
            OBBond *bond = mol->GetBond(beginAtom, endAtom);
1715
0
            if (!bond)
1716
0
              valid = false;
1717
0
            else if (!isCisTrans(bond, stereoUnits))
1718
0
              valid = false;
1719
0
          }
1720
1721
0
          if (!valid) {
1722
0
            config.specified = false;
1723
0
            ct->SetConfig(config);
1724
0
          }
1725
0
        }
1726
0
      }
1727
1728
      // Determine stereochemistry from coordinates if needed
1729
0
      if (!mol->HasChiralityPerceived()) {
1730
0
        switch (mol->GetDimension()) {
1731
0
          case 2:
1732
0
            mol->DeleteData(OBGenericDataType::StereoData);
1733
0
            TetrahedralFrom2D(mol, stereoUnits);
1734
0
            CisTransFrom2D(mol, stereoUnits);
1735
0
            break;
1736
0
          case 3:
1737
0
            mol->DeleteData(OBGenericDataType::StereoData);
1738
0
            TetrahedralFrom3D(mol, stereoUnits);
1739
0
            CisTransFrom3D(mol, stereoUnits);
1740
0
            break;
1741
0
          default:
1742
0
            TetrahedralFrom0D(mol, stereoUnits);
1743
0
            CisTransFrom0D(mol, stereoUnits);
1744
0
            break;
1745
0
        }
1746
0
      }
1747
1748
      // The mol->DeleteData() above deleted some of the data that the OBStereoFacade sf()
1749
      // is using.  We have to create a new one.  (CJ - fixes a bug.)
1750
0
      OBStereoFacade newsf(mol, false);
1751
1752
      // Start the labeling process
1753
0
      CanonicalLabelsImpl::CalcCanonicalLabels(mol, symmetry_classes, canonical_labels, stereoUnits, maskCopy, &newsf, maxSeconds);
1754
0
    }
1755
1756
    // if the labeling failed, just return the identity labels to avoid craches
1757
0
    if (canonical_labels.empty())
1758
0
      for (std::size_t i = 0; i < symmetry_classes.size(); ++i)
1759
0
        canonical_labels.push_back(i+1);
1760
0
  }
1761
1762
1763
1764
/* -*-C++-*-
1765
+======================================================================
1766
|
1767
|                       !!!DEPRECATED!!!
1768
|
1769
|
1770
| AUTHOR: Craig A. James, eMolecules, Inc.
1771
|
1772
| DESCRIPTION: CANONICALIZATION OF SMILES
1773
|
1774
|       This is a specialized SMILES canonicalization algorithm.  Although
1775
|       it can be applied in the standard fashion to a whole molecule,
1776
|       its real job is to generate canonical SMILES for fragments, or
1777
|       "subsets", of the atoms of a molecule.
1778
|
1779
|       For example, consider the first three atoms of Oc1ccccc1.  With
1780
|       a "normal" SMILES canonicalizer, you couldn't generate a SMILES
1781
|       for Occ, because it's not a valid molecule.  However, this system
1782
|       can do exactly that, by taking both the whole molecule (which
1783
|       retains the aromaticity), and a "subset" bitmap that specifies
1784
|       which atoms are to be included in the SMILES.
1785
|
1786
|       Canonicalization is carried out per Weininger et al (J. Chem.
1787
|       Inf. Comput. Sci., Vol. 29, No. 2, 1989, pp 97-101), with some
1788
|       modifications to handle bond symmetries not foreseen by Weininger
1789
|       in that paper.
1790
|
1791
|       WARNING - KNOWN BUG: These functions make use of a bitmap vector
1792
|       to represent a "fragment" -- a subset of the atoms in a molecule.
1793
|       But this means the bonds of the fragment are implicit, not explicit,
1794
|       which is incorrect.  For example, if you want to break one bond of
1795
|       cyclehexane (C1CCCCC1), all six atoms will still be there, so the
1796
|       "fragment" will be cyclic.  This is relevant when generating fragment
1797
|       SMILES for ring systems where breaking a bond can reduce the number
1798
|       of ring without removing any atoms.  We need to add a pair of bit
1799
|       vectors, the atoms AND the bonds, to represent a fragment.  (Note
1800
|       that this is also an ambiguity in OpenBabel itself, which represents
1801
|       a ring as a set of atoms. This is only valid if the ring is a member
1802
|       of a SSSR.)
1803
+======================================================================
1804
*/
1805
1806
} // namespace OpenBabel
1807
1808
//! \file canon.cpp
1809
//! \brief Canonical numbering of SMILES, molecules and fragments