Coverage Report

Created: 2026-03-31 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rdkit/Code/GraphMol/SmilesParse/SmilesParseOps.cpp
Line
Count
Source
1
//
2
//  Copyright (C) 2001-2022 Greg Landrum and other RDKit contributors
3
//
4
//   @@ All Rights Reserved @@
5
//  This file is part of the RDKit.
6
//  The contents are covered by the terms of the BSD license
7
//  which is included in the file license.txt, found at the root
8
//  of the RDKit source tree.
9
//
10
#include <GraphMol/RDKitBase.h>
11
#include <GraphMol/RDKitQueries.h>
12
#include <GraphMol/Canon.h>
13
#include <GraphMol/Chirality.h>
14
#include "SmilesParse.h"
15
#include "SmilesParseOps.h"
16
#include <list>
17
#include <algorithm>
18
#include <boost/dynamic_bitset.hpp>
19
#include <boost/format.hpp>
20
#include <RDGeneral/RDLog.h>
21
22
namespace SmilesParseOps {
23
using namespace RDKit;
24
25
36.0k
void ClearAtomChemicalProps(RDKit::Atom *atom) {
26
36.0k
  TEST_ASSERT(atom);
27
36.0k
  atom->setIsotope(0);
28
36.0k
  atom->setFormalCharge(0);
29
36.0k
  atom->setNumExplicitHs(0);
30
36.0k
}
31
32
779k
void CheckRingClosureBranchStatus(RDKit::Atom *atom, RDKit::RWMol *mp) {
33
  // github #786 and #1652: if the ring closure comes after a branch,
34
  // the stereochem is wrong.
35
  // This function is called while closing a branch during construction of
36
  // the molecule from SMILES and corrects for what happens when parsing odd
37
  // (and arguably wrong) SMILES constructs like:
38
  //   1) [C@@](F)1(C)CCO1
39
  //   2) C1CN[C@](O)(N)1
40
  //   3) [C@](Cl)(F)1CC[C@H](F)CC1
41
  // In the first two cases the stereochemistry at the chiral atom
42
  // needs to be reversed. In the third case the stereochemistry should be
43
  // reversed when the Cl is added, but left alone when the F is added.
44
  // We recognize these situations using the index of the chiral atom
45
  // and the degree of that chiral atom at the time the ring closure
46
  // digit is encountered during parsing.
47
  // ----------
48
  // github #1972 adds these examples:
49
  //   1) [C@@]1(Cl)(F)I.Br1    (ok)
50
  //   2) [C@@](Cl)1(F)I.Br1    (reverse)
51
  //   3) [C@@](Cl)(F)1I.Br1    (ok)
52
  //   4) [C@@](Cl)(F)(I)1.Br1  (reverse)
53
779k
  PRECONDITION(atom, "bad atom");
54
779k
  PRECONDITION(mp, "bad mol");
55
779k
  if (atom->getIdx() != mp->getNumAtoms(true) - 1 &&
56
4.53k
      (atom->getDegree() == 1 ||
57
4.16k
       (atom->getDegree() == 2 && atom->getIdx() != 0) ||
58
1.83k
       (atom->getDegree() == 3 && atom->getIdx() == 0)) &&
59
2.86k
      (atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW ||
60
2.20k
       atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW)) {
61
    // std::cerr << "crcbs: " << atom->getIdx() << std::endl;
62
1.10k
    atom->invertChirality();
63
1.10k
  }
64
779k
}
65
66
548
void ReportParseError(const char *message, bool throwIt) {
67
548
  PRECONDITION(message, "bad message");
68
548
  if (!throwIt) {
69
0
    BOOST_LOG(rdErrorLog) << "SMILES Parse Error: " << message << std::endl;
70
548
  } else {
71
548
    throw SmilesParseException(message);
72
548
  }
73
548
}
74
75
9.06k
void CleanupAfterParseError(RWMol *mol) {
76
9.06k
  PRECONDITION(mol, "no molecule");
77
  // blow out any partial bonds:
78
58.3k
  for (auto markI : *mol->getBondBookmarks()) {
79
58.3k
    RWMol::BOND_PTR_LIST &bonds = markI.second;
80
459k
    for (auto &bond : bonds) {
81
459k
      delete bond;
82
459k
    }
83
58.3k
  }
84
9.06k
}
85
86
namespace {
87
61.0k
bool couldBeRingClosure(int val) { return val < 100000 && val >= 0; }
88
}  // namespace
89
//
90
// set bondOrder to Bond::IONIC to skip the formation of a bond
91
//  between the fragment and the molecule
92
//
93
void AddFragToMol(RWMol *mol, RWMol *frag, Bond::BondType bondOrder,
94
0
                  Bond::BondDir bondDir) {
95
0
  PRECONDITION(mol, "no molecule");
96
0
  PRECONDITION(frag, "no fragment");
97
0
  PRECONDITION(mol->getActiveAtom(), "no active atom");
98
0
  Atom *lastAt = mol->getActiveAtom();
99
0
  int nOrigAtoms = mol->getNumAtoms();
100
0
  int nOrigBonds = mol->getNumBonds();
101
102
  //
103
  // Add the fragment's atoms and bonds to the molecule:
104
  //
105
0
  mol->insertMol(*frag);
106
107
  //
108
  // update ring-closure order information on the added atoms:
109
  //
110
0
  for (const auto atom : frag->atoms()) {
111
0
    INT_VECT tmpVect;
112
0
    if (atom->getPropIfPresent(common_properties::_RingClosures, tmpVect)) {
113
0
      for (auto &v : tmpVect) {
114
        // if the ring closure is not already a bond, don't touch it:
115
0
        if (v >= 0) {
116
0
          v += nOrigBonds;
117
0
        }
118
0
      }
119
0
      auto newAtom = mol->getAtomWithIdx(nOrigAtoms + atom->getIdx());
120
0
      newAtom->setProp(common_properties::_RingClosures, tmpVect);
121
0
    }
122
0
  }
123
124
  //
125
  //  ses up the bond between the mol and the branch
126
  //
127
0
  if (bondOrder != Bond::IONIC) {
128
    // FIX: this is not so much with the elegance...
129
0
    auto firstAt = mol->getAtomWithIdx(nOrigAtoms);
130
0
    int atomIdx1 = firstAt->getIdx();
131
0
    int atomIdx2 = lastAt->getIdx();
132
0
    if (frag->hasBondBookmark(ci_LEADING_BOND)) {
133
      // std::cout << "found it" << std::endl;
134
0
      const ROMol::BOND_PTR_LIST &leadingBonds =
135
0
          frag->getAllBondsWithBookmark(ci_LEADING_BOND);
136
0
      for (auto leadingBond : leadingBonds) {
137
        // we've already got a bond, so just set its local info
138
        // and then add it to the molecule intact (no sense doing
139
        // any extra work).
140
0
        leadingBond->setOwningMol(mol);
141
0
        leadingBond->setEndAtomIdx(leadingBond->getBeginAtomIdx() + nOrigAtoms);
142
0
        leadingBond->setBeginAtomIdx(atomIdx2);
143
0
        mol->addBond(leadingBond, true);
144
0
      }
145
0
      mol->clearBondBookmark(ci_LEADING_BOND);
146
0
    } else {
147
      // SMARTS semantics: unspecified bonds can be single or aromatic
148
0
      if (bondOrder == Bond::UNSPECIFIED) {
149
0
        auto *newB = new QueryBond(Bond::SINGLE);
150
0
        newB->setQuery(makeSingleOrAromaticBondQuery());
151
0
        newB->setOwningMol(mol);
152
0
        newB->setBeginAtomIdx(atomIdx1);
153
0
        newB->setEndAtomIdx(atomIdx2);
154
0
        newB->setProp(RDKit::common_properties::_unspecifiedOrder, 1);
155
0
        mol->addBond(newB);
156
0
        delete newB;
157
0
      } else {
158
0
        Bond::BondType bo = bondOrder;
159
0
        if (bo == Bond::DATIVEL) {
160
0
          std::swap(atomIdx1, atomIdx2);
161
0
          bo = Bond::DATIVE;
162
0
        } else if (bo == Bond::DATIVER) {
163
0
          bo = Bond::DATIVE;
164
0
        }
165
0
        int idx = mol->addBond(atomIdx2, atomIdx1, bo) - 1;
166
0
        mol->getBondWithIdx(idx)->setBondDir(bondDir);
167
0
      }
168
0
    }
169
0
  }
170
171
  //
172
  // okay, the next thing we have to worry about is the possibility
173
  // that there might be ring opening/closing in the fragment we just
174
  // dealt with e.g. for things like C1C(C1) and C1C.C1
175
  // We deal with this by copying in the bookmarks and partial bonds
176
  // that exist in the fragment
177
  //
178
0
  for (auto atIt : *frag->getAtomBookmarks()) {
179
    // don't bother even considering bookmarks outside
180
    // the range used for cycles
181
0
    if (couldBeRingClosure(atIt.first)) {
182
0
      for (auto at2 : atIt.second) {
183
0
        int newIdx = at2->getIdx() + nOrigAtoms;
184
0
        mol->setAtomBookmark(mol->getAtomWithIdx(newIdx), atIt.first);
185
0
        while (frag->hasBondBookmark(atIt.first)) {
186
0
          Bond *b = frag->getBondWithBookmark(atIt.first);
187
0
          int atomIdx1 = b->getBeginAtomIdx() + nOrigAtoms;
188
0
          b->setOwningMol(mol);
189
0
          b->setBeginAtomIdx(atomIdx1);
190
0
          mol->setBondBookmark(b, atIt.first);
191
0
          frag->clearBondBookmark(atIt.first, b);
192
0
        }
193
0
      }
194
0
    }
195
0
  }
196
197
0
  frag->clearAllAtomBookmarks();
198
0
  frag->clearAllBondBookmarks();
199
0
};
200
201
typedef std::pair<size_t, int> SIZET_PAIR;
202
typedef std::pair<int, int> INT_PAIR;
203
template <typename T>
204
bool operator<(const std::pair<T, T> &p1, const std::pair<T, T> &p2) {
205
  return p1.first < p2.first;
206
}
207
208
//
209
// Helper function to get the SMILES bond ordering around a given atom, this is
210
// required to make sure the stereo information is correct as the storage order
211
// may not match how it is SMILES due to ring closures and implicit/missing
212
// ligands.
213
//
214
unsigned int GetBondOrdering(INT_LIST &bondOrdering, const RDKit::RWMol *mol,
215
35.2k
                             const RDKit::Atom *atom) {
216
35.2k
  PRECONDITION(mol, "no mol");
217
35.2k
  PRECONDITION(atom, "no atom");
218
219
  //
220
  // The atom is marked as chiral, set the SMILES-order of the
221
  // atom's bonds.  This is easy for non-ring-closure bonds,
222
  // because the SMILES order is determined solely by the atom
223
  // indices.  Things are trickier for ring-closure bonds, which we
224
  // need to insert into the list in a particular order
225
  //
226
35.2k
  INT_VECT ringClosures;
227
35.2k
  atom->getPropIfPresent(common_properties::_RingClosures, ringClosures);
228
229
#if 0
230
  std::cerr << "CLOSURES: ";
231
  std::copy(ringClosures.begin(), ringClosures.end(),
232
            std::ostream_iterator<int>(std::cerr, " "));
233
  std::cerr << std::endl;
234
#endif
235
35.2k
  std::list<SIZET_PAIR> neighbors;
236
  // push this atom onto the list of neighbors (we'll use this
237
  // to find our place later):
238
35.2k
  neighbors.emplace_back(atom->getIdx(), -1);
239
35.2k
  std::list<size_t> bondOrder;
240
126k
  for (auto nbrIdx : boost::make_iterator_range(mol->getAtomNeighbors(atom))) {
241
126k
    const Bond *nbrBond = mol->getBondBetweenAtoms(atom->getIdx(), nbrIdx);
242
126k
    if (std::find(ringClosures.begin(), ringClosures.end(),
243
126k
                  static_cast<int>(nbrBond->getIdx())) == ringClosures.end()) {
244
106k
      neighbors.emplace_back(nbrIdx, nbrBond->getIdx());
245
106k
    }
246
126k
  }
247
  // sort the list of non-ring-closure bonds:
248
35.2k
  neighbors.sort();
249
250
  // find the location of this atom.  it pretty much has to be
251
  // first in the list, e.g for smiles like [C@](F)(Cl)(Br)I, or
252
  // second (everything else).
253
35.2k
  auto selfPos = neighbors.begin();
254
35.2k
  if (selfPos->first != atom->getIdx()) {
255
31.3k
    ++selfPos;
256
31.3k
  }
257
35.2k
  CHECK_INVARIANT(selfPos->first == atom->getIdx(), "weird atom ordering");
258
259
  // copy over the bond ids:
260
176k
  for (auto neighborIt = neighbors.begin(); neighborIt != neighbors.end();
261
141k
       ++neighborIt) {
262
141k
    if (neighborIt != selfPos) {
263
106k
      bondOrdering.push_back(rdcast<int>(neighborIt->second));
264
106k
    } else {
265
      // we are not going to add the atom itself, but we will push on
266
      // ring closure bonds at this point (if required):
267
35.2k
      bondOrdering.insert(bondOrdering.end(), ringClosures.begin(),
268
35.2k
                          ringClosures.end());
269
35.2k
    }
270
141k
  }
271
272
35.2k
  return ringClosures.size();
273
35.2k
}
274
275
35.3k
void AdjustAtomChiralityFlags(RWMol *mol) {
276
35.3k
  PRECONDITION(mol, "no molecule");
277
6.19M
  for (auto atom : mol->atoms()) {
278
6.19M
    Atom::ChiralType chiralType = atom->getChiralTag();
279
6.19M
    if (chiralType == Atom::CHI_TETRAHEDRAL_CW ||
280
6.18M
        chiralType == Atom::CHI_TETRAHEDRAL_CCW) {
281
32.9k
      INT_LIST bondOrdering;
282
32.9k
      unsigned int numClosures = GetBondOrdering(bondOrdering, mol, atom);
283
284
      // ok, we now have the SMILES ordering of the bonds, figure out the
285
      // permutation order.
286
      //
287
      //  This whole thing is necessary because the ring-closure bonds
288
      //  in the SMILES come before the bonds to the other neighbors, but
289
      //  they come after the neighbors in the molecule we build.
290
      //  A crude example:
291
      //   in F[C@](Cl)(Br)I the C-Cl bond is index 1 in both SMILES
292
      //         and as built
293
      //   in F[C@]1(Br)I.Cl1 the C-Cl bond is index 1 in the SMILES
294
      //         and index 3 as built.
295
      //
296
32.9k
      int nSwaps = atom->getPerturbationOrder(bondOrdering);
297
      // FIX: explain this one:
298
      // At least part of what's going on here for degree 3 atoms:
299
      //   - The first part: if we're at the beginning of the SMILES and have
300
      //      an explicit H, we need to add a swap.
301
      //      This is to reflect that [C@](Cl)(F)C is equivalent to Cl[C@@](F)C
302
      //      but [C@H](Cl)(F)C is fine as-is (The H-C bond is the one you look
303
      //      down).
304
      //   - The second part is more complicated and deals with situations like
305
      //      F[C@]1CCO1. In this case we otherwise end up looking like we need
306
      //      to invert the chirality, which is bogus. The chirality here needs
307
      //      to remain @ just as it does in F[C@](Cl)CCO1
308
      //   - We have to be careful with the second part to not sweep things like
309
      //      C[S@]2(=O).Cl2 into the same bin (was github #760). We detect
310
      //      those cases by looking for unsaturated atoms
311
      //
312
32.9k
      if (Canon::chiralAtomNeedsTagInversion(
313
32.9k
              *mol, atom, atom->hasProp(common_properties::_SmilesStart),
314
32.9k
              numClosures)) {
315
5.03k
        ++nSwaps;
316
5.03k
      }
317
      // std::cerr << "nswaps " << atom->getIdx() << " " << nSwaps
318
      //           << std::endl;
319
      // std::copy(bondOrdering.begin(), bondOrdering.end(),
320
      //           std::ostream_iterator<int>(std::cerr, ", "));
321
      // std::cerr << std::endl;
322
32.9k
      if (nSwaps % 2) {
323
5.18k
        atom->invertChirality();
324
5.18k
      }
325
6.16M
    } else if (chiralType == Atom::CHI_SQUAREPLANAR ||
326
6.16M
               chiralType == Atom::CHI_TRIGONALBIPYRAMIDAL ||
327
6.15M
               chiralType == Atom::CHI_OCTAHEDRAL) {
328
2.29k
      INT_LIST bonds;
329
2.29k
      GetBondOrdering(bonds, mol, atom);
330
331
2.29k
      unsigned int ref_max = Chirality::getMaxNbors(chiralType);
332
333
      // insert (-1) for hydrogens or missing ligands, where these are placed
334
      // depends on if it is the first atom or not
335
2.29k
      if (bonds.size() < ref_max) {
336
2.24k
        if (atom->hasProp(common_properties::_SmilesStart)) {
337
478
          bonds.insert(bonds.begin(), ref_max - bonds.size(), -1);
338
1.76k
        } else {
339
1.76k
          bonds.insert(++bonds.begin(), ref_max - bonds.size(), -1);
340
1.76k
        }
341
2.24k
      }
342
343
2.29k
      atom->setProp(common_properties::_chiralPermutation,
344
2.29k
                    Chirality::getChiralPermutation(atom, bonds, true));
345
2.29k
    }
346
6.19M
  }
347
35.3k
}  // namespace SmilesParseOps
348
349
Bond::BondType GetUnspecifiedBondType(const RWMol *mol, const Atom *atom1,
350
8.22M
                                      const Atom *atom2) {
351
8.22M
  PRECONDITION(mol, "no molecule");
352
8.22M
  PRECONDITION(atom1, "no atom1");
353
8.22M
  PRECONDITION(atom2, "no atom2");
354
8.22M
  Bond::BondType res;
355
8.22M
  if (atom1->getIsAromatic() && atom2->getIsAromatic()) {
356
3.64M
    res = Bond::AROMATIC;
357
4.58M
  } else {
358
4.58M
    res = Bond::SINGLE;
359
4.58M
  }
360
8.22M
  return res;
361
8.22M
}
362
35.3k
void SetUnspecifiedBondTypes(RWMol *mol) {
363
35.3k
  PRECONDITION(mol, "no molecule");
364
6.18M
  for (auto bond : mol->bonds()) {
365
6.18M
    if (bond->hasProp(RDKit::common_properties::_unspecifiedOrder)) {
366
550k
      bond->setBondType(GetUnspecifiedBondType(mol, bond->getBeginAtom(),
367
550k
                                               bond->getEndAtom()));
368
550k
      if (bond->getBondType() == Bond::AROMATIC) {
369
337k
        bond->setIsAromatic(true);
370
337k
      } else {
371
212k
        bond->setIsAromatic(false);
372
212k
      }
373
550k
    }
374
6.18M
  }
375
35.3k
}
376
377
namespace {
378
160k
void swapBondDirIfNeeded(Bond *bond1, const Bond *bond2) {
379
160k
  PRECONDITION(bond1, "bad bond1");
380
160k
  PRECONDITION(bond2, "bad bond2");
381
160k
  if (bond1->getBondDir() == Bond::NONE && bond2->getBondDir() != Bond::NONE) {
382
6.02k
    bond1->setBondDir(bond2->getBondDir());
383
6.02k
    if (bond1->getBeginAtom() != bond2->getBeginAtom()) {
384
4.82k
      switch (bond1->getBondDir()) {
385
3.60k
        case Bond::ENDDOWNRIGHT:
386
3.60k
          bond1->setBondDir(Bond::ENDUPRIGHT);
387
3.60k
          break;
388
1.21k
        case Bond::ENDUPRIGHT:
389
1.21k
          bond1->setBondDir(Bond::ENDDOWNRIGHT);
390
1.21k
          break;
391
0
        default:
392
0
          break;
393
4.82k
      }
394
4.82k
    }
395
6.02k
  }
396
160k
}
397
}  // namespace
398
399
static const std::map<int, int> permutationLimits = {
400
    {RDKit::Atom::ChiralType::CHI_TETRAHEDRAL, 2},
401
    {RDKit::Atom::ChiralType::CHI_ALLENE, 2},
402
    {RDKit::Atom::ChiralType::CHI_SQUAREPLANAR, 3},
403
    {RDKit::Atom::ChiralType::CHI_OCTAHEDRAL, 30},
404
    {RDKit::Atom::ChiralType::CHI_TRIGONALBIPYRAMIDAL, 20}};
405
406
3.69k
bool checkChiralPermutation(int chiralTag, int permutation) {
407
3.69k
  if (chiralTag > RDKit::Atom::ChiralType::CHI_OTHER &&
408
3.69k
      permutationLimits.find(chiralTag) != permutationLimits.end() &&
409
3.69k
      (permutation < 0 || permutation > permutationLimits.at(chiralTag))) {
410
7
    return false;
411
7
  }
412
3.69k
  return true;
413
3.69k
}
414
415
35.4k
void CheckChiralitySpecifications(RDKit::RWMol *mol, bool strict) {
416
35.4k
  PRECONDITION(mol, "no molecule");
417
6.19M
  for (const auto atom : mol->atoms()) {
418
6.19M
    int permutation;
419
6.19M
    if (atom->getChiralTag() > RDKit::Atom::ChiralType::CHI_OTHER &&
420
3.73k
        permutationLimits.find(atom->getChiralTag()) !=
421
3.73k
            permutationLimits.end() &&
422
3.73k
        atom->getPropIfPresent(common_properties::_chiralPermutation,
423
3.73k
                               permutation)) {
424
3.69k
      if (!checkChiralPermutation(atom->getChiralTag(), permutation)) {
425
7
        std::string error =
426
7
            (boost::format("Invalid chiral specification on atom %d") %
427
7
             atom->getIdx())
428
7
                .str();
429
7
        BOOST_LOG(rdWarningLog) << error << std::endl;
430
7
        if (strict) {
431
7
          throw SmilesParseException(error);
432
7
        }
433
7
      }
434
      // directly convert @TH1 -> @ and @TH2 -> @@
435
3.69k
      if (atom->getChiralTag() == RDKit::Atom::ChiralType::CHI_TETRAHEDRAL) {
436
1.42k
        if (permutation == 0 || permutation == 1) {
437
1.40k
          atom->setChiralTag(RDKit::Atom::ChiralType::CHI_TETRAHEDRAL_CCW);
438
1.40k
          atom->clearProp(common_properties::_chiralPermutation);
439
1.40k
        } else if (permutation == 2) {
440
17
          atom->setChiralTag(RDKit::Atom::ChiralType::CHI_TETRAHEDRAL_CW);
441
17
          atom->clearProp(common_properties::_chiralPermutation);
442
17
        }
443
1.42k
      }
444
3.69k
    }
445
6.19M
  }
446
35.4k
}
447
448
37.3k
void CloseMolRings(RWMol *mol, bool toleratePartials) {
449
  //  Here's what we want to do here:
450
  //    loop through the molecule's atom bookmarks
451
  //    for each bookmark:
452
  //       connect pairs of atoms sharing that bookmark
453
  //          left to right (in the order in which they were
454
  //          inserted into the molecule).
455
  //       whilst doing this, we have to be cognizant of the fact that
456
  //          there may well be partial bonds in the molecule which need
457
  //          to be tied in as well.  WOO HOO! IT'S A BIG MESS!
458
37.3k
  PRECONDITION(mol, "no molecule");
459
460
37.3k
  auto bookmarkIt = mol->getAtomBookmarks()->begin();
461
98.3k
  while (bookmarkIt != mol->getAtomBookmarks()->end()) {
462
61.0k
    auto &bookmark = *bookmarkIt;
463
    // don't bother even considering bookmarks outside
464
    // the range used for rings
465
61.0k
    if (couldBeRingClosure(bookmark.first)) {
466
23.6k
      RWMol::ATOM_PTR_LIST bookmarkedAtomsToRemove;
467
23.6k
      auto atomIt = bookmark.second.begin();
468
23.6k
      auto atomsEnd = bookmark.second.end();
469
184k
      while (atomIt != atomsEnd) {
470
160k
        Atom *atom1 = *atomIt;
471
160k
        ++atomIt;
472
160k
        if (!toleratePartials && atomIt == atomsEnd) {
473
341
          ReportParseError("unclosed ring");
474
160k
        } else if (atomIt != atomsEnd && *atomIt == atom1) {
475
          // make sure we don't try to connect an atom to itself
476
          // this was github #1925
477
157
          auto fmt =
478
157
              boost::format{
479
157
                  "duplicated ring closure %1% bonds atom %2% to itself"} %
480
157
              bookmark.first % atom1->getIdx();
481
157
          std::string msg = fmt.str();
482
157
          ReportParseError(msg.c_str(), true);
483
160k
        } else if (mol->getBondBetweenAtoms(atom1->getIdx(),
484
160k
                                            (*atomIt)->getIdx()) != nullptr) {
485
50
          auto fmt =
486
50
              boost::format{
487
50
                  "ring closure %1% duplicates bond between atom %2% and atom "
488
50
                  "%3%"} %
489
50
              bookmark.first % atom1->getIdx() % (*atomIt)->getIdx();
490
50
          std::string msg = fmt.str();
491
50
          ReportParseError(msg.c_str(), true);
492
160k
        } else if (atomIt != atomsEnd) {
493
          // we actually found an atom, so connect it to the first
494
160k
          Atom *atom2 = *atomIt;
495
160k
          ++atomIt;
496
497
160k
          int bondIdx = -1;
498
          // We're guaranteed two partial bonds, one for each time
499
          // the ring index was used.  We give the first specification
500
          // priority.
501
160k
          CHECK_INVARIANT(mol->hasBondBookmark(bookmark.first),
502
160k
                          "Missing bond bookmark");
503
504
          // now use the info from the partial bond:
505
          // The partial bond itself will have a proper order and
506
          // directionality (with a minor caveat documented below) and will
507
          // have its beginning atom set already:
508
160k
          RWMol::BOND_PTR_LIST bonds =
509
160k
              mol->getAllBondsWithBookmark(bookmark.first);
510
160k
          auto bondIt = bonds.begin();
511
160k
          CHECK_INVARIANT(bonds.size() >= 2, "Missing bond");
512
513
          // get pointers to the two bonds:
514
160k
          Bond *bond1 = *bondIt;
515
160k
          ++bondIt;
516
160k
          Bond *bond2 = *bondIt;
517
518
          // remove those bonds from the bookmarks:
519
160k
          mol->clearBondBookmark(bookmark.first, bond1);
520
160k
          mol->clearBondBookmark(bookmark.first, bond2);
521
522
          // Make sure the bonds have the correct starting atoms:
523
160k
          CHECK_INVARIANT(bond1->getBeginAtomIdx() == atom1->getIdx(),
524
160k
                          "bad begin atom");
525
160k
          CHECK_INVARIANT(bond2->getBeginAtomIdx() == atom2->getIdx(),
526
160k
                          "bad begin atom");
527
528
          // we use the _cxsmilesBondIdx value from the second one, if it's
529
          // there
530
160k
          if (bond2->hasProp("_cxsmilesBondIdx")) {
531
160k
            bond1->setProp("_cxsmilesBondIdx",
532
160k
                           bond2->getProp<unsigned int>("_cxsmilesBondIdx"));
533
160k
          }
534
535
160k
          Bond *matchedBond;
536
537
          // figure out which (if either) bond has a specified type, we'll
538
          // keep that one.  We also need to update the end atom index to
539
          // match FIX: daylight barfs when you give it multiple specs for the
540
          // closure
541
          //   bond, we'll just take the first one and ignore others
542
          //   NOTE: we used to do this the other way (take the last
543
          //   specification),
544
          //   but that turned out to be troublesome in odd cases like
545
          //   C1CC11CC1.
546
          // std::cerr << ">-------------" << std::endl;
547
          // std::cerr << atom1->getIdx() << "-" << atom2->getIdx() << ": "
548
          //           << bond1->getBondType() << "("
549
          //           << bond1->hasProp(common_properties::_unspecifiedOrder)
550
          //           << "):" << bond1->getBondDir() << " "
551
          //           << bond2->getBondType() << "("
552
          //           << bond2->hasProp(common_properties::_unspecifiedOrder)
553
          //           << "):" << bond2->getBondDir() << std::endl;
554
160k
          if (!bond1->hasProp(common_properties::_unspecifiedOrder)) {
555
14.4k
            matchedBond = bond1;
556
14.4k
            if (matchedBond->getBondType() == Bond::DATIVEL) {
557
1.41k
              matchedBond->setBeginAtomIdx(atom2->getIdx());
558
1.41k
              matchedBond->setEndAtomIdx(atom1->getIdx());
559
1.41k
              matchedBond->setBondType(Bond::DATIVE);
560
13.0k
            } else if (matchedBond->getBondType() == Bond::DATIVER) {
561
198
              matchedBond->setEndAtomIdx(atom2->getIdx());
562
198
              matchedBond->setBondType(Bond::DATIVE);
563
12.8k
            } else {
564
12.8k
              matchedBond->setEndAtomIdx(atom2->getIdx());
565
12.8k
            }
566
14.4k
            swapBondDirIfNeeded(bond1, bond2);
567
14.4k
            delete bond2;
568
145k
          } else {
569
145k
            matchedBond = bond2;
570
145k
            if (matchedBond->getBondType() == Bond::DATIVEL) {
571
898
              matchedBond->setBeginAtomIdx(atom1->getIdx());
572
898
              matchedBond->setEndAtomIdx(atom2->getIdx());
573
898
              matchedBond->setBondType(Bond::DATIVE);
574
144k
            } else if (matchedBond->getBondType() == Bond::DATIVER) {
575
3.24k
              matchedBond->setEndAtomIdx(atom1->getIdx());
576
3.24k
              matchedBond->setBondType(Bond::DATIVE);
577
141k
            } else {
578
141k
              matchedBond->setEndAtomIdx(atom1->getIdx());
579
141k
            }
580
145k
            swapBondDirIfNeeded(bond2, bond1);
581
145k
            delete bond1;
582
145k
          }
583
160k
          if (matchedBond->getBondType() == Bond::UNSPECIFIED &&
584
136k
              !matchedBond->hasQuery()) {
585
136k
            Bond::BondType bondT = GetUnspecifiedBondType(mol, atom1, atom2);
586
136k
            matchedBond->setBondType(bondT);
587
136k
          }
588
160k
          matchedBond->setOwningMol(mol);
589
160k
          if (matchedBond->getBondType() == Bond::AROMATIC) {
590
38.9k
            matchedBond->setIsAromatic(true);
591
38.9k
          }
592
593
          // add the bond:
594
160k
          bondIdx = mol->addBond(matchedBond, true);
595
596
          // we found a bond, so update the atom's _RingClosures
597
          // property:
598
160k
          if (bondIdx > -1) {
599
160k
            CHECK_INVARIANT(
600
160k
                atom1->hasProp(common_properties::_RingClosures) &&
601
160k
                    atom2->hasProp(common_properties::_RingClosures),
602
160k
                "somehow atom doesn't have _RingClosures property.");
603
160k
            INT_VECT closures;
604
160k
            atom1->getProp(common_properties::_RingClosures, closures);
605
160k
            auto closurePos = std::find(closures.begin(), closures.end(),
606
160k
                                        -(bookmark.first + 1));
607
160k
            CHECK_INVARIANT(closurePos != closures.end(),
608
160k
                            "could not find bookmark in atom _RingClosures");
609
160k
            *closurePos = bondIdx - 1;
610
160k
            atom1->setProp(common_properties::_RingClosures, closures);
611
612
160k
            atom2->getProp(common_properties::_RingClosures, closures);
613
160k
            closurePos = std::find(closures.begin(), closures.end(),
614
160k
                                   -(bookmark.first + 1));
615
160k
            CHECK_INVARIANT(closurePos != closures.end(),
616
160k
                            "could not find bookmark in atom _RingClosures");
617
160k
            *closurePos = bondIdx - 1;
618
160k
            atom2->setProp(common_properties::_RingClosures, closures);
619
160k
          }
620
160k
          bookmarkedAtomsToRemove.push_back(atom1);
621
160k
          bookmarkedAtomsToRemove.push_back(atom2);
622
160k
        }
623
160k
      }
624
23.6k
      int mark = bookmark.first;
625
23.6k
      ++bookmarkIt;
626
312k
      for (const auto atom : bookmarkedAtomsToRemove) {
627
312k
        mol->clearAtomBookmark(mark, atom);
628
312k
      }
629
37.3k
    } else {
630
37.3k
      ++bookmarkIt;
631
37.3k
    }
632
61.0k
  }
633
37.3k
};
634
635
27.2k
void CleanupAfterParsing(RWMol *mol) {
636
27.2k
  PRECONDITION(mol, "no molecule");
637
2.05M
  for (auto atom : mol->atoms()) {
638
2.05M
    atom->clearProp(common_properties::_RingClosures);
639
2.05M
    atom->clearProp(common_properties::_SmilesStart);
640
2.05M
    std::string label;
641
2.05M
    if (atom->getAtomicNum() == 0 &&
642
315k
        atom->getPropIfPresent(common_properties::atomLabel, label)) {
643
      // marvinsketch can output higher labels than _AP1 and _AP2, but they
644
      // aren't part of the MOL file spec so we don't treat them as attachment
645
      // points
646
13.2k
      if (label == "_AP1") {
647
295
        atom->setProp(common_properties::_fromAttachPoint, 1);
648
12.9k
      } else if (label == "_AP2") {
649
480
        atom->setProp(common_properties::_fromAttachPoint, 2);
650
480
      }
651
13.2k
    }
652
2.05M
  }
653
2.00M
  for (auto bond : mol->bonds()) {
654
2.00M
    bond->clearProp(common_properties::_unspecifiedOrder);
655
2.00M
    bond->clearProp("_cxsmilesBondIdx");
656
2.00M
  }
657
27.2k
  for (auto sg : RDKit::getSubstanceGroups(*mol)) {
658
5.26k
    sg.clearProp("_cxsmilesindex");
659
5.26k
  }
660
27.2k
  if (!Chirality::getAllowNontetrahedralChirality()) {
661
0
    bool needWarn = false;
662
0
    for (auto atom : mol->atoms()) {
663
0
      if (atom->hasProp(common_properties::_chiralPermutation)) {
664
0
        needWarn = true;
665
0
        atom->clearProp(common_properties::_chiralPermutation);
666
0
      }
667
0
      if (atom->getChiralTag() > Atom::ChiralType::CHI_OTHER) {
668
0
        needWarn = true;
669
0
        atom->setChiralTag(Atom::ChiralType::CHI_UNSPECIFIED);
670
0
      }
671
0
    }
672
0
    if (needWarn) {
673
0
      BOOST_LOG(rdWarningLog)
674
0
          << "ignoring non-tetrahedral stereo specification since setAllowNontetrahedralChirality() is false."
675
0
          << std::endl;
676
0
    }
677
0
  }
678
27.2k
}
679
680
RDKit::QueryBond *getUnspecifiedQueryBond(const RDKit::Atom *a1,
681
1.62M
                                          const RDKit::Atom *a2) {
682
1.62M
  PRECONDITION(a1, "bad atom pointer");
683
1.62M
  QueryBond *newB;
684
1.62M
  if (!a1->getIsAromatic() || (a2 && !a2->getIsAromatic())) {
685
192k
    newB = new QueryBond(Bond::SINGLE);
686
192k
    newB->setQuery(makeSingleOrAromaticBondQuery());
687
1.42M
  } else {
688
1.42M
    newB = new QueryBond(Bond::AROMATIC);
689
1.42M
    newB->setQuery(makeSingleOrAromaticBondQuery());
690
1.42M
  }
691
1.62M
  newB->setProp(RDKit::common_properties::_unspecifiedOrder, 1);
692
1.62M
  return newB;
693
1.62M
}
694
695
namespace detail {
696
697
void printSyntaxErrorMessage(std::string_view input,
698
                             std::string_view err_message,
699
                             unsigned int bad_token_position,
700
11.0k
                             std::string_view input_type) {
701
  // NOTE: If the input is very long, the pointer to the failed location
702
  // becomes less useful. We should truncate the length of the error message
703
  // to 41 chars.
704
11.0k
  static constexpr unsigned int error_size{41};
705
11.0k
  static constexpr unsigned int prefix_size{error_size / 2};
706
11.0k
  static auto truncate_input = [=](const auto &input, const unsigned int pos) {
707
0
    if ((pos >= prefix_size) && (pos + prefix_size) < input.size()) {
708
0
      return input.substr(pos - prefix_size, error_size);
709
0
    } else if (pos >= prefix_size) {
710
0
      return input.substr(pos - prefix_size);
711
0
    } else {
712
0
      return input.substr(
713
0
          0, std::min(input.size(), static_cast<size_t>(error_size)));
714
0
    }
715
0
  };
716
717
11.0k
  size_t num_dashes =
718
11.0k
      (bad_token_position >= prefix_size ? prefix_size
719
11.0k
                                         : bad_token_position - 1);
720
721
11.0k
  BOOST_LOG(rdErrorLog) << input_type << " Parse Error: " << err_message
722
0
                        << " while parsing: " << input << std::endl;
723
11.0k
  BOOST_LOG(rdErrorLog)
724
0
      << input_type << " Parse Error: check for mistakes around position "
725
0
      << bad_token_position << ":" << std::endl;
726
11.0k
  BOOST_LOG(rdErrorLog) << truncate_input(input, bad_token_position - 1)
727
0
                        << std::endl;
728
11.0k
  BOOST_LOG(rdErrorLog) << std::string(num_dashes, '~') << "^" << std::endl;
729
11.0k
}
730
}  // namespace detail
731
}  // end of namespace SmilesParseOps