/src/rdkit/Code/GraphMol/SmilesParse/SmilesParseOps.cpp
Line | Count | Source |
1 | | // |
2 | | // Copyright (C) 2001-2022 Greg Landrum and other RDKit contributors |
3 | | // |
4 | | // @@ All Rights Reserved @@ |
5 | | // This file is part of the RDKit. |
6 | | // The contents are covered by the terms of the BSD license |
7 | | // which is included in the file license.txt, found at the root |
8 | | // of the RDKit source tree. |
9 | | // |
10 | | #include <GraphMol/RDKitBase.h> |
11 | | #include <GraphMol/RDKitQueries.h> |
12 | | #include <GraphMol/Canon.h> |
13 | | #include <GraphMol/Chirality.h> |
14 | | #include "SmilesParse.h" |
15 | | #include "SmilesParseOps.h" |
16 | | #include <list> |
17 | | #include <algorithm> |
18 | | #include <boost/dynamic_bitset.hpp> |
19 | | #include <boost/format.hpp> |
20 | | #include <RDGeneral/RDLog.h> |
21 | | |
22 | | namespace SmilesParseOps { |
23 | | using namespace RDKit; |
24 | | |
25 | 36.0k | void ClearAtomChemicalProps(RDKit::Atom *atom) { |
26 | 36.0k | TEST_ASSERT(atom); |
27 | 36.0k | atom->setIsotope(0); |
28 | 36.0k | atom->setFormalCharge(0); |
29 | 36.0k | atom->setNumExplicitHs(0); |
30 | 36.0k | } |
31 | | |
32 | 779k | void CheckRingClosureBranchStatus(RDKit::Atom *atom, RDKit::RWMol *mp) { |
33 | | // github #786 and #1652: if the ring closure comes after a branch, |
34 | | // the stereochem is wrong. |
35 | | // This function is called while closing a branch during construction of |
36 | | // the molecule from SMILES and corrects for what happens when parsing odd |
37 | | // (and arguably wrong) SMILES constructs like: |
38 | | // 1) [C@@](F)1(C)CCO1 |
39 | | // 2) C1CN[C@](O)(N)1 |
40 | | // 3) [C@](Cl)(F)1CC[C@H](F)CC1 |
41 | | // In the first two cases the stereochemistry at the chiral atom |
42 | | // needs to be reversed. In the third case the stereochemistry should be |
43 | | // reversed when the Cl is added, but left alone when the F is added. |
44 | | // We recognize these situations using the index of the chiral atom |
45 | | // and the degree of that chiral atom at the time the ring closure |
46 | | // digit is encountered during parsing. |
47 | | // ---------- |
48 | | // github #1972 adds these examples: |
49 | | // 1) [C@@]1(Cl)(F)I.Br1 (ok) |
50 | | // 2) [C@@](Cl)1(F)I.Br1 (reverse) |
51 | | // 3) [C@@](Cl)(F)1I.Br1 (ok) |
52 | | // 4) [C@@](Cl)(F)(I)1.Br1 (reverse) |
53 | 779k | PRECONDITION(atom, "bad atom"); |
54 | 779k | PRECONDITION(mp, "bad mol"); |
55 | 779k | if (atom->getIdx() != mp->getNumAtoms(true) - 1 && |
56 | 4.53k | (atom->getDegree() == 1 || |
57 | 4.16k | (atom->getDegree() == 2 && atom->getIdx() != 0) || |
58 | 1.83k | (atom->getDegree() == 3 && atom->getIdx() == 0)) && |
59 | 2.86k | (atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW || |
60 | 2.20k | atom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW)) { |
61 | | // std::cerr << "crcbs: " << atom->getIdx() << std::endl; |
62 | 1.10k | atom->invertChirality(); |
63 | 1.10k | } |
64 | 779k | } |
65 | | |
66 | 548 | void ReportParseError(const char *message, bool throwIt) { |
67 | 548 | PRECONDITION(message, "bad message"); |
68 | 548 | if (!throwIt) { |
69 | 0 | BOOST_LOG(rdErrorLog) << "SMILES Parse Error: " << message << std::endl; |
70 | 548 | } else { |
71 | 548 | throw SmilesParseException(message); |
72 | 548 | } |
73 | 548 | } |
74 | | |
75 | 9.06k | void CleanupAfterParseError(RWMol *mol) { |
76 | 9.06k | PRECONDITION(mol, "no molecule"); |
77 | | // blow out any partial bonds: |
78 | 58.3k | for (auto markI : *mol->getBondBookmarks()) { |
79 | 58.3k | RWMol::BOND_PTR_LIST &bonds = markI.second; |
80 | 459k | for (auto &bond : bonds) { |
81 | 459k | delete bond; |
82 | 459k | } |
83 | 58.3k | } |
84 | 9.06k | } |
85 | | |
86 | | namespace { |
87 | 61.0k | bool couldBeRingClosure(int val) { return val < 100000 && val >= 0; } |
88 | | } // namespace |
89 | | // |
90 | | // set bondOrder to Bond::IONIC to skip the formation of a bond |
91 | | // between the fragment and the molecule |
92 | | // |
93 | | void AddFragToMol(RWMol *mol, RWMol *frag, Bond::BondType bondOrder, |
94 | 0 | Bond::BondDir bondDir) { |
95 | 0 | PRECONDITION(mol, "no molecule"); |
96 | 0 | PRECONDITION(frag, "no fragment"); |
97 | 0 | PRECONDITION(mol->getActiveAtom(), "no active atom"); |
98 | 0 | Atom *lastAt = mol->getActiveAtom(); |
99 | 0 | int nOrigAtoms = mol->getNumAtoms(); |
100 | 0 | int nOrigBonds = mol->getNumBonds(); |
101 | | |
102 | | // |
103 | | // Add the fragment's atoms and bonds to the molecule: |
104 | | // |
105 | 0 | mol->insertMol(*frag); |
106 | | |
107 | | // |
108 | | // update ring-closure order information on the added atoms: |
109 | | // |
110 | 0 | for (const auto atom : frag->atoms()) { |
111 | 0 | INT_VECT tmpVect; |
112 | 0 | if (atom->getPropIfPresent(common_properties::_RingClosures, tmpVect)) { |
113 | 0 | for (auto &v : tmpVect) { |
114 | | // if the ring closure is not already a bond, don't touch it: |
115 | 0 | if (v >= 0) { |
116 | 0 | v += nOrigBonds; |
117 | 0 | } |
118 | 0 | } |
119 | 0 | auto newAtom = mol->getAtomWithIdx(nOrigAtoms + atom->getIdx()); |
120 | 0 | newAtom->setProp(common_properties::_RingClosures, tmpVect); |
121 | 0 | } |
122 | 0 | } |
123 | | |
124 | | // |
125 | | // ses up the bond between the mol and the branch |
126 | | // |
127 | 0 | if (bondOrder != Bond::IONIC) { |
128 | | // FIX: this is not so much with the elegance... |
129 | 0 | auto firstAt = mol->getAtomWithIdx(nOrigAtoms); |
130 | 0 | int atomIdx1 = firstAt->getIdx(); |
131 | 0 | int atomIdx2 = lastAt->getIdx(); |
132 | 0 | if (frag->hasBondBookmark(ci_LEADING_BOND)) { |
133 | | // std::cout << "found it" << std::endl; |
134 | 0 | const ROMol::BOND_PTR_LIST &leadingBonds = |
135 | 0 | frag->getAllBondsWithBookmark(ci_LEADING_BOND); |
136 | 0 | for (auto leadingBond : leadingBonds) { |
137 | | // we've already got a bond, so just set its local info |
138 | | // and then add it to the molecule intact (no sense doing |
139 | | // any extra work). |
140 | 0 | leadingBond->setOwningMol(mol); |
141 | 0 | leadingBond->setEndAtomIdx(leadingBond->getBeginAtomIdx() + nOrigAtoms); |
142 | 0 | leadingBond->setBeginAtomIdx(atomIdx2); |
143 | 0 | mol->addBond(leadingBond, true); |
144 | 0 | } |
145 | 0 | mol->clearBondBookmark(ci_LEADING_BOND); |
146 | 0 | } else { |
147 | | // SMARTS semantics: unspecified bonds can be single or aromatic |
148 | 0 | if (bondOrder == Bond::UNSPECIFIED) { |
149 | 0 | auto *newB = new QueryBond(Bond::SINGLE); |
150 | 0 | newB->setQuery(makeSingleOrAromaticBondQuery()); |
151 | 0 | newB->setOwningMol(mol); |
152 | 0 | newB->setBeginAtomIdx(atomIdx1); |
153 | 0 | newB->setEndAtomIdx(atomIdx2); |
154 | 0 | newB->setProp(RDKit::common_properties::_unspecifiedOrder, 1); |
155 | 0 | mol->addBond(newB); |
156 | 0 | delete newB; |
157 | 0 | } else { |
158 | 0 | Bond::BondType bo = bondOrder; |
159 | 0 | if (bo == Bond::DATIVEL) { |
160 | 0 | std::swap(atomIdx1, atomIdx2); |
161 | 0 | bo = Bond::DATIVE; |
162 | 0 | } else if (bo == Bond::DATIVER) { |
163 | 0 | bo = Bond::DATIVE; |
164 | 0 | } |
165 | 0 | int idx = mol->addBond(atomIdx2, atomIdx1, bo) - 1; |
166 | 0 | mol->getBondWithIdx(idx)->setBondDir(bondDir); |
167 | 0 | } |
168 | 0 | } |
169 | 0 | } |
170 | | |
171 | | // |
172 | | // okay, the next thing we have to worry about is the possibility |
173 | | // that there might be ring opening/closing in the fragment we just |
174 | | // dealt with e.g. for things like C1C(C1) and C1C.C1 |
175 | | // We deal with this by copying in the bookmarks and partial bonds |
176 | | // that exist in the fragment |
177 | | // |
178 | 0 | for (auto atIt : *frag->getAtomBookmarks()) { |
179 | | // don't bother even considering bookmarks outside |
180 | | // the range used for cycles |
181 | 0 | if (couldBeRingClosure(atIt.first)) { |
182 | 0 | for (auto at2 : atIt.second) { |
183 | 0 | int newIdx = at2->getIdx() + nOrigAtoms; |
184 | 0 | mol->setAtomBookmark(mol->getAtomWithIdx(newIdx), atIt.first); |
185 | 0 | while (frag->hasBondBookmark(atIt.first)) { |
186 | 0 | Bond *b = frag->getBondWithBookmark(atIt.first); |
187 | 0 | int atomIdx1 = b->getBeginAtomIdx() + nOrigAtoms; |
188 | 0 | b->setOwningMol(mol); |
189 | 0 | b->setBeginAtomIdx(atomIdx1); |
190 | 0 | mol->setBondBookmark(b, atIt.first); |
191 | 0 | frag->clearBondBookmark(atIt.first, b); |
192 | 0 | } |
193 | 0 | } |
194 | 0 | } |
195 | 0 | } |
196 | |
|
197 | 0 | frag->clearAllAtomBookmarks(); |
198 | 0 | frag->clearAllBondBookmarks(); |
199 | 0 | }; |
200 | | |
201 | | typedef std::pair<size_t, int> SIZET_PAIR; |
202 | | typedef std::pair<int, int> INT_PAIR; |
203 | | template <typename T> |
204 | | bool operator<(const std::pair<T, T> &p1, const std::pair<T, T> &p2) { |
205 | | return p1.first < p2.first; |
206 | | } |
207 | | |
208 | | // |
209 | | // Helper function to get the SMILES bond ordering around a given atom, this is |
210 | | // required to make sure the stereo information is correct as the storage order |
211 | | // may not match how it is SMILES due to ring closures and implicit/missing |
212 | | // ligands. |
213 | | // |
214 | | unsigned int GetBondOrdering(INT_LIST &bondOrdering, const RDKit::RWMol *mol, |
215 | 35.2k | const RDKit::Atom *atom) { |
216 | 35.2k | PRECONDITION(mol, "no mol"); |
217 | 35.2k | PRECONDITION(atom, "no atom"); |
218 | | |
219 | | // |
220 | | // The atom is marked as chiral, set the SMILES-order of the |
221 | | // atom's bonds. This is easy for non-ring-closure bonds, |
222 | | // because the SMILES order is determined solely by the atom |
223 | | // indices. Things are trickier for ring-closure bonds, which we |
224 | | // need to insert into the list in a particular order |
225 | | // |
226 | 35.2k | INT_VECT ringClosures; |
227 | 35.2k | atom->getPropIfPresent(common_properties::_RingClosures, ringClosures); |
228 | | |
229 | | #if 0 |
230 | | std::cerr << "CLOSURES: "; |
231 | | std::copy(ringClosures.begin(), ringClosures.end(), |
232 | | std::ostream_iterator<int>(std::cerr, " ")); |
233 | | std::cerr << std::endl; |
234 | | #endif |
235 | 35.2k | std::list<SIZET_PAIR> neighbors; |
236 | | // push this atom onto the list of neighbors (we'll use this |
237 | | // to find our place later): |
238 | 35.2k | neighbors.emplace_back(atom->getIdx(), -1); |
239 | 35.2k | std::list<size_t> bondOrder; |
240 | 126k | for (auto nbrIdx : boost::make_iterator_range(mol->getAtomNeighbors(atom))) { |
241 | 126k | const Bond *nbrBond = mol->getBondBetweenAtoms(atom->getIdx(), nbrIdx); |
242 | 126k | if (std::find(ringClosures.begin(), ringClosures.end(), |
243 | 126k | static_cast<int>(nbrBond->getIdx())) == ringClosures.end()) { |
244 | 106k | neighbors.emplace_back(nbrIdx, nbrBond->getIdx()); |
245 | 106k | } |
246 | 126k | } |
247 | | // sort the list of non-ring-closure bonds: |
248 | 35.2k | neighbors.sort(); |
249 | | |
250 | | // find the location of this atom. it pretty much has to be |
251 | | // first in the list, e.g for smiles like [C@](F)(Cl)(Br)I, or |
252 | | // second (everything else). |
253 | 35.2k | auto selfPos = neighbors.begin(); |
254 | 35.2k | if (selfPos->first != atom->getIdx()) { |
255 | 31.3k | ++selfPos; |
256 | 31.3k | } |
257 | 35.2k | CHECK_INVARIANT(selfPos->first == atom->getIdx(), "weird atom ordering"); |
258 | | |
259 | | // copy over the bond ids: |
260 | 176k | for (auto neighborIt = neighbors.begin(); neighborIt != neighbors.end(); |
261 | 141k | ++neighborIt) { |
262 | 141k | if (neighborIt != selfPos) { |
263 | 106k | bondOrdering.push_back(rdcast<int>(neighborIt->second)); |
264 | 106k | } else { |
265 | | // we are not going to add the atom itself, but we will push on |
266 | | // ring closure bonds at this point (if required): |
267 | 35.2k | bondOrdering.insert(bondOrdering.end(), ringClosures.begin(), |
268 | 35.2k | ringClosures.end()); |
269 | 35.2k | } |
270 | 141k | } |
271 | | |
272 | 35.2k | return ringClosures.size(); |
273 | 35.2k | } |
274 | | |
275 | 35.3k | void AdjustAtomChiralityFlags(RWMol *mol) { |
276 | 35.3k | PRECONDITION(mol, "no molecule"); |
277 | 6.19M | for (auto atom : mol->atoms()) { |
278 | 6.19M | Atom::ChiralType chiralType = atom->getChiralTag(); |
279 | 6.19M | if (chiralType == Atom::CHI_TETRAHEDRAL_CW || |
280 | 6.18M | chiralType == Atom::CHI_TETRAHEDRAL_CCW) { |
281 | 32.9k | INT_LIST bondOrdering; |
282 | 32.9k | unsigned int numClosures = GetBondOrdering(bondOrdering, mol, atom); |
283 | | |
284 | | // ok, we now have the SMILES ordering of the bonds, figure out the |
285 | | // permutation order. |
286 | | // |
287 | | // This whole thing is necessary because the ring-closure bonds |
288 | | // in the SMILES come before the bonds to the other neighbors, but |
289 | | // they come after the neighbors in the molecule we build. |
290 | | // A crude example: |
291 | | // in F[C@](Cl)(Br)I the C-Cl bond is index 1 in both SMILES |
292 | | // and as built |
293 | | // in F[C@]1(Br)I.Cl1 the C-Cl bond is index 1 in the SMILES |
294 | | // and index 3 as built. |
295 | | // |
296 | 32.9k | int nSwaps = atom->getPerturbationOrder(bondOrdering); |
297 | | // FIX: explain this one: |
298 | | // At least part of what's going on here for degree 3 atoms: |
299 | | // - The first part: if we're at the beginning of the SMILES and have |
300 | | // an explicit H, we need to add a swap. |
301 | | // This is to reflect that [C@](Cl)(F)C is equivalent to Cl[C@@](F)C |
302 | | // but [C@H](Cl)(F)C is fine as-is (The H-C bond is the one you look |
303 | | // down). |
304 | | // - The second part is more complicated and deals with situations like |
305 | | // F[C@]1CCO1. In this case we otherwise end up looking like we need |
306 | | // to invert the chirality, which is bogus. The chirality here needs |
307 | | // to remain @ just as it does in F[C@](Cl)CCO1 |
308 | | // - We have to be careful with the second part to not sweep things like |
309 | | // C[S@]2(=O).Cl2 into the same bin (was github #760). We detect |
310 | | // those cases by looking for unsaturated atoms |
311 | | // |
312 | 32.9k | if (Canon::chiralAtomNeedsTagInversion( |
313 | 32.9k | *mol, atom, atom->hasProp(common_properties::_SmilesStart), |
314 | 32.9k | numClosures)) { |
315 | 5.03k | ++nSwaps; |
316 | 5.03k | } |
317 | | // std::cerr << "nswaps " << atom->getIdx() << " " << nSwaps |
318 | | // << std::endl; |
319 | | // std::copy(bondOrdering.begin(), bondOrdering.end(), |
320 | | // std::ostream_iterator<int>(std::cerr, ", ")); |
321 | | // std::cerr << std::endl; |
322 | 32.9k | if (nSwaps % 2) { |
323 | 5.18k | atom->invertChirality(); |
324 | 5.18k | } |
325 | 6.16M | } else if (chiralType == Atom::CHI_SQUAREPLANAR || |
326 | 6.16M | chiralType == Atom::CHI_TRIGONALBIPYRAMIDAL || |
327 | 6.15M | chiralType == Atom::CHI_OCTAHEDRAL) { |
328 | 2.29k | INT_LIST bonds; |
329 | 2.29k | GetBondOrdering(bonds, mol, atom); |
330 | | |
331 | 2.29k | unsigned int ref_max = Chirality::getMaxNbors(chiralType); |
332 | | |
333 | | // insert (-1) for hydrogens or missing ligands, where these are placed |
334 | | // depends on if it is the first atom or not |
335 | 2.29k | if (bonds.size() < ref_max) { |
336 | 2.24k | if (atom->hasProp(common_properties::_SmilesStart)) { |
337 | 478 | bonds.insert(bonds.begin(), ref_max - bonds.size(), -1); |
338 | 1.76k | } else { |
339 | 1.76k | bonds.insert(++bonds.begin(), ref_max - bonds.size(), -1); |
340 | 1.76k | } |
341 | 2.24k | } |
342 | | |
343 | 2.29k | atom->setProp(common_properties::_chiralPermutation, |
344 | 2.29k | Chirality::getChiralPermutation(atom, bonds, true)); |
345 | 2.29k | } |
346 | 6.19M | } |
347 | 35.3k | } // namespace SmilesParseOps |
348 | | |
349 | | Bond::BondType GetUnspecifiedBondType(const RWMol *mol, const Atom *atom1, |
350 | 8.22M | const Atom *atom2) { |
351 | 8.22M | PRECONDITION(mol, "no molecule"); |
352 | 8.22M | PRECONDITION(atom1, "no atom1"); |
353 | 8.22M | PRECONDITION(atom2, "no atom2"); |
354 | 8.22M | Bond::BondType res; |
355 | 8.22M | if (atom1->getIsAromatic() && atom2->getIsAromatic()) { |
356 | 3.64M | res = Bond::AROMATIC; |
357 | 4.58M | } else { |
358 | 4.58M | res = Bond::SINGLE; |
359 | 4.58M | } |
360 | 8.22M | return res; |
361 | 8.22M | } |
362 | 35.3k | void SetUnspecifiedBondTypes(RWMol *mol) { |
363 | 35.3k | PRECONDITION(mol, "no molecule"); |
364 | 6.18M | for (auto bond : mol->bonds()) { |
365 | 6.18M | if (bond->hasProp(RDKit::common_properties::_unspecifiedOrder)) { |
366 | 550k | bond->setBondType(GetUnspecifiedBondType(mol, bond->getBeginAtom(), |
367 | 550k | bond->getEndAtom())); |
368 | 550k | if (bond->getBondType() == Bond::AROMATIC) { |
369 | 337k | bond->setIsAromatic(true); |
370 | 337k | } else { |
371 | 212k | bond->setIsAromatic(false); |
372 | 212k | } |
373 | 550k | } |
374 | 6.18M | } |
375 | 35.3k | } |
376 | | |
377 | | namespace { |
378 | 160k | void swapBondDirIfNeeded(Bond *bond1, const Bond *bond2) { |
379 | 160k | PRECONDITION(bond1, "bad bond1"); |
380 | 160k | PRECONDITION(bond2, "bad bond2"); |
381 | 160k | if (bond1->getBondDir() == Bond::NONE && bond2->getBondDir() != Bond::NONE) { |
382 | 6.02k | bond1->setBondDir(bond2->getBondDir()); |
383 | 6.02k | if (bond1->getBeginAtom() != bond2->getBeginAtom()) { |
384 | 4.82k | switch (bond1->getBondDir()) { |
385 | 3.60k | case Bond::ENDDOWNRIGHT: |
386 | 3.60k | bond1->setBondDir(Bond::ENDUPRIGHT); |
387 | 3.60k | break; |
388 | 1.21k | case Bond::ENDUPRIGHT: |
389 | 1.21k | bond1->setBondDir(Bond::ENDDOWNRIGHT); |
390 | 1.21k | break; |
391 | 0 | default: |
392 | 0 | break; |
393 | 4.82k | } |
394 | 4.82k | } |
395 | 6.02k | } |
396 | 160k | } |
397 | | } // namespace |
398 | | |
399 | | static const std::map<int, int> permutationLimits = { |
400 | | {RDKit::Atom::ChiralType::CHI_TETRAHEDRAL, 2}, |
401 | | {RDKit::Atom::ChiralType::CHI_ALLENE, 2}, |
402 | | {RDKit::Atom::ChiralType::CHI_SQUAREPLANAR, 3}, |
403 | | {RDKit::Atom::ChiralType::CHI_OCTAHEDRAL, 30}, |
404 | | {RDKit::Atom::ChiralType::CHI_TRIGONALBIPYRAMIDAL, 20}}; |
405 | | |
406 | 3.69k | bool checkChiralPermutation(int chiralTag, int permutation) { |
407 | 3.69k | if (chiralTag > RDKit::Atom::ChiralType::CHI_OTHER && |
408 | 3.69k | permutationLimits.find(chiralTag) != permutationLimits.end() && |
409 | 3.69k | (permutation < 0 || permutation > permutationLimits.at(chiralTag))) { |
410 | 7 | return false; |
411 | 7 | } |
412 | 3.69k | return true; |
413 | 3.69k | } |
414 | | |
415 | 35.4k | void CheckChiralitySpecifications(RDKit::RWMol *mol, bool strict) { |
416 | 35.4k | PRECONDITION(mol, "no molecule"); |
417 | 6.19M | for (const auto atom : mol->atoms()) { |
418 | 6.19M | int permutation; |
419 | 6.19M | if (atom->getChiralTag() > RDKit::Atom::ChiralType::CHI_OTHER && |
420 | 3.73k | permutationLimits.find(atom->getChiralTag()) != |
421 | 3.73k | permutationLimits.end() && |
422 | 3.73k | atom->getPropIfPresent(common_properties::_chiralPermutation, |
423 | 3.73k | permutation)) { |
424 | 3.69k | if (!checkChiralPermutation(atom->getChiralTag(), permutation)) { |
425 | 7 | std::string error = |
426 | 7 | (boost::format("Invalid chiral specification on atom %d") % |
427 | 7 | atom->getIdx()) |
428 | 7 | .str(); |
429 | 7 | BOOST_LOG(rdWarningLog) << error << std::endl; |
430 | 7 | if (strict) { |
431 | 7 | throw SmilesParseException(error); |
432 | 7 | } |
433 | 7 | } |
434 | | // directly convert @TH1 -> @ and @TH2 -> @@ |
435 | 3.69k | if (atom->getChiralTag() == RDKit::Atom::ChiralType::CHI_TETRAHEDRAL) { |
436 | 1.42k | if (permutation == 0 || permutation == 1) { |
437 | 1.40k | atom->setChiralTag(RDKit::Atom::ChiralType::CHI_TETRAHEDRAL_CCW); |
438 | 1.40k | atom->clearProp(common_properties::_chiralPermutation); |
439 | 1.40k | } else if (permutation == 2) { |
440 | 17 | atom->setChiralTag(RDKit::Atom::ChiralType::CHI_TETRAHEDRAL_CW); |
441 | 17 | atom->clearProp(common_properties::_chiralPermutation); |
442 | 17 | } |
443 | 1.42k | } |
444 | 3.69k | } |
445 | 6.19M | } |
446 | 35.4k | } |
447 | | |
448 | 37.3k | void CloseMolRings(RWMol *mol, bool toleratePartials) { |
449 | | // Here's what we want to do here: |
450 | | // loop through the molecule's atom bookmarks |
451 | | // for each bookmark: |
452 | | // connect pairs of atoms sharing that bookmark |
453 | | // left to right (in the order in which they were |
454 | | // inserted into the molecule). |
455 | | // whilst doing this, we have to be cognizant of the fact that |
456 | | // there may well be partial bonds in the molecule which need |
457 | | // to be tied in as well. WOO HOO! IT'S A BIG MESS! |
458 | 37.3k | PRECONDITION(mol, "no molecule"); |
459 | | |
460 | 37.3k | auto bookmarkIt = mol->getAtomBookmarks()->begin(); |
461 | 98.3k | while (bookmarkIt != mol->getAtomBookmarks()->end()) { |
462 | 61.0k | auto &bookmark = *bookmarkIt; |
463 | | // don't bother even considering bookmarks outside |
464 | | // the range used for rings |
465 | 61.0k | if (couldBeRingClosure(bookmark.first)) { |
466 | 23.6k | RWMol::ATOM_PTR_LIST bookmarkedAtomsToRemove; |
467 | 23.6k | auto atomIt = bookmark.second.begin(); |
468 | 23.6k | auto atomsEnd = bookmark.second.end(); |
469 | 184k | while (atomIt != atomsEnd) { |
470 | 160k | Atom *atom1 = *atomIt; |
471 | 160k | ++atomIt; |
472 | 160k | if (!toleratePartials && atomIt == atomsEnd) { |
473 | 341 | ReportParseError("unclosed ring"); |
474 | 160k | } else if (atomIt != atomsEnd && *atomIt == atom1) { |
475 | | // make sure we don't try to connect an atom to itself |
476 | | // this was github #1925 |
477 | 157 | auto fmt = |
478 | 157 | boost::format{ |
479 | 157 | "duplicated ring closure %1% bonds atom %2% to itself"} % |
480 | 157 | bookmark.first % atom1->getIdx(); |
481 | 157 | std::string msg = fmt.str(); |
482 | 157 | ReportParseError(msg.c_str(), true); |
483 | 160k | } else if (mol->getBondBetweenAtoms(atom1->getIdx(), |
484 | 160k | (*atomIt)->getIdx()) != nullptr) { |
485 | 50 | auto fmt = |
486 | 50 | boost::format{ |
487 | 50 | "ring closure %1% duplicates bond between atom %2% and atom " |
488 | 50 | "%3%"} % |
489 | 50 | bookmark.first % atom1->getIdx() % (*atomIt)->getIdx(); |
490 | 50 | std::string msg = fmt.str(); |
491 | 50 | ReportParseError(msg.c_str(), true); |
492 | 160k | } else if (atomIt != atomsEnd) { |
493 | | // we actually found an atom, so connect it to the first |
494 | 160k | Atom *atom2 = *atomIt; |
495 | 160k | ++atomIt; |
496 | | |
497 | 160k | int bondIdx = -1; |
498 | | // We're guaranteed two partial bonds, one for each time |
499 | | // the ring index was used. We give the first specification |
500 | | // priority. |
501 | 160k | CHECK_INVARIANT(mol->hasBondBookmark(bookmark.first), |
502 | 160k | "Missing bond bookmark"); |
503 | | |
504 | | // now use the info from the partial bond: |
505 | | // The partial bond itself will have a proper order and |
506 | | // directionality (with a minor caveat documented below) and will |
507 | | // have its beginning atom set already: |
508 | 160k | RWMol::BOND_PTR_LIST bonds = |
509 | 160k | mol->getAllBondsWithBookmark(bookmark.first); |
510 | 160k | auto bondIt = bonds.begin(); |
511 | 160k | CHECK_INVARIANT(bonds.size() >= 2, "Missing bond"); |
512 | | |
513 | | // get pointers to the two bonds: |
514 | 160k | Bond *bond1 = *bondIt; |
515 | 160k | ++bondIt; |
516 | 160k | Bond *bond2 = *bondIt; |
517 | | |
518 | | // remove those bonds from the bookmarks: |
519 | 160k | mol->clearBondBookmark(bookmark.first, bond1); |
520 | 160k | mol->clearBondBookmark(bookmark.first, bond2); |
521 | | |
522 | | // Make sure the bonds have the correct starting atoms: |
523 | 160k | CHECK_INVARIANT(bond1->getBeginAtomIdx() == atom1->getIdx(), |
524 | 160k | "bad begin atom"); |
525 | 160k | CHECK_INVARIANT(bond2->getBeginAtomIdx() == atom2->getIdx(), |
526 | 160k | "bad begin atom"); |
527 | | |
528 | | // we use the _cxsmilesBondIdx value from the second one, if it's |
529 | | // there |
530 | 160k | if (bond2->hasProp("_cxsmilesBondIdx")) { |
531 | 160k | bond1->setProp("_cxsmilesBondIdx", |
532 | 160k | bond2->getProp<unsigned int>("_cxsmilesBondIdx")); |
533 | 160k | } |
534 | | |
535 | 160k | Bond *matchedBond; |
536 | | |
537 | | // figure out which (if either) bond has a specified type, we'll |
538 | | // keep that one. We also need to update the end atom index to |
539 | | // match FIX: daylight barfs when you give it multiple specs for the |
540 | | // closure |
541 | | // bond, we'll just take the first one and ignore others |
542 | | // NOTE: we used to do this the other way (take the last |
543 | | // specification), |
544 | | // but that turned out to be troublesome in odd cases like |
545 | | // C1CC11CC1. |
546 | | // std::cerr << ">-------------" << std::endl; |
547 | | // std::cerr << atom1->getIdx() << "-" << atom2->getIdx() << ": " |
548 | | // << bond1->getBondType() << "(" |
549 | | // << bond1->hasProp(common_properties::_unspecifiedOrder) |
550 | | // << "):" << bond1->getBondDir() << " " |
551 | | // << bond2->getBondType() << "(" |
552 | | // << bond2->hasProp(common_properties::_unspecifiedOrder) |
553 | | // << "):" << bond2->getBondDir() << std::endl; |
554 | 160k | if (!bond1->hasProp(common_properties::_unspecifiedOrder)) { |
555 | 14.4k | matchedBond = bond1; |
556 | 14.4k | if (matchedBond->getBondType() == Bond::DATIVEL) { |
557 | 1.41k | matchedBond->setBeginAtomIdx(atom2->getIdx()); |
558 | 1.41k | matchedBond->setEndAtomIdx(atom1->getIdx()); |
559 | 1.41k | matchedBond->setBondType(Bond::DATIVE); |
560 | 13.0k | } else if (matchedBond->getBondType() == Bond::DATIVER) { |
561 | 198 | matchedBond->setEndAtomIdx(atom2->getIdx()); |
562 | 198 | matchedBond->setBondType(Bond::DATIVE); |
563 | 12.8k | } else { |
564 | 12.8k | matchedBond->setEndAtomIdx(atom2->getIdx()); |
565 | 12.8k | } |
566 | 14.4k | swapBondDirIfNeeded(bond1, bond2); |
567 | 14.4k | delete bond2; |
568 | 145k | } else { |
569 | 145k | matchedBond = bond2; |
570 | 145k | if (matchedBond->getBondType() == Bond::DATIVEL) { |
571 | 898 | matchedBond->setBeginAtomIdx(atom1->getIdx()); |
572 | 898 | matchedBond->setEndAtomIdx(atom2->getIdx()); |
573 | 898 | matchedBond->setBondType(Bond::DATIVE); |
574 | 144k | } else if (matchedBond->getBondType() == Bond::DATIVER) { |
575 | 3.24k | matchedBond->setEndAtomIdx(atom1->getIdx()); |
576 | 3.24k | matchedBond->setBondType(Bond::DATIVE); |
577 | 141k | } else { |
578 | 141k | matchedBond->setEndAtomIdx(atom1->getIdx()); |
579 | 141k | } |
580 | 145k | swapBondDirIfNeeded(bond2, bond1); |
581 | 145k | delete bond1; |
582 | 145k | } |
583 | 160k | if (matchedBond->getBondType() == Bond::UNSPECIFIED && |
584 | 136k | !matchedBond->hasQuery()) { |
585 | 136k | Bond::BondType bondT = GetUnspecifiedBondType(mol, atom1, atom2); |
586 | 136k | matchedBond->setBondType(bondT); |
587 | 136k | } |
588 | 160k | matchedBond->setOwningMol(mol); |
589 | 160k | if (matchedBond->getBondType() == Bond::AROMATIC) { |
590 | 38.9k | matchedBond->setIsAromatic(true); |
591 | 38.9k | } |
592 | | |
593 | | // add the bond: |
594 | 160k | bondIdx = mol->addBond(matchedBond, true); |
595 | | |
596 | | // we found a bond, so update the atom's _RingClosures |
597 | | // property: |
598 | 160k | if (bondIdx > -1) { |
599 | 160k | CHECK_INVARIANT( |
600 | 160k | atom1->hasProp(common_properties::_RingClosures) && |
601 | 160k | atom2->hasProp(common_properties::_RingClosures), |
602 | 160k | "somehow atom doesn't have _RingClosures property."); |
603 | 160k | INT_VECT closures; |
604 | 160k | atom1->getProp(common_properties::_RingClosures, closures); |
605 | 160k | auto closurePos = std::find(closures.begin(), closures.end(), |
606 | 160k | -(bookmark.first + 1)); |
607 | 160k | CHECK_INVARIANT(closurePos != closures.end(), |
608 | 160k | "could not find bookmark in atom _RingClosures"); |
609 | 160k | *closurePos = bondIdx - 1; |
610 | 160k | atom1->setProp(common_properties::_RingClosures, closures); |
611 | | |
612 | 160k | atom2->getProp(common_properties::_RingClosures, closures); |
613 | 160k | closurePos = std::find(closures.begin(), closures.end(), |
614 | 160k | -(bookmark.first + 1)); |
615 | 160k | CHECK_INVARIANT(closurePos != closures.end(), |
616 | 160k | "could not find bookmark in atom _RingClosures"); |
617 | 160k | *closurePos = bondIdx - 1; |
618 | 160k | atom2->setProp(common_properties::_RingClosures, closures); |
619 | 160k | } |
620 | 160k | bookmarkedAtomsToRemove.push_back(atom1); |
621 | 160k | bookmarkedAtomsToRemove.push_back(atom2); |
622 | 160k | } |
623 | 160k | } |
624 | 23.6k | int mark = bookmark.first; |
625 | 23.6k | ++bookmarkIt; |
626 | 312k | for (const auto atom : bookmarkedAtomsToRemove) { |
627 | 312k | mol->clearAtomBookmark(mark, atom); |
628 | 312k | } |
629 | 37.3k | } else { |
630 | 37.3k | ++bookmarkIt; |
631 | 37.3k | } |
632 | 61.0k | } |
633 | 37.3k | }; |
634 | | |
635 | 27.2k | void CleanupAfterParsing(RWMol *mol) { |
636 | 27.2k | PRECONDITION(mol, "no molecule"); |
637 | 2.05M | for (auto atom : mol->atoms()) { |
638 | 2.05M | atom->clearProp(common_properties::_RingClosures); |
639 | 2.05M | atom->clearProp(common_properties::_SmilesStart); |
640 | 2.05M | std::string label; |
641 | 2.05M | if (atom->getAtomicNum() == 0 && |
642 | 315k | atom->getPropIfPresent(common_properties::atomLabel, label)) { |
643 | | // marvinsketch can output higher labels than _AP1 and _AP2, but they |
644 | | // aren't part of the MOL file spec so we don't treat them as attachment |
645 | | // points |
646 | 13.2k | if (label == "_AP1") { |
647 | 295 | atom->setProp(common_properties::_fromAttachPoint, 1); |
648 | 12.9k | } else if (label == "_AP2") { |
649 | 480 | atom->setProp(common_properties::_fromAttachPoint, 2); |
650 | 480 | } |
651 | 13.2k | } |
652 | 2.05M | } |
653 | 2.00M | for (auto bond : mol->bonds()) { |
654 | 2.00M | bond->clearProp(common_properties::_unspecifiedOrder); |
655 | 2.00M | bond->clearProp("_cxsmilesBondIdx"); |
656 | 2.00M | } |
657 | 27.2k | for (auto sg : RDKit::getSubstanceGroups(*mol)) { |
658 | 5.26k | sg.clearProp("_cxsmilesindex"); |
659 | 5.26k | } |
660 | 27.2k | if (!Chirality::getAllowNontetrahedralChirality()) { |
661 | 0 | bool needWarn = false; |
662 | 0 | for (auto atom : mol->atoms()) { |
663 | 0 | if (atom->hasProp(common_properties::_chiralPermutation)) { |
664 | 0 | needWarn = true; |
665 | 0 | atom->clearProp(common_properties::_chiralPermutation); |
666 | 0 | } |
667 | 0 | if (atom->getChiralTag() > Atom::ChiralType::CHI_OTHER) { |
668 | 0 | needWarn = true; |
669 | 0 | atom->setChiralTag(Atom::ChiralType::CHI_UNSPECIFIED); |
670 | 0 | } |
671 | 0 | } |
672 | 0 | if (needWarn) { |
673 | 0 | BOOST_LOG(rdWarningLog) |
674 | 0 | << "ignoring non-tetrahedral stereo specification since setAllowNontetrahedralChirality() is false." |
675 | 0 | << std::endl; |
676 | 0 | } |
677 | 0 | } |
678 | 27.2k | } |
679 | | |
680 | | RDKit::QueryBond *getUnspecifiedQueryBond(const RDKit::Atom *a1, |
681 | 1.62M | const RDKit::Atom *a2) { |
682 | 1.62M | PRECONDITION(a1, "bad atom pointer"); |
683 | 1.62M | QueryBond *newB; |
684 | 1.62M | if (!a1->getIsAromatic() || (a2 && !a2->getIsAromatic())) { |
685 | 192k | newB = new QueryBond(Bond::SINGLE); |
686 | 192k | newB->setQuery(makeSingleOrAromaticBondQuery()); |
687 | 1.42M | } else { |
688 | 1.42M | newB = new QueryBond(Bond::AROMATIC); |
689 | 1.42M | newB->setQuery(makeSingleOrAromaticBondQuery()); |
690 | 1.42M | } |
691 | 1.62M | newB->setProp(RDKit::common_properties::_unspecifiedOrder, 1); |
692 | 1.62M | return newB; |
693 | 1.62M | } |
694 | | |
695 | | namespace detail { |
696 | | |
697 | | void printSyntaxErrorMessage(std::string_view input, |
698 | | std::string_view err_message, |
699 | | unsigned int bad_token_position, |
700 | 11.0k | std::string_view input_type) { |
701 | | // NOTE: If the input is very long, the pointer to the failed location |
702 | | // becomes less useful. We should truncate the length of the error message |
703 | | // to 41 chars. |
704 | 11.0k | static constexpr unsigned int error_size{41}; |
705 | 11.0k | static constexpr unsigned int prefix_size{error_size / 2}; |
706 | 11.0k | static auto truncate_input = [=](const auto &input, const unsigned int pos) { |
707 | 0 | if ((pos >= prefix_size) && (pos + prefix_size) < input.size()) { |
708 | 0 | return input.substr(pos - prefix_size, error_size); |
709 | 0 | } else if (pos >= prefix_size) { |
710 | 0 | return input.substr(pos - prefix_size); |
711 | 0 | } else { |
712 | 0 | return input.substr( |
713 | 0 | 0, std::min(input.size(), static_cast<size_t>(error_size))); |
714 | 0 | } |
715 | 0 | }; |
716 | | |
717 | 11.0k | size_t num_dashes = |
718 | 11.0k | (bad_token_position >= prefix_size ? prefix_size |
719 | 11.0k | : bad_token_position - 1); |
720 | | |
721 | 11.0k | BOOST_LOG(rdErrorLog) << input_type << " Parse Error: " << err_message |
722 | 0 | << " while parsing: " << input << std::endl; |
723 | 11.0k | BOOST_LOG(rdErrorLog) |
724 | 0 | << input_type << " Parse Error: check for mistakes around position " |
725 | 0 | << bad_token_position << ":" << std::endl; |
726 | 11.0k | BOOST_LOG(rdErrorLog) << truncate_input(input, bad_token_position - 1) |
727 | 0 | << std::endl; |
728 | 11.0k | BOOST_LOG(rdErrorLog) << std::string(num_dashes, '~') << "^" << std::endl; |
729 | 11.0k | } |
730 | | } // namespace detail |
731 | | } // end of namespace SmilesParseOps |