/src/rdkit/Code/GraphMol/FileParsers/MolFileParser.cpp
Line | Count | Source |
1 | | // |
2 | | // Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors |
3 | | // |
4 | | // @@ All Rights Reserved @@ |
5 | | // This file is part of the RDKit. |
6 | | // The contents are covered by the terms of the BSD license |
7 | | // which is included in the file license.txt, found at the root |
8 | | // of the RDKit source tree. |
9 | | // |
10 | | #include <RDGeneral/BoostStartInclude.h> |
11 | | #include <boost/lexical_cast.hpp> |
12 | | #include <boost/algorithm/string.hpp> |
13 | | #include <boost/tokenizer.hpp> |
14 | | #include <boost/algorithm/string/trim.hpp> |
15 | | #include <boost/format.hpp> |
16 | | #include <RDGeneral/BoostEndInclude.h> |
17 | | |
18 | | #include "FileParsers.h" |
19 | | #include "FileParserUtils.h" |
20 | | #include "MolSGroupParsing.h" |
21 | | #include <GraphMol/FileParsers/MolFileStereochem.h> |
22 | | #include <GraphMol/SmilesParse/SmilesParse.h> |
23 | | #include <GraphMol/RDKitQueries.h> |
24 | | #include <GraphMol/StereoGroup.h> |
25 | | #include <GraphMol/SubstanceGroup.h> |
26 | | #include <GraphMol/Atropisomers.h> |
27 | | #include <RDGeneral/StreamOps.h> |
28 | | #include <RDGeneral/RDLog.h> |
29 | | #include <GraphMol/GenericGroups/GenericGroups.h> |
30 | | #include <GraphMol/QueryOps.h> |
31 | | #include <GraphMol/Chirality.h> |
32 | | |
33 | | #include <fstream> |
34 | | #include <RDGeneral/FileParseException.h> |
35 | | #include <RDGeneral/BadFileException.h> |
36 | | #include <RDGeneral/LocaleSwitcher.h> |
37 | | #include <typeinfo> |
38 | | #include <exception> |
39 | | #include <charconv> |
40 | | #include <regex> |
41 | | #include <sstream> |
42 | | #include <locale> |
43 | | #include <cstdlib> |
44 | | #include <cstdio> |
45 | | #include <string_view> |
46 | | |
47 | | using namespace RDKit::SGroupParsing; |
48 | | using std::regex; |
49 | | using std::regex_match; |
50 | | using std::smatch; |
51 | | |
52 | | namespace RDKit { |
53 | | |
54 | | namespace FileParserUtils { |
55 | | |
56 | 904k | int toInt(const std::string_view input, bool acceptSpaces) { |
57 | | // don't need to worry about locale stuff here because |
58 | | // we're not going to have delimiters |
59 | | |
60 | | // sanity check on the input since strtol doesn't do it for us: |
61 | 904k | const char *txt = input.data(); |
62 | 2.57M | for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) { |
63 | 1.72M | if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') || |
64 | 1.67M | *txt == '+' || *txt == '-') { |
65 | 1.67M | ++txt; |
66 | 1.67M | } else { |
67 | 51.9k | throw boost::bad_lexical_cast(); |
68 | 51.9k | } |
69 | 1.72M | } |
70 | | // remove leading spaces |
71 | 852k | txt = input.data(); |
72 | 852k | unsigned int sz = input.size(); |
73 | 852k | if (acceptSpaces) { |
74 | 1.17M | while (*txt == ' ') { |
75 | 391k | ++txt; |
76 | 391k | --sz; |
77 | | // have we run off the end of the view? |
78 | 391k | if (sz < 1U) { |
79 | 63.7k | return 0; |
80 | 63.7k | } |
81 | 391k | } |
82 | 852k | } |
83 | 788k | int res = 0; |
84 | 788k | std::from_chars(txt, txt + sz, res); |
85 | | |
86 | 788k | return res; |
87 | 852k | } |
88 | 516k | int toInt(const std::string &input, bool acceptSpaces) { |
89 | 516k | return toInt(std::string_view(input.c_str()), acceptSpaces); |
90 | 516k | } |
91 | 501k | unsigned int toUnsigned(const std::string_view input, bool acceptSpaces) { |
92 | | // don't need to worry about locale stuff here because |
93 | | // we're not going to have delimiters |
94 | | |
95 | | // sanity check on the input since strtol doesn't do it for us: |
96 | 501k | const char *txt = input.data(); |
97 | 1.80M | for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) { |
98 | 1.33M | if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') || |
99 | 1.30M | *txt == '+') { |
100 | 1.30M | ++txt; |
101 | 1.30M | } else { |
102 | 28.7k | throw boost::bad_lexical_cast(); |
103 | 28.7k | } |
104 | 1.33M | } |
105 | | // remove leading spaces |
106 | 472k | txt = input.data(); |
107 | 472k | unsigned int sz = input.size(); |
108 | 472k | if (acceptSpaces) { |
109 | 1.02M | while (*txt == ' ') { |
110 | 556k | ++txt; |
111 | 556k | --sz; |
112 | | // have we run off the end of the view? |
113 | 556k | if (sz < 1U) { |
114 | 7.71k | return 0; |
115 | 7.71k | } |
116 | 556k | } |
117 | 472k | } |
118 | 465k | unsigned int res = 0; |
119 | 465k | std::from_chars(txt, txt + sz, res); |
120 | 465k | return res; |
121 | 472k | } |
122 | 81.4k | unsigned int toUnsigned(const std::string &input, bool acceptSpaces) { |
123 | 81.4k | return toUnsigned(std::string_view(input.c_str()), acceptSpaces); |
124 | 81.4k | } |
125 | 428k | double toDouble(const std::string_view input, bool acceptSpaces) { |
126 | | // sanity check on the input since strtol doesn't do it for us: |
127 | 428k | const char *txt = input.data(); |
128 | 4.00M | for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) { |
129 | | // check for ',' and '.' because locale |
130 | 3.58M | if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') || |
131 | 3.58M | *txt == '+' || *txt == '-' || *txt == ',' || *txt == '.') { |
132 | 3.58M | ++txt; |
133 | 3.58M | } else { |
134 | 5.34k | throw boost::bad_lexical_cast(); |
135 | 5.34k | } |
136 | 3.58M | } |
137 | | // unfortunately from_chars() with doubles didn't work on g++ until v11.1 |
138 | | // and the status with clang is hard to figure out... we remain old-school |
139 | | // remove leading spaces |
140 | 422k | double res = atof(input.data()); |
141 | 422k | return res; |
142 | 428k | } |
143 | 26.8k | double toDouble(const std::string &input, bool acceptSpaces) { |
144 | 26.8k | return toDouble(std::string_view(input.c_str()), acceptSpaces); |
145 | 26.8k | } |
146 | 78.8k | std::string getV3000Line(std::istream *inStream, unsigned int &line) { |
147 | | // FIX: technically V3K blocks are case-insensitive. We should really be |
148 | | // up-casing everything here. |
149 | 78.8k | PRECONDITION(inStream, "bad stream"); |
150 | 78.8k | std::string res; |
151 | 78.8k | ++line; |
152 | 78.8k | auto inl = getLine(inStream); |
153 | 78.8k | std::string_view tempStr = inl; |
154 | 78.8k | if (tempStr.size() < 7 || tempStr.substr(0, 7) != "M V30 ") { |
155 | 2.05k | std::ostringstream errout; |
156 | 2.05k | errout << "Line " << line << " does not start with 'M V30 '" << std::endl; |
157 | 2.05k | throw FileParseException(errout.str()); |
158 | 2.05k | } |
159 | | // FIX: do we need to handle trailing whitespace after a -? |
160 | 77.3k | while (tempStr.back() == '-') { |
161 | | // continuation character, append what we read: |
162 | 586 | res += tempStr.substr(7, tempStr.length() - 8); |
163 | | // and then read another line: |
164 | 586 | ++line; |
165 | 586 | inl = getLine(inStream); |
166 | 586 | tempStr = inl; |
167 | 586 | if (tempStr.size() < 7 || tempStr.substr(0, 7) != "M V30 ") { |
168 | 20 | std::ostringstream errout; |
169 | 20 | errout << "Line " << line << " does not start with 'M V30 '" |
170 | 20 | << std::endl; |
171 | 20 | throw FileParseException(errout.str()); |
172 | 20 | } |
173 | 586 | } |
174 | 76.7k | res += tempStr.substr(7, tempStr.length() - 7); |
175 | | |
176 | 76.7k | return res; |
177 | 76.8k | } |
178 | | |
179 | 0 | Atom *replaceAtomWithQueryAtom(RWMol *mol, Atom *atom) { |
180 | 0 | return QueryOps::replaceAtomWithQueryAtom(mol, atom); |
181 | 0 | } |
182 | | } // namespace FileParserUtils |
183 | | using RDKit::FileParserUtils::getV3000Line; |
184 | | |
185 | | namespace { |
186 | | |
187 | 6.58k | bool startsWith(const std::string &haystack, const char *needle, size_t size) { |
188 | 6.58k | return haystack.compare(0u, size, needle, size) == 0; |
189 | 6.58k | } |
190 | | |
191 | | //! parse a collection block to find enhanced stereo groups |
192 | | std::string parseEnhancedStereo(std::istream *inStream, unsigned int &line, |
193 | 4.12k | RWMol *mol, bool strictParsing) { |
194 | | // Lines like (absolute, relative, racemic): |
195 | | // M V30 MDLV30/STEABS ATOMS=(2 2 3) |
196 | | // M V30 MDLV30/STEREL1 ATOMS=(1 12) |
197 | | // M V30 MDLV30/STERAC1 ATOMS=(1 12) |
198 | 4.12k | const regex stereo_label( |
199 | 4.12k | R"regex(MDLV30/STE(...)([0-9]*) +ATOMS=\(([0-9]+) +(.*)\) *)regex"); |
200 | | |
201 | 4.12k | smatch match; |
202 | 4.12k | std::vector<StereoGroup> groups; |
203 | | |
204 | | // Read the collection until the end |
205 | 4.12k | auto tempStr = getV3000Line(inStream, line); |
206 | 4.12k | boost::to_upper(tempStr); |
207 | 4.12k | unsigned abs_group_seen = 0; |
208 | 6.68k | while (!startsWith(tempStr, "END", 3)) { |
209 | | // If this line in the collection is part of a stereo group |
210 | 2.55k | if (regex_match(tempStr, match, stereo_label)) { |
211 | 0 | StereoGroupType grouptype = RDKit::StereoGroupType::STEREO_ABSOLUTE; |
212 | 0 | unsigned groupid = 0; |
213 | |
|
214 | 0 | if (match[1] == "ABS") { |
215 | 0 | grouptype = RDKit::StereoGroupType::STEREO_ABSOLUTE; |
216 | | // Warn only one per mol about multiple ABS groups |
217 | 0 | if (abs_group_seen == 1) { |
218 | 0 | std::ostringstream errout; |
219 | 0 | errout << "Seen a second ABS stereo group on line " << line |
220 | 0 | << std::endl; |
221 | 0 | if (strictParsing) { |
222 | 0 | throw FileParseException(errout.str()); |
223 | 0 | } else { |
224 | 0 | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
225 | 0 | } |
226 | 0 | } |
227 | 0 | ++abs_group_seen; |
228 | 0 | } else if (match[1] == "REL") { |
229 | 0 | grouptype = RDKit::StereoGroupType::STEREO_OR; |
230 | 0 | groupid = FileParserUtils::toUnsigned(match[2], true); |
231 | 0 | } else if (match[1] == "RAC") { |
232 | 0 | grouptype = RDKit::StereoGroupType::STEREO_AND; |
233 | 0 | groupid = FileParserUtils::toUnsigned(match[2], true); |
234 | 0 | } else { |
235 | 0 | std::ostringstream errout; |
236 | 0 | errout << "Unrecognized stereogroup type : '" << tempStr << "' on line" |
237 | 0 | << line; |
238 | 0 | throw FileParseException(errout.str()); |
239 | 0 | } |
240 | | |
241 | 0 | const unsigned int count = FileParserUtils::toUnsigned(match[3], true); |
242 | 0 | std::vector<Atom *> atoms; |
243 | 0 | std::stringstream ss(match[4]); |
244 | 0 | unsigned int index; |
245 | 0 | for (size_t i = 0; i < count; ++i) { |
246 | 0 | ss >> index; |
247 | | // atoms are 1 indexed in molfiles |
248 | 0 | atoms.push_back(mol->getAtomWithIdx(index - 1)); |
249 | 0 | } |
250 | 0 | std::vector<Bond *> newBonds; |
251 | 0 | groups.emplace_back(grouptype, std::move(atoms), std::move(newBonds), |
252 | 0 | groupid); |
253 | 2.55k | } else { |
254 | | // skip collection types we don't know how to read. Only one documented |
255 | | // is MDLV30/HILITE |
256 | 2.55k | BOOST_LOG(rdWarningLog) << "Skipping unrecognized collection type at " |
257 | 0 | "line " |
258 | 0 | << line << ": " << tempStr << std::endl; |
259 | 2.55k | } |
260 | 2.55k | tempStr = getV3000Line(inStream, line); |
261 | 2.55k | } |
262 | | |
263 | 4.12k | if (!groups.empty()) { |
264 | 0 | mol->setStereoGroups(std::move(groups)); |
265 | 0 | } |
266 | 4.12k | tempStr = getV3000Line(inStream, line); |
267 | 4.12k | return tempStr; |
268 | 4.12k | } |
269 | | |
270 | | //************************************* |
271 | | // |
272 | | // Every effort has been made to adhere to MDL's standard |
273 | | // for mol files |
274 | | // |
275 | | //************************************* |
276 | | |
277 | | void ParseOldAtomList(RWMol *mol, const std::string_view &text, |
278 | 253 | unsigned int line) { |
279 | 253 | PRECONDITION(mol, "bad mol"); |
280 | 253 | unsigned int idx; |
281 | 253 | try { |
282 | 253 | idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(0, 3)) - |
283 | 253 | 1; |
284 | 253 | } catch (boost::bad_lexical_cast &) { |
285 | 127 | std::ostringstream errout; |
286 | 127 | errout << "Cannot convert '" << text.substr(0, 3) << "' to int on line " |
287 | 127 | << line; |
288 | 127 | throw FileParseException(errout.str()); |
289 | 127 | } |
290 | | |
291 | 126 | URANGE_CHECK(idx, mol->getNumAtoms()); |
292 | 87 | QueryAtom a(*(mol->getAtomWithIdx(idx))); |
293 | | |
294 | 87 | auto *q = new ATOM_OR_QUERY; |
295 | 87 | q->setDescription("AtomOr"); |
296 | | |
297 | 87 | switch (text[4]) { |
298 | 62 | case 'T': |
299 | 62 | q->setNegation(true); |
300 | 62 | break; |
301 | 13 | case 'F': |
302 | 13 | q->setNegation(false); |
303 | 13 | break; |
304 | 12 | default: |
305 | 12 | delete q; |
306 | 12 | std::ostringstream errout; |
307 | 12 | errout << "Unrecognized atom-list query modifier: '" << text[4] |
308 | 12 | << "' on line " << line; |
309 | 12 | throw FileParseException(errout.str()); |
310 | 87 | } |
311 | | |
312 | 75 | int nQueries; |
313 | 75 | try { |
314 | 75 | nQueries = FileParserUtils::toInt(text.substr(9, 1)); |
315 | 75 | } catch (const std::out_of_range &) { |
316 | 4 | delete q; |
317 | 4 | std::ostringstream errout; |
318 | 4 | errout << "Cannot convert position 9 of '" << text << "' to int on line " |
319 | 4 | << line; |
320 | 4 | throw FileParseException(errout.str()); |
321 | 7 | } catch (boost::bad_lexical_cast &) { |
322 | 7 | delete q; |
323 | 7 | std::ostringstream errout; |
324 | 7 | errout << "Cannot convert '" << text.substr(9, 1) << "' to int on line " |
325 | 7 | << line; |
326 | 7 | throw FileParseException(errout.str()); |
327 | 7 | } |
328 | | |
329 | 64 | RANGE_CHECK(0, nQueries, 5); |
330 | 137 | for (int i = 0; i < nQueries; i++) { |
331 | 114 | int pos = 11 + i * 4; |
332 | 114 | int atNum; |
333 | 114 | try { |
334 | 114 | atNum = FileParserUtils::toInt(text.substr(pos, 3)); |
335 | 114 | } catch (const std::out_of_range &) { |
336 | 8 | delete q; |
337 | 8 | std::ostringstream errout; |
338 | 8 | errout << "Cannot convert position " << pos << " of '" << text |
339 | 8 | << "' to int on line " << line; |
340 | 8 | throw FileParseException(errout.str()); |
341 | 22 | } catch (boost::bad_lexical_cast &) { |
342 | 22 | delete q; |
343 | 22 | std::ostringstream errout; |
344 | 22 | errout << "Cannot convert '" << text.substr(pos, 3) << "' to int on line " |
345 | 22 | << line; |
346 | 22 | throw FileParseException(errout.str()); |
347 | 22 | } |
348 | 84 | RANGE_CHECK(0, atNum, 200); // goofy! |
349 | 74 | q->addChild( |
350 | 74 | QueryAtom::QUERYATOM_QUERY::CHILD_TYPE(makeAtomNumQuery(atNum))); |
351 | 74 | if (!i) { |
352 | 39 | a.setAtomicNum(atNum); |
353 | 39 | } |
354 | 74 | } |
355 | | |
356 | 23 | a.setQuery(q); |
357 | 23 | a.setProp(common_properties::_MolFileAtomQuery, 1); |
358 | | |
359 | 23 | mol->replaceAtom(idx, &a); |
360 | 23 | } |
361 | | |
362 | | void ParseChargeLine(RWMol *mol, const std::string &text, bool firstCall, |
363 | 1.69k | unsigned int line) { |
364 | 1.69k | PRECONDITION(mol, "bad mol"); |
365 | 1.69k | PRECONDITION(text.substr(0, 6) == std::string("M CHG"), "bad charge line"); |
366 | | |
367 | | // if this line is specified all the atom other than those specified |
368 | | // here should carry a charge of 0; but we should only do this once: |
369 | 1.69k | if (firstCall) { |
370 | 2.73k | for (ROMol::AtomIterator ai = mol->beginAtoms(); ai != mol->endAtoms(); |
371 | 2.45k | ++ai) { |
372 | 2.45k | (*ai)->setFormalCharge(0); |
373 | 2.45k | } |
374 | 275 | } |
375 | | |
376 | 1.69k | int ie, nent; |
377 | 1.69k | try { |
378 | 1.69k | nent = FileParserUtils::toInt(text.substr(6, 3)); |
379 | 1.69k | } catch (boost::bad_lexical_cast &) { |
380 | 21 | std::ostringstream errout; |
381 | 21 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
382 | 21 | << line; |
383 | 21 | throw FileParseException(errout.str()); |
384 | 21 | } |
385 | 1.67k | int spos = 9; |
386 | 2.87k | for (ie = 0; ie < nent; ie++) { |
387 | 1.27k | int aid, chg; |
388 | 1.27k | try { |
389 | 1.27k | aid = FileParserUtils::toInt(text.substr(spos, 4)); |
390 | 1.27k | spos += 4; |
391 | 1.27k | chg = FileParserUtils::toInt(text.substr(spos, 4)); |
392 | 1.27k | spos += 4; |
393 | 1.27k | mol->getAtomWithIdx(aid - 1)->setFormalCharge(chg); |
394 | 1.27k | } catch (boost::bad_lexical_cast &) { |
395 | 25 | std::ostringstream errout; |
396 | 25 | errout << "Cannot convert '" << text.substr(spos, 4) |
397 | 25 | << "' to int on line " << line; |
398 | 25 | throw FileParseException(errout.str()); |
399 | 25 | } |
400 | 1.27k | } |
401 | 1.67k | } |
402 | | |
403 | | void ParseRadicalLine(RWMol *mol, const std::string &text, bool firstCall, |
404 | 3.68k | unsigned int line) { |
405 | 3.68k | PRECONDITION(mol, "bad mol"); |
406 | 3.68k | PRECONDITION(text.substr(0, 6) == std::string("M RAD"), "bad charge line"); |
407 | | |
408 | | // if this line is specified all the atom other than those specified |
409 | | // here should carry a charge of 0; but we should only do this once: |
410 | 3.68k | if (firstCall) { |
411 | 524 | for (ROMol::AtomIterator ai = mol->beginAtoms(); ai != mol->endAtoms(); |
412 | 309 | ++ai) { |
413 | 309 | (*ai)->setFormalCharge(0); |
414 | 309 | } |
415 | 215 | } |
416 | | |
417 | 3.68k | int ie, nent; |
418 | 3.68k | try { |
419 | 3.68k | nent = FileParserUtils::toInt(text.substr(6, 3)); |
420 | 3.68k | } catch (boost::bad_lexical_cast &) { |
421 | 14 | std::ostringstream errout; |
422 | 14 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
423 | 14 | << line; |
424 | 14 | throw FileParseException(errout.str()); |
425 | 14 | } |
426 | 3.67k | int spos = 9; |
427 | 7.46k | for (ie = 0; ie < nent; ie++) { |
428 | 3.86k | int aid, rad; |
429 | 3.86k | std::ostringstream errout; |
430 | | |
431 | 3.86k | try { |
432 | 3.86k | aid = FileParserUtils::toInt(text.substr(spos, 4)); |
433 | 3.86k | spos += 4; |
434 | 3.86k | rad = FileParserUtils::toInt(text.substr(spos, 4)); |
435 | 3.86k | spos += 4; |
436 | | |
437 | 3.86k | switch (rad) { |
438 | 361 | case 0: |
439 | | // This shouldn't be required, but let's make sure. |
440 | 361 | mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(0); |
441 | 361 | break; |
442 | 977 | case 1: |
443 | 977 | mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(2); |
444 | 977 | break; |
445 | 1.13k | case 2: |
446 | 1.13k | mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(1); |
447 | 1.13k | break; |
448 | 1.34k | case 3: |
449 | 1.34k | mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(2); |
450 | 1.34k | break; |
451 | 7 | default: |
452 | 7 | errout << "Unrecognized radical value " << rad << " for atom " |
453 | 7 | << aid - 1 << " on line " << line << std::endl; |
454 | 7 | throw FileParseException(errout.str()); |
455 | 3.86k | } |
456 | 3.86k | } catch (boost::bad_lexical_cast &) { |
457 | 25 | std::ostringstream errout; |
458 | 25 | errout << "Cannot convert '" << text.substr(spos, 4) |
459 | 25 | << "' to int on line " << line; |
460 | 25 | throw FileParseException(errout.str()); |
461 | 25 | } |
462 | 3.86k | } |
463 | 3.67k | } |
464 | | |
465 | 4.74k | void ParsePXALine(RWMol *mol, const std::string &text, unsigned int line) { |
466 | 4.74k | PRECONDITION(mol, "bad mol"); |
467 | 4.74k | PRECONDITION(text.substr(0, 6) == "M PXA", "bad PXA line"); |
468 | 4.74k | unsigned int pos = 7; |
469 | 4.74k | try { |
470 | 4.74k | auto atIdx = |
471 | 4.74k | FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(pos, 3)); |
472 | 4.74k | pos += 3; |
473 | 4.74k | mol->getAtomWithIdx(atIdx - 1)->setProp( |
474 | 4.74k | "_MolFile_PXA", text.substr(pos, text.length() - pos)); |
475 | 4.74k | } catch (boost::bad_lexical_cast &) { |
476 | 15 | std::ostringstream errout; |
477 | 15 | errout << "Cannot convert '" << text.substr(pos, 3) << "' to int on line " |
478 | 15 | << line; |
479 | 15 | throw FileParseException(errout.str()); |
480 | 15 | } |
481 | 4.74k | } |
482 | | |
483 | 2.42k | void ParseIsotopeLine(RWMol *mol, const std::string &text, unsigned int line) { |
484 | 2.42k | PRECONDITION(mol, "bad mol"); |
485 | 2.42k | PRECONDITION(text.substr(0, 6) == std::string("M ISO"), "bad isotope line"); |
486 | | |
487 | 2.42k | unsigned int nent; |
488 | 2.42k | try { |
489 | 2.42k | nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3)); |
490 | 2.42k | } catch (boost::bad_lexical_cast &) { |
491 | 8 | std::ostringstream errout; |
492 | 8 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
493 | 8 | << line; |
494 | 8 | throw FileParseException(errout.str()); |
495 | 8 | } |
496 | 2.41k | unsigned int spos = 9; |
497 | 4.75k | for (unsigned int ie = 0; ie < nent; ie++) { |
498 | 2.39k | unsigned int aid; |
499 | 2.39k | try { |
500 | 2.39k | aid = FileParserUtils::stripSpacesAndCast<unsigned int>( |
501 | 2.39k | text.substr(spos, 4)); |
502 | 2.39k | spos += 4; |
503 | 2.39k | Atom *atom = mol->getAtomWithIdx(aid - 1); |
504 | 2.39k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
505 | 1.48k | int isotope = FileParserUtils::toInt(text.substr(spos, 4)); |
506 | 1.48k | if (isotope < 0) { |
507 | 228 | BOOST_LOG(rdErrorLog) |
508 | 0 | << " atom " << aid |
509 | 0 | << " has a negative isotope value. line: " << line << std::endl; |
510 | 1.25k | } else { |
511 | 1.25k | atom->setIsotope(isotope); |
512 | 1.25k | } |
513 | 1.48k | } |
514 | 2.39k | spos += 4; |
515 | 2.39k | } catch (boost::bad_lexical_cast &) { |
516 | 34 | std::ostringstream errout; |
517 | 34 | errout << "Cannot convert '" << text.substr(spos, 4) |
518 | 34 | << "' to int on line " << line; |
519 | 34 | throw FileParseException(errout.str()); |
520 | 34 | } |
521 | 2.39k | } |
522 | 2.41k | } |
523 | | |
524 | | void ParseSubstitutionCountLine(RWMol *mol, const std::string &text, |
525 | 3.60k | unsigned int line) { |
526 | 3.60k | PRECONDITION(mol, "bad mol"); |
527 | 3.60k | PRECONDITION(text.substr(0, 6) == std::string("M SUB"), "bad SUB line"); |
528 | | |
529 | 3.60k | unsigned int nent; |
530 | 3.60k | try { |
531 | 3.60k | nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3)); |
532 | 3.60k | } catch (boost::bad_lexical_cast &) { |
533 | 17 | std::ostringstream errout; |
534 | 17 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
535 | 17 | << line; |
536 | 17 | throw FileParseException(errout.str()); |
537 | 17 | } |
538 | 3.58k | unsigned int spos = 9; |
539 | 7.79k | for (unsigned int ie = 0; ie < nent; ie++) { |
540 | 4.29k | unsigned int aid; |
541 | 4.29k | int count = 0; |
542 | 4.29k | try { |
543 | 4.29k | aid = FileParserUtils::stripSpacesAndCast<unsigned int>( |
544 | 4.29k | text.substr(spos, 4)); |
545 | 4.29k | spos += 4; |
546 | 4.29k | Atom *atom = mol->getAtomWithIdx(aid - 1); |
547 | 4.29k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
548 | 3.63k | count = FileParserUtils::toInt(text.substr(spos, 4)); |
549 | 3.63k | } |
550 | 4.29k | spos += 4; |
551 | 4.29k | if (count == 0) { |
552 | 1.52k | continue; |
553 | 1.52k | } |
554 | 2.77k | ATOM_EQUALS_QUERY *q = makeAtomExplicitDegreeQuery(0); |
555 | 2.77k | switch (count) { |
556 | 582 | case -1: |
557 | 582 | q->setVal(0); |
558 | 582 | break; |
559 | 420 | case -2: |
560 | 420 | q->setVal(atom->getDegree()); |
561 | 420 | break; |
562 | 190 | case 1: |
563 | 302 | case 2: |
564 | 363 | case 3: |
565 | 445 | case 4: |
566 | 1.18k | case 5: |
567 | 1.18k | q->setVal(count); |
568 | 1.18k | break; |
569 | 489 | case 6: |
570 | 489 | BOOST_LOG(rdWarningLog) << " atom degree query with value 6 found. " |
571 | 0 | "This will not match degree >6. The MDL " |
572 | 0 | "spec says it should. line: " |
573 | 0 | << line; |
574 | 489 | q->setVal(6); |
575 | 489 | break; |
576 | 17 | default: |
577 | 17 | std::ostringstream errout; |
578 | 17 | errout << "Value " << count |
579 | 17 | << " is not supported as a degree query. line: " << line; |
580 | 17 | throw FileParseException(errout.str()); |
581 | 2.77k | } |
582 | 2.67k | if (!atom->hasQuery()) { |
583 | 104 | atom = QueryOps::replaceAtomWithQueryAtom(mol, atom); |
584 | 104 | } |
585 | 2.67k | atom->expandQuery(q, Queries::COMPOSITE_AND); |
586 | 2.67k | } catch (boost::bad_lexical_cast &) { |
587 | 44 | std::ostringstream errout; |
588 | 44 | errout << "Cannot convert '" << text.substr(spos, 4) |
589 | 44 | << "' to int on line " << line; |
590 | 44 | throw FileParseException(errout.str()); |
591 | 44 | } |
592 | 4.29k | } |
593 | 3.58k | } |
594 | | |
595 | | void ParseUnsaturationLine(RWMol *mol, const std::string &text, |
596 | 2.78k | unsigned int line) { |
597 | 2.78k | PRECONDITION(mol, "bad mol"); |
598 | 2.78k | PRECONDITION(text.substr(0, 6) == std::string("M UNS"), "bad UNS line"); |
599 | | |
600 | 2.78k | unsigned int nent; |
601 | 2.78k | try { |
602 | 2.78k | nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3)); |
603 | 2.78k | } catch (boost::bad_lexical_cast &) { |
604 | 7 | std::ostringstream errout; |
605 | 7 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
606 | 7 | << line; |
607 | 7 | throw FileParseException(errout.str()); |
608 | 7 | } |
609 | 2.77k | unsigned int spos = 9; |
610 | 5.35k | for (unsigned int ie = 0; ie < nent; ie++) { |
611 | 2.68k | unsigned int aid; |
612 | 2.68k | int count = 0; |
613 | 2.68k | try { |
614 | 2.68k | aid = FileParserUtils::stripSpacesAndCast<unsigned int>( |
615 | 2.68k | text.substr(spos, 4)); |
616 | 2.68k | spos += 4; |
617 | 2.68k | Atom *atom = mol->getAtomWithIdx(aid - 1); |
618 | 2.68k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
619 | 1.39k | count = FileParserUtils::toInt(text.substr(spos, 4)); |
620 | 1.39k | } |
621 | 2.68k | spos += 4; |
622 | 2.68k | if (count == 0) { |
623 | 2.13k | continue; |
624 | 2.13k | } else if (count == 1) { |
625 | 444 | ATOM_EQUALS_QUERY *q = makeAtomUnsaturatedQuery(); |
626 | 444 | if (!atom->hasQuery()) { |
627 | 70 | atom = QueryOps::replaceAtomWithQueryAtom(mol, atom); |
628 | 70 | } |
629 | 444 | atom->expandQuery(q, Queries::COMPOSITE_AND); |
630 | 444 | } else { |
631 | 111 | std::ostringstream errout; |
632 | 111 | errout << "Value " << count |
633 | 111 | << " is not supported as an unsaturation " |
634 | 111 | "query (only 0 and 1 are allowed). " |
635 | 111 | "line: " |
636 | 111 | << line; |
637 | 111 | throw FileParseException(errout.str()); |
638 | 111 | } |
639 | 2.68k | } catch (boost::bad_lexical_cast &) { |
640 | 71 | std::ostringstream errout; |
641 | 71 | errout << "Cannot convert '" << text.substr(spos, 4) |
642 | 71 | << "' to int on line " << line; |
643 | 71 | throw FileParseException(errout.str()); |
644 | 71 | } |
645 | 2.68k | } |
646 | 2.77k | } |
647 | | |
648 | | void ParseRingBondCountLine(RWMol *mol, const std::string &text, |
649 | 9.68k | unsigned int line) { |
650 | 9.68k | PRECONDITION(mol, "bad mol"); |
651 | 9.68k | PRECONDITION(text.substr(0, 6) == std::string("M RBC"), "bad RBC line"); |
652 | | |
653 | 9.68k | unsigned int nent; |
654 | 9.68k | try { |
655 | 9.68k | nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3)); |
656 | 9.68k | } catch (boost::bad_lexical_cast &) { |
657 | 14 | std::ostringstream errout; |
658 | 14 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
659 | 14 | << line; |
660 | 14 | throw FileParseException(errout.str()); |
661 | 14 | } |
662 | 9.66k | unsigned int spos = 9; |
663 | 19.6k | for (unsigned int ie = 0; ie < nent; ie++) { |
664 | 10.1k | unsigned int aid; |
665 | 10.1k | int count = 0; |
666 | 10.1k | try { |
667 | 10.1k | aid = FileParserUtils::stripSpacesAndCast<unsigned int>( |
668 | 10.1k | text.substr(spos, 4)); |
669 | 10.1k | spos += 4; |
670 | 10.1k | Atom *atom = mol->getAtomWithIdx(aid - 1); |
671 | 10.1k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
672 | 9.16k | count = FileParserUtils::toInt(text.substr(spos, 4)); |
673 | 9.16k | } |
674 | 10.1k | spos += 4; |
675 | 10.1k | if (count == 0) { |
676 | 1.55k | continue; |
677 | 1.55k | } |
678 | 8.55k | ATOM_EQUALS_QUERY *q = makeAtomRingBondCountQuery(0); |
679 | 8.55k | switch (count) { |
680 | 1.91k | case -1: |
681 | 1.91k | q->setVal(0); |
682 | 1.91k | break; |
683 | 4.10k | case -2: |
684 | 4.10k | q->setVal(0xDEADBEEF); |
685 | 4.10k | mol->setProp(common_properties::_NeedsQueryScan, 1); |
686 | 4.10k | break; |
687 | 446 | case 1: |
688 | 612 | case 2: |
689 | 1.58k | case 3: |
690 | 1.58k | q->setVal(count); |
691 | 1.58k | break; |
692 | 871 | case 4: |
693 | 871 | delete q; |
694 | 871 | q = static_cast<ATOM_EQUALS_QUERY *>(new ATOM_LESSEQUAL_QUERY); |
695 | 871 | q->setVal(4); |
696 | 871 | q->setDescription("AtomRingBondCount"); |
697 | 871 | q->setDataFunc(queryAtomRingBondCount); |
698 | 871 | break; |
699 | 25 | default: |
700 | 25 | std::ostringstream errout; |
701 | 25 | errout << "Value " << count |
702 | 25 | << " is not supported as a ring-bond count query. line: " |
703 | 25 | << line; |
704 | 25 | throw FileParseException(errout.str()); |
705 | 8.55k | } |
706 | 8.47k | if (!atom->hasQuery()) { |
707 | 173 | atom = QueryOps::replaceAtomWithQueryAtom(mol, atom); |
708 | 173 | } |
709 | 8.47k | atom->expandQuery(q, Queries::COMPOSITE_AND); |
710 | 8.47k | } catch (boost::bad_lexical_cast &) { |
711 | 31 | std::ostringstream errout; |
712 | 31 | errout << "Cannot convert '" << text.substr(spos, 4) |
713 | 31 | << "' to int on line " << line; |
714 | 31 | throw FileParseException(errout.str()); |
715 | 31 | } |
716 | 10.1k | } |
717 | 9.66k | } |
718 | | |
719 | 1.60k | void ParseZCHLine(RWMol *mol, const std::string &text, unsigned int line) { |
720 | | // part of Alex Clark's ZBO proposal |
721 | | // from JCIM 51:3149-57 (2011) |
722 | 1.60k | PRECONDITION(mol, "bad mol"); |
723 | 1.60k | PRECONDITION(text.substr(0, 6) == std::string("M ZCH"), "bad ZCH line"); |
724 | | |
725 | 1.60k | unsigned int nent; |
726 | 1.60k | try { |
727 | 1.60k | nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3)); |
728 | 1.60k | } catch (boost::bad_lexical_cast &) { |
729 | 8 | std::ostringstream errout; |
730 | 8 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
731 | 8 | << line; |
732 | 8 | throw FileParseException(errout.str()); |
733 | 8 | } |
734 | 1.60k | unsigned int spos = 9; |
735 | 3.61k | for (unsigned int ie = 0; ie < nent; ie++) { |
736 | 2.11k | unsigned int aid = 0; |
737 | 2.11k | int val = 0; |
738 | 2.11k | try { |
739 | 2.11k | aid = FileParserUtils::stripSpacesAndCast<unsigned int>( |
740 | 2.11k | text.substr(spos, 4)); |
741 | 2.11k | spos += 4; |
742 | 2.11k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
743 | 1.38k | val = FileParserUtils::stripSpacesAndCast<int>(text.substr(spos, 4)); |
744 | 1.38k | } |
745 | 2.11k | if (!aid || aid > mol->getNumAtoms()) { |
746 | 45 | std::ostringstream errout; |
747 | 45 | errout << "Bad ZCH specification on line " << line; |
748 | 45 | throw FileParseException(errout.str()); |
749 | 45 | } |
750 | 2.06k | spos += 4; |
751 | 2.06k | --aid; |
752 | 2.06k | Atom *atom = mol->getAtomWithIdx(aid); |
753 | 2.06k | if (!atom) { |
754 | 0 | std::ostringstream errout; |
755 | 0 | errout << "Atom " << aid << " from ZCH specification on line " << line |
756 | 0 | << " not found"; |
757 | 0 | throw FileParseException(errout.str()); |
758 | 2.06k | } else { |
759 | 2.06k | atom->setFormalCharge(val); |
760 | 2.06k | } |
761 | 2.06k | } catch (boost::bad_lexical_cast &) { |
762 | 41 | std::ostringstream errout; |
763 | 41 | errout << "Cannot convert '" << text.substr(spos, 4) |
764 | 41 | << "' to int on line " << line; |
765 | 41 | throw FileParseException(errout.str()); |
766 | 41 | } |
767 | 2.11k | } |
768 | 1.60k | } |
769 | | |
770 | 1.68k | void ParseHYDLine(RWMol *mol, const std::string &text, unsigned int line) { |
771 | | // part of Alex Clark's ZBO proposal |
772 | | // from JCIM 51:3149-57 (2011) |
773 | 1.68k | PRECONDITION(mol, "bad mol"); |
774 | 1.68k | PRECONDITION(text.substr(0, 6) == std::string("M HYD"), "bad HYD line"); |
775 | | |
776 | 1.68k | unsigned int nent; |
777 | 1.68k | try { |
778 | 1.68k | nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3)); |
779 | 1.68k | } catch (boost::bad_lexical_cast &) { |
780 | 20 | std::ostringstream errout; |
781 | 20 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
782 | 20 | << line; |
783 | 20 | throw FileParseException(errout.str()); |
784 | 20 | } |
785 | 1.66k | unsigned int spos = 9; |
786 | 4.87k | for (unsigned int ie = 0; ie < nent; ie++) { |
787 | 3.32k | unsigned int aid = 0; |
788 | 3.32k | int val = -1; |
789 | 3.32k | try { |
790 | 3.32k | aid = FileParserUtils::stripSpacesAndCast<unsigned int>( |
791 | 3.32k | text.substr(spos, 4)); |
792 | 3.32k | spos += 4; |
793 | 3.32k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
794 | 2.59k | val = FileParserUtils::stripSpacesAndCast<int>(text.substr(spos, 4)); |
795 | 2.59k | } |
796 | 3.32k | if (!aid || aid > mol->getNumAtoms()) { |
797 | 58 | std::ostringstream errout; |
798 | 58 | errout << "Bad HYD specification on line " << line; |
799 | 58 | throw FileParseException(errout.str()); |
800 | 58 | } |
801 | 3.26k | spos += 4; |
802 | 3.26k | --aid; |
803 | 3.26k | Atom *atom = mol->getAtomWithIdx(aid); |
804 | 3.26k | if (!atom) { |
805 | 0 | std::ostringstream errout; |
806 | 0 | errout << "Atom " << aid << " from HYD specification on line " << line |
807 | 0 | << " not found"; |
808 | 0 | throw FileParseException(errout.str()); |
809 | 3.26k | } else { |
810 | 3.26k | if (val >= 0) { |
811 | 2.52k | atom->setProp("_ZBO_H", true); |
812 | 2.52k | atom->setNumExplicitHs(val); |
813 | 2.52k | } |
814 | 3.26k | } |
815 | 3.26k | } catch (boost::bad_lexical_cast &) { |
816 | 43 | std::ostringstream errout; |
817 | 43 | errout << "Cannot convert '" << text.substr(spos, 4) |
818 | 43 | << "' to int on line " << line; |
819 | 43 | throw FileParseException(errout.str()); |
820 | 43 | } |
821 | 3.32k | } |
822 | 1.66k | } |
823 | | |
824 | 1.34k | void ParseZBOLine(RWMol *mol, const std::string &text, unsigned int line) { |
825 | | // part of Alex Clark's ZBO proposal |
826 | | // from JCIM 51:3149-57 (2011) |
827 | 1.34k | PRECONDITION(mol, "bad mol"); |
828 | 1.34k | PRECONDITION(text.substr(0, 6) == std::string("M ZBO"), "bad ZBO line"); |
829 | | |
830 | 1.34k | unsigned int nent; |
831 | 1.34k | try { |
832 | 1.34k | nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3)); |
833 | 1.34k | } catch (boost::bad_lexical_cast &) { |
834 | 23 | std::ostringstream errout; |
835 | 23 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
836 | 23 | << line; |
837 | 23 | throw FileParseException(errout.str()); |
838 | 23 | } |
839 | 1.32k | unsigned int spos = 9; |
840 | 3.29k | for (unsigned int ie = 0; ie < nent; ie++) { |
841 | 2.08k | unsigned int bid = 0; |
842 | 2.08k | unsigned int order = 0; |
843 | 2.08k | try { |
844 | 2.08k | bid = FileParserUtils::stripSpacesAndCast<unsigned int>( |
845 | 2.08k | text.substr(spos, 4)); |
846 | 2.08k | spos += 4; |
847 | 2.08k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
848 | 993 | order = FileParserUtils::stripSpacesAndCast<unsigned int>( |
849 | 993 | text.substr(spos, 4)); |
850 | 993 | } |
851 | 2.08k | if (!bid || bid > mol->getNumBonds()) { |
852 | 36 | std::ostringstream errout; |
853 | 36 | errout << "Bad ZBO specification on line " << line; |
854 | 36 | throw FileParseException(errout.str()); |
855 | 36 | } |
856 | 2.05k | spos += 4; |
857 | 2.05k | --bid; |
858 | 2.05k | Bond *bnd = mol->getBondWithIdx(bid); |
859 | 2.05k | if (!bnd) { |
860 | 0 | std::ostringstream errout; |
861 | 0 | errout << "Bond " << bid << " from ZBO specification on line " << line |
862 | 0 | << " not found"; |
863 | 0 | throw FileParseException(errout.str()); |
864 | 2.05k | } else { |
865 | 2.05k | if (order == 0) { |
866 | 1.03k | bnd->setBondType(Bond::ZERO); |
867 | 1.03k | } else { |
868 | 1.01k | bnd->setBondType(static_cast<Bond::BondType>(order)); |
869 | 1.01k | } |
870 | 2.05k | } |
871 | 2.05k | } catch (boost::bad_lexical_cast &) { |
872 | 67 | std::ostringstream errout; |
873 | 67 | errout << "Cannot convert '" << text.substr(spos, 4) |
874 | 67 | << "' to int on line " << line; |
875 | 67 | throw FileParseException(errout.str()); |
876 | 67 | } |
877 | 2.08k | } |
878 | 1.32k | } |
879 | | |
880 | | void ParseMarvinSmartsLine(RWMol *mol, const std::string &text, |
881 | 35.8k | unsigned int line) { |
882 | 35.8k | const unsigned int atomNumStart = 10; |
883 | 35.8k | const unsigned int smartsStart = 15; |
884 | | // M MRV SMA 1 [*;A] |
885 | | // 01234567890123456789 |
886 | | // 1111111111 |
887 | 35.8k | if (text.substr(0, 10) != "M MRV SMA") { |
888 | 5.83k | return; |
889 | 5.83k | } |
890 | | |
891 | 30.0k | unsigned int idx; |
892 | 30.0k | std::string idxTxt = text.substr(atomNumStart, smartsStart - atomNumStart); |
893 | 30.0k | try { |
894 | 30.0k | idx = FileParserUtils::stripSpacesAndCast<unsigned int>(idxTxt) - 1; |
895 | 30.0k | } catch (boost::bad_lexical_cast &) { |
896 | 38 | std::ostringstream errout; |
897 | 38 | errout << "Cannot convert '" << idxTxt << "' to an atom index on line " |
898 | 38 | << line; |
899 | 38 | throw FileParseException(errout.str()); |
900 | 38 | } |
901 | | |
902 | 29.9k | URANGE_CHECK(idx, mol->getNumAtoms()); |
903 | | // Should we check the validity of the marvin line here? Should we |
904 | | // automatically |
905 | | // Add these as recursive smarts? I tend to think so... |
906 | 29.9k | std::string sma = text.substr(smartsStart); |
907 | 29.9k | Atom *at = mol->getAtomWithIdx(idx); |
908 | 29.9k | at->setProp(common_properties::MRV_SMA, sma); |
909 | 29.9k | RWMol *m = nullptr; |
910 | 29.9k | try { |
911 | 29.9k | m = SmartsToMol(sma); |
912 | 29.9k | } catch (...) { |
913 | | // Is this ever used? |
914 | 2.75k | } |
915 | | |
916 | 29.9k | if (m) { |
917 | 24.4k | QueryAtom::QUERYATOM_QUERY *query = new RecursiveStructureQuery(m); |
918 | 24.4k | if (!at->hasQuery()) { |
919 | 1.32k | QueryAtom qAt(*at); |
920 | 1.32k | int oidx = at->getIdx(); |
921 | 1.32k | mol->replaceAtom(oidx, &qAt); |
922 | 1.32k | at = mol->getAtomWithIdx(oidx); |
923 | 1.32k | } |
924 | 24.4k | at->expandQuery(query, Queries::COMPOSITE_AND); |
925 | 24.4k | at->setProp(common_properties::_MolFileAtomQuery, 1); |
926 | 24.4k | } else { |
927 | 5.48k | std::ostringstream errout; |
928 | 5.48k | errout << "Cannot parse smarts: '" << sma << "' on line " << line; |
929 | 5.48k | throw FileParseException(errout.str()); |
930 | 5.48k | } |
931 | 29.9k | } |
932 | | |
933 | | void ParseAttachPointLine(RWMol *mol, const std::string &text, |
934 | 1.55k | unsigned int line, bool strictParsing) { |
935 | 1.55k | PRECONDITION(mol, "bad mol"); |
936 | 1.55k | PRECONDITION(text.substr(0, 6) == std::string("M APO"), "bad APO line"); |
937 | | |
938 | 1.55k | unsigned int nent; |
939 | 1.55k | try { |
940 | 1.55k | nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3)); |
941 | 1.55k | } catch (boost::bad_lexical_cast &) { |
942 | 13 | std::ostringstream errout; |
943 | 13 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
944 | 13 | << line; |
945 | 13 | throw FileParseException(errout.str()); |
946 | 13 | } |
947 | 1.54k | unsigned int spos = 9; |
948 | 4.14k | for (unsigned int ie = 0; ie < nent; ie++) { |
949 | 2.75k | unsigned int aid = 0; |
950 | 2.75k | int val = 0; |
951 | 2.75k | try { |
952 | 2.75k | aid = FileParserUtils::stripSpacesAndCast<unsigned int>( |
953 | 2.75k | text.substr(spos, 4)); |
954 | 2.75k | spos += 4; |
955 | 2.75k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
956 | 1.66k | val = FileParserUtils::stripSpacesAndCast<int>(text.substr(spos, 4)); |
957 | 1.66k | } |
958 | 2.75k | if (!aid || aid > mol->getNumAtoms()) { |
959 | 47 | std::ostringstream errout; |
960 | 47 | errout << "Bad APO specification on line " << line; |
961 | 47 | throw FileParseException(errout.str()); |
962 | 47 | } |
963 | 2.70k | spos += 4; |
964 | 2.70k | --aid; |
965 | 2.70k | Atom *atom = mol->getAtomWithIdx(aid); |
966 | 2.70k | if (!atom) { |
967 | 0 | std::ostringstream errout; |
968 | 0 | errout << "Atom " << aid << " from APO specification on line " << line |
969 | 0 | << " not found"; |
970 | 0 | throw FileParseException(errout.str()); |
971 | 2.70k | } else { |
972 | 2.70k | if (val < 0 || val > 3) { |
973 | 13 | std::ostringstream errout; |
974 | 13 | errout << "Value " << val << " from APO specification on line " |
975 | 13 | << line << " is invalid"; |
976 | 13 | throw FileParseException(errout.str()); |
977 | 2.69k | } else if (val) { |
978 | 1.59k | if (val == 3) { |
979 | | // this is -1 in v3k mol blocks, so use that: |
980 | 252 | val = -1; |
981 | 252 | } |
982 | 1.59k | if (atom->hasProp(common_properties::molAttachPoint)) { |
983 | 1.49k | std::ostringstream errout; |
984 | 1.49k | errout << "Multiple ATTCHPT values for atom " << atom->getIdx() + 1 |
985 | 1.49k | << " on line " << line; |
986 | 1.49k | if (strictParsing) { |
987 | 2 | throw FileParseException(errout.str()); |
988 | 1.49k | } else { |
989 | 1.49k | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
990 | 1.49k | } |
991 | 1.49k | } else { |
992 | 98 | atom->setProp(common_properties::molAttachPoint, val); |
993 | 98 | } |
994 | 1.59k | } |
995 | 2.70k | } |
996 | 2.70k | } catch (boost::bad_lexical_cast &) { |
997 | 74 | std::ostringstream errout; |
998 | 74 | errout << "Cannot convert '" << text.substr(spos, 4) |
999 | 74 | << "' to int on line " << line; |
1000 | 74 | throw FileParseException(errout.str()); |
1001 | 74 | } |
1002 | 2.75k | } |
1003 | 1.54k | } |
1004 | | |
1005 | | // the format differs between V2000 and V3000, so we have to do a bit of |
1006 | | // translation here |
1007 | 1.31k | void ParseLinkNodeLine(RWMol *mol, const std::string &text, unsigned int line) { |
1008 | 1.31k | PRECONDITION(mol, "bad mol"); |
1009 | 1.31k | PRECONDITION(text.substr(0, 6) == std::string("M LIN"), "bad LIN line"); |
1010 | | |
1011 | 1.31k | unsigned int nent; |
1012 | 1.31k | try { |
1013 | 1.31k | nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3)); |
1014 | 1.31k | } catch (boost::bad_lexical_cast &) { |
1015 | 9 | std::ostringstream errout; |
1016 | 9 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
1017 | 9 | << line; |
1018 | 9 | throw FileParseException(errout.str()); |
1019 | 9 | } |
1020 | 1.30k | std::string propVal = ""; |
1021 | 1.30k | unsigned int spos = 9; |
1022 | 3.86k | for (unsigned int ie = 0; ie < nent; ie++) { |
1023 | 2.72k | try { |
1024 | 2.72k | auto aid = FileParserUtils::stripSpacesAndCast<unsigned int>( |
1025 | 2.72k | text.substr(spos, 4)); |
1026 | 2.72k | if (!aid || aid > mol->getNumAtoms()) { |
1027 | 37 | std::ostringstream errout; |
1028 | 37 | errout << "LIN specification has bad atom idx on line " << line; |
1029 | 37 | throw FileParseException(errout.str()); |
1030 | 37 | } |
1031 | 2.68k | spos += 4; |
1032 | | |
1033 | 2.68k | if (text.size() < spos + 4 || text.substr(spos, 4) == " ") { |
1034 | 6 | std::ostringstream errout; |
1035 | 6 | errout << "LIN specification missing repeat count on line " << line; |
1036 | 6 | throw FileParseException(errout.str()); |
1037 | 6 | } |
1038 | 2.68k | auto repeatCount = FileParserUtils::stripSpacesAndCast<unsigned int>( |
1039 | 2.68k | text.substr(spos, 4)); |
1040 | 2.68k | spos += 4; |
1041 | 2.68k | if (repeatCount < 2) { |
1042 | 2 | std::ostringstream errout; |
1043 | 2 | errout << "LIN specification: repeat count must be >=2 on line " |
1044 | 2 | << line; |
1045 | 2 | throw FileParseException(errout.str()); |
1046 | 2 | } |
1047 | 2.68k | unsigned int substB = 0; |
1048 | 2.68k | unsigned int substC = 0; |
1049 | 2.68k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
1050 | 2.61k | substB = FileParserUtils::stripSpacesAndCast<unsigned int>( |
1051 | 2.61k | text.substr(spos, 4)); |
1052 | 2.61k | } |
1053 | 2.68k | spos += 4; |
1054 | 2.68k | if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") { |
1055 | 2.62k | substC = FileParserUtils::stripSpacesAndCast<unsigned int>( |
1056 | 2.62k | text.substr(spos, 4)); |
1057 | 2.62k | } |
1058 | 2.68k | spos += 4; |
1059 | | |
1060 | 2.68k | if (!substB || substB > mol->getNumAtoms() || |
1061 | 2.57k | substC > mol->getNumAtoms()) { |
1062 | 37 | std::ostringstream errout; |
1063 | 37 | errout << "LIN specification has bad substituent idx on line " << line; |
1064 | 37 | throw FileParseException(errout.str()); |
1065 | 37 | } |
1066 | | |
1067 | 2.64k | boost::format formatter; |
1068 | 2.64k | if (substC) { |
1069 | 2.55k | formatter = boost::format("1 %1% 2 %2% %3% %2% %4%") % repeatCount % |
1070 | 2.55k | aid % substB % substC; |
1071 | 2.55k | } else { |
1072 | 89 | formatter = boost::format("1 %1% 1 %2% %3%") % repeatCount % aid % |
1073 | 89 | substB % substC; |
1074 | 89 | } |
1075 | 2.64k | if (!propVal.empty()) { |
1076 | 1.64k | propVal += "|"; |
1077 | 1.64k | } |
1078 | 2.64k | propVal += formatter.str(); |
1079 | 2.64k | } catch (boost::bad_lexical_cast &) { |
1080 | 76 | std::ostringstream errout; |
1081 | 76 | errout << "Cannot convert '" << text.substr(spos, 4) |
1082 | 76 | << "' to int on line " << line; |
1083 | 76 | throw FileParseException(errout.str()); |
1084 | 76 | } |
1085 | 2.55k | mol->setProp(common_properties::molFileLinkNodes, propVal); |
1086 | 2.55k | } |
1087 | 1.30k | } |
1088 | | |
1089 | | // Recursively populates queryVect with COMPOSITE_AND queries |
1090 | | // present in the input query. If the logic of the input query |
1091 | | // is more complex, it returns nullptr and empty set. |
1092 | | // The returned ptr should only be checked for not being null |
1093 | | // and not used for any other purposes, as the actual result is |
1094 | | // the queryVect |
1095 | | const QueryAtom::QUERYATOM_QUERY *getAndQueries( |
1096 | | const QueryAtom::QUERYATOM_QUERY *q, |
1097 | 200k | std::vector<const QueryAtom::QUERYATOM_QUERY *> &queryVect) { |
1098 | 200k | if (q) { |
1099 | 200k | auto qOrig = q; |
1100 | 397k | for (auto cq = qOrig->beginChildren(); cq != qOrig->endChildren(); ++cq) { |
1101 | 197k | if (q == qOrig && q->getDescription() != "AtomAnd") { |
1102 | 617 | q = nullptr; |
1103 | 617 | break; |
1104 | 617 | } |
1105 | 197k | q = getAndQueries(cq->get(), queryVect); |
1106 | 197k | } |
1107 | 200k | if (q == qOrig) { |
1108 | 101k | queryVect.push_back(q); |
1109 | 101k | } |
1110 | 200k | } |
1111 | 200k | if (!q) { |
1112 | 617 | queryVect.clear(); |
1113 | 617 | } |
1114 | 200k | return q; |
1115 | 200k | } |
1116 | | |
1117 | 3.72k | void ParseNewAtomList(RWMol *mol, const std::string &text, unsigned int line) { |
1118 | 3.72k | if (text.size() < 15) { |
1119 | 13 | std::ostringstream errout; |
1120 | 13 | errout << "Atom list line too short: '" << text << "'"; |
1121 | 13 | throw FileParseException(errout.str()); |
1122 | 13 | } |
1123 | 3.71k | PRECONDITION(mol, "bad mol"); |
1124 | 3.71k | PRECONDITION(text.substr(0, 6) == std::string("M ALS"), |
1125 | 3.71k | "bad atom list line"); |
1126 | | |
1127 | 3.71k | unsigned int idx; |
1128 | 3.71k | try { |
1129 | 3.71k | idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(7, 3)) - |
1130 | 3.71k | 1; |
1131 | 3.71k | } catch (boost::bad_lexical_cast &) { |
1132 | 16 | std::ostringstream errout; |
1133 | 16 | errout << "Cannot convert '" << text.substr(7, 3) << "' to int on line " |
1134 | 16 | << line; |
1135 | 16 | throw FileParseException(errout.str()); |
1136 | 16 | } |
1137 | 3.69k | URANGE_CHECK(idx, mol->getNumAtoms()); |
1138 | | |
1139 | 3.66k | int nQueries; |
1140 | 3.66k | try { |
1141 | 3.66k | nQueries = FileParserUtils::toInt(text.substr(10, 3)); |
1142 | 3.66k | } catch (boost::bad_lexical_cast &) { |
1143 | 4 | std::ostringstream errout; |
1144 | 4 | errout << "Cannot convert '" << text.substr(10, 3) << "' to int on line " |
1145 | 4 | << line; |
1146 | 4 | throw FileParseException(errout.str()); |
1147 | 4 | } |
1148 | | |
1149 | 3.65k | if (!nQueries) { |
1150 | 252 | BOOST_LOG(rdWarningLog) << "Empty atom list: '" << text << "' on line " |
1151 | 0 | << line << "." << std::endl; |
1152 | 252 | return; |
1153 | 252 | } |
1154 | | |
1155 | 3.40k | if (nQueries < 0) { |
1156 | 5 | std::ostringstream errout; |
1157 | 5 | errout << "negative length atom list: '" << text << "' on line " << line |
1158 | 5 | << "." << std::endl; |
1159 | 5 | throw FileParseException(errout.str()); |
1160 | 5 | } |
1161 | 3.40k | QueryAtom *a = nullptr; |
1162 | 3.40k | QueryAtom *qaOrig = nullptr; |
1163 | 3.40k | QueryAtom::QUERYATOM_QUERY *qOrig = nullptr; |
1164 | 3.40k | Atom *aOrig = mol->getAtomWithIdx(idx); |
1165 | 7.48k | for (unsigned int i = 0; i < static_cast<unsigned int>(nQueries); i++) { |
1166 | 4.09k | unsigned int pos = 16 + i * 4; |
1167 | 4.09k | if (text.size() < pos + 4) { |
1168 | 8 | std::ostringstream errout; |
1169 | 8 | errout << "Atom list line too short: '" << text << "' on line " << line; |
1170 | 8 | throw FileParseException(errout.str()); |
1171 | 8 | } |
1172 | | |
1173 | 4.08k | std::string atSymb = text.substr(pos, 4); |
1174 | 4.08k | atSymb.erase(atSymb.find(' '), atSymb.size()); |
1175 | 4.08k | int atNum = PeriodicTable::getTable()->getAtomicNumber(atSymb); |
1176 | 4.08k | if (!i) { |
1177 | 3.38k | if (aOrig->hasQuery()) { |
1178 | 3.24k | qaOrig = dynamic_cast<QueryAtom *>(aOrig); |
1179 | 3.24k | if (qaOrig) { |
1180 | 3.24k | qOrig = qaOrig->getQuery(); |
1181 | 3.24k | } |
1182 | 3.24k | } |
1183 | 3.38k | a = new QueryAtom(*aOrig); |
1184 | 3.38k | a->setAtomicNum(atNum); |
1185 | 3.38k | if (!qOrig) { |
1186 | 131 | qOrig = a->getQuery()->copy(); |
1187 | 131 | } |
1188 | 3.38k | a->setQuery(makeAtomNumQuery(atNum)); |
1189 | 3.38k | } else { |
1190 | 703 | a->expandQuery(makeAtomNumQuery(atNum), Queries::COMPOSITE_OR, true); |
1191 | | // For COMPOSITE_OR query atoms, reset atomic num to 0 such that they are |
1192 | | // exported as "*" in SMILES |
1193 | 703 | a->setAtomicNum(0); |
1194 | 703 | } |
1195 | 4.08k | } |
1196 | 3.39k | ASSERT_INVARIANT(a, "no atom built"); |
1197 | 3.39k | if (qOrig) { |
1198 | 3.35k | std::vector<const QueryAtom::QUERYATOM_QUERY *> queryVect; |
1199 | 3.35k | if (getAndQueries(qOrig, queryVect)) { |
1200 | 101k | for (const auto &q : queryVect) { |
1201 | 101k | if (q->getDescription() != "AtomAtomicNum") { |
1202 | 98.6k | a->expandQuery(q->copy(), Queries::COMPOSITE_AND, true); |
1203 | 98.6k | } |
1204 | 101k | } |
1205 | 3.01k | } |
1206 | 3.35k | if (!qaOrig) { |
1207 | 117 | delete qOrig; |
1208 | 117 | } |
1209 | 3.35k | } |
1210 | 3.39k | a->setProp(common_properties::_MolFileAtomQuery, 1); |
1211 | 3.39k | switch (text[14]) { |
1212 | 736 | case 'T': |
1213 | 736 | a->getQuery()->setNegation(true); |
1214 | 736 | break; |
1215 | 2.60k | case 'F': |
1216 | 2.60k | a->getQuery()->setNegation(false); |
1217 | 2.60k | break; |
1218 | 16 | default: |
1219 | 16 | std::ostringstream errout; |
1220 | 16 | errout << "Unrecognized atom-list query modifier: '" << text[14] |
1221 | 16 | << "' on line " << line; |
1222 | 16 | delete a; |
1223 | 16 | throw FileParseException(errout.str()); |
1224 | 3.39k | } |
1225 | | |
1226 | 3.33k | mol->replaceAtom(idx, a); |
1227 | 3.33k | delete a; |
1228 | 3.33k | } |
1229 | | |
1230 | | void ParseV3000RGroups(RWMol *mol, Atom *&atom, std::string_view text, |
1231 | 22 | unsigned int line) { |
1232 | 22 | PRECONDITION(mol, "bad mol"); |
1233 | 22 | PRECONDITION(atom, "bad atom"); |
1234 | 22 | if (text[0] != '(' || text.back() != ')') { |
1235 | 2 | std::ostringstream errout; |
1236 | 2 | errout << "Bad RGROUPS specification '" << text << "' on line " << line |
1237 | 2 | << ". Missing parens."; |
1238 | 2 | throw FileParseException(errout.str()); |
1239 | 2 | } |
1240 | 20 | std::vector<std::string> splitToken; |
1241 | 20 | std::string resid = std::string(text.substr(1, text.size() - 2)); |
1242 | 20 | boost::split(splitToken, resid, boost::is_any_of(std::string(" "))); |
1243 | 20 | if (splitToken.size() < 1) { |
1244 | 0 | std::ostringstream errout; |
1245 | 0 | errout << "Bad RGROUPS specification '" << text << "' on line " << line |
1246 | 0 | << ". Missing values."; |
1247 | 0 | throw FileParseException(errout.str()); |
1248 | 0 | } |
1249 | 20 | unsigned int nRs; |
1250 | 20 | try { |
1251 | 20 | nRs = FileParserUtils::stripSpacesAndCast<unsigned int>(splitToken[0]); |
1252 | 20 | } catch (boost::bad_lexical_cast &) { |
1253 | 3 | std::ostringstream errout; |
1254 | 3 | errout << "Cannot convert '" << splitToken[0] << "' to int on line" << line; |
1255 | 3 | throw FileParseException(errout.str()); |
1256 | 3 | } |
1257 | 17 | if (splitToken.size() < nRs + 1) { |
1258 | 5 | std::ostringstream errout; |
1259 | 5 | errout << "Bad RGROUPS specification '" << text << "' on line " << line |
1260 | 5 | << ". Not enough values."; |
1261 | 5 | throw FileParseException(errout.str()); |
1262 | 5 | } |
1263 | 36 | for (unsigned int i = 0; i < nRs; ++i) { |
1264 | 34 | unsigned int rLabel; |
1265 | 34 | try { |
1266 | 34 | rLabel = |
1267 | 34 | FileParserUtils::stripSpacesAndCast<unsigned int>(splitToken[i + 1]); |
1268 | 34 | } catch (boost::bad_lexical_cast &) { |
1269 | 10 | std::ostringstream errout; |
1270 | 10 | errout << "Cannot convert '" << splitToken[i + 1] << "' to int on line" |
1271 | 10 | << line; |
1272 | 10 | throw FileParseException(errout.str()); |
1273 | 10 | } |
1274 | 24 | atom = QueryOps::replaceAtomWithQueryAtom(mol, atom); |
1275 | 24 | atom->setProp(common_properties::_MolFileRLabel, rLabel); |
1276 | 24 | std::string dLabel = "R" + std::to_string(rLabel); |
1277 | 24 | atom->setProp(common_properties::dummyLabel, dLabel); |
1278 | 24 | atom->setIsotope(rLabel); |
1279 | 24 | atom->setQuery(makeAtomNullQuery()); |
1280 | 24 | } |
1281 | 12 | } |
1282 | | |
1283 | 1.91k | void ParseRGroupLabels(RWMol *mol, const std::string &text, unsigned int line) { |
1284 | 1.91k | PRECONDITION(mol, "bad mol"); |
1285 | 1.91k | PRECONDITION(text.substr(0, 6) == std::string("M RGP"), |
1286 | 1.91k | "bad R group label line"); |
1287 | | |
1288 | 1.91k | int nLabels; |
1289 | 1.91k | try { |
1290 | 1.91k | nLabels = FileParserUtils::toInt(text.substr(6, 3)); |
1291 | 1.91k | } catch (boost::bad_lexical_cast &) { |
1292 | 25 | std::ostringstream errout; |
1293 | 25 | errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line " |
1294 | 25 | << line; |
1295 | 25 | throw FileParseException(errout.str()); |
1296 | 25 | } |
1297 | | |
1298 | 4.07k | for (int i = 0; i < nLabels; i++) { |
1299 | 2.23k | int pos = 10 + i * 8; |
1300 | 2.23k | unsigned int atIdx; |
1301 | 2.23k | try { |
1302 | 2.23k | atIdx = FileParserUtils::stripSpacesAndCast<unsigned int>( |
1303 | 2.23k | text.substr(pos, 3)); |
1304 | 2.23k | } catch (boost::bad_lexical_cast &) { |
1305 | 7 | std::ostringstream errout; |
1306 | 7 | errout << "Cannot convert '" << text.substr(pos, 3) << "' to int on line " |
1307 | 7 | << line; |
1308 | 7 | throw FileParseException(errout.str()); |
1309 | 7 | } |
1310 | 2.22k | unsigned int rLabel; |
1311 | 2.22k | try { |
1312 | 2.22k | rLabel = FileParserUtils::stripSpacesAndCast<unsigned int>( |
1313 | 2.22k | text.substr(pos + 4, 3)); |
1314 | 2.22k | } catch (boost::bad_lexical_cast &) { |
1315 | 15 | std::ostringstream errout; |
1316 | 15 | errout << "Cannot convert '" << text.substr(pos + 4, 3) |
1317 | 15 | << "' to int on line " << line; |
1318 | 15 | throw FileParseException(errout.str()); |
1319 | 15 | } |
1320 | 2.20k | atIdx -= 1; |
1321 | 2.20k | if (atIdx > mol->getNumAtoms()) { |
1322 | 17 | std::ostringstream errout; |
1323 | 17 | errout << "Attempt to set R group label on nonexistent atom " << atIdx |
1324 | 17 | << " on line " << line; |
1325 | 17 | throw FileParseException(errout.str()); |
1326 | 17 | } |
1327 | 2.18k | QueryAtom qatom(*(mol->getAtomWithIdx(atIdx))); |
1328 | 2.18k | qatom.setProp(common_properties::_MolFileRLabel, rLabel); |
1329 | | |
1330 | | // set the dummy label so that this is shown correctly |
1331 | | // in other pieces of the code : |
1332 | | // (this was sf.net issue 3316600) |
1333 | 2.18k | std::string dLabel = "R" + std::to_string(rLabel); |
1334 | 2.18k | qatom.setProp(common_properties::dummyLabel, dLabel); |
1335 | | |
1336 | | // the CTFile spec (June 2005 version) technically only allows |
1337 | | // R labels up to 32. Since there are three digits, we'll accept |
1338 | | // anything: so long as it's positive and less than 1000: |
1339 | 2.18k | if (rLabel > 0 && rLabel < 999) { |
1340 | 1.07k | qatom.setIsotope(rLabel); |
1341 | 1.07k | } |
1342 | 2.18k | qatom.setQuery(makeAtomNullQuery()); |
1343 | 2.18k | mol->replaceAtom(atIdx, &qatom); |
1344 | 2.18k | } |
1345 | 1.89k | } |
1346 | | |
1347 | | void ParseAtomAlias(RWMol *mol, std::string text, const std::string &nextLine, |
1348 | 2.06k | unsigned int line) { |
1349 | 2.06k | PRECONDITION(mol, "bad mol"); |
1350 | 2.06k | PRECONDITION(text.substr(0, 2) == std::string("A "), "bad atom alias line"); |
1351 | | |
1352 | 2.04k | unsigned int idx; |
1353 | 2.04k | try { |
1354 | 2.04k | idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(3, 3)) - |
1355 | 2.04k | 1; |
1356 | 2.04k | } catch (boost::bad_lexical_cast &) { |
1357 | 13 | std::ostringstream errout; |
1358 | 13 | errout << "Cannot convert '" << text.substr(3, 3) << "' to int on line " |
1359 | 13 | << line; |
1360 | 13 | throw FileParseException(errout.str()); |
1361 | 13 | } |
1362 | 2.02k | URANGE_CHECK(idx, mol->getNumAtoms()); |
1363 | 2.00k | Atom *at = mol->getAtomWithIdx(idx); |
1364 | 2.00k | at->setProp(common_properties::molFileAlias, nextLine); |
1365 | 2.00k | } |
1366 | | |
1367 | 3.15k | void ParseAtomValue(RWMol *mol, std::string text, unsigned int line) { |
1368 | 3.15k | PRECONDITION(mol, "bad mol"); |
1369 | 3.15k | PRECONDITION(text.substr(0, 2) == std::string("V "), "bad atom value line"); |
1370 | | |
1371 | 3.13k | unsigned int idx; |
1372 | 3.13k | try { |
1373 | 3.13k | idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(3, 3)) - |
1374 | 3.13k | 1; |
1375 | 3.13k | } catch (boost::bad_lexical_cast &) { |
1376 | 17 | std::ostringstream errout; |
1377 | 17 | errout << "Cannot convert '" << text.substr(3, 3) << "' to int on line" |
1378 | 17 | << line; |
1379 | 17 | throw FileParseException(errout.str()); |
1380 | 17 | } |
1381 | 3.11k | URANGE_CHECK(idx, mol->getNumAtoms()); |
1382 | 3.09k | Atom *at = mol->getAtomWithIdx(idx); |
1383 | 3.09k | at->setProp(common_properties::molFileValue, |
1384 | 3.09k | text.substr(7, text.length() - 7)); |
1385 | 3.09k | } |
1386 | | |
1387 | | namespace { |
1388 | 9.70k | void setRGPProps(const std::string_view symb, Atom *res) { |
1389 | 9.70k | PRECONDITION(res, "bad atom pointer"); |
1390 | | // set the dummy label so that this is shown correctly |
1391 | | // in other pieces of the code : |
1392 | 9.70k | std::string symbc(symb); |
1393 | 9.70k | res->setProp(common_properties::dummyLabel, symbc); |
1394 | 9.70k | } |
1395 | | |
1396 | | void lookupAtomicNumber(Atom *res, const std::string &symb, |
1397 | 133k | bool strictParsing) { |
1398 | 133k | std::string tCopy(symb); |
1399 | 133k | if (symb.size() == 2 && symb[1] >= 'A' && symb[1] <= 'Z') { |
1400 | 4.99k | tCopy[1] = static_cast<char>(tolower(symb[1])); |
1401 | 4.99k | } |
1402 | 133k | try { |
1403 | 133k | res->setAtomicNum(PeriodicTable::getTable()->getAtomicNumber(tCopy)); |
1404 | 133k | } catch (const Invar::Invariant &e) { |
1405 | 54.9k | if (strictParsing || symb.empty()) { |
1406 | 83 | throw FileParseException(e.what()); |
1407 | 54.8k | } else { |
1408 | 54.8k | res->setAtomicNum(0); |
1409 | 54.8k | res->setProp(common_properties::dummyLabel, symb); |
1410 | 54.8k | } |
1411 | 54.9k | } |
1412 | 133k | } |
1413 | | |
1414 | | } // namespace |
1415 | | |
1416 | | Atom *ParseMolFileAtomLine(const std::string_view text, RDGeom::Point3D &pos, |
1417 | 133k | unsigned int line, bool strictParsing) { |
1418 | 133k | std::string symb; |
1419 | 133k | int massDiff, chg, hCount; |
1420 | | |
1421 | 133k | if ((strictParsing && text.size() < 34) || text.size() < 32) { |
1422 | 80 | std::ostringstream errout; |
1423 | 80 | errout << "Atom line too short: '" << text << "' on line " << line; |
1424 | 80 | throw FileParseException(errout.str()); |
1425 | 80 | } |
1426 | | |
1427 | 133k | try { |
1428 | 133k | pos.x = FileParserUtils::toDouble(text.substr(0, 10)); |
1429 | 133k | pos.y = FileParserUtils::toDouble(text.substr(10, 10)); |
1430 | 133k | pos.z = FileParserUtils::toDouble(text.substr(20, 10)); |
1431 | 133k | } catch (boost::bad_lexical_cast &) { |
1432 | 373 | std::ostringstream errout; |
1433 | 373 | errout << "Cannot process coordinates on line " << line; |
1434 | 373 | throw FileParseException(errout.str()); |
1435 | 373 | } |
1436 | 133k | symb = text.substr(31, 3); |
1437 | 133k | boost::trim(symb); |
1438 | | |
1439 | | // REVIEW: should we handle missing fields at the end of the line? |
1440 | 133k | massDiff = 0; |
1441 | 133k | if (text.size() >= 36 && text.substr(34, 2) != " 0") { |
1442 | 46.9k | try { |
1443 | 46.9k | massDiff = FileParserUtils::toInt(text.substr(34, 2), true); |
1444 | 46.9k | } catch (boost::bad_lexical_cast &) { |
1445 | 46 | std::ostringstream errout; |
1446 | 46 | errout << "Cannot convert '" << text.substr(34, 2) << "' to int on line " |
1447 | 46 | << line; |
1448 | 46 | throw FileParseException(errout.str()); |
1449 | 46 | } |
1450 | 46.9k | } |
1451 | 133k | chg = 0; |
1452 | 133k | if (text.size() >= 39 && text.substr(36, 3) != " 0") { |
1453 | 44.0k | try { |
1454 | 44.0k | chg = FileParserUtils::toInt(text.substr(36, 3), true); |
1455 | 44.0k | } catch (boost::bad_lexical_cast &) { |
1456 | 32 | std::ostringstream errout; |
1457 | 32 | errout << "Cannot convert '" << text.substr(36, 3) << "' to int on line " |
1458 | 32 | << line; |
1459 | 32 | throw FileParseException(errout.str()); |
1460 | 32 | } |
1461 | 44.0k | } |
1462 | 133k | hCount = 0; |
1463 | 133k | if (text.size() >= 45 && text.substr(42, 3) != " 0") { |
1464 | 36.6k | try { |
1465 | 36.6k | hCount = FileParserUtils::toInt(text.substr(42, 3), true); |
1466 | 36.6k | } catch (boost::bad_lexical_cast &) { |
1467 | 33 | std::ostringstream errout; |
1468 | 33 | errout << "Cannot convert '" << text.substr(42, 3) << "' to int on line " |
1469 | 33 | << line; |
1470 | 33 | throw FileParseException(errout.str()); |
1471 | 33 | } |
1472 | 36.6k | } |
1473 | 133k | std::unique_ptr<Atom> res(new Atom); |
1474 | 133k | bool isComplexQueryName = |
1475 | 133k | std::find(complexQueries.begin(), complexQueries.end(), symb) != |
1476 | 133k | complexQueries.end(); |
1477 | 133k | if (isComplexQueryName || symb == "L" || symb == "*" || symb == "LP" || |
1478 | 128k | symb == "R" || symb == "R#" || |
1479 | 126k | (symb[0] == 'R' && symb >= "R0" && symb <= "R99")) { |
1480 | 7.51k | if (isComplexQueryName || symb == "*" || symb == "R") { |
1481 | 4.23k | auto *query = new QueryAtom(0); |
1482 | 4.23k | if (symb == "*" || symb == "R") { |
1483 | | // according to the MDL spec, these match anything |
1484 | 1.46k | query->setQuery(makeAtomNullQuery()); |
1485 | 2.77k | } else if (isComplexQueryName) { |
1486 | 2.77k | convertComplexNameToQuery(query, symb); |
1487 | 2.77k | } |
1488 | 4.23k | res.reset(query); |
1489 | | // queries have no implicit Hs: |
1490 | 4.23k | res->setNoImplicit(true); |
1491 | 4.23k | } else { |
1492 | 3.27k | res->setAtomicNum(0); |
1493 | 3.27k | } |
1494 | 7.51k | if (massDiff == 0 && symb[0] == 'R') { |
1495 | 3.26k | if (symb.length() > 1 && symb >= "R0" && symb <= "R99") { |
1496 | 606 | std::string rlabel = ""; |
1497 | 606 | rlabel = symb.substr(1, symb.length() - 1); |
1498 | 606 | int rnumber; |
1499 | 606 | try { |
1500 | 606 | rnumber = boost::lexical_cast<int>(rlabel); |
1501 | 606 | } catch (boost::bad_lexical_cast &) { |
1502 | 61 | rnumber = -1; |
1503 | 61 | } |
1504 | 606 | if (rnumber >= 0) { |
1505 | 545 | res->setIsotope(rnumber); |
1506 | 545 | } |
1507 | 606 | } |
1508 | 3.26k | } |
1509 | 7.51k | if (symb[0] == 'R') { |
1510 | | // we used to skip R# here because that really should be handled by an |
1511 | | // RGP spec, but that turned out to not be permissive enough... <sigh> |
1512 | 3.32k | setRGPProps(symb, res.get()); |
1513 | 3.32k | } |
1514 | 125k | } else if (symb == "D") { // mol blocks support "D" and "T" as shorthand... |
1515 | | // handle that. |
1516 | 246 | res->setAtomicNum(1); |
1517 | 246 | res->setIsotope(2); |
1518 | 125k | } else if (symb == "T") { // mol blocks support "D" and "T" as shorthand... |
1519 | | // handle that. |
1520 | 95 | res->setAtomicNum(1); |
1521 | 95 | res->setIsotope(3); |
1522 | 125k | } else if (symb == "Pol" || symb == "Mod") { |
1523 | 80 | res->setAtomicNum(0); |
1524 | 80 | res->setProp(common_properties::dummyLabel, symb); |
1525 | 125k | } else if (GenericGroups::genericMatchers.find(symb) != |
1526 | 125k | GenericGroups::genericMatchers.end()) { |
1527 | 344 | res.reset(new QueryAtom(0)); |
1528 | 344 | res->setProp(common_properties::atomLabel, std::string(symb)); |
1529 | 124k | } else { |
1530 | 124k | lookupAtomicNumber(res.get(), symb, strictParsing); |
1531 | 124k | } |
1532 | | |
1533 | | // res->setPos(pX,pY,pZ); |
1534 | 133k | if (chg != 0) { |
1535 | 15.2k | res->setFormalCharge(4 - chg); |
1536 | 15.2k | } |
1537 | | |
1538 | 133k | if (hCount >= 1) { |
1539 | 10.0k | if (!res->hasQuery()) { |
1540 | 9.54k | auto qatom = new QueryAtom(*res); |
1541 | 9.54k | res.reset(qatom); |
1542 | 9.54k | } |
1543 | 10.0k | res->setNoImplicit(true); |
1544 | 10.0k | if (hCount > 1) { |
1545 | 8.13k | ATOM_EQUALS_QUERY *oq = makeAtomImplicitHCountQuery(hCount - 1); |
1546 | 8.13k | auto nq = makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>( |
1547 | 8.13k | hCount - 1, oq->getDataFunc(), |
1548 | 8.13k | std::string("less_") + oq->getDescription()); |
1549 | 8.13k | res->expandQuery(nq); |
1550 | 8.13k | delete oq; |
1551 | 8.13k | } else { |
1552 | 1.87k | res->expandQuery(makeAtomImplicitHCountQuery(0)); |
1553 | 1.87k | } |
1554 | 10.0k | } |
1555 | | |
1556 | 133k | if (massDiff != 0) { |
1557 | 11.3k | int defIso = |
1558 | 11.3k | PeriodicTable::getTable()->getMostCommonIsotope(res->getAtomicNum()); |
1559 | 11.3k | int dIso = defIso + massDiff; |
1560 | 11.3k | if (dIso < 0) { |
1561 | 469 | BOOST_LOG(rdWarningLog) |
1562 | 0 | << " atom " << res->getIdx() |
1563 | 0 | << " has a negative isotope offset. line: " << line << std::endl; |
1564 | 469 | } |
1565 | 11.3k | res->setIsotope(dIso); |
1566 | 11.3k | } |
1567 | | |
1568 | 133k | if (text.size() >= 42 && text.substr(39, 3) != " 0") { |
1569 | 37.3k | int parity = 0; |
1570 | 37.3k | try { |
1571 | 37.3k | parity = FileParserUtils::toInt(text.substr(39, 3), true); |
1572 | 37.3k | } catch (boost::bad_lexical_cast &) { |
1573 | 25 | std::ostringstream errout; |
1574 | 25 | errout << "Cannot convert '" << text.substr(39, 3) << "' to int on line " |
1575 | 25 | << line; |
1576 | 25 | throw FileParseException(errout.str()); |
1577 | 25 | } |
1578 | 37.3k | res->setProp(common_properties::molParity, parity); |
1579 | 37.3k | } |
1580 | | |
1581 | 133k | if (text.size() >= 48 && text.substr(45, 3) != " 0") { |
1582 | 28.5k | int stereoCare = 0; |
1583 | 28.5k | try { |
1584 | 28.5k | stereoCare = FileParserUtils::toInt(text.substr(45, 3), true); |
1585 | 28.5k | } catch (boost::bad_lexical_cast &) { |
1586 | 26 | std::ostringstream errout; |
1587 | 26 | errout << "Cannot convert '" << text.substr(45, 3) << "' to int on line " |
1588 | 26 | << line; |
1589 | 26 | throw FileParseException(errout.str()); |
1590 | 26 | } |
1591 | 28.5k | res->setProp(common_properties::molStereoCare, stereoCare); |
1592 | 28.5k | } |
1593 | 132k | if (text.size() >= 51 && text.substr(48, 3) != " 0") { |
1594 | 24.5k | int totValence = 0; |
1595 | 24.5k | try { |
1596 | 24.5k | totValence = FileParserUtils::toInt(text.substr(48, 3), true); |
1597 | 24.5k | } catch (boost::bad_lexical_cast &) { |
1598 | 23 | std::ostringstream errout; |
1599 | 23 | errout << "Cannot convert '" << text.substr(48, 3) << "' to int on line " |
1600 | 23 | << line; |
1601 | 23 | throw FileParseException(errout.str()); |
1602 | 23 | } |
1603 | 24.5k | if (totValence != 0) { |
1604 | | // only set if it's a non-default value |
1605 | 6.28k | res->setProp(common_properties::molTotValence, totValence); |
1606 | 6.28k | } |
1607 | 24.5k | } |
1608 | 132k | if (text.size() >= 57 && text.substr(54, 3) != " 0") { |
1609 | 20.3k | int rxnRole = 0; |
1610 | 20.3k | try { |
1611 | 20.3k | rxnRole = FileParserUtils::toInt(text.substr(54, 3), true); |
1612 | 20.3k | } catch (boost::bad_lexical_cast &) { |
1613 | 27 | std::ostringstream errout; |
1614 | 27 | errout << "Cannot convert '" << text.substr(54, 3) << "' to int on line " |
1615 | 27 | << line; |
1616 | 27 | throw FileParseException(errout.str()); |
1617 | 27 | } |
1618 | 20.3k | if (rxnRole != 0) { |
1619 | | // only set if it's a non-default value |
1620 | 3.82k | res->setProp(common_properties::molRxnRole, rxnRole); |
1621 | 3.82k | } |
1622 | 20.3k | } |
1623 | 132k | if (text.size() >= 60 && text.substr(57, 3) != " 0") { |
1624 | 21.6k | int rxnComponent = 0; |
1625 | 21.6k | try { |
1626 | 21.6k | rxnComponent = FileParserUtils::toInt(text.substr(57, 3), true); |
1627 | 21.6k | } catch (boost::bad_lexical_cast &) { |
1628 | 23 | std::ostringstream errout; |
1629 | 23 | errout << "Cannot convert '" << text.substr(57, 3) << "' to int on line " |
1630 | 23 | << line; |
1631 | 23 | throw FileParseException(errout.str()); |
1632 | 23 | } |
1633 | 21.5k | if (rxnComponent != 0) { |
1634 | | // only set if it's a non-default value |
1635 | 4.82k | res->setProp(common_properties::molRxnComponent, rxnComponent); |
1636 | 4.82k | } |
1637 | 21.5k | } |
1638 | 132k | if (text.size() >= 63 && text.substr(60, 3) != " 0") { |
1639 | 19.7k | int atomMapNumber = 0; |
1640 | 19.7k | try { |
1641 | 19.7k | atomMapNumber = FileParserUtils::toInt(text.substr(60, 3), true); |
1642 | 19.7k | } catch (boost::bad_lexical_cast &) { |
1643 | 23 | std::ostringstream errout; |
1644 | 23 | errout << "Cannot convert '" << text.substr(60, 3) << "' to int on line " |
1645 | 23 | << line; |
1646 | 23 | throw FileParseException(errout.str()); |
1647 | 23 | } |
1648 | 19.7k | res->setProp(common_properties::molAtomMapNumber, atomMapNumber); |
1649 | 19.7k | } |
1650 | 132k | if (text.size() >= 66 && text.substr(63, 3) != " 0") { |
1651 | 17.3k | int inversionFlag = 0; |
1652 | 17.3k | try { |
1653 | 17.3k | inversionFlag = FileParserUtils::toInt(text.substr(63, 3), true); |
1654 | 17.3k | } catch (boost::bad_lexical_cast &) { |
1655 | 37 | std::ostringstream errout; |
1656 | 37 | errout << "Cannot convert '" << text.substr(63, 3) << "' to int on line " |
1657 | 37 | << line; |
1658 | 37 | throw FileParseException(errout.str()); |
1659 | 37 | } |
1660 | 17.3k | res->setProp(common_properties::molInversionFlag, inversionFlag); |
1661 | 17.3k | } |
1662 | 132k | if (text.size() >= 69 && text.substr(66, 3) != " 0") { |
1663 | 14.4k | int exactChangeFlag = 0; |
1664 | 14.4k | try { |
1665 | 14.4k | exactChangeFlag = FileParserUtils::toInt(text.substr(66, 3), true); |
1666 | 14.4k | } catch (boost::bad_lexical_cast &) { |
1667 | 21 | std::ostringstream errout; |
1668 | 21 | errout << "Cannot convert '" << text.substr(66, 3) << "' to int on line " |
1669 | 21 | << line; |
1670 | 21 | throw FileParseException(errout.str()); |
1671 | 21 | } |
1672 | 14.4k | res->setProp(common_properties::molRxnExactChange, exactChangeFlag); |
1673 | 14.4k | } |
1674 | 132k | return res.release(); |
1675 | 132k | } |
1676 | | |
1677 | 122k | Bond *ParseMolFileBondLine(const std::string_view text, unsigned int line) { |
1678 | 122k | unsigned int idx1, idx2, bType, stereo; |
1679 | 122k | int spos = 0; |
1680 | | |
1681 | 122k | if (text.size() < 9) { |
1682 | 17 | std::ostringstream errout; |
1683 | 17 | errout << "Bond line too short: '" << text << "' on line " << line; |
1684 | 17 | throw FileParseException(errout.str()); |
1685 | 17 | } |
1686 | | |
1687 | 122k | try { |
1688 | 122k | idx1 = FileParserUtils::toUnsigned(text.substr(spos, 3)); |
1689 | 122k | spos += 3; |
1690 | 122k | idx2 = FileParserUtils::toUnsigned(text.substr(spos, 3)); |
1691 | 122k | spos += 3; |
1692 | 122k | bType = FileParserUtils::toUnsigned(text.substr(spos, 3)); |
1693 | 122k | } catch (boost::bad_lexical_cast &) { |
1694 | 29 | std::ostringstream errout; |
1695 | 29 | errout << "Cannot convert '" << text.substr(spos, 3) << "' to int on line " |
1696 | 29 | << line; |
1697 | 29 | throw FileParseException(errout.str()); |
1698 | 29 | } |
1699 | | |
1700 | | // adjust the numbering |
1701 | 122k | idx1--; |
1702 | 122k | idx2--; |
1703 | | |
1704 | 122k | Bond::BondType type; |
1705 | 122k | Bond *res = nullptr; |
1706 | 122k | switch (bType) { |
1707 | 65.3k | case 1: |
1708 | 65.3k | type = Bond::SINGLE; |
1709 | 65.3k | res = new Bond; |
1710 | 65.3k | break; |
1711 | 14.6k | case 2: |
1712 | 14.6k | type = Bond::DOUBLE; |
1713 | 14.6k | res = new Bond; |
1714 | 14.6k | break; |
1715 | 1.68k | case 3: |
1716 | 1.68k | type = Bond::TRIPLE; |
1717 | 1.68k | res = new Bond; |
1718 | 1.68k | break; |
1719 | 4.55k | case 4: |
1720 | 4.55k | type = Bond::AROMATIC; |
1721 | 4.55k | res = new Bond; |
1722 | 4.55k | break; |
1723 | 1.46k | case 9: |
1724 | 1.46k | type = Bond::DATIVE; |
1725 | 1.46k | res = new Bond; |
1726 | 1.46k | break; |
1727 | 19.9k | case 0: |
1728 | 19.9k | type = Bond::UNSPECIFIED; |
1729 | 19.9k | res = new Bond; |
1730 | 19.9k | BOOST_LOG(rdWarningLog) |
1731 | 0 | << "bond with order 0 found on line " << line |
1732 | 0 | << ". This is not part of the MDL specification." << std::endl; |
1733 | 19.9k | break; |
1734 | 14.7k | default: |
1735 | 14.7k | type = Bond::UNSPECIFIED; |
1736 | | // it's a query bond of some type |
1737 | 14.7k | res = new QueryBond; |
1738 | 14.7k | if (bType == 8) { |
1739 | 904 | BOND_NULL_QUERY *q; |
1740 | 904 | q = makeBondNullQuery(); |
1741 | 904 | res->setQuery(q); |
1742 | 13.8k | } else if (bType == 5) { |
1743 | 1.50k | res->setQuery(makeSingleOrDoubleBondQuery()); |
1744 | 1.50k | res->setProp(common_properties::_MolFileBondQuery, 1); |
1745 | 12.3k | } else if (bType == 6) { |
1746 | 1.52k | res->setQuery(makeSingleOrAromaticBondQuery()); |
1747 | 1.52k | res->setProp(common_properties::_MolFileBondQuery, 1); |
1748 | 10.8k | } else if (bType == 7) { |
1749 | 1.40k | res->setQuery(makeDoubleOrAromaticBondQuery()); |
1750 | 1.40k | res->setProp(common_properties::_MolFileBondQuery, 1); |
1751 | 9.44k | } else { |
1752 | 9.44k | BOND_NULL_QUERY *q; |
1753 | 9.44k | q = makeBondNullQuery(); |
1754 | 9.44k | res->setQuery(q); |
1755 | 9.44k | BOOST_LOG(rdWarningLog) |
1756 | 0 | << "unrecognized query bond type, " << bType << ", found on line " |
1757 | 0 | << line << ". Using an \"any\" query." << std::endl; |
1758 | 9.44k | } |
1759 | 14.7k | break; |
1760 | 122k | } |
1761 | 122k | res->setBeginAtomIdx(idx1); |
1762 | 122k | res->setEndAtomIdx(idx2); |
1763 | 122k | res->setBondType(type); |
1764 | 122k | res->setProp(common_properties::_MolFileBondType, bType); |
1765 | | |
1766 | 122k | if (text.size() >= 12 && text.substr(9, 3) != " 0") { |
1767 | 52.4k | try { |
1768 | 52.4k | stereo = FileParserUtils::toUnsigned(text.substr(9, 3)); |
1769 | 52.4k | switch (stereo) { |
1770 | 6.84k | case 0: |
1771 | 6.84k | res->setBondDir(Bond::NONE); |
1772 | 6.84k | break; |
1773 | 14.8k | case 1: |
1774 | 14.8k | res->setBondDir(Bond::BEGINWEDGE); |
1775 | 14.8k | break; |
1776 | 4.28k | case 6: |
1777 | 4.28k | res->setBondDir(Bond::BEGINDASH); |
1778 | 4.28k | break; |
1779 | 567 | case 3: // "either" double bond |
1780 | 567 | res->setBondDir(Bond::EITHERDOUBLE); |
1781 | 567 | res->setStereo(Bond::STEREOANY); |
1782 | 567 | break; |
1783 | 1.07k | case 4: // "either" single bond |
1784 | 1.07k | res->setBondDir(Bond::UNKNOWN); |
1785 | 1.07k | break; |
1786 | 52.4k | } |
1787 | 33.2k | res->setProp(common_properties::_MolFileBondStereo, stereo); |
1788 | 33.2k | } catch (boost::bad_lexical_cast &) { |
1789 | 19.2k | ; |
1790 | 19.2k | } |
1791 | 52.4k | } |
1792 | 122k | if (text.size() >= 18 && text.substr(15, 3) != " 0") { |
1793 | 9.25k | try { |
1794 | 9.25k | int topology = FileParserUtils::toInt(text.substr(15, 3)); |
1795 | 9.25k | if (topology) { |
1796 | 1.69k | if (!res->hasQuery()) { |
1797 | 1.06k | auto *qBond = new QueryBond(*res); |
1798 | 1.06k | delete res; |
1799 | 1.06k | res = qBond; |
1800 | 1.06k | } |
1801 | 1.69k | BOND_EQUALS_QUERY *q = makeBondIsInRingQuery(); |
1802 | 1.69k | switch (topology) { |
1803 | 1.11k | case 1: |
1804 | 1.11k | break; |
1805 | 552 | case 2: |
1806 | 552 | q->setNegation(true); |
1807 | 552 | break; |
1808 | 28 | default: |
1809 | 28 | std::ostringstream errout; |
1810 | 28 | errout << "Unrecognized bond topology specifier: " << topology |
1811 | 28 | << " on line " << line; |
1812 | 28 | throw FileParseException(errout.str()); |
1813 | 1.69k | } |
1814 | 1.66k | res->expandQuery(q); |
1815 | 1.66k | } |
1816 | 9.25k | } catch (boost::bad_lexical_cast &) { |
1817 | 4.43k | ; |
1818 | 4.43k | } |
1819 | 9.25k | } |
1820 | 122k | if (text.size() >= 21 && text.substr(18, 3) != " 0") { |
1821 | 6.75k | try { |
1822 | 6.75k | int reactStatus = FileParserUtils::toInt(text.substr(18, 3)); |
1823 | 6.75k | res->setProp("molReactStatus", reactStatus); |
1824 | 6.75k | } catch (boost::bad_lexical_cast &) { |
1825 | 3.91k | ; |
1826 | 3.91k | } |
1827 | 6.75k | } |
1828 | 122k | return res; |
1829 | 122k | } // namespace |
1830 | | |
1831 | | void ParseMolBlockAtoms(std::istream *inStream, unsigned int &line, |
1832 | | unsigned int nAtoms, RWMol *mol, Conformer *conf, |
1833 | 14.8k | bool strictParsing) { |
1834 | 14.8k | PRECONDITION(inStream, "bad stream"); |
1835 | 14.8k | PRECONDITION(mol, "bad molecule"); |
1836 | 14.8k | PRECONDITION(conf, "bad conformer"); |
1837 | 148k | for (unsigned int i = 1; i <= nAtoms; ++i) { |
1838 | 134k | ++line; |
1839 | 134k | std::string tempStr = getLine(inStream); |
1840 | 134k | if (inStream->eof()) { |
1841 | 436 | throw FileParseException("EOF hit while reading atoms"); |
1842 | 436 | } |
1843 | 133k | RDGeom::Point3D pos; |
1844 | 133k | Atom *atom = ParseMolFileAtomLine(tempStr, pos, line, strictParsing); |
1845 | 133k | unsigned int aid = mol->addAtom(atom, false, true); |
1846 | 133k | conf->setAtomPos(aid, pos); |
1847 | 133k | mol->setAtomBookmark(atom, i); |
1848 | 133k | } |
1849 | 14.8k | } |
1850 | | |
1851 | | void ParseMolBlockBonds(std::istream *inStream, unsigned int &line, |
1852 | | unsigned int nBonds, RWMol *mol, |
1853 | 17.2k | bool &chiralityPossible) { |
1854 | 17.2k | PRECONDITION(inStream, "bad stream"); |
1855 | 17.2k | PRECONDITION(mol, "bad molecule"); |
1856 | 139k | for (unsigned int i = 1; i <= nBonds; ++i) { |
1857 | 122k | ++line; |
1858 | 122k | std::string tempStr = getLine(inStream); |
1859 | 122k | if (inStream->eof()) { |
1860 | 55 | throw FileParseException("EOF hit while reading bonds"); |
1861 | 55 | } |
1862 | 122k | Bond *bond = ParseMolFileBondLine(tempStr, line); |
1863 | | // if we got an aromatic bond set the flag on the bond and the connected |
1864 | | // atoms |
1865 | 122k | if (bond->getBondType() == Bond::AROMATIC) { |
1866 | 4.55k | bond->setIsAromatic(true); |
1867 | 4.55k | } |
1868 | | // if the bond might have chirality info associated with it, set a flag: |
1869 | 122k | if (bond->getBondDir() != Bond::NONE && |
1870 | 20.7k | bond->getBondDir() != Bond::UNKNOWN) { |
1871 | 19.6k | chiralityPossible = true; |
1872 | 19.6k | } |
1873 | | // v2k has no way to set stereoCare on bonds, so set the property if both |
1874 | | // the beginning and end atoms have it set: |
1875 | 122k | int care1 = 0; |
1876 | 122k | int care2 = 0; |
1877 | 122k | if (!bond->hasProp(common_properties::molStereoCare) && |
1878 | 122k | mol->getAtomWithIdx(bond->getBeginAtomIdx()) |
1879 | 122k | ->getPropIfPresent(common_properties::molStereoCare, care1) && |
1880 | 27.1k | mol->getAtomWithIdx(bond->getEndAtomIdx()) |
1881 | 27.1k | ->getPropIfPresent(common_properties::molStereoCare, care2)) { |
1882 | 9.18k | if (care1 && care2) { |
1883 | 367 | bond->setProp(common_properties::molStereoCare, 1); |
1884 | 367 | } |
1885 | 9.18k | } |
1886 | 122k | mol->addBond(bond, true); |
1887 | 122k | mol->setBondBookmark(bond, i); |
1888 | 122k | } |
1889 | 17.2k | } |
1890 | | |
1891 | | bool checkAttachmentPointsAreValid( |
1892 | 3.06k | const RWMol *mol, std::pair<const int, SubstanceGroup> &sgroup) { |
1893 | 3.06k | bool res = true; |
1894 | 3.06k | int nAtoms = static_cast<int>(mol->getNumAtoms()); |
1895 | 3.06k | std::vector<SubstanceGroup::AttachPoint> &attachPoints = |
1896 | 3.06k | sgroup.second.getAttachPoints(); |
1897 | 6.74k | for (auto &attachPoint : attachPoints) { |
1898 | 6.74k | if (attachPoint.lvIdx == nAtoms) { |
1899 | 555 | const std::vector<unsigned int> &bonds = sgroup.second.getBonds(); |
1900 | 555 | if (bonds.size() == 1) { |
1901 | 304 | const auto bond = mol->getBondWithIdx(bonds.front()); |
1902 | 304 | if (bond->getBeginAtomIdx() == attachPoint.aIdx || |
1903 | 283 | bond->getEndAtomIdx() == attachPoint.aIdx) { |
1904 | 283 | attachPoint.lvIdx = bond->getOtherAtomIdx(attachPoint.aIdx); |
1905 | 283 | } |
1906 | 304 | } |
1907 | 555 | } |
1908 | 6.74k | if (attachPoint.lvIdx == nAtoms) { |
1909 | 272 | BOOST_LOG(rdWarningLog) |
1910 | 0 | << "Could not infer missing lvIdx on malformed SAP line for SGroup " |
1911 | 0 | << sgroup.first << std::endl; |
1912 | 272 | res = false; |
1913 | 272 | } |
1914 | 6.74k | } |
1915 | 3.06k | return res; |
1916 | 3.06k | } |
1917 | | |
1918 | | bool ParseMolBlockProperties(std::istream *inStream, unsigned int &line, |
1919 | 16.9k | RWMol *mol, bool strictParsing) { |
1920 | 16.9k | PRECONDITION(inStream, "bad stream"); |
1921 | 16.9k | PRECONDITION(mol, "bad molecule"); |
1922 | | // older mol files can have an atom list block here |
1923 | 16.9k | std::string tempStr = getLine(inStream); |
1924 | 16.9k | ++line; |
1925 | | // there is apparently some software out there that puts a |
1926 | | // blank line in mol blocks before the "M END". If we aren't |
1927 | | // doing strict parsing, deal with that here. |
1928 | 16.9k | if (!tempStr.size()) { |
1929 | 872 | if (!strictParsing) { |
1930 | 861 | tempStr = getLine(inStream); |
1931 | 861 | ++line; |
1932 | 861 | } else { |
1933 | 11 | std::ostringstream errout; |
1934 | 11 | errout << "Problems encountered parsing Mol data, unexpected blank line " |
1935 | 11 | "found at line " |
1936 | 11 | << line; |
1937 | 11 | throw FileParseException(errout.str()); |
1938 | 11 | } |
1939 | 16.0k | } else { |
1940 | 16.0k | if (tempStr[0] != 'M' && tempStr[0] != 'A' && tempStr[0] != 'V' && |
1941 | 585 | tempStr[0] != 'G' && tempStr[0] != 'S') { |
1942 | 253 | ParseOldAtomList(mol, std::string_view(tempStr.c_str()), line); |
1943 | 253 | } |
1944 | 16.0k | } |
1945 | | |
1946 | 16.9k | IDX_TO_SGROUP_MAP sGroupMap; |
1947 | 16.9k | IDX_TO_STR_VECT_MAP dataFieldsMap; |
1948 | 16.9k | bool fileComplete = false; |
1949 | 16.9k | bool firstChargeLine = true; |
1950 | 16.9k | unsigned int SCDcounter = 0; |
1951 | 16.9k | unsigned int lastDataSGroup = 0; |
1952 | 16.9k | std::ostringstream currentDataField; |
1953 | 16.9k | std::string lineBeg = tempStr.substr(0, 6); |
1954 | 702k | while (!inStream->eof() && !inStream->fail() && lineBeg != "M END" && |
1955 | 685k | tempStr.substr(0, 4) != "$$$$") { |
1956 | 685k | if (tempStr[0] == 'A') { |
1957 | 2.06k | line++; |
1958 | 2.06k | std::string nextLine = getLine(inStream); |
1959 | 2.06k | if (lineBeg != "M END") { |
1960 | 2.06k | ParseAtomAlias(mol, tempStr, nextLine, line); |
1961 | 2.06k | } |
1962 | 683k | } else if (tempStr[0] == 'G') { |
1963 | 631 | BOOST_LOG(rdWarningLog) |
1964 | 0 | << " deprecated group abbreviation ignored on line " << line |
1965 | 0 | << std::endl; |
1966 | | // we need to skip the next line, which holds the abbreviation: |
1967 | 631 | line++; |
1968 | 631 | tempStr = getLine(inStream); |
1969 | 682k | } else if (tempStr[0] == 'V') { |
1970 | 3.15k | ParseAtomValue(mol, tempStr, line); |
1971 | 679k | } else if (lineBeg == "S SKP") { |
1972 | 1.25k | int nToSkip = FileParserUtils::toInt(tempStr.substr(6, 3)); |
1973 | 1.25k | if (nToSkip < 0) { |
1974 | 11 | std::ostringstream errout; |
1975 | 11 | errout << "negative skip value " << nToSkip << " on line " << line; |
1976 | 11 | throw FileParseException(errout.str()); |
1977 | 11 | } |
1978 | 10.2k | for (unsigned int i = 0; i < static_cast<unsigned int>(nToSkip); ++i) { |
1979 | 9.05k | ++line; |
1980 | 9.05k | tempStr = getLine(inStream); |
1981 | 9.05k | } |
1982 | 678k | } else if (lineBeg == "M ALS") { |
1983 | 3.72k | ParseNewAtomList(mol, tempStr, line); |
1984 | 674k | } else if (lineBeg == "M ISO") { |
1985 | 2.42k | ParseIsotopeLine(mol, tempStr, line); |
1986 | 671k | } else if (lineBeg == "M RGP") { |
1987 | 1.91k | ParseRGroupLabels(mol, tempStr, line); |
1988 | 669k | } else if (lineBeg == "M RBC") { |
1989 | 9.68k | ParseRingBondCountLine(mol, tempStr, line); |
1990 | 660k | } else if (lineBeg == "M SUB") { |
1991 | 3.60k | ParseSubstitutionCountLine(mol, tempStr, line); |
1992 | 656k | } else if (lineBeg == "M UNS") { |
1993 | 2.78k | ParseUnsaturationLine(mol, tempStr, line); |
1994 | 653k | } else if (lineBeg == "M CHG") { |
1995 | 1.69k | ParseChargeLine(mol, tempStr, firstChargeLine, line); |
1996 | 1.69k | firstChargeLine = false; |
1997 | 652k | } else if (lineBeg == "M RAD") { |
1998 | 3.68k | ParseRadicalLine(mol, tempStr, firstChargeLine, line); |
1999 | 3.68k | firstChargeLine = false; |
2000 | 648k | } else if (lineBeg == "M PXA") { |
2001 | 4.74k | ParsePXALine(mol, tempStr, line); |
2002 | | |
2003 | | /* SGroup parsing start */ |
2004 | 643k | } else if (lineBeg == "M STY") { |
2005 | 34.4k | ParseSGroupV2000STYLine(sGroupMap, mol, tempStr, line, strictParsing); |
2006 | 609k | } else if (lineBeg == "M SST") { |
2007 | 4.48k | ParseSGroupV2000SSTLine(sGroupMap, mol, tempStr, line, strictParsing); |
2008 | 604k | } else if (lineBeg == "M SLB") { |
2009 | 4.02k | ParseSGroupV2000SLBLine(sGroupMap, mol, tempStr, line, strictParsing); |
2010 | 600k | } else if (lineBeg == "M SCN") { |
2011 | 8.16k | ParseSGroupV2000SCNLine(sGroupMap, mol, tempStr, line, strictParsing); |
2012 | 592k | } else if (lineBeg == "M SDS") { |
2013 | 2.70k | ParseSGroupV2000SDSLine(sGroupMap, mol, tempStr, line, strictParsing); |
2014 | 589k | } else if (lineBeg == "M SAL" || lineBeg == "M SBL" || |
2015 | 564k | lineBeg == "M SPA") { |
2016 | 28.3k | ParseSGroupV2000VectorDataLine(sGroupMap, mol, tempStr, line, |
2017 | 28.3k | strictParsing); |
2018 | 561k | } else if (lineBeg == "M SMT") { |
2019 | 12.1k | ParseSGroupV2000SMTLine(sGroupMap, mol, tempStr, line, strictParsing); |
2020 | 549k | } else if (lineBeg == "M SDI") { |
2021 | 22.1k | ParseSGroupV2000SDILine(sGroupMap, mol, tempStr, line, strictParsing); |
2022 | 527k | } else if (lineBeg == "M CRS") { |
2023 | 2 | std::ostringstream errout; |
2024 | 2 | errout << "Unsupported SGroup subtype '" << lineBeg << "' on line " |
2025 | 2 | << line; |
2026 | 2 | throw FileParseException(errout.str()); |
2027 | 527k | } else if (lineBeg == "M SBV") { |
2028 | 7.49k | ParseSGroupV2000SBVLine(sGroupMap, mol, tempStr, line, strictParsing); |
2029 | 519k | } else if (lineBeg == "M SDT") { |
2030 | 24.2k | ParseSGroupV2000SDTLine(sGroupMap, mol, tempStr, line, strictParsing); |
2031 | 495k | } else if (lineBeg == "M SDD") { |
2032 | 2.75k | ParseSGroupV2000SDDLine(sGroupMap, mol, tempStr, line, strictParsing); |
2033 | 492k | } else if (lineBeg == "M SCD" || lineBeg == "M SED") { |
2034 | 54.9k | ParseSGroupV2000SCDSEDLine(sGroupMap, dataFieldsMap, mol, tempStr, line, |
2035 | 54.9k | strictParsing, SCDcounter, lastDataSGroup, |
2036 | 54.9k | currentDataField); |
2037 | 437k | } else if (lineBeg == "M SPL") { |
2038 | 6.36k | ParseSGroupV2000SPLLine(sGroupMap, mol, tempStr, line, strictParsing); |
2039 | 431k | } else if (lineBeg == "M SNC") { |
2040 | 5.07k | ParseSGroupV2000SNCLine(sGroupMap, mol, tempStr, line, strictParsing); |
2041 | 426k | } else if (lineBeg == "M SAP") { |
2042 | 17.3k | ParseSGroupV2000SAPLine(sGroupMap, mol, tempStr, line, strictParsing); |
2043 | 408k | } else if (lineBeg == "M SCL") { |
2044 | 4.74k | ParseSGroupV2000SCLLine(sGroupMap, mol, tempStr, line, strictParsing); |
2045 | 404k | } else if (lineBeg == "M SBT") { |
2046 | 5.35k | ParseSGroupV2000SBTLine(sGroupMap, mol, tempStr, line, strictParsing); |
2047 | | |
2048 | | /* SGroup parsing end */ |
2049 | 398k | } else if (lineBeg == "M ZBO") { |
2050 | 1.34k | ParseZBOLine(mol, tempStr, line); |
2051 | 397k | } else if (lineBeg == "M ZCH") { |
2052 | 1.60k | ParseZCHLine(mol, tempStr, line); |
2053 | 395k | } else if (lineBeg == "M HYD") { |
2054 | 1.68k | ParseHYDLine(mol, tempStr, line); |
2055 | 394k | } else if (lineBeg == "M MRV") { |
2056 | 35.8k | ParseMarvinSmartsLine(mol, tempStr, line); |
2057 | 358k | } else if (lineBeg == "M APO") { |
2058 | 1.55k | ParseAttachPointLine(mol, tempStr, line, strictParsing); |
2059 | 356k | } else if (lineBeg == "M LIN") { |
2060 | 1.31k | ParseLinkNodeLine(mol, tempStr, line); |
2061 | 1.31k | } |
2062 | 685k | line++; |
2063 | 685k | tempStr = getLine(inStream); |
2064 | 685k | lineBeg = tempStr.substr(0, 6); |
2065 | 685k | } |
2066 | 16.9k | if (tempStr[0] == 'M' && tempStr.substr(0, 6) == "M END") { |
2067 | | // All went well, make final updates to SGroups, and add them to Mol |
2068 | 4.88k | for (auto &sgroup : sGroupMap) { |
2069 | 3.21k | if (sgroup.second.getIsValid()) { |
2070 | 3.06k | sgroup.second.setProp("DATAFIELDS", dataFieldsMap[sgroup.first]); |
2071 | 3.06k | sgroup.second.setIsValid(checkAttachmentPointsAreValid(mol, sgroup)); |
2072 | 3.06k | } |
2073 | 3.21k | if (sgroup.second.getIsValid()) { |
2074 | 3.04k | addSubstanceGroup(*mol, sgroup.second); |
2075 | 3.04k | } else { |
2076 | 167 | std::ostringstream errout; |
2077 | 167 | errout << "SGroup " << sgroup.first << " is invalid"; |
2078 | 167 | if (strictParsing) { |
2079 | 0 | throw FileParseException(errout.str()); |
2080 | 167 | } else { |
2081 | 167 | BOOST_LOG(rdWarningLog) |
2082 | 0 | << errout.str() << " and will be ignored" << std::endl; |
2083 | 167 | } |
2084 | 167 | } |
2085 | 3.21k | } |
2086 | | |
2087 | 4.88k | fileComplete = true; |
2088 | 4.88k | } |
2089 | 16.9k | return fileComplete; |
2090 | 16.9k | } |
2091 | | |
2092 | | Atom *ParseV3000AtomSymbol(std::string_view token, unsigned int &line, |
2093 | 15.8k | bool strictParsing) { |
2094 | 15.8k | bool negate = false; |
2095 | 15.8k | token = FileParserUtils::strip(token); |
2096 | 15.8k | if (token.size() > 3 && (token[0] == 'N' || token[0] == 'n') && |
2097 | 1.66k | (token[1] == 'O' || token[1] == 'o') && |
2098 | 753 | (token[2] == 'T' || token[2] == 't')) { |
2099 | 9 | negate = true; |
2100 | 9 | token = token.substr(3, token.size() - 3); |
2101 | 9 | token = FileParserUtils::strip(token); |
2102 | 9 | } |
2103 | | |
2104 | 15.8k | std::unique_ptr<Atom> res; |
2105 | 15.8k | if (token[0] == '[') { |
2106 | | // atom list: |
2107 | 127 | if (token.back() != ']') { |
2108 | 14 | std::ostringstream errout; |
2109 | 14 | errout << "Bad atom token '" << token << "' on line: " << line; |
2110 | 14 | throw FileParseException(errout.str()); |
2111 | 14 | } |
2112 | 113 | token = token.substr(1, token.size() - 2); |
2113 | | |
2114 | 113 | std::vector<std::string> splitToken; |
2115 | 113 | boost::split(splitToken, token, boost::is_any_of(",")); |
2116 | | |
2117 | 113 | for (std::vector<std::string>::const_iterator stIt = splitToken.begin(); |
2118 | 2.64k | stIt != splitToken.end(); ++stIt) { |
2119 | 2.53k | std::string_view stoken = *stIt; |
2120 | 2.53k | std::string atSymb(FileParserUtils::strip(stoken)); |
2121 | 2.53k | if (atSymb.empty()) { |
2122 | 1.72k | continue; |
2123 | 1.72k | } |
2124 | 813 | if (atSymb.size() == 2 && atSymb[1] >= 'A' && atSymb[1] <= 'Z') { |
2125 | 7 | atSymb[1] = static_cast<char>(tolower(atSymb[1])); |
2126 | 7 | } |
2127 | | |
2128 | 813 | int atNum = PeriodicTable::getTable()->getAtomicNumber(atSymb); |
2129 | 813 | if (!res) { |
2130 | 41 | res.reset(new QueryAtom(atNum)); |
2131 | 772 | } else { |
2132 | 772 | res->expandQuery(makeAtomNumQuery(atNum), Queries::COMPOSITE_OR, true); |
2133 | 772 | } |
2134 | | // we want the atomic number of the query itself to always be zero |
2135 | | // this was Github #8820 and #8823 |
2136 | 813 | res->setAtomicNum(0); |
2137 | 813 | } |
2138 | 113 | res->getQuery()->setNegation(negate); |
2139 | 15.7k | } else { |
2140 | 15.7k | if (negate) { |
2141 | 9 | std::ostringstream errout; |
2142 | 9 | errout << "NOT tokens only supported for atom lists. line " << line; |
2143 | 9 | throw FileParseException(errout.str()); |
2144 | 9 | } |
2145 | | // it's a normal CTAB atom symbol: |
2146 | | // NOTE: "R" and "R0"-"R99" are not in the v3K CTAB spec, but we're going to |
2147 | | // support them anyway |
2148 | 15.7k | bool isComplexQueryName = |
2149 | 15.7k | std::find(complexQueries.begin(), complexQueries.end(), token) != |
2150 | 15.7k | complexQueries.end(); |
2151 | 15.7k | if (isComplexQueryName || token == "R" || |
2152 | 15.3k | (token[0] == 'R' && token >= "R0" && token <= "R99") || token == "R#" || |
2153 | 8.95k | token == "*") { |
2154 | 6.84k | if (isComplexQueryName || token == "*") { |
2155 | 470 | res.reset(new QueryAtom(0)); |
2156 | 470 | if (token == "*") { |
2157 | | // according to the MDL spec, these match anything |
2158 | 71 | res->setQuery(makeAtomNullQuery()); |
2159 | 399 | } else if (isComplexQueryName) { |
2160 | 399 | convertComplexNameToQuery(res.get(), token); |
2161 | 399 | } |
2162 | | // queries have no implicit Hs: |
2163 | 470 | res->setNoImplicit(true); |
2164 | 6.37k | } else { |
2165 | 6.37k | res.reset(new Atom(1)); |
2166 | 6.37k | res->setAtomicNum(0); |
2167 | 6.37k | } |
2168 | 6.84k | if (token[0] == 'R' && token >= "R0" && token <= "R99") { |
2169 | 6.29k | auto rlabel = token.substr(1, token.length() - 1); |
2170 | 6.29k | int rnumber; |
2171 | 6.29k | try { |
2172 | 6.29k | rnumber = boost::lexical_cast<int>(rlabel); |
2173 | 6.29k | } catch (boost::bad_lexical_cast &) { |
2174 | 4.13k | rnumber = -1; |
2175 | 4.13k | } |
2176 | 6.29k | if (rnumber >= 0) { |
2177 | 2.15k | res->setIsotope(rnumber); |
2178 | 2.15k | } |
2179 | 6.29k | } |
2180 | 6.84k | if (token[0] == 'R') { |
2181 | | // we used to skip R# here because that really should be handled by an |
2182 | | // RGP spec, but that turned out to not be permissive enough... <sigh> |
2183 | 6.37k | setRGPProps(token, res.get()); |
2184 | 6.37k | } |
2185 | 8.88k | } else if (token == "D") { // mol blocks support "D" and "T" as |
2186 | | // shorthand... handle that. |
2187 | 78 | res.reset(new Atom(1)); |
2188 | 78 | res->setIsotope(2); |
2189 | 8.80k | } else if (token == "T") { // mol blocks support "D" and "T" as |
2190 | | // shorthand... handle that. |
2191 | 2 | res.reset(new Atom(1)); |
2192 | 2 | res->setIsotope(3); |
2193 | 8.80k | } else if (token == "Pol" || token == "Mod") { |
2194 | 26 | res.reset(new Atom(0)); |
2195 | 26 | res->setProp(common_properties::dummyLabel, std::string(token)); |
2196 | 8.77k | } else if (GenericGroups::genericMatchers.find(std::string(token)) != |
2197 | 8.77k | GenericGroups::genericMatchers.end()) { |
2198 | 45 | res.reset(new QueryAtom(0)); |
2199 | 45 | res->setProp(common_properties::atomLabel, std::string(token)); |
2200 | 8.73k | } else { |
2201 | 8.73k | std::string tcopy(token); |
2202 | 8.73k | res.reset(new Atom(0)); |
2203 | 8.73k | lookupAtomicNumber(res.get(), tcopy, strictParsing); |
2204 | 8.73k | } |
2205 | 15.7k | } |
2206 | | |
2207 | 15.8k | POSTCONDITION(res, "no atom built"); |
2208 | 15.8k | return res.release(); |
2209 | 15.8k | } |
2210 | | |
2211 | | bool splitAssignToken(std::string_view token, std::string &prop, |
2212 | 205k | std::string_view &val) { |
2213 | 205k | auto equalsLoc = token.find("="); |
2214 | 205k | if (equalsLoc == token.npos || equalsLoc != token.rfind("=")) { |
2215 | 941 | return false; |
2216 | 941 | } |
2217 | 204k | prop = token.substr(0, equalsLoc); |
2218 | 204k | boost::to_upper(prop); |
2219 | 204k | val = token.substr(equalsLoc + 1); |
2220 | 204k | return true; |
2221 | 205k | } |
2222 | | |
2223 | | template <class T> |
2224 | | void ParseV3000AtomProps(RWMol *mol, Atom *&atom, typename T::iterator &token, |
2225 | | const T &tokens, unsigned int &line, |
2226 | 15.2k | bool strictParsing) { |
2227 | 15.2k | PRECONDITION(mol, "bad molecule"); |
2228 | 15.2k | PRECONDITION(atom, "bad atom"); |
2229 | 15.2k | std::ostringstream errout; |
2230 | 131k | while (token != tokens.end()) { |
2231 | 116k | std::string prop; |
2232 | 116k | std::string_view val; |
2233 | 116k | if (!splitAssignToken(*token, prop, val)) { |
2234 | 465 | errout << "Invalid atom property: '" << *token << "' for atom " |
2235 | 465 | << atom->getIdx() + 1 << " on line " << line << std::endl; |
2236 | 465 | throw FileParseException(errout.str()); |
2237 | 465 | } |
2238 | | |
2239 | 116k | if (prop == "CHG") { |
2240 | 487 | auto charge = FileParserUtils::toInt(val); |
2241 | 487 | if (!atom->hasQuery()) { |
2242 | 408 | atom->setFormalCharge(charge); |
2243 | 408 | } else { |
2244 | 79 | atom->expandQuery(makeAtomFormalChargeQuery(charge)); |
2245 | 79 | } |
2246 | 115k | } else if (prop == "RAD") { |
2247 | | // FIX handle queries here |
2248 | 1.28k | switch (FileParserUtils::toInt(val)) { |
2249 | 116 | case 0: |
2250 | 116 | break; |
2251 | 12 | case 1: |
2252 | 12 | atom->setNumRadicalElectrons(2); |
2253 | 12 | break; |
2254 | 87 | case 2: |
2255 | 87 | atom->setNumRadicalElectrons(1); |
2256 | 87 | break; |
2257 | 1.05k | case 3: |
2258 | 1.05k | atom->setNumRadicalElectrons(2); |
2259 | 1.05k | break; |
2260 | 7 | default: |
2261 | 7 | errout << "Unrecognized RAD value " << val << " for atom " |
2262 | 7 | << atom->getIdx() + 1 << " on line " << line << std::endl; |
2263 | 7 | throw FileParseException(errout.str()); |
2264 | 1.28k | } |
2265 | 114k | } else if (prop == "MASS") { |
2266 | | // the documentation for V3000 CTABs says that this should contain the |
2267 | | // "absolute atomic weight" (whatever that means). |
2268 | | // Online examples seem to have integer (isotope) values and Marvin |
2269 | | // won't even read something that has a float. We'll go with the int |
2270 | 2.16k | int v; |
2271 | 2.16k | double dv; |
2272 | 2.16k | try { |
2273 | 2.16k | v = FileParserUtils::toInt(val); |
2274 | 2.16k | } catch (boost::bad_lexical_cast &) { |
2275 | 1.35k | try { |
2276 | 1.35k | dv = FileParserUtils::toDouble(val); |
2277 | 1.35k | v = static_cast<int>(floor(dv)); |
2278 | 1.35k | } catch (boost::bad_lexical_cast &) { |
2279 | 17 | v = -1; |
2280 | 17 | } |
2281 | 1.35k | } |
2282 | 2.16k | if (v < 0) { |
2283 | 20 | errout << "Bad value for MASS :" << val << " for atom " |
2284 | 20 | << atom->getIdx() + 1 << " on line " << line << std::endl; |
2285 | 20 | throw FileParseException(errout.str()); |
2286 | 2.14k | } else { |
2287 | 2.14k | if (!atom->hasQuery()) { |
2288 | 1.64k | atom->setIsotope(v); |
2289 | 1.64k | } else { |
2290 | 497 | atom->expandQuery(makeAtomIsotopeQuery(v)); |
2291 | 497 | } |
2292 | 2.14k | } |
2293 | 112k | } else if (prop == "CFG") { |
2294 | 9.25k | auto cfg = FileParserUtils::toInt(val); |
2295 | 9.25k | switch (cfg) { |
2296 | 2.78k | case 0: |
2297 | 2.78k | break; |
2298 | 1.09k | case 1: |
2299 | 5.33k | case 2: |
2300 | 6.44k | case 3: |
2301 | 6.44k | atom->setProp(common_properties::molParity, cfg); |
2302 | 6.44k | break; |
2303 | 13 | default: |
2304 | 13 | errout << "Unrecognized CFG value : " << val << " for atom " |
2305 | 13 | << atom->getIdx() + 1 << " on line " << line << std::endl; |
2306 | 13 | throw FileParseException(errout.str()); |
2307 | 9.25k | } |
2308 | 103k | } else if (prop == "HCOUNT") { |
2309 | 3.22k | if (val != "0") { |
2310 | 3.12k | auto hcount = FileParserUtils::toInt(val); |
2311 | 3.12k | if (!atom->hasQuery()) { |
2312 | 113 | atom = QueryOps::replaceAtomWithQueryAtom(mol, atom); |
2313 | 113 | } |
2314 | 3.12k | if (hcount == -1) { |
2315 | 630 | hcount = 0; |
2316 | 630 | } |
2317 | 3.12k | if (hcount > 0) { |
2318 | 1.59k | ATOM_EQUALS_QUERY *oq = makeAtomImplicitHCountQuery(hcount); |
2319 | 1.59k | auto nq = makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>( |
2320 | 1.59k | hcount, oq->getDataFunc(), |
2321 | 1.59k | std::string("less_") + oq->getDescription()); |
2322 | 1.59k | atom->expandQuery(nq); |
2323 | 1.59k | delete oq; |
2324 | 1.59k | } else { |
2325 | 1.52k | atom->expandQuery(makeAtomImplicitHCountQuery(0)); |
2326 | 1.52k | } |
2327 | 3.12k | } |
2328 | 99.8k | } else if (prop == "UNSAT") { |
2329 | 6.85k | if (val == "1") { |
2330 | 4.95k | if (!atom->hasQuery()) { |
2331 | 589 | atom = QueryOps::replaceAtomWithQueryAtom(mol, atom); |
2332 | 589 | } |
2333 | 4.95k | atom->expandQuery(makeAtomUnsaturatedQuery()); |
2334 | 4.95k | } |
2335 | 93.0k | } else if (prop == "RBCNT") { |
2336 | 8.33k | if (val != "0") { |
2337 | 8.23k | auto rbcount = FileParserUtils::toInt(val); |
2338 | 8.23k | if (!atom->hasQuery()) { |
2339 | 1.50k | atom = QueryOps::replaceAtomWithQueryAtom(mol, atom); |
2340 | 1.50k | } |
2341 | 8.23k | atom->setProp(common_properties::molRingBondCount, rbcount); |
2342 | 8.23k | if (rbcount == -1) { |
2343 | 835 | rbcount = 0; |
2344 | 7.40k | } else if (rbcount == -2) { |
2345 | | // Ring bonds can only be counted during post processing |
2346 | 51 | mol->setProp(common_properties::_NeedsQueryScan, 1); |
2347 | 51 | rbcount = 0xDEADBEEF; |
2348 | 7.35k | } else if (rbcount > 4) { |
2349 | 142 | rbcount = 4; |
2350 | 142 | } |
2351 | 8.23k | atom->expandQuery(makeAtomRingBondCountQuery(rbcount)); |
2352 | 8.23k | } |
2353 | 84.6k | } else if (prop == "VAL") { |
2354 | 1.42k | if (val != "0") { |
2355 | 1.23k | auto totval = FileParserUtils::toInt(val); |
2356 | 1.23k | atom->setProp(common_properties::molTotValence, totval); |
2357 | 1.23k | } |
2358 | 83.2k | } else if (prop == "RGROUPS") { |
2359 | 22 | ParseV3000RGroups(mol, atom, val, line); |
2360 | | // FIX |
2361 | 83.2k | } else if (prop == "STBOX") { |
2362 | 6.37k | if (val != "0") { |
2363 | 4.02k | auto ival = FileParserUtils::toInt(val); |
2364 | 4.02k | atom->setProp(common_properties::molStereoCare, ival); |
2365 | 4.02k | } |
2366 | 76.8k | } else if (prop == "SUBST") { |
2367 | 1.39k | if (val != "0") { |
2368 | 1.11k | auto ival = FileParserUtils::toInt(val); |
2369 | 1.11k | atom->setProp(common_properties::molSubstCount, ival); |
2370 | 1.11k | } |
2371 | 75.4k | } else if (prop == "EXACHG") { |
2372 | 1.55k | if (val != "0") { |
2373 | 1.20k | auto ival = FileParserUtils::toInt(val); |
2374 | 1.20k | atom->setProp(common_properties::molRxnExactChange, ival); |
2375 | 1.20k | } |
2376 | 73.8k | } else if (prop == "INVRET") { |
2377 | 3.40k | if (val != "0") { |
2378 | 2.73k | auto ival = FileParserUtils::toInt(val); |
2379 | 2.73k | atom->setProp(common_properties::molInversionFlag, ival); |
2380 | 2.73k | } |
2381 | 70.4k | } else if (prop == "ATTCHPT") { |
2382 | 11.8k | if (val != "0") { |
2383 | 11.4k | auto ival = FileParserUtils::toInt(val); |
2384 | 11.4k | if (atom->hasProp(common_properties::molAttachPoint)) { |
2385 | 10.7k | errout << "Multiple ATTCHPT values for atom " << atom->getIdx() + 1 |
2386 | 10.7k | << " on line " << line; |
2387 | 10.7k | if (strictParsing) { |
2388 | 2 | throw FileParseException(errout.str()); |
2389 | 10.7k | } else { |
2390 | 10.7k | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
2391 | 10.7k | errout.str(std::string()); |
2392 | 10.7k | } |
2393 | 10.7k | } else { |
2394 | 654 | atom->setProp(common_properties::molAttachPoint, ival); |
2395 | 654 | } |
2396 | 11.4k | } |
2397 | 58.6k | } else if (prop == "ATTCHORD") { |
2398 | | // there are two kinds of ATTCHORD |
2399 | | // one is for template instances and looks like this: ATTCHORD=(4 1 Al 3 |
2400 | | // Br) |
2401 | | |
2402 | 1.34k | if (val.substr(0, 1) == "(") { |
2403 | | // this is a template instance |
2404 | | |
2405 | 13 | val = val.substr(1, val.size() - 2); |
2406 | 13 | std::vector<std::string> splitToken; |
2407 | 13 | boost::split(splitToken, val, boost::is_any_of(" \t")); |
2408 | | |
2409 | 13 | unsigned int itemCount = 0; |
2410 | 13 | if (splitToken.size() > 0) { |
2411 | 13 | itemCount = FileParserUtils::toInt(splitToken[0]); |
2412 | 13 | } |
2413 | | |
2414 | 13 | if (itemCount == 0 || itemCount % 2 != 0 || |
2415 | 8 | splitToken.size() != itemCount + 1) { |
2416 | 8 | errout << "Invalid ATTCHORD value: '" << val << "' for atom " |
2417 | 8 | << atom->getIdx() + 1 << " on line " << line << std::endl; |
2418 | 8 | throw FileParseException(errout.str()); |
2419 | 8 | } |
2420 | 5 | std::vector<std::pair<unsigned int, std::string>> attchOrds; |
2421 | 5 | for (unsigned int i = 1; i < itemCount; i += 2) { |
2422 | 0 | unsigned int idx = FileParserUtils::toInt(splitToken[i]); |
2423 | | // check for uniqueness |
2424 | 0 | for (const auto &[aidx, lbl] : attchOrds) { |
2425 | 0 | if (idx == aidx + 1 || splitToken[i + 1] == lbl) { |
2426 | 0 | errout << "Invalid ATTCHORD value: '" << val << "' for atom " |
2427 | 0 | << atom->getIdx() + 1 << " on line " << line << std::endl; |
2428 | |
|
2429 | 0 | throw FileParseException(errout.str()); |
2430 | 0 | } |
2431 | 0 | } |
2432 | 0 | attchOrds.emplace_back(idx - 1, splitToken[i + 1]); |
2433 | 0 | } |
2434 | 5 | atom->setProp(common_properties::molAttachOrderTemplate, attchOrds); |
2435 | 1.32k | } else { |
2436 | | // this is a normal ATTCHORD |
2437 | 1.32k | auto ival = FileParserUtils::toInt(val); |
2438 | 1.32k | atom->setProp(common_properties::molAttachOrder, ival); |
2439 | 1.32k | } |
2440 | 57.3k | } else if (prop == "CLASS") { |
2441 | 1.24k | atom->setProp(common_properties::molAtomClass, std::string(val)); |
2442 | 56.0k | } else if (prop == "SEQID") { |
2443 | 1.25k | if (val != "0") { |
2444 | 1.06k | auto ival = FileParserUtils::toInt(val); |
2445 | 1.06k | atom->setProp(common_properties::molAtomSeqId, ival); |
2446 | 1.06k | } |
2447 | 54.8k | } else if (prop == "SEQNAME") { |
2448 | 33 | if (val != "") { |
2449 | 5 | atom->setProp(common_properties::molAtomSeqName, std::string(val)); |
2450 | 5 | } |
2451 | 33 | } |
2452 | 116k | ++token; |
2453 | 116k | } |
2454 | 15.2k | } |
2455 | | |
2456 | | void tokenizeV3000Line(std::string_view line, |
2457 | 16.8k | std::vector<std::string_view> &tokens) { |
2458 | 16.8k | tokens.clear(); |
2459 | 16.8k | bool inQuotes = false; |
2460 | 16.8k | unsigned int parenDepth = 0; |
2461 | 16.8k | unsigned int start = 0; |
2462 | 16.8k | unsigned int pos = 0; |
2463 | 33.3M | while (pos < line.size()) { |
2464 | 33.3M | if (line[pos] == ' ' || line[pos] == '\t') { |
2465 | 2.21M | if (start == pos) { |
2466 | 116k | ++start; |
2467 | 116k | ++pos; |
2468 | 2.10M | } else if (!inQuotes && parenDepth == 0) { |
2469 | 559k | tokens.push_back(line.substr(start, pos - start)); |
2470 | 559k | ++pos; |
2471 | 559k | start = pos; |
2472 | 1.54M | } else { |
2473 | 1.54M | ++pos; |
2474 | 1.54M | } |
2475 | 31.1M | } else if (line[pos] == ')' && parenDepth > 0) { |
2476 | 15.0k | --parenDepth; |
2477 | 15.0k | ++pos; |
2478 | 31.1M | } else if (line[pos] == '(' && !inQuotes) { |
2479 | 355k | ++parenDepth; |
2480 | 355k | ++pos; |
2481 | 30.7M | } else if (line[pos] == '"' && parenDepth == 0) { |
2482 | 394k | if (pos + 1 < line.size() && line[pos + 1] == '"') { |
2483 | 8.59k | pos += 2; |
2484 | 386k | } else if (inQuotes) { |
2485 | | // don't push on the quotes themselves |
2486 | 190k | tokens.push_back(line.substr(start + 1, pos - start - 1)); |
2487 | 190k | ++pos; |
2488 | 190k | start = pos; |
2489 | 190k | inQuotes = false; |
2490 | 195k | } else { |
2491 | 195k | ++pos; |
2492 | 195k | inQuotes = true; |
2493 | 195k | } |
2494 | 30.3M | } else { |
2495 | 30.3M | ++pos; |
2496 | 30.3M | } |
2497 | 33.3M | } |
2498 | 16.8k | if (start != pos) { |
2499 | 16.3k | tokens.push_back(line.substr(start, line.size() - start)); |
2500 | 16.3k | } |
2501 | | #if 0 |
2502 | | std::cerr<<"tokens: "; |
2503 | | std::copy(tokens.begin(),tokens.end(),std::ostream_iterator<std::string>(std::cerr,"|")); |
2504 | | std::cerr<<std::endl; |
2505 | | #endif |
2506 | 16.8k | } |
2507 | | |
2508 | | bool calculate3dFlag(const RWMol &mol, const Conformer &conf, |
2509 | 17.1k | bool chiralityPossible) { |
2510 | 17.1k | int marked3d = 0; |
2511 | 17.1k | if (mol.getPropIfPresent(common_properties::_3DConf, marked3d)) { |
2512 | 393 | mol.clearProp(common_properties::_3DConf); |
2513 | 393 | } |
2514 | | |
2515 | 17.1k | bool nonzeroZ = hasNonZeroZCoords(conf); |
2516 | | |
2517 | 17.1k | if (!nonzeroZ && marked3d == 1) { |
2518 | | // If we have no Z coordinates, mark the structure 2D if we see any |
2519 | | // 2D stereo markers, or stay as 3D if |
2520 | 38 | if (chiralityPossible) { |
2521 | 32 | BOOST_LOG(rdWarningLog) |
2522 | 0 | << "Warning: molecule is tagged as 3D, but all Z coords are zero and 2D stereo " |
2523 | 0 | "markers have been found, marking the mol as 2D." |
2524 | 0 | << std::endl; |
2525 | 32 | return false; |
2526 | 32 | } |
2527 | 6 | return true; |
2528 | 17.0k | } else if (marked3d == 0 && nonzeroZ) { |
2529 | 3.14k | BOOST_LOG(rdWarningLog) |
2530 | 0 | << "Warning: molecule is tagged as 2D, but at least one Z coordinate is not zero. " |
2531 | 0 | "Marking the mol as 3D." |
2532 | 0 | << std::endl; |
2533 | 3.14k | return true; |
2534 | 3.14k | } |
2535 | | |
2536 | 13.9k | return nonzeroZ; |
2537 | 17.1k | } |
2538 | | |
2539 | | void ParseV3000AtomBlock(std::istream *inStream, unsigned int &line, |
2540 | | unsigned int nAtoms, RWMol *mol, Conformer *conf, |
2541 | 1.77k | bool strictParsing, bool expectMacroAtoms) { |
2542 | 1.77k | PRECONDITION(inStream, "bad stream"); |
2543 | 1.77k | PRECONDITION(nAtoms > 0, "bad atom count"); |
2544 | 1.77k | PRECONDITION(mol, "bad molecule"); |
2545 | 1.77k | PRECONDITION(conf, "bad conformer"); |
2546 | 1.77k | std::vector<std::string> splitLine; |
2547 | | |
2548 | 1.77k | auto inl = getV3000Line(inStream, line); |
2549 | 1.77k | std::string_view tempStr = inl; |
2550 | 1.77k | if (tempStr.length() < 10 || tempStr.substr(0, 10) != "BEGIN ATOM") { |
2551 | 13 | std::ostringstream errout; |
2552 | 13 | errout << "BEGIN ATOM line not found on line " << line; |
2553 | 13 | throw FileParseException(errout.str()); |
2554 | 13 | } |
2555 | 17.5k | for (unsigned int i = 0; i < nAtoms; ++i) { |
2556 | 16.2k | inl = getV3000Line(inStream, line); |
2557 | 16.2k | tempStr = inl; |
2558 | 16.2k | auto trimmed = FileParserUtils::strip(tempStr); |
2559 | | |
2560 | 16.2k | std::vector<std::string_view> tokens; |
2561 | 16.2k | std::vector<std::string_view>::iterator token; |
2562 | | |
2563 | 16.2k | tokenizeV3000Line(trimmed, tokens); |
2564 | 16.2k | token = tokens.begin(); |
2565 | | |
2566 | 16.2k | if (token == tokens.end()) { |
2567 | 6 | std::ostringstream errout; |
2568 | 6 | errout << "Bad atom line : '" << tempStr << "' on line" << line; |
2569 | 6 | throw FileParseException(errout.str()); |
2570 | 6 | } |
2571 | 16.2k | unsigned int molIdx = 0; |
2572 | 16.2k | std::from_chars(token->data(), token->data() + token->size(), molIdx); |
2573 | | |
2574 | | // start with the symbol: |
2575 | 16.2k | ++token; |
2576 | 16.2k | if (token == tokens.end()) { |
2577 | 46 | std::ostringstream errout; |
2578 | 46 | errout << "Bad atom line : '" << tempStr << "' on line " << line; |
2579 | 46 | throw FileParseException(errout.str()); |
2580 | 46 | } |
2581 | | |
2582 | | // before we parse the symbol, we need to know if the atom has a class attr. |
2583 | | // if it does, it is a macro atom reference, and we do not need to parse the |
2584 | | // symbol. (the single letter codes can be the same as element sysmbols or |
2585 | | // special query names) |
2586 | | |
2587 | 16.1k | auto isMacroAtom = false; |
2588 | 16.1k | if (expectMacroAtoms) { |
2589 | 0 | auto lookAheadToken = token + 1; |
2590 | 0 | while (lookAheadToken != tokens.end()) { |
2591 | 0 | std::string prop; |
2592 | 0 | std::string_view val; |
2593 | 0 | if (splitAssignToken(*lookAheadToken, prop, val) && prop == "CLASS") { |
2594 | 0 | isMacroAtom = true; |
2595 | 0 | break; |
2596 | 0 | } |
2597 | 0 | ++lookAheadToken; |
2598 | 0 | } |
2599 | 0 | } |
2600 | | |
2601 | 16.1k | Atom *atom = nullptr; |
2602 | 16.1k | if (isMacroAtom) { |
2603 | 0 | atom = new Atom(0); |
2604 | 0 | atom->setAtomicNum(0); |
2605 | 0 | std::string tcopy(*token); |
2606 | 0 | atom->setProp(common_properties::dummyLabel, tcopy); |
2607 | 16.1k | } else { |
2608 | 16.1k | atom = ParseV3000AtomSymbol(*token, line, strictParsing); |
2609 | 16.1k | } |
2610 | | |
2611 | | // now the position; |
2612 | 16.1k | RDGeom::Point3D pos; |
2613 | 16.1k | ++token; |
2614 | 16.1k | if (token == tokens.end()) { |
2615 | 165 | delete atom; |
2616 | 165 | std::ostringstream errout; |
2617 | 165 | errout << "Bad atom line : '" << tempStr << "' on line " << line; |
2618 | 165 | throw FileParseException(errout.str()); |
2619 | 165 | } |
2620 | | |
2621 | 16.0k | pos.x = atof(std::string(*token).c_str()); |
2622 | 16.0k | ++token; |
2623 | 16.0k | if (token == tokens.end()) { |
2624 | 85 | delete atom; |
2625 | 85 | std::ostringstream errout; |
2626 | 85 | errout << "Bad atom line : '" << tempStr << "' on line " << line; |
2627 | 85 | throw FileParseException(errout.str()); |
2628 | 85 | } |
2629 | 15.9k | pos.y = atof(std::string(*token).c_str()); |
2630 | 15.9k | ++token; |
2631 | 15.9k | if (token == tokens.end()) { |
2632 | 88 | delete atom; |
2633 | 88 | std::ostringstream errout; |
2634 | 88 | errout << "Bad atom line : '" << tempStr << "' on line " << line; |
2635 | 88 | throw FileParseException(errout.str()); |
2636 | 88 | } |
2637 | 15.8k | pos.z = atof(std::string(*token).c_str()); |
2638 | | // the map number: |
2639 | 15.8k | ++token; |
2640 | 15.8k | if (token == tokens.end()) { |
2641 | 70 | delete atom; |
2642 | 70 | std::ostringstream errout; |
2643 | 70 | errout << "Bad atom line : '" << tempStr << "' on line " << line; |
2644 | 70 | throw FileParseException(errout.str()); |
2645 | 70 | } |
2646 | 15.7k | int mapNum = atoi(std::string(*token).c_str()); |
2647 | 15.7k | if (mapNum > 0) { |
2648 | 821 | atom->setProp(common_properties::molAtomMapNumber, mapNum); |
2649 | 821 | } |
2650 | 15.7k | ++token; |
2651 | | |
2652 | 15.7k | unsigned int aid = mol->addAtom(atom, false, true); |
2653 | | |
2654 | | // additional properties this may change the atom, |
2655 | | // so be careful with it: |
2656 | 15.7k | ParseV3000AtomProps(mol, atom, token, tokens, line, strictParsing); |
2657 | | |
2658 | 15.7k | mol->setAtomBookmark(atom, molIdx); |
2659 | 15.7k | conf->setAtomPos(aid, pos); |
2660 | 15.7k | } |
2661 | 1.30k | inl = getV3000Line(inStream, line); |
2662 | 1.30k | tempStr = inl; |
2663 | 1.30k | if (tempStr.length() < 8 || tempStr.substr(0, 8) != "END ATOM") { |
2664 | 14 | std::ostringstream errout; |
2665 | 14 | errout << "END ATOM line not found on line " << line; |
2666 | 14 | throw FileParseException(errout.str()); |
2667 | 14 | } |
2668 | 1.30k | } |
2669 | | |
2670 | | void ParseV3000BondBlock(std::istream *inStream, unsigned int &line, |
2671 | | unsigned int nBonds, RWMol *mol, |
2672 | 1.00k | bool &chiralityPossible) { |
2673 | 1.00k | PRECONDITION(inStream, "bad stream"); |
2674 | 1.00k | PRECONDITION(nBonds > 0, "bad bond count"); |
2675 | 1.00k | PRECONDITION(mol, "bad molecule"); |
2676 | | |
2677 | 1.00k | auto inl = getV3000Line(inStream, line); |
2678 | 1.00k | std::string_view tempStr = inl; |
2679 | 1.00k | if (tempStr.length() < 10 || tempStr.substr(0, 10) != "BEGIN BOND") { |
2680 | 18 | throw FileParseException("BEGIN BOND line not found"); |
2681 | 18 | } |
2682 | 1.31k | for (unsigned int i = 0; i < nBonds; ++i) { |
2683 | 969 | inl = getV3000Line(inStream, line); |
2684 | 969 | tempStr = inl; |
2685 | 969 | tempStr = FileParserUtils::strip(tempStr); |
2686 | 969 | std::vector<std::string_view> splitLine; |
2687 | 969 | tokenizeV3000Line(tempStr, splitLine); |
2688 | 969 | if (splitLine.size() < 4) { |
2689 | 49 | std::ostringstream errout; |
2690 | 49 | errout << "bond line " << line << " is too short"; |
2691 | 49 | throw FileParseException(errout.str()); |
2692 | 49 | } |
2693 | 920 | Bond *bond; |
2694 | 920 | unsigned int bondIdx = 0; |
2695 | 920 | std::from_chars(splitLine[0].data(), |
2696 | 920 | splitLine[0].data() + splitLine[0].size(), bondIdx); |
2697 | 920 | unsigned int bType = 0; |
2698 | 920 | std::from_chars(splitLine[1].data(), |
2699 | 920 | splitLine[1].data() + splitLine[1].size(), bType); |
2700 | 920 | unsigned int a1Idx = 0; |
2701 | 920 | std::from_chars(splitLine[2].data(), |
2702 | 920 | splitLine[2].data() + splitLine[2].size(), a1Idx); |
2703 | 920 | unsigned int a2Idx = 0; |
2704 | 920 | std::from_chars(splitLine[3].data(), |
2705 | 920 | splitLine[3].data() + splitLine[3].size(), a2Idx); |
2706 | | |
2707 | 920 | switch (bType) { |
2708 | 19 | case 1: |
2709 | 19 | bond = new Bond(Bond::SINGLE); |
2710 | 19 | break; |
2711 | 36 | case 2: |
2712 | 36 | bond = new Bond(Bond::DOUBLE); |
2713 | 36 | break; |
2714 | 7 | case 3: |
2715 | 7 | bond = new Bond(Bond::TRIPLE); |
2716 | 7 | break; |
2717 | 31 | case 4: |
2718 | 31 | bond = new Bond(Bond::AROMATIC); |
2719 | 31 | bond->setIsAromatic(true); |
2720 | 31 | break; |
2721 | 1 | case 9: |
2722 | 1 | bond = new Bond(Bond::DATIVE); |
2723 | 1 | break; |
2724 | 3 | case 10: |
2725 | 3 | bond = new Bond(Bond::HYDROGEN); |
2726 | 3 | break; |
2727 | 686 | case 0: |
2728 | 686 | bond = new Bond(Bond::UNSPECIFIED); |
2729 | 686 | BOOST_LOG(rdWarningLog) |
2730 | 0 | << "bond with order 0 found on line " << line |
2731 | 0 | << ". This is not part of the MDL specification." << std::endl; |
2732 | 686 | break; |
2733 | 133 | default: |
2734 | | // it's a query bond of some type |
2735 | 133 | bond = new QueryBond; |
2736 | 133 | if (bType == 8) { |
2737 | 1 | BOND_NULL_QUERY *q; |
2738 | 1 | q = makeBondNullQuery(); |
2739 | 1 | bond->setQuery(q); |
2740 | 132 | } else if (bType == 5) { |
2741 | 3 | bond->setQuery(makeSingleOrDoubleBondQuery()); |
2742 | 3 | bond->setProp(common_properties::_MolFileBondQuery, 1); |
2743 | 129 | } else if (bType == 6) { |
2744 | 6 | bond->setQuery(makeSingleOrAromaticBondQuery()); |
2745 | 6 | bond->setProp(common_properties::_MolFileBondQuery, 1); |
2746 | 123 | } else if (bType == 7) { |
2747 | 3 | bond->setQuery(makeDoubleOrAromaticBondQuery()); |
2748 | 3 | bond->setProp(common_properties::_MolFileBondQuery, 1); |
2749 | 120 | } else { |
2750 | 120 | BOND_NULL_QUERY *q; |
2751 | 120 | q = makeBondNullQuery(); |
2752 | 120 | bond->setQuery(q); |
2753 | 120 | BOOST_LOG(rdWarningLog) |
2754 | 0 | << "unrecognized query bond type, " << bType << ", found on line " |
2755 | 0 | << line << ". Using an \"any\" query." << std::endl; |
2756 | 120 | } |
2757 | 133 | break; |
2758 | 920 | } |
2759 | 916 | bond->setProp(common_properties::_MolFileBondType, bType); |
2760 | | |
2761 | | // additional bond properties: |
2762 | 916 | unsigned int lPos = 4; |
2763 | 916 | std::ostringstream errout; |
2764 | 89.4k | while (lPos < splitLine.size()) { |
2765 | 89.1k | std::string prop; |
2766 | 89.1k | std::string_view val; |
2767 | 89.1k | if (!splitAssignToken(splitLine[lPos], prop, val)) { |
2768 | 476 | errout << "bad bond property '" << splitLine[lPos] << "' on line " |
2769 | 476 | << line; |
2770 | 476 | throw FileParseException(errout.str()); |
2771 | 476 | } |
2772 | 88.6k | if (prop == "CFG") { |
2773 | 17.8k | unsigned int cfg = 0; |
2774 | 17.8k | std::from_chars(val.data(), val.data() + val.size(), cfg); |
2775 | 17.8k | switch (cfg) { |
2776 | 13.4k | case 0: |
2777 | 13.4k | break; |
2778 | 1.67k | case 1: |
2779 | 1.67k | bond->setBondDir(Bond::BEGINWEDGE); |
2780 | 1.67k | chiralityPossible = true; |
2781 | 1.67k | break; |
2782 | 2.29k | case 2: |
2783 | 2.29k | if (bType == 1) { |
2784 | 49 | bond->setBondDir(Bond::UNKNOWN); |
2785 | 2.25k | } else if (bType == 2) { |
2786 | 249 | bond->setBondDir(Bond::EITHERDOUBLE); |
2787 | 249 | bond->setStereo(Bond::STEREOANY); |
2788 | 249 | } |
2789 | 2.29k | break; |
2790 | 292 | case 3: |
2791 | 292 | bond->setBondDir(Bond::BEGINDASH); |
2792 | 292 | chiralityPossible = true; |
2793 | 292 | break; |
2794 | 95 | default: |
2795 | 95 | errout << "bad bond CFG " << val << "' on line " << line; |
2796 | 95 | throw FileParseException(errout.str()); |
2797 | 17.8k | } |
2798 | 17.7k | bond->setProp(common_properties::_MolFileBondCfg, cfg); |
2799 | 70.8k | } else if (prop == "TOPO") { |
2800 | 1.38k | if (val != "0") { |
2801 | 1.06k | if (!bond->hasQuery()) { |
2802 | 40 | auto *qBond = new QueryBond(*bond); |
2803 | 40 | delete bond; |
2804 | 40 | bond = qBond; |
2805 | 40 | } |
2806 | 1.06k | BOND_EQUALS_QUERY *q = makeBondIsInRingQuery(); |
2807 | 1.06k | if (val == "1") { |
2808 | | // nothing |
2809 | 1.04k | } else if (val == "2") { |
2810 | 1.02k | q->setNegation(true); |
2811 | 1.02k | } else { |
2812 | 15 | errout << "bad bond TOPO " << val << "' on line " << line; |
2813 | 15 | throw FileParseException(errout.str()); |
2814 | 15 | } |
2815 | 1.05k | bond->expandQuery(q); |
2816 | 1.05k | } |
2817 | 69.4k | } else if (prop == "RXCTR") { |
2818 | 11.6k | int reactStatus = FileParserUtils::toInt(val); |
2819 | 11.6k | bond->setProp(common_properties::molReactStatus, reactStatus); |
2820 | 57.8k | } else if (prop == "STBOX") { |
2821 | 12.9k | bond->setProp(common_properties::molStereoCare, std::string(val)); |
2822 | 44.8k | } else if (prop == "ENDPTS") { |
2823 | 3.29k | bond->setProp(common_properties::_MolFileBondEndPts, std::string(val)); |
2824 | 41.5k | } else if (prop == "ATTACH") { |
2825 | 774 | bond->setProp(common_properties::_MolFileBondAttach, std::string(val)); |
2826 | 774 | } |
2827 | 88.5k | ++lPos; |
2828 | 88.5k | } |
2829 | | |
2830 | 330 | bond->setBeginAtomIdx(mol->getAtomWithBookmark(a1Idx)->getIdx()); |
2831 | 330 | bond->setEndAtomIdx(mol->getAtomWithBookmark(a2Idx)->getIdx()); |
2832 | 330 | mol->addBond(bond, true); |
2833 | 330 | mol->setBondBookmark(bond, bondIdx); |
2834 | | |
2835 | | // set the stereoCare property on the bond if it's not set already and |
2836 | | // both the beginning and end atoms have it set: |
2837 | 330 | int care1 = 0; |
2838 | 330 | int care2 = 0; |
2839 | 330 | if (!bond->hasProp(common_properties::molStereoCare) && |
2840 | 0 | mol->getAtomWithIdx(bond->getBeginAtomIdx()) |
2841 | 0 | ->getPropIfPresent(common_properties::molStereoCare, care1) && |
2842 | 0 | mol->getAtomWithIdx(bond->getEndAtomIdx()) |
2843 | 0 | ->getPropIfPresent(common_properties::molStereoCare, care2)) { |
2844 | 0 | if (care1 == care2) { |
2845 | 0 | bond->setProp(common_properties::molStereoCare, care1); |
2846 | 0 | } |
2847 | 0 | } |
2848 | 330 | } |
2849 | 344 | inl = getV3000Line(inStream, line); |
2850 | 344 | tempStr = inl; |
2851 | 344 | if (tempStr.length() < 8 || tempStr.substr(0, 8) != "END BOND") { |
2852 | 0 | std::ostringstream errout; |
2853 | 0 | errout << "END BOND line not found at line " << line; |
2854 | 0 | throw FileParseException(errout.str()); |
2855 | 0 | } |
2856 | 344 | } |
2857 | | // The documentation about MRV_COORDINATE_BOND_TYPE in |
2858 | | // https://docs.chemaxon.com/display/docs/chemaxon-specific-information-in-mdl-mol-files.md |
2859 | | // seems to be wrong: it says the only data field in this group contains the |
2860 | | // index for the coordinate atom. But behavior in Marvin Sketch seems to |
2861 | | // indicate that it references the bond index instead (see |
2862 | | // https://github.com/rdkit/rdkit/issues/4473) |
2863 | | |
2864 | 16 | void processMrvCoordinateBond(RWMol &mol, const SubstanceGroup &sg) { |
2865 | 16 | std::vector<std::string> dataFields; |
2866 | 16 | if (sg.getPropIfPresent("DATAFIELDS", dataFields)) { |
2867 | 16 | if (dataFields.empty()) { |
2868 | 1 | BOOST_LOG(rdWarningLog) |
2869 | 0 | << "ignoring MRV_COORDINATE_BOND_TYPE SGroup without data fields." |
2870 | 0 | << std::endl; |
2871 | 1 | return; |
2872 | 1 | } |
2873 | | |
2874 | 15 | auto coordinate_bond_idx = |
2875 | 15 | FileParserUtils::toUnsigned(dataFields[0], true) - 1; |
2876 | | |
2877 | 15 | if (dataFields.size() > 1) { |
2878 | 2 | BOOST_LOG(rdWarningLog) << "ignoring extra data fields in " |
2879 | 0 | "MRV_COORDINATE_BOND_TYPE SGroup for bond " |
2880 | 0 | << coordinate_bond_idx << '.' << std::endl; |
2881 | 2 | } |
2882 | | |
2883 | 15 | Bond *old_bond = nullptr; |
2884 | 15 | try { |
2885 | 15 | old_bond = mol.getBondWithIdx(coordinate_bond_idx); |
2886 | 15 | } catch (const Invar::Invariant &) { |
2887 | 5 | BOOST_LOG(rdWarningLog) |
2888 | 0 | << "molecule does not contain a bond matching the " |
2889 | 0 | "MRV_COORDINATE_BOND_TYPE SGroup for bond " |
2890 | 0 | << coordinate_bond_idx << ", ignoring." << std::endl; |
2891 | 5 | return; |
2892 | 5 | } |
2893 | | |
2894 | 5 | if (!old_bond || old_bond->getBondType() != Bond::BondType::UNSPECIFIED) { |
2895 | 1 | BOOST_LOG(rdWarningLog) |
2896 | 0 | << "MRV_COORDINATE_BOND_TYPE SGroup with value " |
2897 | 0 | << coordinate_bond_idx |
2898 | 0 | << " does not reference a query bond, ignoring." << std::endl; |
2899 | 1 | return; |
2900 | 1 | } |
2901 | | |
2902 | 4 | Bond new_bond(Bond::BondType::DATIVE); |
2903 | 4 | auto preserveProps = true; |
2904 | 4 | auto keepSGroups = true; |
2905 | 4 | mol.replaceBond(coordinate_bond_idx, &new_bond, preserveProps, keepSGroups); |
2906 | 4 | } |
2907 | 16 | } |
2908 | | |
2909 | 119 | void processSMARTSQ(RWMol &mol, const SubstanceGroup &sg) { |
2910 | 119 | std::string field; |
2911 | 119 | if (sg.getPropIfPresent("QUERYOP", field) && field != "=") { |
2912 | 36 | BOOST_LOG(rdWarningLog) << "unrecognized QUERYOP '" << field |
2913 | 0 | << "' for SMARTSQ. Query ignored." << std::endl; |
2914 | 36 | return; |
2915 | 36 | } |
2916 | 83 | std::vector<std::string> dataFields; |
2917 | 83 | if (!sg.getPropIfPresent("DATAFIELDS", dataFields) || dataFields.empty()) { |
2918 | 5 | BOOST_LOG(rdWarningLog) |
2919 | 0 | << "empty FIELDDATA for SMARTSQ. Query ignored." << std::endl; |
2920 | 5 | return; |
2921 | 5 | } |
2922 | 78 | if (dataFields.size() > 1) { |
2923 | 15 | BOOST_LOG(rdWarningLog) |
2924 | 0 | << "multiple FIELDDATA values for SMARTSQ. Taking the first." |
2925 | 0 | << std::endl; |
2926 | 15 | } |
2927 | 78 | const std::string &sma = dataFields[0]; |
2928 | 78 | if (sma.empty()) { |
2929 | 2 | BOOST_LOG(rdWarningLog) |
2930 | 0 | << "Skipping empty SMARTS value for SMARTSQ." << std::endl; |
2931 | 2 | return; |
2932 | 2 | } |
2933 | | |
2934 | 4.58k | for (auto aidx : sg.getAtoms()) { |
2935 | 4.58k | auto at = mol.getAtomWithIdx(aidx); |
2936 | | |
2937 | 4.58k | std::unique_ptr<RWMol> m; |
2938 | 4.58k | try { |
2939 | 4.58k | m.reset(SmartsToMol(sma)); |
2940 | 4.58k | } catch (...) { |
2941 | | // Is this ever used? |
2942 | 1 | } |
2943 | | |
2944 | 4.58k | if (!m || !m->getNumAtoms()) { |
2945 | 3 | BOOST_LOG(rdWarningLog) |
2946 | 0 | << "SMARTS for SMARTSQ '" << sma |
2947 | 0 | << "' could not be parsed or has no atoms. Ignoring it." << std::endl; |
2948 | 3 | return; |
2949 | 3 | } |
2950 | | |
2951 | 4.58k | if (!at->hasQuery()) { |
2952 | 73 | QueryAtom qAt(*at); |
2953 | 73 | int oidx = at->getIdx(); |
2954 | 73 | mol.replaceAtom(oidx, &qAt); |
2955 | 73 | at = mol.getAtomWithIdx(oidx); |
2956 | 73 | } |
2957 | 4.58k | QueryAtom::QUERYATOM_QUERY *query = nullptr; |
2958 | 4.58k | if (m->getNumAtoms() == 1) { |
2959 | 2.51k | query = m->getAtomWithIdx(0)->getQuery()->copy(); |
2960 | 2.51k | } else { |
2961 | 2.06k | query = new RecursiveStructureQuery(m.release()); |
2962 | 2.06k | } |
2963 | 4.58k | at->setQuery(query); |
2964 | 4.58k | at->setProp(common_properties::MRV_SMA, sma); |
2965 | 4.58k | at->setProp(common_properties::_MolFileAtomQuery, 1); |
2966 | 4.58k | } |
2967 | 76 | } |
2968 | | |
2969 | 185 | void processMrvImplicitH(RWMol &mol, const SubstanceGroup &sg) { |
2970 | 185 | std::vector<std::string> dataFields; |
2971 | 185 | if (sg.getPropIfPresent("DATAFIELDS", dataFields)) { |
2972 | 8.79k | for (const auto &df : dataFields) { |
2973 | 8.79k | if (df.substr(0, 6) == "IMPL_H") { |
2974 | 3.36k | auto val = FileParserUtils::toInt(df.substr(6)); |
2975 | 28.5k | for (auto atIdx : sg.getAtoms()) { |
2976 | 28.5k | if (atIdx < mol.getNumAtoms()) { |
2977 | | // if the atom has aromatic bonds to it, then set the explicit |
2978 | | // value, otherwise skip it. |
2979 | 28.5k | auto atom = mol.getAtomWithIdx(atIdx); |
2980 | 28.5k | bool hasAromaticBonds = false; |
2981 | 28.5k | for (auto bndI : |
2982 | 58.7k | boost::make_iterator_range(mol.getAtomBonds(atom))) { |
2983 | 58.7k | auto bnd = (mol)[bndI]; |
2984 | 58.7k | if (bnd->getIsAromatic() || |
2985 | 58.3k | bnd->getBondType() == Bond::AROMATIC) { |
2986 | 368 | hasAromaticBonds = true; |
2987 | 368 | break; |
2988 | 368 | } |
2989 | 58.7k | } |
2990 | 28.5k | if (hasAromaticBonds) { |
2991 | 368 | atom->setNumExplicitHs(val); |
2992 | 28.2k | } else { |
2993 | 28.2k | BOOST_LOG(rdWarningLog) |
2994 | 0 | << "MRV_IMPLICIT_H SGroup on atom without aromatic " |
2995 | 0 | "bonds, " |
2996 | 0 | << atIdx << ", ignored." << std::endl; |
2997 | 28.2k | } |
2998 | 28.5k | } else { |
2999 | 0 | BOOST_LOG(rdWarningLog) |
3000 | 0 | << "bad atom index, " << atIdx |
3001 | 0 | << ", found in MRV_IMPLICIT_H SGroup. Ignoring it." |
3002 | 0 | << std::endl; |
3003 | 0 | } |
3004 | 28.5k | } |
3005 | 3.36k | } |
3006 | 8.79k | } |
3007 | 185 | } |
3008 | 185 | } |
3009 | | |
3010 | 11 | void processZBO(RWMol &mol, const SubstanceGroup &sg) { |
3011 | 84 | for (auto bidx : sg.getBonds()) { |
3012 | 84 | auto bond = mol.getBondWithIdx(bidx); |
3013 | 84 | bond->setBondType(Bond::BondType::ZERO); |
3014 | 84 | } |
3015 | 11 | } |
3016 | | |
3017 | 193 | void processZCH(RWMol &mol, const SubstanceGroup &sg) { |
3018 | 193 | RDUNUSED_PARAM(mol); |
3019 | 193 | std::vector<std::string> dataFields; |
3020 | 193 | if (sg.getPropIfPresent("DATAFIELDS", dataFields)) { |
3021 | 193 | if (dataFields.empty()) { |
3022 | 3 | BOOST_LOG(rdWarningLog) |
3023 | 0 | << "ignoring ZCHG SGroup without data fields." << std::endl; |
3024 | 3 | return; |
3025 | 3 | } |
3026 | 6.41k | for (const auto &df : dataFields) { |
3027 | 6.41k | std::string trimmed = boost::trim_copy(df); |
3028 | 6.41k | std::vector<std::string> splitLine; |
3029 | 6.41k | boost::split(splitLine, trimmed, boost::is_any_of(";"), |
3030 | 6.41k | boost::token_compress_off); |
3031 | 6.41k | const auto &aids = sg.getAtoms(); |
3032 | 6.41k | if (splitLine.size() < aids.size()) { |
3033 | 419 | BOOST_LOG(rdWarningLog) |
3034 | 0 | << "DATAFIELDS in ZCH SGroup is shorter than the number of atoms in the SGroup. Ignoring it." |
3035 | 0 | << std::endl; |
3036 | 419 | continue; |
3037 | 419 | } |
3038 | 16.4k | for (auto i = 0u; i < aids.size(); ++i) { |
3039 | 10.4k | auto aid = aids[i]; |
3040 | 10.4k | auto atom = mol.getAtomWithIdx(aid); |
3041 | 10.4k | auto val = 0; |
3042 | 10.4k | if (!splitLine[i].empty()) { |
3043 | 10.0k | val = FileParserUtils::toInt(splitLine[i]); |
3044 | 10.0k | } |
3045 | 10.4k | atom->setFormalCharge(val); |
3046 | 10.4k | } |
3047 | 5.99k | } |
3048 | 190 | } |
3049 | 193 | } |
3050 | 67 | void processHYD(RWMol &mol, const SubstanceGroup &sg) { |
3051 | 67 | std::vector<std::string> dataFields; |
3052 | 67 | if (sg.getPropIfPresent("DATAFIELDS", dataFields)) { |
3053 | 67 | if (dataFields.empty()) { |
3054 | 1 | BOOST_LOG(rdWarningLog) |
3055 | 0 | << "ignoring HYD SGroup without data fields." << std::endl; |
3056 | 1 | return; |
3057 | 1 | } |
3058 | 3.37k | for (const auto &df : dataFields) { |
3059 | 3.37k | std::string trimmed = boost::trim_copy(df); |
3060 | 3.37k | std::vector<std::string> splitLine; |
3061 | 3.37k | boost::split(splitLine, trimmed, boost::is_any_of(";"), |
3062 | 3.37k | boost::token_compress_off); |
3063 | 3.37k | const auto &aids = sg.getAtoms(); |
3064 | 3.37k | if (splitLine.size() < aids.size()) { |
3065 | 1.62k | BOOST_LOG(rdWarningLog) |
3066 | 0 | << "DATAFIELDS in HYD SGroup is shorter than the number of atoms in the SGroup. Ignoring it." |
3067 | 0 | << std::endl; |
3068 | 1.62k | continue; |
3069 | 1.62k | } |
3070 | 3.17k | for (auto i = 0u; i < aids.size(); ++i) { |
3071 | 1.41k | auto aid = aids[i]; |
3072 | 1.41k | auto atom = mol.getAtomWithIdx(aid); |
3073 | 1.41k | auto val = 0; |
3074 | 1.41k | if (!splitLine[i].empty()) { |
3075 | 1.18k | val = FileParserUtils::toInt(splitLine[i]); |
3076 | 1.18k | } |
3077 | 1.41k | atom->setProp("_ZBO_H", true); |
3078 | 1.41k | atom->setNumExplicitHs(val); |
3079 | 1.41k | } |
3080 | 1.75k | } |
3081 | 66 | } |
3082 | 67 | } |
3083 | | |
3084 | | // process (and remove) SGroups which modify the structure |
3085 | | // and which we can unambiguously apply |
3086 | 4.88k | void processSGroups(RWMol *mol) { |
3087 | 4.88k | std::vector<unsigned int> sgsToRemove; |
3088 | 4.88k | unsigned int sgIdx = 0; |
3089 | 4.88k | for (auto &sg : getSubstanceGroups(*mol)) { |
3090 | 3.09k | if (sg.getProp<std::string>("TYPE") == "DAT") { |
3091 | 964 | std::string field; |
3092 | 964 | if (sg.getPropIfPresent("FIELDNAME", field)) { |
3093 | 727 | if (field == "MRV_COORDINATE_BOND_TYPE") { |
3094 | | // V2000 support for coordinate bonds |
3095 | 16 | processMrvCoordinateBond(*mol, sg); |
3096 | 16 | sgsToRemove.push_back(sgIdx); |
3097 | 16 | continue; |
3098 | 711 | } else if (field == "MRV_IMPLICIT_H") { |
3099 | | // CXN extension to specify implicit Hs, used for aromatic rings |
3100 | 185 | processMrvImplicitH(*mol, sg); |
3101 | 185 | sgsToRemove.push_back(sgIdx); |
3102 | 185 | continue; |
3103 | 526 | } else if (field == "ZBO") { |
3104 | | // RDKit extension for zero-order bonds |
3105 | 11 | processZBO(*mol, sg); |
3106 | 11 | sgsToRemove.push_back(sgIdx); |
3107 | 11 | continue; |
3108 | 515 | } else if (field == "ZCH") { |
3109 | | // RDKit extension for charge on atoms involved in zero-order bonds |
3110 | 193 | processZCH(*mol, sg); |
3111 | 193 | sgsToRemove.push_back(sgIdx); |
3112 | 193 | continue; |
3113 | 322 | } else if (field == "HYD") { |
3114 | | // RDKit extension for hydrogen-count on atoms involved in |
3115 | | // zero-order bonds |
3116 | 67 | processHYD(*mol, sg); |
3117 | 67 | sgsToRemove.push_back(sgIdx); |
3118 | 67 | continue; |
3119 | 67 | } |
3120 | 727 | } |
3121 | 492 | if (sg.getPropIfPresent("QUERYTYPE", field) && |
3122 | 181 | (field == "SMARTSQ" || field == "SQ")) { |
3123 | 119 | processSMARTSQ(*mol, sg); |
3124 | 119 | sgsToRemove.push_back(sgIdx); |
3125 | 119 | continue; |
3126 | 119 | } |
3127 | 492 | } |
3128 | 2.50k | ++sgIdx; |
3129 | 2.50k | } |
3130 | | // now remove the S groups we processed, we saved indices so do this in |
3131 | | // backwards |
3132 | 4.88k | auto &sgs = getSubstanceGroups(*mol); |
3133 | 5.39k | for (auto it = sgsToRemove.rbegin(); it != sgsToRemove.rend(); ++it) { |
3134 | 508 | sgs.erase(sgs.begin() + *it); |
3135 | 508 | } |
3136 | 4.88k | } |
3137 | | |
3138 | 4.88k | void ProcessMolProps(RWMol *mol) { |
3139 | 4.88k | PRECONDITION(mol, "no molecule"); |
3140 | | // we have to loop the ugly way because we may need to actually replace an |
3141 | | // atom |
3142 | 115k | for (unsigned int aidx = 0; aidx < mol->getNumAtoms(); ++aidx) { |
3143 | 110k | auto atom = mol->getAtomWithIdx(aidx); |
3144 | 110k | int ival = 0; |
3145 | 110k | if (atom->getPropIfPresent(common_properties::molSubstCount, ival) && |
3146 | 0 | ival != 0) { |
3147 | 0 | if (!atom->hasQuery()) { |
3148 | 0 | atom = QueryOps::replaceAtomWithQueryAtom(mol, atom); |
3149 | 0 | } |
3150 | 0 | bool gtQuery = false; |
3151 | 0 | if (ival == -1) { |
3152 | 0 | ival = 0; |
3153 | 0 | } else if (ival == -2) { |
3154 | | // as drawn |
3155 | 0 | ival = atom->getDegree(); |
3156 | 0 | } else if (ival >= 6) { |
3157 | | // 6 or more |
3158 | 0 | gtQuery = true; |
3159 | 0 | } |
3160 | 0 | if (!gtQuery) { |
3161 | 0 | atom->expandQuery(makeAtomExplicitDegreeQuery(ival)); |
3162 | 0 | } else { |
3163 | | // create a temp query the normal way so that we can be sure to get |
3164 | | // the description right |
3165 | 0 | std::unique_ptr<ATOM_EQUALS_QUERY> tmp{ |
3166 | 0 | makeAtomExplicitDegreeQuery(ival)}; |
3167 | 0 | atom->expandQuery(makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>( |
3168 | 0 | ival, tmp->getDataFunc(), |
3169 | 0 | std::string("less_") + tmp->getDescription())); |
3170 | 0 | } |
3171 | 0 | } |
3172 | 110k | if (atom->getPropIfPresent(common_properties::molTotValence, ival) && |
3173 | 3.54k | ival != 0 && !atom->hasProp("_ZBO_H")) { |
3174 | 3.54k | atom->setNoImplicit(true); |
3175 | 3.54k | if (ival == 15 // V2000 |
3176 | 3.53k | || ival == -1 // v3000 |
3177 | 3.54k | ) { |
3178 | 23 | atom->setNumExplicitHs(0); |
3179 | 3.52k | } else { |
3180 | 3.52k | if (static_cast<int>(atom->getValence(Atom::ValenceType::EXPLICIT)) > |
3181 | 3.52k | ival) { |
3182 | 470 | BOOST_LOG(rdWarningLog) |
3183 | 0 | << "atom " << atom->getIdx() << " has specified valence (" << ival |
3184 | 0 | << ") smaller than the drawn valence " |
3185 | 0 | << atom->getValence(Atom::ValenceType::EXPLICIT) << "." |
3186 | 0 | << std::endl; |
3187 | 470 | atom->setNumExplicitHs(0); |
3188 | 3.05k | } else { |
3189 | 3.05k | atom->setNumExplicitHs(ival - |
3190 | 3.05k | atom->getValence(Atom::ValenceType::EXPLICIT)); |
3191 | 3.05k | } |
3192 | 3.52k | } |
3193 | 3.54k | } |
3194 | 110k | atom->clearProp(common_properties::molTotValence); |
3195 | 110k | } |
3196 | 4.88k | processSGroups(mol); |
3197 | 4.88k | } |
3198 | | |
3199 | | } // namespace |
3200 | | namespace FileParserUtils { |
3201 | | bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, |
3202 | | Conformer *&conf, bool &chiralityPossible, |
3203 | | unsigned int &nAtoms, unsigned int &nBonds, |
3204 | | bool strictParsing, bool expectMEND, |
3205 | 4.74k | bool expectMacroAtoms) { |
3206 | 4.74k | PRECONDITION(inStream, "bad stream"); |
3207 | 4.74k | PRECONDITION(mol, "bad molecule"); |
3208 | | |
3209 | 4.74k | std::string tempStr; |
3210 | 4.74k | std::vector<std::string> splitLine; |
3211 | | |
3212 | 4.74k | bool fileComplete = false; |
3213 | | |
3214 | 4.74k | tempStr = getV3000Line(inStream, line); |
3215 | 4.74k | boost::to_upper(tempStr); |
3216 | 4.74k | if (tempStr.length() < 10 || tempStr.substr(0, 10) != "BEGIN CTAB") { |
3217 | 55 | std::ostringstream errout; |
3218 | 55 | errout << "BEGIN CTAB line not found on line " << line; |
3219 | 55 | throw FileParseException(errout.str()); |
3220 | 55 | } |
3221 | | |
3222 | 4.68k | tempStr = getV3000Line(inStream, line); |
3223 | 4.68k | boost::to_upper(tempStr); |
3224 | 4.68k | if (tempStr.size() < 8 || tempStr.substr(0, 7) != "COUNTS ") { |
3225 | 16 | std::ostringstream errout; |
3226 | 16 | errout << "Bad counts line : '" << tempStr << "' on line " << line; |
3227 | 16 | throw FileParseException(errout.str()); |
3228 | 16 | } |
3229 | 4.67k | std::string trimmed = |
3230 | 4.67k | boost::trim_copy(tempStr.substr(7, tempStr.length() - 7)); |
3231 | 4.67k | boost::split(splitLine, trimmed, boost::is_any_of(" \t"), |
3232 | 4.67k | boost::token_compress_on); |
3233 | 4.67k | if (splitLine.size() < 2) { |
3234 | 6 | std::ostringstream errout; |
3235 | 6 | errout << "Bad counts line : '" << tempStr << "' on line " << line; |
3236 | 6 | throw FileParseException(errout.str()); |
3237 | 6 | } |
3238 | | |
3239 | 4.66k | nAtoms = FileParserUtils::toUnsigned(splitLine[0]); |
3240 | 4.66k | nBonds = FileParserUtils::toUnsigned(splitLine[1]); |
3241 | 4.66k | conf = new Conformer(nAtoms); |
3242 | | |
3243 | 4.66k | unsigned int nSgroups = 0, n3DConstraints = 0, chiralFlag = 0; |
3244 | | |
3245 | 4.66k | if (splitLine.size() > 2) { |
3246 | 2.17k | nSgroups = FileParserUtils::toUnsigned(splitLine[2]); |
3247 | 2.17k | } |
3248 | 4.66k | if (splitLine.size() > 3) { |
3249 | 1.21k | n3DConstraints = FileParserUtils::toUnsigned(splitLine[3]); |
3250 | 1.21k | } |
3251 | 4.66k | if (splitLine.size() > 4) { |
3252 | 185 | chiralFlag = FileParserUtils::toUnsigned(splitLine[4]); |
3253 | 185 | } |
3254 | | |
3255 | 4.66k | mol->setProp(common_properties::_MolFileChiralFlag, chiralFlag); |
3256 | | |
3257 | 4.66k | if (nAtoms) { |
3258 | 1.77k | ParseV3000AtomBlock(inStream, line, nAtoms, mol, conf, strictParsing, |
3259 | 1.77k | expectMacroAtoms); |
3260 | 1.77k | } |
3261 | 4.66k | if (nBonds) { |
3262 | 1.00k | ParseV3000BondBlock(inStream, line, nBonds, mol, chiralityPossible); |
3263 | 1.00k | } |
3264 | | |
3265 | 4.66k | tempStr = getV3000Line(inStream, line); |
3266 | | // do link nodes: |
3267 | 4.66k | boost::to_upper(tempStr); |
3268 | 6.44k | while (tempStr.length() > 8 && tempStr.substr(0, 8) == "LINKNODE") { |
3269 | 1.77k | boost::to_upper(tempStr); |
3270 | | // if the line has nothing on it we just ignore it |
3271 | 1.77k | if (tempStr.size() > 9) { |
3272 | 1.72k | std::string existing = ""; |
3273 | 1.72k | if (mol->getPropIfPresent(common_properties::molFileLinkNodes, |
3274 | 1.72k | existing)) { |
3275 | 1.65k | existing += "|"; |
3276 | 1.65k | } |
3277 | 1.72k | existing += tempStr.substr(9); // skip the "LINKNODE " |
3278 | 1.72k | mol->setProp(common_properties::molFileLinkNodes, existing); |
3279 | 1.72k | } |
3280 | 1.77k | tempStr = getV3000Line(inStream, line); |
3281 | 1.77k | } |
3282 | | |
3283 | 4.66k | bool sgroupFound = false; |
3284 | 4.66k | bool obj3dFound = false; |
3285 | 4.66k | boost::to_upper(tempStr); |
3286 | 13.0k | while (tempStr.length() > 5 && tempStr.substr(0, 5) == "BEGIN") { |
3287 | 8.41k | if (tempStr.length() >= 12 && tempStr.substr(0, 12) == "BEGIN SGROUP") { |
3288 | 1.43k | if (sgroupFound) { |
3289 | 6 | std::ostringstream errout; |
3290 | 6 | errout << "BEGIN SGROUP found more than once on line " << line; |
3291 | 6 | throw FileParseException(errout.str()); |
3292 | | |
3293 | 1.42k | } else if (!nSgroups) { |
3294 | 58 | std::ostringstream errout; |
3295 | 58 | errout << "BEGIN SGROUP found but Sgroups NOT expected on line " |
3296 | 58 | << line; |
3297 | 58 | if (strictParsing) { |
3298 | 1 | throw FileParseException(errout.str()); |
3299 | 57 | } else { |
3300 | 57 | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
3301 | | // Prepare to read a lot of sgroups |
3302 | 57 | nSgroups = std::numeric_limits<unsigned int>::max(); |
3303 | 57 | } |
3304 | 58 | } |
3305 | 1.42k | sgroupFound = true; |
3306 | 1.42k | tempStr = |
3307 | 1.42k | ParseV3000SGroupsBlock(inStream, line, nSgroups, mol, strictParsing); |
3308 | 1.42k | boost::to_upper(tempStr); |
3309 | 1.42k | if (tempStr.length() < 10 || tempStr.substr(0, 10) != "END SGROUP") { |
3310 | 87 | std::ostringstream errout; |
3311 | 87 | errout << "END SGROUP line not found on line " << line; |
3312 | 87 | if (strictParsing) { |
3313 | 2 | throw FileParseException(errout.str()); |
3314 | 85 | } else { |
3315 | 85 | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
3316 | 85 | } |
3317 | 1.33k | } else { |
3318 | 1.33k | tempStr = getV3000Line(inStream, line); |
3319 | 1.33k | boost::to_upper(tempStr); |
3320 | 1.33k | } |
3321 | | |
3322 | 6.98k | } else if (tempStr.length() >= 15 && |
3323 | 5.68k | tempStr.substr(6, 10) == "COLLECTION") { |
3324 | 4.12k | tempStr = parseEnhancedStereo(inStream, line, mol, strictParsing); |
3325 | 4.12k | boost::to_upper(tempStr); |
3326 | 4.12k | } else if (tempStr.length() >= 11 && |
3327 | 2.48k | tempStr.substr(0, 11) == "BEGIN OBJ3D") { |
3328 | 51 | if (obj3dFound) { |
3329 | 1 | std::ostringstream errout; |
3330 | 1 | errout << "BEGIN OBJ3D found more than once on line " << line; |
3331 | 1 | throw FileParseException(errout.str()); |
3332 | 1 | } |
3333 | 50 | if (!n3DConstraints) { |
3334 | 30 | std::ostringstream errout; |
3335 | 30 | errout << "BEGIN OBJ3D found but 3n3DConstraints NOT expected on line " |
3336 | 30 | << line; |
3337 | 30 | if (strictParsing) { |
3338 | 1 | throw FileParseException(errout.str()); |
3339 | 29 | } else { |
3340 | 29 | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
3341 | 29 | } |
3342 | 30 | } |
3343 | 49 | BOOST_LOG(rdWarningLog) |
3344 | 0 | << "3D constraint information in mol block ignored at line " << line |
3345 | 0 | << std::endl; |
3346 | 49 | obj3dFound = true; |
3347 | 451 | for (unsigned int i = 0; i < n3DConstraints; ++i) { |
3348 | 402 | tempStr = getV3000Line(inStream, line); |
3349 | 402 | } |
3350 | 49 | tempStr = getV3000Line(inStream, line); |
3351 | 49 | boost::to_upper(tempStr); |
3352 | 49 | if (tempStr.length() < 9 || tempStr.substr(0, 9) != "END OBJ3D") { |
3353 | 31 | std::ostringstream errout; |
3354 | 31 | errout << "END OBJ3D line not found on line " << line; |
3355 | 31 | if (strictParsing) { |
3356 | 0 | throw FileParseException(errout.str()); |
3357 | 31 | } else { |
3358 | 31 | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
3359 | 31 | } |
3360 | 31 | } |
3361 | 49 | tempStr = getV3000Line(inStream, line); |
3362 | 49 | boost::to_upper(tempStr); |
3363 | 2.80k | } else { |
3364 | | // skip blocks we don't know how to read |
3365 | 2.80k | BOOST_LOG(rdWarningLog) << "skipping block at line " << line << ": '" |
3366 | 0 | << tempStr << "'" << std::endl; |
3367 | 8.05k | while (tempStr.length() < 3 || tempStr.substr(0, 3) != "END") { |
3368 | 5.25k | tempStr = getV3000Line(inStream, line); |
3369 | 5.25k | } |
3370 | 2.80k | tempStr = getV3000Line(inStream, line); |
3371 | 2.80k | boost::to_upper(tempStr); |
3372 | 2.80k | } |
3373 | 8.41k | } |
3374 | | |
3375 | 4.65k | if (nSgroups && !sgroupFound) { |
3376 | 52 | std::ostringstream errout; |
3377 | 52 | errout << "BEGIN SGROUP line not found on line " << line; |
3378 | 52 | if (strictParsing) { |
3379 | 9 | throw FileParseException(errout.str()); |
3380 | 43 | } else { |
3381 | 43 | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
3382 | 43 | } |
3383 | 52 | } |
3384 | | |
3385 | 4.64k | if (n3DConstraints && !obj3dFound) { |
3386 | 42 | std::ostringstream errout; |
3387 | 42 | errout << "BEGIN OBJ3D line not found on line " << line; |
3388 | 42 | if (strictParsing) { |
3389 | 4 | throw FileParseException(errout.str()); |
3390 | 38 | } else { |
3391 | 38 | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
3392 | 38 | } |
3393 | 42 | } |
3394 | | |
3395 | 4.64k | boost::to_upper(tempStr); |
3396 | 4.64k | if (tempStr.length() < 8 || tempStr.substr(0, 8) != "END CTAB") { |
3397 | 208 | if (strictParsing) { |
3398 | 44 | throw FileParseException("END CTAB line not found"); |
3399 | 164 | } else { |
3400 | 164 | BOOST_LOG(rdWarningLog) << "END CTAB line not found." << std::endl; |
3401 | 164 | } |
3402 | 208 | } |
3403 | | |
3404 | 4.59k | if (expectMEND) { |
3405 | 165 | tempStr = getLine(inStream); |
3406 | 165 | ++line; |
3407 | 165 | if (tempStr[0] == 'M' && tempStr.substr(0, 6) == "M END") { |
3408 | 21 | fileComplete = true; |
3409 | 21 | } |
3410 | 4.43k | } else { |
3411 | 4.43k | fileComplete = true; |
3412 | 4.43k | } |
3413 | | |
3414 | 4.59k | auto is3d = calculate3dFlag(*mol, *conf, chiralityPossible); |
3415 | 4.59k | conf->set3D(is3d); |
3416 | 4.59k | mol->addConformer(conf, true); |
3417 | 4.59k | conf = nullptr; |
3418 | | |
3419 | 4.59k | return fileComplete; |
3420 | 4.64k | } |
3421 | | |
3422 | | bool ParseV2000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, |
3423 | | Conformer *&conf, bool &chiralityPossible, |
3424 | | unsigned int &nAtoms, unsigned int &nBonds, |
3425 | 18.4k | bool strictParsing) { |
3426 | 18.4k | conf = new Conformer(nAtoms); |
3427 | | |
3428 | 18.4k | if (nAtoms == 0) { |
3429 | 3.66k | conf->set3D(false); |
3430 | 14.8k | } else { |
3431 | 14.8k | ParseMolBlockAtoms(inStream, line, nAtoms, mol, conf, strictParsing); |
3432 | 14.8k | } |
3433 | 18.4k | ParseMolBlockBonds(inStream, line, nBonds, mol, chiralityPossible); |
3434 | | |
3435 | 18.4k | auto is3d = calculate3dFlag(*mol, *conf, chiralityPossible); |
3436 | 18.4k | conf->set3D(is3d); |
3437 | 18.4k | mol->addConformer(conf, true); |
3438 | 18.4k | conf = nullptr; |
3439 | | |
3440 | 18.4k | bool fileComplete = |
3441 | 18.4k | ParseMolBlockProperties(inStream, line, mol, strictParsing); |
3442 | 18.4k | return fileComplete; |
3443 | 18.4k | } |
3444 | | |
3445 | | void finishMolProcessing( |
3446 | | RWMol *res, bool chiralityPossible, |
3447 | 4.90k | const RDKit::v2::FileParsers::MolFileParserParams ¶ms) { |
3448 | 4.90k | if (!res) { |
3449 | 0 | return; |
3450 | 0 | } |
3451 | 4.90k | res->clearAllAtomBookmarks(); |
3452 | 4.90k | res->clearAllBondBookmarks(); |
3453 | | |
3454 | 4.90k | if (params.expandAttachmentPoints) { |
3455 | 0 | MolOps::expandAttachmentPoints(*res); |
3456 | 0 | } |
3457 | | |
3458 | | // calculate explicit valence on each atom: |
3459 | 110k | for (auto atom : res->atoms()) { |
3460 | 110k | atom->calcExplicitValence(false); |
3461 | 110k | } |
3462 | | |
3463 | | // postprocess mol file flags |
3464 | 4.90k | ProcessMolProps(res); |
3465 | | |
3466 | | // update the chirality and stereo-chemistry |
3467 | | // |
3468 | | // NOTE: we detect the stereochemistry before sanitizing/removing |
3469 | | // hydrogens because the removal of H atoms may actually remove |
3470 | | // the wedged bond from the molecule. This wipes out the only |
3471 | | // sign that chirality ever existed and makes us sad... so first |
3472 | | // perceive chirality, then remove the Hs and sanitize. |
3473 | | // |
3474 | 4.90k | const Conformer &conf = res->getConformer(); |
3475 | 4.90k | if (chiralityPossible || conf.is3D()) { |
3476 | 3.51k | if (!conf.is3D()) { |
3477 | 418 | bool replaceExistingTags = true; |
3478 | 418 | MolOps::assignChiralTypesFromBondDirs(*res, conf.getId(), |
3479 | 418 | replaceExistingTags); |
3480 | 3.09k | } else { |
3481 | 3.09k | res->updatePropertyCache(false); |
3482 | 3.09k | MolOps::assignChiralTypesFrom3D(*res, conf.getId(), true); |
3483 | 3.09k | } |
3484 | 3.51k | } |
3485 | | |
3486 | 4.90k | Atropisomers::detectAtropisomerChirality(*res, &conf); |
3487 | | |
3488 | | // now that atom stereochem has been perceived, the wedging |
3489 | | // information is no longer needed, so we clear |
3490 | | // single bond dir flags: |
3491 | 4.90k | MolOps::clearSingleBondDirFlags(*res); |
3492 | | |
3493 | 4.90k | if (params.sanitize) { |
3494 | 4.09k | if (params.removeHs) { |
3495 | | // Bond stereo detection must happen before H removal, or |
3496 | | // else we might be removing stereogenic H atoms in double |
3497 | | // bonds (e.g. imines). But before we run stereo detection, |
3498 | | // we need to run mol cleanup so don't have trouble with |
3499 | | // e.g. nitro groups. Sadly, this a;; means we will find |
3500 | | // run both cleanup and ring finding twice (a fast find |
3501 | | // rings in bond stereo detection, and another in |
3502 | | // sanitization's SSSR symmetrization). |
3503 | 2.50k | unsigned int failedOp = 0; |
3504 | 2.50k | MolOps::sanitizeMol(*res, failedOp, MolOps::SANITIZE_CLEANUP); |
3505 | 2.50k | MolOps::detectBondStereochemistry(*res); |
3506 | 2.50k | MolOps::removeHs(*res); |
3507 | 2.50k | } else { |
3508 | 1.59k | MolOps::sanitizeMol(*res); |
3509 | 1.59k | MolOps::detectBondStereochemistry(*res); |
3510 | 1.59k | } |
3511 | | |
3512 | 4.09k | MolOps::assignStereochemistry(*res, true, true, true); |
3513 | 4.09k | } else { |
3514 | 805 | MolOps::detectBondStereochemistry(*res); |
3515 | 805 | } |
3516 | | |
3517 | 4.90k | if (res->hasProp(common_properties::_NeedsQueryScan)) { |
3518 | 79 | res->clearProp(common_properties::_NeedsQueryScan); |
3519 | 79 | QueryOps::completeMolQueries(res); |
3520 | 79 | } |
3521 | 4.90k | } |
3522 | | } // namespace FileParserUtils |
3523 | | |
3524 | | namespace v2 { |
3525 | | namespace FileParsers { |
3526 | | //------------------------------------------------ |
3527 | | // |
3528 | | // Read a molecule from a stream |
3529 | | // |
3530 | | //------------------------------------------------ |
3531 | | std::unique_ptr<RWMol> MolFromMolDataStream(std::istream &inStream, |
3532 | | unsigned int &line, |
3533 | 23.4k | const MolFileParserParams ¶ms) { |
3534 | 23.4k | std::string tempStr; |
3535 | 23.4k | bool fileComplete = false; |
3536 | 23.4k | bool chiralityPossible = false; |
3537 | 23.4k | Utils::LocaleSwitcher ls; |
3538 | | // mol name |
3539 | 23.4k | line++; |
3540 | 23.4k | tempStr = getLine(inStream); |
3541 | 23.4k | if (inStream.eof()) { |
3542 | 42 | return nullptr; |
3543 | 42 | } |
3544 | 23.4k | auto res = std::make_unique<RWMol>(); |
3545 | 23.4k | res->setProp(common_properties::_Name, tempStr); |
3546 | | |
3547 | | // info |
3548 | 23.4k | line++; |
3549 | 23.4k | tempStr = getLine(inStream); |
3550 | 23.4k | res->setProp("_MolFileInfo", tempStr); |
3551 | 23.4k | if (tempStr.length() >= 22) { |
3552 | 2.66k | std::string dimLabel = tempStr.substr(20, 2); |
3553 | | // Unless labelled as 3D we assume 2D |
3554 | 2.66k | if (dimLabel == "3d" || dimLabel == "3D") { |
3555 | 418 | res->setProp(common_properties::_3DConf, 1); |
3556 | 418 | } |
3557 | 2.66k | } |
3558 | | // comments |
3559 | 23.4k | line++; |
3560 | 23.4k | tempStr = getLine(inStream); |
3561 | 23.4k | res->setProp("_MolFileComments", tempStr); |
3562 | | |
3563 | 23.4k | unsigned int nAtoms = 0, nBonds = 0, nLists = 0, chiralFlag = 0, nsText = 0, |
3564 | 23.4k | nRxnComponents = 0; |
3565 | 23.4k | int nReactants = 0, nProducts = 0, nIntermediates = 0; |
3566 | 23.4k | (void)nLists; // read from the file but unused |
3567 | 23.4k | (void)nsText; |
3568 | 23.4k | (void)nRxnComponents; |
3569 | 23.4k | (void)nReactants; |
3570 | 23.4k | (void)nProducts; |
3571 | 23.4k | (void)nIntermediates; |
3572 | | // counts line, this is where we really get started |
3573 | 23.4k | line++; |
3574 | 23.4k | tempStr = getLine(inStream); |
3575 | | |
3576 | 23.4k | if (tempStr.size() < 6) { |
3577 | 101 | if (res) { |
3578 | 101 | res = nullptr; |
3579 | 101 | } |
3580 | 101 | std::ostringstream errout; |
3581 | 101 | errout << "Counts line too short: '" << tempStr << "' on line" << line; |
3582 | 101 | throw FileParseException(errout.str()); |
3583 | 101 | } |
3584 | | |
3585 | 23.3k | unsigned int spos = 0; |
3586 | | // this needs to go into a try block because if the lexical_cast throws an |
3587 | | // exception we want to catch throw a different exception |
3588 | 23.3k | try { |
3589 | 23.3k | nAtoms = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true); |
3590 | 23.3k | spos = 3; |
3591 | 23.3k | nBonds = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true); |
3592 | 23.3k | spos = 6; |
3593 | 23.3k | } catch (boost::bad_lexical_cast &) { |
3594 | 61 | std::ostringstream errout; |
3595 | 61 | errout << "Cannot convert '" << tempStr.substr(spos, 3) |
3596 | 61 | << "' to unsigned int on line " << line; |
3597 | 61 | throw FileParseException(errout.str()); |
3598 | 61 | } |
3599 | 23.2k | try { |
3600 | 23.2k | spos = 6; |
3601 | 23.2k | if (tempStr.size() >= 9) { |
3602 | 11.0k | nLists = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true); |
3603 | 11.0k | } |
3604 | | |
3605 | 23.2k | spos = 12; |
3606 | 23.2k | if (tempStr.size() >= spos + 3) { |
3607 | 3.45k | chiralFlag = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true); |
3608 | 3.45k | } |
3609 | | |
3610 | 23.2k | spos = 15; |
3611 | 23.2k | if (tempStr.size() >= spos + 3) { |
3612 | 2.11k | nsText = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true); |
3613 | 2.11k | } |
3614 | | |
3615 | 23.2k | spos = 18; |
3616 | 23.2k | if (tempStr.size() >= spos + 3) { |
3617 | 1.47k | nRxnComponents = |
3618 | 1.47k | FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true); |
3619 | 1.47k | } |
3620 | | |
3621 | 23.2k | spos = 21; |
3622 | 23.2k | if (tempStr.size() >= spos + 3) { |
3623 | 1.38k | nReactants = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true); |
3624 | 1.38k | } |
3625 | | |
3626 | 23.2k | spos = 24; |
3627 | 23.2k | if (tempStr.size() >= spos + 3) { |
3628 | 1.25k | nProducts = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true); |
3629 | 1.25k | } |
3630 | | |
3631 | 23.2k | spos = 27; |
3632 | 23.2k | if (tempStr.size() >= spos + 3) { |
3633 | 1.19k | nIntermediates = |
3634 | 1.19k | FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true); |
3635 | 1.19k | } |
3636 | | |
3637 | 23.2k | } catch (boost::bad_lexical_cast &) { |
3638 | | // some SD files (such as some from NCI) lack all the extra information |
3639 | | // on the header line, so ignore problems parsing there. |
3640 | 9.34k | } |
3641 | | |
3642 | 23.2k | unsigned int ctabVersion = 2000; |
3643 | 23.2k | if (tempStr.size() > 35) { |
3644 | 7.34k | if (tempStr.size() < 39 || tempStr[34] != 'V') { |
3645 | 1.10k | std::ostringstream errout; |
3646 | 1.10k | errout << "CTAB version string invalid at line " << line; |
3647 | 1.10k | if (params.strictParsing) { |
3648 | 40 | throw FileParseException(errout.str()); |
3649 | 1.06k | } else { |
3650 | 1.06k | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
3651 | 1.06k | } |
3652 | 6.23k | } else if (tempStr.substr(34, 5) == "V3000") { |
3653 | 4.74k | ctabVersion = 3000; |
3654 | 4.74k | } else if (tempStr.substr(34, 5) != "V2000") { |
3655 | 503 | std::ostringstream errout; |
3656 | 503 | errout << "Unsupported CTAB version: '" << tempStr.substr(34, 5) |
3657 | 503 | << "' at line " << line; |
3658 | 503 | if (params.strictParsing) { |
3659 | 2 | throw FileParseException(errout.str()); |
3660 | 501 | } else { |
3661 | 501 | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
3662 | 501 | } |
3663 | 992 | } else if (params.parsingSCSRMol) { |
3664 | 0 | std::ostringstream errout; |
3665 | 0 | errout << "SCSR Mol files is not V3000 at line" << line; |
3666 | 0 | throw FileParseException(errout.str()); |
3667 | 0 | } |
3668 | 7.34k | } |
3669 | | |
3670 | 23.2k | res->setProp(common_properties::_MolFileChiralFlag, chiralFlag); |
3671 | | |
3672 | 23.2k | Conformer *conf = nullptr; |
3673 | 23.2k | try { |
3674 | 23.2k | if (ctabVersion == 2000) { |
3675 | 18.4k | fileComplete = FileParserUtils::ParseV2000CTAB( |
3676 | 18.4k | &inStream, line, res.get(), conf, chiralityPossible, nAtoms, nBonds, |
3677 | 18.4k | params.strictParsing); |
3678 | 18.4k | } else { |
3679 | 4.74k | if (nAtoms != 0 || nBonds != 0) { |
3680 | 894 | std::ostringstream errout; |
3681 | 894 | errout << "V3000 mol blocks should have 0s in the initial counts line. " |
3682 | 894 | "(line: " |
3683 | 894 | << line << ")"; |
3684 | 894 | if (params.strictParsing) { |
3685 | 1 | throw FileParseException(errout.str()); |
3686 | 893 | } else { |
3687 | 893 | BOOST_LOG(rdWarningLog) << errout.str() << std::endl; |
3688 | 893 | } |
3689 | 894 | } |
3690 | | |
3691 | 4.74k | auto expectMEND = true; |
3692 | 4.74k | auto expectMacroAtoms = false; |
3693 | 4.74k | if (params.parsingSCSRMol) { |
3694 | 0 | expectMEND = false; |
3695 | 0 | expectMacroAtoms = true; |
3696 | 0 | } |
3697 | | |
3698 | 4.74k | fileComplete = FileParserUtils::ParseV3000CTAB( |
3699 | 4.74k | &inStream, line, res.get(), conf, chiralityPossible, nAtoms, nBonds, |
3700 | 4.74k | params.strictParsing, expectMEND, expectMacroAtoms); |
3701 | 4.74k | } |
3702 | 23.2k | } catch (MolFileUnhandledFeatureException &e) { |
3703 | | // unhandled mol file feature, show an error |
3704 | 58 | res.reset(); |
3705 | 58 | delete conf; |
3706 | 58 | conf = nullptr; |
3707 | 58 | BOOST_LOG(rdErrorLog) << " Unhandled CTAB feature: '" << e.what() |
3708 | 0 | << "'. Molecule skipped." << std::endl; |
3709 | | |
3710 | 58 | if (!inStream.eof()) { |
3711 | 54 | tempStr = getLine(inStream); |
3712 | 54 | } |
3713 | 58 | ++line; |
3714 | 1.30k | while (!inStream.eof() && !inStream.fail() && |
3715 | 1.25k | tempStr.substr(0, 6) != "M END" && tempStr.substr(0, 4) != "$$$$") { |
3716 | 1.25k | tempStr = getLine(inStream); |
3717 | 1.25k | ++line; |
3718 | 1.25k | } |
3719 | 58 | fileComplete = !inStream.eof() || tempStr.substr(0, 6) == "M END" || |
3720 | 54 | tempStr.substr(0, 4) == "$$$$"; |
3721 | 12.3k | } catch (FileParseException &e) { |
3722 | | // catch our exceptions and throw them back after cleanup |
3723 | 12.3k | delete conf; |
3724 | 12.3k | conf = nullptr; |
3725 | 12.3k | throw e; |
3726 | 12.3k | } |
3727 | | |
3728 | 9.42k | if (!fileComplete) { |
3729 | 4.52k | delete conf; |
3730 | 4.52k | conf = nullptr; |
3731 | 4.52k | std::ostringstream errout; |
3732 | 4.52k | errout |
3733 | 4.52k | << "Problems encountered parsing Mol data, M END missing around line " |
3734 | 4.52k | << line; |
3735 | 4.52k | throw FileParseException(errout.str()); |
3736 | 4.52k | } |
3737 | | |
3738 | 4.90k | if (res) { |
3739 | 4.90k | FileParserUtils::finishMolProcessing(res.get(), chiralityPossible, params); |
3740 | 4.90k | } |
3741 | 4.90k | return res; |
3742 | 9.42k | } |
3743 | | |
3744 | | //------------------------------------------------ |
3745 | | // |
3746 | | // Read a molecule from a string |
3747 | | // |
3748 | | //------------------------------------------------ |
3749 | | std::unique_ptr<RWMol> MolFromMolBlock(const std::string &molBlock, |
3750 | 0 | const MolFileParserParams ¶ms) { |
3751 | 0 | std::istringstream inStream(molBlock); |
3752 | 0 | unsigned int line = 0; |
3753 | 0 | return MolFromMolDataStream(inStream, line, params); |
3754 | 0 | } |
3755 | | |
3756 | | //------------------------------------------------ |
3757 | | // |
3758 | | // Read a molecule from a file |
3759 | | // |
3760 | | //------------------------------------------------ |
3761 | | std::unique_ptr<RWMol> MolFromMolFile(const std::string &fName, |
3762 | 0 | const MolFileParserParams ¶ms) { |
3763 | 0 | std::ifstream inStream(fName.c_str()); |
3764 | 0 | if (!inStream || (inStream.bad())) { |
3765 | 0 | std::ostringstream errout; |
3766 | 0 | errout << "Bad input file " << fName; |
3767 | 0 | throw BadFileException(errout.str()); |
3768 | 0 | } |
3769 | 0 | if (!inStream.eof()) { |
3770 | 0 | unsigned int line = 0; |
3771 | 0 | return MolFromMolDataStream(inStream, line, params); |
3772 | 0 | } else { |
3773 | 0 | return std::unique_ptr<RWMol>(); |
3774 | 0 | } |
3775 | 0 | } |
3776 | | } // namespace FileParsers |
3777 | | } // namespace v2 |
3778 | | } // namespace RDKit |