Coverage Report

Created: 2026-04-10 07:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rdkit/Code/GraphMol/FileParsers/MolFileParser.cpp
Line
Count
Source
1
//
2
//  Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors
3
//
4
//   @@ All Rights Reserved @@
5
//  This file is part of the RDKit.
6
//  The contents are covered by the terms of the BSD license
7
//  which is included in the file license.txt, found at the root
8
//  of the RDKit source tree.
9
//
10
#include <RDGeneral/BoostStartInclude.h>
11
#include <boost/lexical_cast.hpp>
12
#include <boost/algorithm/string.hpp>
13
#include <boost/tokenizer.hpp>
14
#include <boost/algorithm/string/trim.hpp>
15
#include <boost/format.hpp>
16
#include <RDGeneral/BoostEndInclude.h>
17
18
#include "FileParsers.h"
19
#include "FileParserUtils.h"
20
#include "MolSGroupParsing.h"
21
#include <GraphMol/FileParsers/MolFileStereochem.h>
22
#include <GraphMol/SmilesParse/SmilesParse.h>
23
#include <GraphMol/RDKitQueries.h>
24
#include <GraphMol/StereoGroup.h>
25
#include <GraphMol/SubstanceGroup.h>
26
#include <GraphMol/Atropisomers.h>
27
#include <RDGeneral/StreamOps.h>
28
#include <RDGeneral/RDLog.h>
29
#include <GraphMol/GenericGroups/GenericGroups.h>
30
#include <GraphMol/QueryOps.h>
31
#include <GraphMol/Chirality.h>
32
33
#include <fstream>
34
#include <RDGeneral/FileParseException.h>
35
#include <RDGeneral/BadFileException.h>
36
#include <RDGeneral/LocaleSwitcher.h>
37
#include <typeinfo>
38
#include <exception>
39
#include <charconv>
40
#include <regex>
41
#include <sstream>
42
#include <locale>
43
#include <cstdlib>
44
#include <cstdio>
45
#include <string_view>
46
47
using namespace RDKit::SGroupParsing;
48
using std::regex;
49
using std::regex_match;
50
using std::smatch;
51
52
namespace RDKit {
53
54
namespace FileParserUtils {
55
56
904k
int toInt(const std::string_view input, bool acceptSpaces) {
57
  // don't need to worry about locale stuff here because
58
  // we're not going to have delimiters
59
60
  // sanity check on the input since strtol doesn't do it for us:
61
904k
  const char *txt = input.data();
62
2.57M
  for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) {
63
1.72M
    if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') ||
64
1.67M
        *txt == '+' || *txt == '-') {
65
1.67M
      ++txt;
66
1.67M
    } else {
67
51.9k
      throw boost::bad_lexical_cast();
68
51.9k
    }
69
1.72M
  }
70
  // remove leading spaces
71
852k
  txt = input.data();
72
852k
  unsigned int sz = input.size();
73
852k
  if (acceptSpaces) {
74
1.17M
    while (*txt == ' ') {
75
391k
      ++txt;
76
391k
      --sz;
77
      // have we run off the end of the view?
78
391k
      if (sz < 1U) {
79
63.7k
        return 0;
80
63.7k
      }
81
391k
    }
82
852k
  }
83
788k
  int res = 0;
84
788k
  std::from_chars(txt, txt + sz, res);
85
86
788k
  return res;
87
852k
}
88
516k
int toInt(const std::string &input, bool acceptSpaces) {
89
516k
  return toInt(std::string_view(input.c_str()), acceptSpaces);
90
516k
}
91
501k
unsigned int toUnsigned(const std::string_view input, bool acceptSpaces) {
92
  // don't need to worry about locale stuff here because
93
  // we're not going to have delimiters
94
95
  // sanity check on the input since strtol doesn't do it for us:
96
501k
  const char *txt = input.data();
97
1.80M
  for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) {
98
1.33M
    if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') ||
99
1.30M
        *txt == '+') {
100
1.30M
      ++txt;
101
1.30M
    } else {
102
28.7k
      throw boost::bad_lexical_cast();
103
28.7k
    }
104
1.33M
  }
105
  // remove leading spaces
106
472k
  txt = input.data();
107
472k
  unsigned int sz = input.size();
108
472k
  if (acceptSpaces) {
109
1.02M
    while (*txt == ' ') {
110
556k
      ++txt;
111
556k
      --sz;
112
      // have we run off the end of the view?
113
556k
      if (sz < 1U) {
114
7.71k
        return 0;
115
7.71k
      }
116
556k
    }
117
472k
  }
118
465k
  unsigned int res = 0;
119
465k
  std::from_chars(txt, txt + sz, res);
120
465k
  return res;
121
472k
}
122
81.4k
unsigned int toUnsigned(const std::string &input, bool acceptSpaces) {
123
81.4k
  return toUnsigned(std::string_view(input.c_str()), acceptSpaces);
124
81.4k
}
125
428k
double toDouble(const std::string_view input, bool acceptSpaces) {
126
  // sanity check on the input since strtol doesn't do it for us:
127
428k
  const char *txt = input.data();
128
4.00M
  for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) {
129
    // check for ',' and '.' because locale
130
3.58M
    if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') ||
131
3.58M
        *txt == '+' || *txt == '-' || *txt == ',' || *txt == '.') {
132
3.58M
      ++txt;
133
3.58M
    } else {
134
5.34k
      throw boost::bad_lexical_cast();
135
5.34k
    }
136
3.58M
  }
137
  // unfortunately from_chars() with doubles didn't work on g++ until v11.1
138
  // and the status with clang is hard to figure out... we remain old-school
139
  // remove leading spaces
140
422k
  double res = atof(input.data());
141
422k
  return res;
142
428k
}
143
26.8k
double toDouble(const std::string &input, bool acceptSpaces) {
144
26.8k
  return toDouble(std::string_view(input.c_str()), acceptSpaces);
145
26.8k
}
146
78.8k
std::string getV3000Line(std::istream *inStream, unsigned int &line) {
147
  // FIX: technically V3K blocks are case-insensitive. We should really be
148
  // up-casing everything here.
149
78.8k
  PRECONDITION(inStream, "bad stream");
150
78.8k
  std::string res;
151
78.8k
  ++line;
152
78.8k
  auto inl = getLine(inStream);
153
78.8k
  std::string_view tempStr = inl;
154
78.8k
  if (tempStr.size() < 7 || tempStr.substr(0, 7) != "M  V30 ") {
155
2.05k
    std::ostringstream errout;
156
2.05k
    errout << "Line " << line << " does not start with 'M  V30 '" << std::endl;
157
2.05k
    throw FileParseException(errout.str());
158
2.05k
  }
159
  // FIX: do we need to handle trailing whitespace after a -?
160
77.3k
  while (tempStr.back() == '-') {
161
    // continuation character, append what we read:
162
586
    res += tempStr.substr(7, tempStr.length() - 8);
163
    // and then read another line:
164
586
    ++line;
165
586
    inl = getLine(inStream);
166
586
    tempStr = inl;
167
586
    if (tempStr.size() < 7 || tempStr.substr(0, 7) != "M  V30 ") {
168
20
      std::ostringstream errout;
169
20
      errout << "Line " << line << " does not start with 'M  V30 '"
170
20
             << std::endl;
171
20
      throw FileParseException(errout.str());
172
20
    }
173
586
  }
174
76.7k
  res += tempStr.substr(7, tempStr.length() - 7);
175
176
76.7k
  return res;
177
76.8k
}
178
179
0
Atom *replaceAtomWithQueryAtom(RWMol *mol, Atom *atom) {
180
0
  return QueryOps::replaceAtomWithQueryAtom(mol, atom);
181
0
}
182
}  // namespace FileParserUtils
183
using RDKit::FileParserUtils::getV3000Line;
184
185
namespace {
186
187
6.58k
bool startsWith(const std::string &haystack, const char *needle, size_t size) {
188
6.58k
  return haystack.compare(0u, size, needle, size) == 0;
189
6.58k
}
190
191
//! parse a collection block to find enhanced stereo groups
192
std::string parseEnhancedStereo(std::istream *inStream, unsigned int &line,
193
4.12k
                                RWMol *mol, bool strictParsing) {
194
  // Lines like (absolute, relative, racemic):
195
  // M  V30 MDLV30/STEABS ATOMS=(2 2 3)
196
  // M  V30 MDLV30/STEREL1 ATOMS=(1 12)
197
  // M  V30 MDLV30/STERAC1 ATOMS=(1 12)
198
4.12k
  const regex stereo_label(
199
4.12k
      R"regex(MDLV30/STE(...)([0-9]*) +ATOMS=\(([0-9]+) +(.*)\) *)regex");
200
201
4.12k
  smatch match;
202
4.12k
  std::vector<StereoGroup> groups;
203
204
  // Read the collection until the end
205
4.12k
  auto tempStr = getV3000Line(inStream, line);
206
4.12k
  boost::to_upper(tempStr);
207
4.12k
  unsigned abs_group_seen = 0;
208
6.68k
  while (!startsWith(tempStr, "END", 3)) {
209
    // If this line in the collection is part of a stereo group
210
2.55k
    if (regex_match(tempStr, match, stereo_label)) {
211
0
      StereoGroupType grouptype = RDKit::StereoGroupType::STEREO_ABSOLUTE;
212
0
      unsigned groupid = 0;
213
214
0
      if (match[1] == "ABS") {
215
0
        grouptype = RDKit::StereoGroupType::STEREO_ABSOLUTE;
216
        // Warn only one per mol about multiple ABS groups
217
0
        if (abs_group_seen == 1) {
218
0
          std::ostringstream errout;
219
0
          errout << "Seen a second ABS stereo group on line " << line
220
0
                 << std::endl;
221
0
          if (strictParsing) {
222
0
            throw FileParseException(errout.str());
223
0
          } else {
224
0
            BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
225
0
          }
226
0
        }
227
0
        ++abs_group_seen;
228
0
      } else if (match[1] == "REL") {
229
0
        grouptype = RDKit::StereoGroupType::STEREO_OR;
230
0
        groupid = FileParserUtils::toUnsigned(match[2], true);
231
0
      } else if (match[1] == "RAC") {
232
0
        grouptype = RDKit::StereoGroupType::STEREO_AND;
233
0
        groupid = FileParserUtils::toUnsigned(match[2], true);
234
0
      } else {
235
0
        std::ostringstream errout;
236
0
        errout << "Unrecognized stereogroup type : '" << tempStr << "' on line"
237
0
               << line;
238
0
        throw FileParseException(errout.str());
239
0
      }
240
241
0
      const unsigned int count = FileParserUtils::toUnsigned(match[3], true);
242
0
      std::vector<Atom *> atoms;
243
0
      std::stringstream ss(match[4]);
244
0
      unsigned int index;
245
0
      for (size_t i = 0; i < count; ++i) {
246
0
        ss >> index;
247
        // atoms are 1 indexed in molfiles
248
0
        atoms.push_back(mol->getAtomWithIdx(index - 1));
249
0
      }
250
0
      std::vector<Bond *> newBonds;
251
0
      groups.emplace_back(grouptype, std::move(atoms), std::move(newBonds),
252
0
                          groupid);
253
2.55k
    } else {
254
      // skip collection types we don't know how to read. Only one documented
255
      // is MDLV30/HILITE
256
2.55k
      BOOST_LOG(rdWarningLog) << "Skipping unrecognized collection type at "
257
0
                                 "line "
258
0
                              << line << ": " << tempStr << std::endl;
259
2.55k
    }
260
2.55k
    tempStr = getV3000Line(inStream, line);
261
2.55k
  }
262
263
4.12k
  if (!groups.empty()) {
264
0
    mol->setStereoGroups(std::move(groups));
265
0
  }
266
4.12k
  tempStr = getV3000Line(inStream, line);
267
4.12k
  return tempStr;
268
4.12k
}
269
270
//*************************************
271
//
272
// Every effort has been made to adhere to MDL's standard
273
// for mol files
274
//
275
//*************************************
276
277
void ParseOldAtomList(RWMol *mol, const std::string_view &text,
278
253
                      unsigned int line) {
279
253
  PRECONDITION(mol, "bad mol");
280
253
  unsigned int idx;
281
253
  try {
282
253
    idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(0, 3)) -
283
253
          1;
284
253
  } catch (boost::bad_lexical_cast &) {
285
127
    std::ostringstream errout;
286
127
    errout << "Cannot convert '" << text.substr(0, 3) << "' to int on line "
287
127
           << line;
288
127
    throw FileParseException(errout.str());
289
127
  }
290
291
126
  URANGE_CHECK(idx, mol->getNumAtoms());
292
87
  QueryAtom a(*(mol->getAtomWithIdx(idx)));
293
294
87
  auto *q = new ATOM_OR_QUERY;
295
87
  q->setDescription("AtomOr");
296
297
87
  switch (text[4]) {
298
62
    case 'T':
299
62
      q->setNegation(true);
300
62
      break;
301
13
    case 'F':
302
13
      q->setNegation(false);
303
13
      break;
304
12
    default:
305
12
      delete q;
306
12
      std::ostringstream errout;
307
12
      errout << "Unrecognized atom-list query modifier: '" << text[4]
308
12
             << "' on line " << line;
309
12
      throw FileParseException(errout.str());
310
87
  }
311
312
75
  int nQueries;
313
75
  try {
314
75
    nQueries = FileParserUtils::toInt(text.substr(9, 1));
315
75
  } catch (const std::out_of_range &) {
316
4
    delete q;
317
4
    std::ostringstream errout;
318
4
    errout << "Cannot convert position 9 of '" << text << "' to int on line "
319
4
           << line;
320
4
    throw FileParseException(errout.str());
321
7
  } catch (boost::bad_lexical_cast &) {
322
7
    delete q;
323
7
    std::ostringstream errout;
324
7
    errout << "Cannot convert '" << text.substr(9, 1) << "' to int on line "
325
7
           << line;
326
7
    throw FileParseException(errout.str());
327
7
  }
328
329
64
  RANGE_CHECK(0, nQueries, 5);
330
137
  for (int i = 0; i < nQueries; i++) {
331
114
    int pos = 11 + i * 4;
332
114
    int atNum;
333
114
    try {
334
114
      atNum = FileParserUtils::toInt(text.substr(pos, 3));
335
114
    } catch (const std::out_of_range &) {
336
8
      delete q;
337
8
      std::ostringstream errout;
338
8
      errout << "Cannot convert position " << pos << " of '" << text
339
8
             << "' to int on line " << line;
340
8
      throw FileParseException(errout.str());
341
22
    } catch (boost::bad_lexical_cast &) {
342
22
      delete q;
343
22
      std::ostringstream errout;
344
22
      errout << "Cannot convert '" << text.substr(pos, 3) << "' to int on line "
345
22
             << line;
346
22
      throw FileParseException(errout.str());
347
22
    }
348
84
    RANGE_CHECK(0, atNum, 200);  // goofy!
349
74
    q->addChild(
350
74
        QueryAtom::QUERYATOM_QUERY::CHILD_TYPE(makeAtomNumQuery(atNum)));
351
74
    if (!i) {
352
39
      a.setAtomicNum(atNum);
353
39
    }
354
74
  }
355
356
23
  a.setQuery(q);
357
23
  a.setProp(common_properties::_MolFileAtomQuery, 1);
358
359
23
  mol->replaceAtom(idx, &a);
360
23
}
361
362
void ParseChargeLine(RWMol *mol, const std::string &text, bool firstCall,
363
1.69k
                     unsigned int line) {
364
1.69k
  PRECONDITION(mol, "bad mol");
365
1.69k
  PRECONDITION(text.substr(0, 6) == std::string("M  CHG"), "bad charge line");
366
367
  // if this line is specified all the atom other than those specified
368
  // here should carry a charge of 0; but we should only do this once:
369
1.69k
  if (firstCall) {
370
2.73k
    for (ROMol::AtomIterator ai = mol->beginAtoms(); ai != mol->endAtoms();
371
2.45k
         ++ai) {
372
2.45k
      (*ai)->setFormalCharge(0);
373
2.45k
    }
374
275
  }
375
376
1.69k
  int ie, nent;
377
1.69k
  try {
378
1.69k
    nent = FileParserUtils::toInt(text.substr(6, 3));
379
1.69k
  } catch (boost::bad_lexical_cast &) {
380
21
    std::ostringstream errout;
381
21
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
382
21
           << line;
383
21
    throw FileParseException(errout.str());
384
21
  }
385
1.67k
  int spos = 9;
386
2.87k
  for (ie = 0; ie < nent; ie++) {
387
1.27k
    int aid, chg;
388
1.27k
    try {
389
1.27k
      aid = FileParserUtils::toInt(text.substr(spos, 4));
390
1.27k
      spos += 4;
391
1.27k
      chg = FileParserUtils::toInt(text.substr(spos, 4));
392
1.27k
      spos += 4;
393
1.27k
      mol->getAtomWithIdx(aid - 1)->setFormalCharge(chg);
394
1.27k
    } catch (boost::bad_lexical_cast &) {
395
25
      std::ostringstream errout;
396
25
      errout << "Cannot convert '" << text.substr(spos, 4)
397
25
             << "' to int on line " << line;
398
25
      throw FileParseException(errout.str());
399
25
    }
400
1.27k
  }
401
1.67k
}
402
403
void ParseRadicalLine(RWMol *mol, const std::string &text, bool firstCall,
404
3.68k
                      unsigned int line) {
405
3.68k
  PRECONDITION(mol, "bad mol");
406
3.68k
  PRECONDITION(text.substr(0, 6) == std::string("M  RAD"), "bad charge line");
407
408
  // if this line is specified all the atom other than those specified
409
  // here should carry a charge of 0; but we should only do this once:
410
3.68k
  if (firstCall) {
411
524
    for (ROMol::AtomIterator ai = mol->beginAtoms(); ai != mol->endAtoms();
412
309
         ++ai) {
413
309
      (*ai)->setFormalCharge(0);
414
309
    }
415
215
  }
416
417
3.68k
  int ie, nent;
418
3.68k
  try {
419
3.68k
    nent = FileParserUtils::toInt(text.substr(6, 3));
420
3.68k
  } catch (boost::bad_lexical_cast &) {
421
14
    std::ostringstream errout;
422
14
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
423
14
           << line;
424
14
    throw FileParseException(errout.str());
425
14
  }
426
3.67k
  int spos = 9;
427
7.46k
  for (ie = 0; ie < nent; ie++) {
428
3.86k
    int aid, rad;
429
3.86k
    std::ostringstream errout;
430
431
3.86k
    try {
432
3.86k
      aid = FileParserUtils::toInt(text.substr(spos, 4));
433
3.86k
      spos += 4;
434
3.86k
      rad = FileParserUtils::toInt(text.substr(spos, 4));
435
3.86k
      spos += 4;
436
437
3.86k
      switch (rad) {
438
361
        case 0:
439
          // This shouldn't be required, but let's make sure.
440
361
          mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(0);
441
361
          break;
442
977
        case 1:
443
977
          mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(2);
444
977
          break;
445
1.13k
        case 2:
446
1.13k
          mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(1);
447
1.13k
          break;
448
1.34k
        case 3:
449
1.34k
          mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(2);
450
1.34k
          break;
451
7
        default:
452
7
          errout << "Unrecognized radical value " << rad << " for atom "
453
7
                 << aid - 1 << " on line " << line << std::endl;
454
7
          throw FileParseException(errout.str());
455
3.86k
      }
456
3.86k
    } catch (boost::bad_lexical_cast &) {
457
25
      std::ostringstream errout;
458
25
      errout << "Cannot convert '" << text.substr(spos, 4)
459
25
             << "' to int on line " << line;
460
25
      throw FileParseException(errout.str());
461
25
    }
462
3.86k
  }
463
3.67k
}
464
465
4.74k
void ParsePXALine(RWMol *mol, const std::string &text, unsigned int line) {
466
4.74k
  PRECONDITION(mol, "bad mol");
467
4.74k
  PRECONDITION(text.substr(0, 6) == "M  PXA", "bad PXA line");
468
4.74k
  unsigned int pos = 7;
469
4.74k
  try {
470
4.74k
    auto atIdx =
471
4.74k
        FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(pos, 3));
472
4.74k
    pos += 3;
473
4.74k
    mol->getAtomWithIdx(atIdx - 1)->setProp(
474
4.74k
        "_MolFile_PXA", text.substr(pos, text.length() - pos));
475
4.74k
  } catch (boost::bad_lexical_cast &) {
476
15
    std::ostringstream errout;
477
15
    errout << "Cannot convert '" << text.substr(pos, 3) << "' to int on line "
478
15
           << line;
479
15
    throw FileParseException(errout.str());
480
15
  }
481
4.74k
}
482
483
2.42k
void ParseIsotopeLine(RWMol *mol, const std::string &text, unsigned int line) {
484
2.42k
  PRECONDITION(mol, "bad mol");
485
2.42k
  PRECONDITION(text.substr(0, 6) == std::string("M  ISO"), "bad isotope line");
486
487
2.42k
  unsigned int nent;
488
2.42k
  try {
489
2.42k
    nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
490
2.42k
  } catch (boost::bad_lexical_cast &) {
491
8
    std::ostringstream errout;
492
8
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
493
8
           << line;
494
8
    throw FileParseException(errout.str());
495
8
  }
496
2.41k
  unsigned int spos = 9;
497
4.75k
  for (unsigned int ie = 0; ie < nent; ie++) {
498
2.39k
    unsigned int aid;
499
2.39k
    try {
500
2.39k
      aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
501
2.39k
          text.substr(spos, 4));
502
2.39k
      spos += 4;
503
2.39k
      Atom *atom = mol->getAtomWithIdx(aid - 1);
504
2.39k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
505
1.48k
        int isotope = FileParserUtils::toInt(text.substr(spos, 4));
506
1.48k
        if (isotope < 0) {
507
228
          BOOST_LOG(rdErrorLog)
508
0
              << " atom " << aid
509
0
              << " has a negative isotope value. line:  " << line << std::endl;
510
1.25k
        } else {
511
1.25k
          atom->setIsotope(isotope);
512
1.25k
        }
513
1.48k
      }
514
2.39k
      spos += 4;
515
2.39k
    } catch (boost::bad_lexical_cast &) {
516
34
      std::ostringstream errout;
517
34
      errout << "Cannot convert '" << text.substr(spos, 4)
518
34
             << "' to int on line " << line;
519
34
      throw FileParseException(errout.str());
520
34
    }
521
2.39k
  }
522
2.41k
}
523
524
void ParseSubstitutionCountLine(RWMol *mol, const std::string &text,
525
3.60k
                                unsigned int line) {
526
3.60k
  PRECONDITION(mol, "bad mol");
527
3.60k
  PRECONDITION(text.substr(0, 6) == std::string("M  SUB"), "bad SUB line");
528
529
3.60k
  unsigned int nent;
530
3.60k
  try {
531
3.60k
    nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
532
3.60k
  } catch (boost::bad_lexical_cast &) {
533
17
    std::ostringstream errout;
534
17
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
535
17
           << line;
536
17
    throw FileParseException(errout.str());
537
17
  }
538
3.58k
  unsigned int spos = 9;
539
7.79k
  for (unsigned int ie = 0; ie < nent; ie++) {
540
4.29k
    unsigned int aid;
541
4.29k
    int count = 0;
542
4.29k
    try {
543
4.29k
      aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
544
4.29k
          text.substr(spos, 4));
545
4.29k
      spos += 4;
546
4.29k
      Atom *atom = mol->getAtomWithIdx(aid - 1);
547
4.29k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
548
3.63k
        count = FileParserUtils::toInt(text.substr(spos, 4));
549
3.63k
      }
550
4.29k
      spos += 4;
551
4.29k
      if (count == 0) {
552
1.52k
        continue;
553
1.52k
      }
554
2.77k
      ATOM_EQUALS_QUERY *q = makeAtomExplicitDegreeQuery(0);
555
2.77k
      switch (count) {
556
582
        case -1:
557
582
          q->setVal(0);
558
582
          break;
559
420
        case -2:
560
420
          q->setVal(atom->getDegree());
561
420
          break;
562
190
        case 1:
563
302
        case 2:
564
363
        case 3:
565
445
        case 4:
566
1.18k
        case 5:
567
1.18k
          q->setVal(count);
568
1.18k
          break;
569
489
        case 6:
570
489
          BOOST_LOG(rdWarningLog) << " atom degree query with value 6 found. "
571
0
                                     "This will not match degree >6. The MDL "
572
0
                                     "spec says it should.  line: "
573
0
                                  << line;
574
489
          q->setVal(6);
575
489
          break;
576
17
        default:
577
17
          std::ostringstream errout;
578
17
          errout << "Value " << count
579
17
                 << " is not supported as a degree query. line: " << line;
580
17
          throw FileParseException(errout.str());
581
2.77k
      }
582
2.67k
      if (!atom->hasQuery()) {
583
104
        atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
584
104
      }
585
2.67k
      atom->expandQuery(q, Queries::COMPOSITE_AND);
586
2.67k
    } catch (boost::bad_lexical_cast &) {
587
44
      std::ostringstream errout;
588
44
      errout << "Cannot convert '" << text.substr(spos, 4)
589
44
             << "' to int on line " << line;
590
44
      throw FileParseException(errout.str());
591
44
    }
592
4.29k
  }
593
3.58k
}
594
595
void ParseUnsaturationLine(RWMol *mol, const std::string &text,
596
2.78k
                           unsigned int line) {
597
2.78k
  PRECONDITION(mol, "bad mol");
598
2.78k
  PRECONDITION(text.substr(0, 6) == std::string("M  UNS"), "bad UNS line");
599
600
2.78k
  unsigned int nent;
601
2.78k
  try {
602
2.78k
    nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
603
2.78k
  } catch (boost::bad_lexical_cast &) {
604
7
    std::ostringstream errout;
605
7
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
606
7
           << line;
607
7
    throw FileParseException(errout.str());
608
7
  }
609
2.77k
  unsigned int spos = 9;
610
5.35k
  for (unsigned int ie = 0; ie < nent; ie++) {
611
2.68k
    unsigned int aid;
612
2.68k
    int count = 0;
613
2.68k
    try {
614
2.68k
      aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
615
2.68k
          text.substr(spos, 4));
616
2.68k
      spos += 4;
617
2.68k
      Atom *atom = mol->getAtomWithIdx(aid - 1);
618
2.68k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
619
1.39k
        count = FileParserUtils::toInt(text.substr(spos, 4));
620
1.39k
      }
621
2.68k
      spos += 4;
622
2.68k
      if (count == 0) {
623
2.13k
        continue;
624
2.13k
      } else if (count == 1) {
625
444
        ATOM_EQUALS_QUERY *q = makeAtomUnsaturatedQuery();
626
444
        if (!atom->hasQuery()) {
627
70
          atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
628
70
        }
629
444
        atom->expandQuery(q, Queries::COMPOSITE_AND);
630
444
      } else {
631
111
        std::ostringstream errout;
632
111
        errout << "Value " << count
633
111
               << " is not supported as an unsaturation "
634
111
                  "query (only 0 and 1 are allowed). "
635
111
                  "line: "
636
111
               << line;
637
111
        throw FileParseException(errout.str());
638
111
      }
639
2.68k
    } catch (boost::bad_lexical_cast &) {
640
71
      std::ostringstream errout;
641
71
      errout << "Cannot convert '" << text.substr(spos, 4)
642
71
             << "' to int on line " << line;
643
71
      throw FileParseException(errout.str());
644
71
    }
645
2.68k
  }
646
2.77k
}
647
648
void ParseRingBondCountLine(RWMol *mol, const std::string &text,
649
9.68k
                            unsigned int line) {
650
9.68k
  PRECONDITION(mol, "bad mol");
651
9.68k
  PRECONDITION(text.substr(0, 6) == std::string("M  RBC"), "bad RBC line");
652
653
9.68k
  unsigned int nent;
654
9.68k
  try {
655
9.68k
    nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
656
9.68k
  } catch (boost::bad_lexical_cast &) {
657
14
    std::ostringstream errout;
658
14
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
659
14
           << line;
660
14
    throw FileParseException(errout.str());
661
14
  }
662
9.66k
  unsigned int spos = 9;
663
19.6k
  for (unsigned int ie = 0; ie < nent; ie++) {
664
10.1k
    unsigned int aid;
665
10.1k
    int count = 0;
666
10.1k
    try {
667
10.1k
      aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
668
10.1k
          text.substr(spos, 4));
669
10.1k
      spos += 4;
670
10.1k
      Atom *atom = mol->getAtomWithIdx(aid - 1);
671
10.1k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
672
9.16k
        count = FileParserUtils::toInt(text.substr(spos, 4));
673
9.16k
      }
674
10.1k
      spos += 4;
675
10.1k
      if (count == 0) {
676
1.55k
        continue;
677
1.55k
      }
678
8.55k
      ATOM_EQUALS_QUERY *q = makeAtomRingBondCountQuery(0);
679
8.55k
      switch (count) {
680
1.91k
        case -1:
681
1.91k
          q->setVal(0);
682
1.91k
          break;
683
4.10k
        case -2:
684
4.10k
          q->setVal(0xDEADBEEF);
685
4.10k
          mol->setProp(common_properties::_NeedsQueryScan, 1);
686
4.10k
          break;
687
446
        case 1:
688
612
        case 2:
689
1.58k
        case 3:
690
1.58k
          q->setVal(count);
691
1.58k
          break;
692
871
        case 4:
693
871
          delete q;
694
871
          q = static_cast<ATOM_EQUALS_QUERY *>(new ATOM_LESSEQUAL_QUERY);
695
871
          q->setVal(4);
696
871
          q->setDescription("AtomRingBondCount");
697
871
          q->setDataFunc(queryAtomRingBondCount);
698
871
          break;
699
25
        default:
700
25
          std::ostringstream errout;
701
25
          errout << "Value " << count
702
25
                 << " is not supported as a ring-bond count query. line: "
703
25
                 << line;
704
25
          throw FileParseException(errout.str());
705
8.55k
      }
706
8.47k
      if (!atom->hasQuery()) {
707
173
        atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
708
173
      }
709
8.47k
      atom->expandQuery(q, Queries::COMPOSITE_AND);
710
8.47k
    } catch (boost::bad_lexical_cast &) {
711
31
      std::ostringstream errout;
712
31
      errout << "Cannot convert '" << text.substr(spos, 4)
713
31
             << "' to int on line " << line;
714
31
      throw FileParseException(errout.str());
715
31
    }
716
10.1k
  }
717
9.66k
}
718
719
1.60k
void ParseZCHLine(RWMol *mol, const std::string &text, unsigned int line) {
720
  // part of Alex Clark's ZBO proposal
721
  // from JCIM 51:3149-57 (2011)
722
1.60k
  PRECONDITION(mol, "bad mol");
723
1.60k
  PRECONDITION(text.substr(0, 6) == std::string("M  ZCH"), "bad ZCH line");
724
725
1.60k
  unsigned int nent;
726
1.60k
  try {
727
1.60k
    nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
728
1.60k
  } catch (boost::bad_lexical_cast &) {
729
8
    std::ostringstream errout;
730
8
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
731
8
           << line;
732
8
    throw FileParseException(errout.str());
733
8
  }
734
1.60k
  unsigned int spos = 9;
735
3.61k
  for (unsigned int ie = 0; ie < nent; ie++) {
736
2.11k
    unsigned int aid = 0;
737
2.11k
    int val = 0;
738
2.11k
    try {
739
2.11k
      aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
740
2.11k
          text.substr(spos, 4));
741
2.11k
      spos += 4;
742
2.11k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
743
1.38k
        val = FileParserUtils::stripSpacesAndCast<int>(text.substr(spos, 4));
744
1.38k
      }
745
2.11k
      if (!aid || aid > mol->getNumAtoms()) {
746
45
        std::ostringstream errout;
747
45
        errout << "Bad ZCH specification on line " << line;
748
45
        throw FileParseException(errout.str());
749
45
      }
750
2.06k
      spos += 4;
751
2.06k
      --aid;
752
2.06k
      Atom *atom = mol->getAtomWithIdx(aid);
753
2.06k
      if (!atom) {
754
0
        std::ostringstream errout;
755
0
        errout << "Atom " << aid << " from ZCH specification on line " << line
756
0
               << " not found";
757
0
        throw FileParseException(errout.str());
758
2.06k
      } else {
759
2.06k
        atom->setFormalCharge(val);
760
2.06k
      }
761
2.06k
    } catch (boost::bad_lexical_cast &) {
762
41
      std::ostringstream errout;
763
41
      errout << "Cannot convert '" << text.substr(spos, 4)
764
41
             << "' to int on line " << line;
765
41
      throw FileParseException(errout.str());
766
41
    }
767
2.11k
  }
768
1.60k
}
769
770
1.68k
void ParseHYDLine(RWMol *mol, const std::string &text, unsigned int line) {
771
  // part of Alex Clark's ZBO proposal
772
  // from JCIM 51:3149-57 (2011)
773
1.68k
  PRECONDITION(mol, "bad mol");
774
1.68k
  PRECONDITION(text.substr(0, 6) == std::string("M  HYD"), "bad HYD line");
775
776
1.68k
  unsigned int nent;
777
1.68k
  try {
778
1.68k
    nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
779
1.68k
  } catch (boost::bad_lexical_cast &) {
780
20
    std::ostringstream errout;
781
20
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
782
20
           << line;
783
20
    throw FileParseException(errout.str());
784
20
  }
785
1.66k
  unsigned int spos = 9;
786
4.87k
  for (unsigned int ie = 0; ie < nent; ie++) {
787
3.32k
    unsigned int aid = 0;
788
3.32k
    int val = -1;
789
3.32k
    try {
790
3.32k
      aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
791
3.32k
          text.substr(spos, 4));
792
3.32k
      spos += 4;
793
3.32k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
794
2.59k
        val = FileParserUtils::stripSpacesAndCast<int>(text.substr(spos, 4));
795
2.59k
      }
796
3.32k
      if (!aid || aid > mol->getNumAtoms()) {
797
58
        std::ostringstream errout;
798
58
        errout << "Bad HYD specification on line " << line;
799
58
        throw FileParseException(errout.str());
800
58
      }
801
3.26k
      spos += 4;
802
3.26k
      --aid;
803
3.26k
      Atom *atom = mol->getAtomWithIdx(aid);
804
3.26k
      if (!atom) {
805
0
        std::ostringstream errout;
806
0
        errout << "Atom " << aid << " from HYD specification on line " << line
807
0
               << " not found";
808
0
        throw FileParseException(errout.str());
809
3.26k
      } else {
810
3.26k
        if (val >= 0) {
811
2.52k
          atom->setProp("_ZBO_H", true);
812
2.52k
          atom->setNumExplicitHs(val);
813
2.52k
        }
814
3.26k
      }
815
3.26k
    } catch (boost::bad_lexical_cast &) {
816
43
      std::ostringstream errout;
817
43
      errout << "Cannot convert '" << text.substr(spos, 4)
818
43
             << "' to int on line " << line;
819
43
      throw FileParseException(errout.str());
820
43
    }
821
3.32k
  }
822
1.66k
}
823
824
1.34k
void ParseZBOLine(RWMol *mol, const std::string &text, unsigned int line) {
825
  // part of Alex Clark's ZBO proposal
826
  // from JCIM 51:3149-57 (2011)
827
1.34k
  PRECONDITION(mol, "bad mol");
828
1.34k
  PRECONDITION(text.substr(0, 6) == std::string("M  ZBO"), "bad ZBO line");
829
830
1.34k
  unsigned int nent;
831
1.34k
  try {
832
1.34k
    nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
833
1.34k
  } catch (boost::bad_lexical_cast &) {
834
23
    std::ostringstream errout;
835
23
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
836
23
           << line;
837
23
    throw FileParseException(errout.str());
838
23
  }
839
1.32k
  unsigned int spos = 9;
840
3.29k
  for (unsigned int ie = 0; ie < nent; ie++) {
841
2.08k
    unsigned int bid = 0;
842
2.08k
    unsigned int order = 0;
843
2.08k
    try {
844
2.08k
      bid = FileParserUtils::stripSpacesAndCast<unsigned int>(
845
2.08k
          text.substr(spos, 4));
846
2.08k
      spos += 4;
847
2.08k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
848
993
        order = FileParserUtils::stripSpacesAndCast<unsigned int>(
849
993
            text.substr(spos, 4));
850
993
      }
851
2.08k
      if (!bid || bid > mol->getNumBonds()) {
852
36
        std::ostringstream errout;
853
36
        errout << "Bad ZBO specification on line " << line;
854
36
        throw FileParseException(errout.str());
855
36
      }
856
2.05k
      spos += 4;
857
2.05k
      --bid;
858
2.05k
      Bond *bnd = mol->getBondWithIdx(bid);
859
2.05k
      if (!bnd) {
860
0
        std::ostringstream errout;
861
0
        errout << "Bond " << bid << " from ZBO specification on line " << line
862
0
               << " not found";
863
0
        throw FileParseException(errout.str());
864
2.05k
      } else {
865
2.05k
        if (order == 0) {
866
1.03k
          bnd->setBondType(Bond::ZERO);
867
1.03k
        } else {
868
1.01k
          bnd->setBondType(static_cast<Bond::BondType>(order));
869
1.01k
        }
870
2.05k
      }
871
2.05k
    } catch (boost::bad_lexical_cast &) {
872
67
      std::ostringstream errout;
873
67
      errout << "Cannot convert '" << text.substr(spos, 4)
874
67
             << "' to int on line " << line;
875
67
      throw FileParseException(errout.str());
876
67
    }
877
2.08k
  }
878
1.32k
}
879
880
void ParseMarvinSmartsLine(RWMol *mol, const std::string &text,
881
35.8k
                           unsigned int line) {
882
35.8k
  const unsigned int atomNumStart = 10;
883
35.8k
  const unsigned int smartsStart = 15;
884
  // M  MRV SMA   1 [*;A]
885
  // 01234567890123456789
886
  //           1111111111
887
35.8k
  if (text.substr(0, 10) != "M  MRV SMA") {
888
5.83k
    return;
889
5.83k
  }
890
891
30.0k
  unsigned int idx;
892
30.0k
  std::string idxTxt = text.substr(atomNumStart, smartsStart - atomNumStart);
893
30.0k
  try {
894
30.0k
    idx = FileParserUtils::stripSpacesAndCast<unsigned int>(idxTxt) - 1;
895
30.0k
  } catch (boost::bad_lexical_cast &) {
896
38
    std::ostringstream errout;
897
38
    errout << "Cannot convert '" << idxTxt << "' to an atom index on line "
898
38
           << line;
899
38
    throw FileParseException(errout.str());
900
38
  }
901
902
29.9k
  URANGE_CHECK(idx, mol->getNumAtoms());
903
  // Should we check the validity of the marvin line here?  Should we
904
  // automatically
905
  //   Add these as recursive smarts?  I tend to think so...
906
29.9k
  std::string sma = text.substr(smartsStart);
907
29.9k
  Atom *at = mol->getAtomWithIdx(idx);
908
29.9k
  at->setProp(common_properties::MRV_SMA, sma);
909
29.9k
  RWMol *m = nullptr;
910
29.9k
  try {
911
29.9k
    m = SmartsToMol(sma);
912
29.9k
  } catch (...) {
913
    // Is this ever used?
914
2.75k
  }
915
916
29.9k
  if (m) {
917
24.4k
    QueryAtom::QUERYATOM_QUERY *query = new RecursiveStructureQuery(m);
918
24.4k
    if (!at->hasQuery()) {
919
1.32k
      QueryAtom qAt(*at);
920
1.32k
      int oidx = at->getIdx();
921
1.32k
      mol->replaceAtom(oidx, &qAt);
922
1.32k
      at = mol->getAtomWithIdx(oidx);
923
1.32k
    }
924
24.4k
    at->expandQuery(query, Queries::COMPOSITE_AND);
925
24.4k
    at->setProp(common_properties::_MolFileAtomQuery, 1);
926
24.4k
  } else {
927
5.48k
    std::ostringstream errout;
928
5.48k
    errout << "Cannot parse smarts: '" << sma << "' on line " << line;
929
5.48k
    throw FileParseException(errout.str());
930
5.48k
  }
931
29.9k
}
932
933
void ParseAttachPointLine(RWMol *mol, const std::string &text,
934
1.55k
                          unsigned int line, bool strictParsing) {
935
1.55k
  PRECONDITION(mol, "bad mol");
936
1.55k
  PRECONDITION(text.substr(0, 6) == std::string("M  APO"), "bad APO line");
937
938
1.55k
  unsigned int nent;
939
1.55k
  try {
940
1.55k
    nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
941
1.55k
  } catch (boost::bad_lexical_cast &) {
942
13
    std::ostringstream errout;
943
13
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
944
13
           << line;
945
13
    throw FileParseException(errout.str());
946
13
  }
947
1.54k
  unsigned int spos = 9;
948
4.14k
  for (unsigned int ie = 0; ie < nent; ie++) {
949
2.75k
    unsigned int aid = 0;
950
2.75k
    int val = 0;
951
2.75k
    try {
952
2.75k
      aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
953
2.75k
          text.substr(spos, 4));
954
2.75k
      spos += 4;
955
2.75k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
956
1.66k
        val = FileParserUtils::stripSpacesAndCast<int>(text.substr(spos, 4));
957
1.66k
      }
958
2.75k
      if (!aid || aid > mol->getNumAtoms()) {
959
47
        std::ostringstream errout;
960
47
        errout << "Bad APO specification on line " << line;
961
47
        throw FileParseException(errout.str());
962
47
      }
963
2.70k
      spos += 4;
964
2.70k
      --aid;
965
2.70k
      Atom *atom = mol->getAtomWithIdx(aid);
966
2.70k
      if (!atom) {
967
0
        std::ostringstream errout;
968
0
        errout << "Atom " << aid << " from APO specification on line " << line
969
0
               << " not found";
970
0
        throw FileParseException(errout.str());
971
2.70k
      } else {
972
2.70k
        if (val < 0 || val > 3) {
973
13
          std::ostringstream errout;
974
13
          errout << "Value " << val << " from APO specification on line "
975
13
                 << line << " is invalid";
976
13
          throw FileParseException(errout.str());
977
2.69k
        } else if (val) {
978
1.59k
          if (val == 3) {
979
            // this is -1 in v3k mol blocks, so use that:
980
252
            val = -1;
981
252
          }
982
1.59k
          if (atom->hasProp(common_properties::molAttachPoint)) {
983
1.49k
            std::ostringstream errout;
984
1.49k
            errout << "Multiple ATTCHPT values for atom " << atom->getIdx() + 1
985
1.49k
                   << " on line " << line;
986
1.49k
            if (strictParsing) {
987
2
              throw FileParseException(errout.str());
988
1.49k
            } else {
989
1.49k
              BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
990
1.49k
            }
991
1.49k
          } else {
992
98
            atom->setProp(common_properties::molAttachPoint, val);
993
98
          }
994
1.59k
        }
995
2.70k
      }
996
2.70k
    } catch (boost::bad_lexical_cast &) {
997
74
      std::ostringstream errout;
998
74
      errout << "Cannot convert '" << text.substr(spos, 4)
999
74
             << "' to int on line " << line;
1000
74
      throw FileParseException(errout.str());
1001
74
    }
1002
2.75k
  }
1003
1.54k
}
1004
1005
// the format differs between V2000 and V3000, so we have to do a bit of
1006
// translation here
1007
1.31k
void ParseLinkNodeLine(RWMol *mol, const std::string &text, unsigned int line) {
1008
1.31k
  PRECONDITION(mol, "bad mol");
1009
1.31k
  PRECONDITION(text.substr(0, 6) == std::string("M  LIN"), "bad LIN line");
1010
1011
1.31k
  unsigned int nent;
1012
1.31k
  try {
1013
1.31k
    nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
1014
1.31k
  } catch (boost::bad_lexical_cast &) {
1015
9
    std::ostringstream errout;
1016
9
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
1017
9
           << line;
1018
9
    throw FileParseException(errout.str());
1019
9
  }
1020
1.30k
  std::string propVal = "";
1021
1.30k
  unsigned int spos = 9;
1022
3.86k
  for (unsigned int ie = 0; ie < nent; ie++) {
1023
2.72k
    try {
1024
2.72k
      auto aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
1025
2.72k
          text.substr(spos, 4));
1026
2.72k
      if (!aid || aid > mol->getNumAtoms()) {
1027
37
        std::ostringstream errout;
1028
37
        errout << "LIN specification has bad atom idx on line " << line;
1029
37
        throw FileParseException(errout.str());
1030
37
      }
1031
2.68k
      spos += 4;
1032
1033
2.68k
      if (text.size() < spos + 4 || text.substr(spos, 4) == "    ") {
1034
6
        std::ostringstream errout;
1035
6
        errout << "LIN specification missing repeat count on line " << line;
1036
6
        throw FileParseException(errout.str());
1037
6
      }
1038
2.68k
      auto repeatCount = FileParserUtils::stripSpacesAndCast<unsigned int>(
1039
2.68k
          text.substr(spos, 4));
1040
2.68k
      spos += 4;
1041
2.68k
      if (repeatCount < 2) {
1042
2
        std::ostringstream errout;
1043
2
        errout << "LIN specification: repeat count must be >=2 on line "
1044
2
               << line;
1045
2
        throw FileParseException(errout.str());
1046
2
      }
1047
2.68k
      unsigned int substB = 0;
1048
2.68k
      unsigned int substC = 0;
1049
2.68k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
1050
2.61k
        substB = FileParserUtils::stripSpacesAndCast<unsigned int>(
1051
2.61k
            text.substr(spos, 4));
1052
2.61k
      }
1053
2.68k
      spos += 4;
1054
2.68k
      if (text.size() >= spos + 4 && text.substr(spos, 4) != "    ") {
1055
2.62k
        substC = FileParserUtils::stripSpacesAndCast<unsigned int>(
1056
2.62k
            text.substr(spos, 4));
1057
2.62k
      }
1058
2.68k
      spos += 4;
1059
1060
2.68k
      if (!substB || substB > mol->getNumAtoms() ||
1061
2.57k
          substC > mol->getNumAtoms()) {
1062
37
        std::ostringstream errout;
1063
37
        errout << "LIN specification has bad substituent idx on line " << line;
1064
37
        throw FileParseException(errout.str());
1065
37
      }
1066
1067
2.64k
      boost::format formatter;
1068
2.64k
      if (substC) {
1069
2.55k
        formatter = boost::format("1 %1% 2 %2% %3% %2% %4%") % repeatCount %
1070
2.55k
                    aid % substB % substC;
1071
2.55k
      } else {
1072
89
        formatter = boost::format("1 %1% 1 %2% %3%") % repeatCount % aid %
1073
89
                    substB % substC;
1074
89
      }
1075
2.64k
      if (!propVal.empty()) {
1076
1.64k
        propVal += "|";
1077
1.64k
      }
1078
2.64k
      propVal += formatter.str();
1079
2.64k
    } catch (boost::bad_lexical_cast &) {
1080
76
      std::ostringstream errout;
1081
76
      errout << "Cannot convert '" << text.substr(spos, 4)
1082
76
             << "' to int on line " << line;
1083
76
      throw FileParseException(errout.str());
1084
76
    }
1085
2.55k
    mol->setProp(common_properties::molFileLinkNodes, propVal);
1086
2.55k
  }
1087
1.30k
}
1088
1089
// Recursively populates queryVect with COMPOSITE_AND queries
1090
// present in the input query. If the logic of the input query
1091
// is more complex, it returns nullptr and empty set.
1092
// The returned ptr should only be checked for not being null
1093
// and not used for any other purposes, as the actual result is
1094
// the queryVect
1095
const QueryAtom::QUERYATOM_QUERY *getAndQueries(
1096
    const QueryAtom::QUERYATOM_QUERY *q,
1097
200k
    std::vector<const QueryAtom::QUERYATOM_QUERY *> &queryVect) {
1098
200k
  if (q) {
1099
200k
    auto qOrig = q;
1100
397k
    for (auto cq = qOrig->beginChildren(); cq != qOrig->endChildren(); ++cq) {
1101
197k
      if (q == qOrig && q->getDescription() != "AtomAnd") {
1102
617
        q = nullptr;
1103
617
        break;
1104
617
      }
1105
197k
      q = getAndQueries(cq->get(), queryVect);
1106
197k
    }
1107
200k
    if (q == qOrig) {
1108
101k
      queryVect.push_back(q);
1109
101k
    }
1110
200k
  }
1111
200k
  if (!q) {
1112
617
    queryVect.clear();
1113
617
  }
1114
200k
  return q;
1115
200k
}
1116
1117
3.72k
void ParseNewAtomList(RWMol *mol, const std::string &text, unsigned int line) {
1118
3.72k
  if (text.size() < 15) {
1119
13
    std::ostringstream errout;
1120
13
    errout << "Atom list line too short: '" << text << "'";
1121
13
    throw FileParseException(errout.str());
1122
13
  }
1123
3.71k
  PRECONDITION(mol, "bad mol");
1124
3.71k
  PRECONDITION(text.substr(0, 6) == std::string("M  ALS"),
1125
3.71k
               "bad atom list line");
1126
1127
3.71k
  unsigned int idx;
1128
3.71k
  try {
1129
3.71k
    idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(7, 3)) -
1130
3.71k
          1;
1131
3.71k
  } catch (boost::bad_lexical_cast &) {
1132
16
    std::ostringstream errout;
1133
16
    errout << "Cannot convert '" << text.substr(7, 3) << "' to int on line "
1134
16
           << line;
1135
16
    throw FileParseException(errout.str());
1136
16
  }
1137
3.69k
  URANGE_CHECK(idx, mol->getNumAtoms());
1138
1139
3.66k
  int nQueries;
1140
3.66k
  try {
1141
3.66k
    nQueries = FileParserUtils::toInt(text.substr(10, 3));
1142
3.66k
  } catch (boost::bad_lexical_cast &) {
1143
4
    std::ostringstream errout;
1144
4
    errout << "Cannot convert '" << text.substr(10, 3) << "' to int on line "
1145
4
           << line;
1146
4
    throw FileParseException(errout.str());
1147
4
  }
1148
1149
3.65k
  if (!nQueries) {
1150
252
    BOOST_LOG(rdWarningLog) << "Empty atom list: '" << text << "' on line "
1151
0
                            << line << "." << std::endl;
1152
252
    return;
1153
252
  }
1154
1155
3.40k
  if (nQueries < 0) {
1156
5
    std::ostringstream errout;
1157
5
    errout << "negative length atom list: '" << text << "' on line " << line
1158
5
           << "." << std::endl;
1159
5
    throw FileParseException(errout.str());
1160
5
  }
1161
3.40k
  QueryAtom *a = nullptr;
1162
3.40k
  QueryAtom *qaOrig = nullptr;
1163
3.40k
  QueryAtom::QUERYATOM_QUERY *qOrig = nullptr;
1164
3.40k
  Atom *aOrig = mol->getAtomWithIdx(idx);
1165
7.48k
  for (unsigned int i = 0; i < static_cast<unsigned int>(nQueries); i++) {
1166
4.09k
    unsigned int pos = 16 + i * 4;
1167
4.09k
    if (text.size() < pos + 4) {
1168
8
      std::ostringstream errout;
1169
8
      errout << "Atom list line too short: '" << text << "' on line " << line;
1170
8
      throw FileParseException(errout.str());
1171
8
    }
1172
1173
4.08k
    std::string atSymb = text.substr(pos, 4);
1174
4.08k
    atSymb.erase(atSymb.find(' '), atSymb.size());
1175
4.08k
    int atNum = PeriodicTable::getTable()->getAtomicNumber(atSymb);
1176
4.08k
    if (!i) {
1177
3.38k
      if (aOrig->hasQuery()) {
1178
3.24k
        qaOrig = dynamic_cast<QueryAtom *>(aOrig);
1179
3.24k
        if (qaOrig) {
1180
3.24k
          qOrig = qaOrig->getQuery();
1181
3.24k
        }
1182
3.24k
      }
1183
3.38k
      a = new QueryAtom(*aOrig);
1184
3.38k
      a->setAtomicNum(atNum);
1185
3.38k
      if (!qOrig) {
1186
131
        qOrig = a->getQuery()->copy();
1187
131
      }
1188
3.38k
      a->setQuery(makeAtomNumQuery(atNum));
1189
3.38k
    } else {
1190
703
      a->expandQuery(makeAtomNumQuery(atNum), Queries::COMPOSITE_OR, true);
1191
      // For COMPOSITE_OR query atoms, reset atomic num to 0 such that they are
1192
      // exported as "*" in SMILES
1193
703
      a->setAtomicNum(0);
1194
703
    }
1195
4.08k
  }
1196
3.39k
  ASSERT_INVARIANT(a, "no atom built");
1197
3.39k
  if (qOrig) {
1198
3.35k
    std::vector<const QueryAtom::QUERYATOM_QUERY *> queryVect;
1199
3.35k
    if (getAndQueries(qOrig, queryVect)) {
1200
101k
      for (const auto &q : queryVect) {
1201
101k
        if (q->getDescription() != "AtomAtomicNum") {
1202
98.6k
          a->expandQuery(q->copy(), Queries::COMPOSITE_AND, true);
1203
98.6k
        }
1204
101k
      }
1205
3.01k
    }
1206
3.35k
    if (!qaOrig) {
1207
117
      delete qOrig;
1208
117
    }
1209
3.35k
  }
1210
3.39k
  a->setProp(common_properties::_MolFileAtomQuery, 1);
1211
3.39k
  switch (text[14]) {
1212
736
    case 'T':
1213
736
      a->getQuery()->setNegation(true);
1214
736
      break;
1215
2.60k
    case 'F':
1216
2.60k
      a->getQuery()->setNegation(false);
1217
2.60k
      break;
1218
16
    default:
1219
16
      std::ostringstream errout;
1220
16
      errout << "Unrecognized atom-list query modifier: '" << text[14]
1221
16
             << "' on line " << line;
1222
16
      delete a;
1223
16
      throw FileParseException(errout.str());
1224
3.39k
  }
1225
1226
3.33k
  mol->replaceAtom(idx, a);
1227
3.33k
  delete a;
1228
3.33k
}
1229
1230
void ParseV3000RGroups(RWMol *mol, Atom *&atom, std::string_view text,
1231
22
                       unsigned int line) {
1232
22
  PRECONDITION(mol, "bad mol");
1233
22
  PRECONDITION(atom, "bad atom");
1234
22
  if (text[0] != '(' || text.back() != ')') {
1235
2
    std::ostringstream errout;
1236
2
    errout << "Bad RGROUPS specification '" << text << "' on line " << line
1237
2
           << ". Missing parens.";
1238
2
    throw FileParseException(errout.str());
1239
2
  }
1240
20
  std::vector<std::string> splitToken;
1241
20
  std::string resid = std::string(text.substr(1, text.size() - 2));
1242
20
  boost::split(splitToken, resid, boost::is_any_of(std::string(" ")));
1243
20
  if (splitToken.size() < 1) {
1244
0
    std::ostringstream errout;
1245
0
    errout << "Bad RGROUPS specification '" << text << "' on line " << line
1246
0
           << ". Missing values.";
1247
0
    throw FileParseException(errout.str());
1248
0
  }
1249
20
  unsigned int nRs;
1250
20
  try {
1251
20
    nRs = FileParserUtils::stripSpacesAndCast<unsigned int>(splitToken[0]);
1252
20
  } catch (boost::bad_lexical_cast &) {
1253
3
    std::ostringstream errout;
1254
3
    errout << "Cannot convert '" << splitToken[0] << "' to int on line" << line;
1255
3
    throw FileParseException(errout.str());
1256
3
  }
1257
17
  if (splitToken.size() < nRs + 1) {
1258
5
    std::ostringstream errout;
1259
5
    errout << "Bad RGROUPS specification '" << text << "' on line " << line
1260
5
           << ". Not enough values.";
1261
5
    throw FileParseException(errout.str());
1262
5
  }
1263
36
  for (unsigned int i = 0; i < nRs; ++i) {
1264
34
    unsigned int rLabel;
1265
34
    try {
1266
34
      rLabel =
1267
34
          FileParserUtils::stripSpacesAndCast<unsigned int>(splitToken[i + 1]);
1268
34
    } catch (boost::bad_lexical_cast &) {
1269
10
      std::ostringstream errout;
1270
10
      errout << "Cannot convert '" << splitToken[i + 1] << "' to int on line"
1271
10
             << line;
1272
10
      throw FileParseException(errout.str());
1273
10
    }
1274
24
    atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
1275
24
    atom->setProp(common_properties::_MolFileRLabel, rLabel);
1276
24
    std::string dLabel = "R" + std::to_string(rLabel);
1277
24
    atom->setProp(common_properties::dummyLabel, dLabel);
1278
24
    atom->setIsotope(rLabel);
1279
24
    atom->setQuery(makeAtomNullQuery());
1280
24
  }
1281
12
}
1282
1283
1.91k
void ParseRGroupLabels(RWMol *mol, const std::string &text, unsigned int line) {
1284
1.91k
  PRECONDITION(mol, "bad mol");
1285
1.91k
  PRECONDITION(text.substr(0, 6) == std::string("M  RGP"),
1286
1.91k
               "bad R group label line");
1287
1288
1.91k
  int nLabels;
1289
1.91k
  try {
1290
1.91k
    nLabels = FileParserUtils::toInt(text.substr(6, 3));
1291
1.91k
  } catch (boost::bad_lexical_cast &) {
1292
25
    std::ostringstream errout;
1293
25
    errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
1294
25
           << line;
1295
25
    throw FileParseException(errout.str());
1296
25
  }
1297
1298
4.07k
  for (int i = 0; i < nLabels; i++) {
1299
2.23k
    int pos = 10 + i * 8;
1300
2.23k
    unsigned int atIdx;
1301
2.23k
    try {
1302
2.23k
      atIdx = FileParserUtils::stripSpacesAndCast<unsigned int>(
1303
2.23k
          text.substr(pos, 3));
1304
2.23k
    } catch (boost::bad_lexical_cast &) {
1305
7
      std::ostringstream errout;
1306
7
      errout << "Cannot convert '" << text.substr(pos, 3) << "' to int on line "
1307
7
             << line;
1308
7
      throw FileParseException(errout.str());
1309
7
    }
1310
2.22k
    unsigned int rLabel;
1311
2.22k
    try {
1312
2.22k
      rLabel = FileParserUtils::stripSpacesAndCast<unsigned int>(
1313
2.22k
          text.substr(pos + 4, 3));
1314
2.22k
    } catch (boost::bad_lexical_cast &) {
1315
15
      std::ostringstream errout;
1316
15
      errout << "Cannot convert '" << text.substr(pos + 4, 3)
1317
15
             << "' to int on line " << line;
1318
15
      throw FileParseException(errout.str());
1319
15
    }
1320
2.20k
    atIdx -= 1;
1321
2.20k
    if (atIdx > mol->getNumAtoms()) {
1322
17
      std::ostringstream errout;
1323
17
      errout << "Attempt to set R group label on nonexistent atom " << atIdx
1324
17
             << " on line " << line;
1325
17
      throw FileParseException(errout.str());
1326
17
    }
1327
2.18k
    QueryAtom qatom(*(mol->getAtomWithIdx(atIdx)));
1328
2.18k
    qatom.setProp(common_properties::_MolFileRLabel, rLabel);
1329
1330
    // set the dummy label so that this is shown correctly
1331
    // in other pieces of the code :
1332
    // (this was sf.net issue 3316600)
1333
2.18k
    std::string dLabel = "R" + std::to_string(rLabel);
1334
2.18k
    qatom.setProp(common_properties::dummyLabel, dLabel);
1335
1336
    // the CTFile spec (June 2005 version) technically only allows
1337
    // R labels up to 32. Since there are three digits, we'll accept
1338
    // anything: so long as it's positive and less than 1000:
1339
2.18k
    if (rLabel > 0 && rLabel < 999) {
1340
1.07k
      qatom.setIsotope(rLabel);
1341
1.07k
    }
1342
2.18k
    qatom.setQuery(makeAtomNullQuery());
1343
2.18k
    mol->replaceAtom(atIdx, &qatom);
1344
2.18k
  }
1345
1.89k
}
1346
1347
void ParseAtomAlias(RWMol *mol, std::string text, const std::string &nextLine,
1348
2.06k
                    unsigned int line) {
1349
2.06k
  PRECONDITION(mol, "bad mol");
1350
2.06k
  PRECONDITION(text.substr(0, 2) == std::string("A "), "bad atom alias line");
1351
1352
2.04k
  unsigned int idx;
1353
2.04k
  try {
1354
2.04k
    idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(3, 3)) -
1355
2.04k
          1;
1356
2.04k
  } catch (boost::bad_lexical_cast &) {
1357
13
    std::ostringstream errout;
1358
13
    errout << "Cannot convert '" << text.substr(3, 3) << "' to int on line "
1359
13
           << line;
1360
13
    throw FileParseException(errout.str());
1361
13
  }
1362
2.02k
  URANGE_CHECK(idx, mol->getNumAtoms());
1363
2.00k
  Atom *at = mol->getAtomWithIdx(idx);
1364
2.00k
  at->setProp(common_properties::molFileAlias, nextLine);
1365
2.00k
}
1366
1367
3.15k
void ParseAtomValue(RWMol *mol, std::string text, unsigned int line) {
1368
3.15k
  PRECONDITION(mol, "bad mol");
1369
3.15k
  PRECONDITION(text.substr(0, 2) == std::string("V "), "bad atom value line");
1370
1371
3.13k
  unsigned int idx;
1372
3.13k
  try {
1373
3.13k
    idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(3, 3)) -
1374
3.13k
          1;
1375
3.13k
  } catch (boost::bad_lexical_cast &) {
1376
17
    std::ostringstream errout;
1377
17
    errout << "Cannot convert '" << text.substr(3, 3) << "' to int on line"
1378
17
           << line;
1379
17
    throw FileParseException(errout.str());
1380
17
  }
1381
3.11k
  URANGE_CHECK(idx, mol->getNumAtoms());
1382
3.09k
  Atom *at = mol->getAtomWithIdx(idx);
1383
3.09k
  at->setProp(common_properties::molFileValue,
1384
3.09k
              text.substr(7, text.length() - 7));
1385
3.09k
}
1386
1387
namespace {
1388
9.70k
void setRGPProps(const std::string_view symb, Atom *res) {
1389
9.70k
  PRECONDITION(res, "bad atom pointer");
1390
  // set the dummy label so that this is shown correctly
1391
  // in other pieces of the code :
1392
9.70k
  std::string symbc(symb);
1393
9.70k
  res->setProp(common_properties::dummyLabel, symbc);
1394
9.70k
}
1395
1396
void lookupAtomicNumber(Atom *res, const std::string &symb,
1397
133k
                        bool strictParsing) {
1398
133k
  std::string tCopy(symb);
1399
133k
  if (symb.size() == 2 && symb[1] >= 'A' && symb[1] <= 'Z') {
1400
4.99k
    tCopy[1] = static_cast<char>(tolower(symb[1]));
1401
4.99k
  }
1402
133k
  try {
1403
133k
    res->setAtomicNum(PeriodicTable::getTable()->getAtomicNumber(tCopy));
1404
133k
  } catch (const Invar::Invariant &e) {
1405
54.9k
    if (strictParsing || symb.empty()) {
1406
83
      throw FileParseException(e.what());
1407
54.8k
    } else {
1408
54.8k
      res->setAtomicNum(0);
1409
54.8k
      res->setProp(common_properties::dummyLabel, symb);
1410
54.8k
    }
1411
54.9k
  }
1412
133k
}
1413
1414
}  // namespace
1415
1416
Atom *ParseMolFileAtomLine(const std::string_view text, RDGeom::Point3D &pos,
1417
133k
                           unsigned int line, bool strictParsing) {
1418
133k
  std::string symb;
1419
133k
  int massDiff, chg, hCount;
1420
1421
133k
  if ((strictParsing && text.size() < 34) || text.size() < 32) {
1422
80
    std::ostringstream errout;
1423
80
    errout << "Atom line too short: '" << text << "' on line " << line;
1424
80
    throw FileParseException(errout.str());
1425
80
  }
1426
1427
133k
  try {
1428
133k
    pos.x = FileParserUtils::toDouble(text.substr(0, 10));
1429
133k
    pos.y = FileParserUtils::toDouble(text.substr(10, 10));
1430
133k
    pos.z = FileParserUtils::toDouble(text.substr(20, 10));
1431
133k
  } catch (boost::bad_lexical_cast &) {
1432
373
    std::ostringstream errout;
1433
373
    errout << "Cannot process coordinates on line " << line;
1434
373
    throw FileParseException(errout.str());
1435
373
  }
1436
133k
  symb = text.substr(31, 3);
1437
133k
  boost::trim(symb);
1438
1439
  // REVIEW: should we handle missing fields at the end of the line?
1440
133k
  massDiff = 0;
1441
133k
  if (text.size() >= 36 && text.substr(34, 2) != " 0") {
1442
46.9k
    try {
1443
46.9k
      massDiff = FileParserUtils::toInt(text.substr(34, 2), true);
1444
46.9k
    } catch (boost::bad_lexical_cast &) {
1445
46
      std::ostringstream errout;
1446
46
      errout << "Cannot convert '" << text.substr(34, 2) << "' to int on line "
1447
46
             << line;
1448
46
      throw FileParseException(errout.str());
1449
46
    }
1450
46.9k
  }
1451
133k
  chg = 0;
1452
133k
  if (text.size() >= 39 && text.substr(36, 3) != "  0") {
1453
44.0k
    try {
1454
44.0k
      chg = FileParserUtils::toInt(text.substr(36, 3), true);
1455
44.0k
    } catch (boost::bad_lexical_cast &) {
1456
32
      std::ostringstream errout;
1457
32
      errout << "Cannot convert '" << text.substr(36, 3) << "' to int on line "
1458
32
             << line;
1459
32
      throw FileParseException(errout.str());
1460
32
    }
1461
44.0k
  }
1462
133k
  hCount = 0;
1463
133k
  if (text.size() >= 45 && text.substr(42, 3) != "  0") {
1464
36.6k
    try {
1465
36.6k
      hCount = FileParserUtils::toInt(text.substr(42, 3), true);
1466
36.6k
    } catch (boost::bad_lexical_cast &) {
1467
33
      std::ostringstream errout;
1468
33
      errout << "Cannot convert '" << text.substr(42, 3) << "' to int on line "
1469
33
             << line;
1470
33
      throw FileParseException(errout.str());
1471
33
    }
1472
36.6k
  }
1473
133k
  std::unique_ptr<Atom> res(new Atom);
1474
133k
  bool isComplexQueryName =
1475
133k
      std::find(complexQueries.begin(), complexQueries.end(), symb) !=
1476
133k
      complexQueries.end();
1477
133k
  if (isComplexQueryName || symb == "L" || symb == "*" || symb == "LP" ||
1478
128k
      symb == "R" || symb == "R#" ||
1479
126k
      (symb[0] == 'R' && symb >= "R0" && symb <= "R99")) {
1480
7.51k
    if (isComplexQueryName || symb == "*" || symb == "R") {
1481
4.23k
      auto *query = new QueryAtom(0);
1482
4.23k
      if (symb == "*" || symb == "R") {
1483
        // according to the MDL spec, these match anything
1484
1.46k
        query->setQuery(makeAtomNullQuery());
1485
2.77k
      } else if (isComplexQueryName) {
1486
2.77k
        convertComplexNameToQuery(query, symb);
1487
2.77k
      }
1488
4.23k
      res.reset(query);
1489
      // queries have no implicit Hs:
1490
4.23k
      res->setNoImplicit(true);
1491
4.23k
    } else {
1492
3.27k
      res->setAtomicNum(0);
1493
3.27k
    }
1494
7.51k
    if (massDiff == 0 && symb[0] == 'R') {
1495
3.26k
      if (symb.length() > 1 && symb >= "R0" && symb <= "R99") {
1496
606
        std::string rlabel = "";
1497
606
        rlabel = symb.substr(1, symb.length() - 1);
1498
606
        int rnumber;
1499
606
        try {
1500
606
          rnumber = boost::lexical_cast<int>(rlabel);
1501
606
        } catch (boost::bad_lexical_cast &) {
1502
61
          rnumber = -1;
1503
61
        }
1504
606
        if (rnumber >= 0) {
1505
545
          res->setIsotope(rnumber);
1506
545
        }
1507
606
      }
1508
3.26k
    }
1509
7.51k
    if (symb[0] == 'R') {
1510
      // we used to skip R# here because that really should be handled by an
1511
      // RGP spec, but that turned out to not be permissive enough... <sigh>
1512
3.32k
      setRGPProps(symb, res.get());
1513
3.32k
    }
1514
125k
  } else if (symb == "D") {  // mol blocks support "D" and "T" as shorthand...
1515
                             // handle that.
1516
246
    res->setAtomicNum(1);
1517
246
    res->setIsotope(2);
1518
125k
  } else if (symb == "T") {  // mol blocks support "D" and "T" as shorthand...
1519
                             // handle that.
1520
95
    res->setAtomicNum(1);
1521
95
    res->setIsotope(3);
1522
125k
  } else if (symb == "Pol" || symb == "Mod") {
1523
80
    res->setAtomicNum(0);
1524
80
    res->setProp(common_properties::dummyLabel, symb);
1525
125k
  } else if (GenericGroups::genericMatchers.find(symb) !=
1526
125k
             GenericGroups::genericMatchers.end()) {
1527
344
    res.reset(new QueryAtom(0));
1528
344
    res->setProp(common_properties::atomLabel, std::string(symb));
1529
124k
  } else {
1530
124k
    lookupAtomicNumber(res.get(), symb, strictParsing);
1531
124k
  }
1532
1533
  // res->setPos(pX,pY,pZ);
1534
133k
  if (chg != 0) {
1535
15.2k
    res->setFormalCharge(4 - chg);
1536
15.2k
  }
1537
1538
133k
  if (hCount >= 1) {
1539
10.0k
    if (!res->hasQuery()) {
1540
9.54k
      auto qatom = new QueryAtom(*res);
1541
9.54k
      res.reset(qatom);
1542
9.54k
    }
1543
10.0k
    res->setNoImplicit(true);
1544
10.0k
    if (hCount > 1) {
1545
8.13k
      ATOM_EQUALS_QUERY *oq = makeAtomImplicitHCountQuery(hCount - 1);
1546
8.13k
      auto nq = makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>(
1547
8.13k
          hCount - 1, oq->getDataFunc(),
1548
8.13k
          std::string("less_") + oq->getDescription());
1549
8.13k
      res->expandQuery(nq);
1550
8.13k
      delete oq;
1551
8.13k
    } else {
1552
1.87k
      res->expandQuery(makeAtomImplicitHCountQuery(0));
1553
1.87k
    }
1554
10.0k
  }
1555
1556
133k
  if (massDiff != 0) {
1557
11.3k
    int defIso =
1558
11.3k
        PeriodicTable::getTable()->getMostCommonIsotope(res->getAtomicNum());
1559
11.3k
    int dIso = defIso + massDiff;
1560
11.3k
    if (dIso < 0) {
1561
469
      BOOST_LOG(rdWarningLog)
1562
0
          << " atom " << res->getIdx()
1563
0
          << " has a negative isotope offset. line:  " << line << std::endl;
1564
469
    }
1565
11.3k
    res->setIsotope(dIso);
1566
11.3k
  }
1567
1568
133k
  if (text.size() >= 42 && text.substr(39, 3) != "  0") {
1569
37.3k
    int parity = 0;
1570
37.3k
    try {
1571
37.3k
      parity = FileParserUtils::toInt(text.substr(39, 3), true);
1572
37.3k
    } catch (boost::bad_lexical_cast &) {
1573
25
      std::ostringstream errout;
1574
25
      errout << "Cannot convert '" << text.substr(39, 3) << "' to int on line "
1575
25
             << line;
1576
25
      throw FileParseException(errout.str());
1577
25
    }
1578
37.3k
    res->setProp(common_properties::molParity, parity);
1579
37.3k
  }
1580
1581
133k
  if (text.size() >= 48 && text.substr(45, 3) != "  0") {
1582
28.5k
    int stereoCare = 0;
1583
28.5k
    try {
1584
28.5k
      stereoCare = FileParserUtils::toInt(text.substr(45, 3), true);
1585
28.5k
    } catch (boost::bad_lexical_cast &) {
1586
26
      std::ostringstream errout;
1587
26
      errout << "Cannot convert '" << text.substr(45, 3) << "' to int on line "
1588
26
             << line;
1589
26
      throw FileParseException(errout.str());
1590
26
    }
1591
28.5k
    res->setProp(common_properties::molStereoCare, stereoCare);
1592
28.5k
  }
1593
132k
  if (text.size() >= 51 && text.substr(48, 3) != "  0") {
1594
24.5k
    int totValence = 0;
1595
24.5k
    try {
1596
24.5k
      totValence = FileParserUtils::toInt(text.substr(48, 3), true);
1597
24.5k
    } catch (boost::bad_lexical_cast &) {
1598
23
      std::ostringstream errout;
1599
23
      errout << "Cannot convert '" << text.substr(48, 3) << "' to int on line "
1600
23
             << line;
1601
23
      throw FileParseException(errout.str());
1602
23
    }
1603
24.5k
    if (totValence != 0) {
1604
      // only set if it's a non-default value
1605
6.28k
      res->setProp(common_properties::molTotValence, totValence);
1606
6.28k
    }
1607
24.5k
  }
1608
132k
  if (text.size() >= 57 && text.substr(54, 3) != "  0") {
1609
20.3k
    int rxnRole = 0;
1610
20.3k
    try {
1611
20.3k
      rxnRole = FileParserUtils::toInt(text.substr(54, 3), true);
1612
20.3k
    } catch (boost::bad_lexical_cast &) {
1613
27
      std::ostringstream errout;
1614
27
      errout << "Cannot convert '" << text.substr(54, 3) << "' to int on line "
1615
27
             << line;
1616
27
      throw FileParseException(errout.str());
1617
27
    }
1618
20.3k
    if (rxnRole != 0) {
1619
      // only set if it's a non-default value
1620
3.82k
      res->setProp(common_properties::molRxnRole, rxnRole);
1621
3.82k
    }
1622
20.3k
  }
1623
132k
  if (text.size() >= 60 && text.substr(57, 3) != "  0") {
1624
21.6k
    int rxnComponent = 0;
1625
21.6k
    try {
1626
21.6k
      rxnComponent = FileParserUtils::toInt(text.substr(57, 3), true);
1627
21.6k
    } catch (boost::bad_lexical_cast &) {
1628
23
      std::ostringstream errout;
1629
23
      errout << "Cannot convert '" << text.substr(57, 3) << "' to int on line "
1630
23
             << line;
1631
23
      throw FileParseException(errout.str());
1632
23
    }
1633
21.5k
    if (rxnComponent != 0) {
1634
      // only set if it's a non-default value
1635
4.82k
      res->setProp(common_properties::molRxnComponent, rxnComponent);
1636
4.82k
    }
1637
21.5k
  }
1638
132k
  if (text.size() >= 63 && text.substr(60, 3) != "  0") {
1639
19.7k
    int atomMapNumber = 0;
1640
19.7k
    try {
1641
19.7k
      atomMapNumber = FileParserUtils::toInt(text.substr(60, 3), true);
1642
19.7k
    } catch (boost::bad_lexical_cast &) {
1643
23
      std::ostringstream errout;
1644
23
      errout << "Cannot convert '" << text.substr(60, 3) << "' to int on line "
1645
23
             << line;
1646
23
      throw FileParseException(errout.str());
1647
23
    }
1648
19.7k
    res->setProp(common_properties::molAtomMapNumber, atomMapNumber);
1649
19.7k
  }
1650
132k
  if (text.size() >= 66 && text.substr(63, 3) != "  0") {
1651
17.3k
    int inversionFlag = 0;
1652
17.3k
    try {
1653
17.3k
      inversionFlag = FileParserUtils::toInt(text.substr(63, 3), true);
1654
17.3k
    } catch (boost::bad_lexical_cast &) {
1655
37
      std::ostringstream errout;
1656
37
      errout << "Cannot convert '" << text.substr(63, 3) << "' to int on line "
1657
37
             << line;
1658
37
      throw FileParseException(errout.str());
1659
37
    }
1660
17.3k
    res->setProp(common_properties::molInversionFlag, inversionFlag);
1661
17.3k
  }
1662
132k
  if (text.size() >= 69 && text.substr(66, 3) != "  0") {
1663
14.4k
    int exactChangeFlag = 0;
1664
14.4k
    try {
1665
14.4k
      exactChangeFlag = FileParserUtils::toInt(text.substr(66, 3), true);
1666
14.4k
    } catch (boost::bad_lexical_cast &) {
1667
21
      std::ostringstream errout;
1668
21
      errout << "Cannot convert '" << text.substr(66, 3) << "' to int on line "
1669
21
             << line;
1670
21
      throw FileParseException(errout.str());
1671
21
    }
1672
14.4k
    res->setProp(common_properties::molRxnExactChange, exactChangeFlag);
1673
14.4k
  }
1674
132k
  return res.release();
1675
132k
}
1676
1677
122k
Bond *ParseMolFileBondLine(const std::string_view text, unsigned int line) {
1678
122k
  unsigned int idx1, idx2, bType, stereo;
1679
122k
  int spos = 0;
1680
1681
122k
  if (text.size() < 9) {
1682
17
    std::ostringstream errout;
1683
17
    errout << "Bond line too short: '" << text << "' on line " << line;
1684
17
    throw FileParseException(errout.str());
1685
17
  }
1686
1687
122k
  try {
1688
122k
    idx1 = FileParserUtils::toUnsigned(text.substr(spos, 3));
1689
122k
    spos += 3;
1690
122k
    idx2 = FileParserUtils::toUnsigned(text.substr(spos, 3));
1691
122k
    spos += 3;
1692
122k
    bType = FileParserUtils::toUnsigned(text.substr(spos, 3));
1693
122k
  } catch (boost::bad_lexical_cast &) {
1694
29
    std::ostringstream errout;
1695
29
    errout << "Cannot convert '" << text.substr(spos, 3) << "' to int on line "
1696
29
           << line;
1697
29
    throw FileParseException(errout.str());
1698
29
  }
1699
1700
  // adjust the numbering
1701
122k
  idx1--;
1702
122k
  idx2--;
1703
1704
122k
  Bond::BondType type;
1705
122k
  Bond *res = nullptr;
1706
122k
  switch (bType) {
1707
65.3k
    case 1:
1708
65.3k
      type = Bond::SINGLE;
1709
65.3k
      res = new Bond;
1710
65.3k
      break;
1711
14.6k
    case 2:
1712
14.6k
      type = Bond::DOUBLE;
1713
14.6k
      res = new Bond;
1714
14.6k
      break;
1715
1.68k
    case 3:
1716
1.68k
      type = Bond::TRIPLE;
1717
1.68k
      res = new Bond;
1718
1.68k
      break;
1719
4.55k
    case 4:
1720
4.55k
      type = Bond::AROMATIC;
1721
4.55k
      res = new Bond;
1722
4.55k
      break;
1723
1.46k
    case 9:
1724
1.46k
      type = Bond::DATIVE;
1725
1.46k
      res = new Bond;
1726
1.46k
      break;
1727
19.9k
    case 0:
1728
19.9k
      type = Bond::UNSPECIFIED;
1729
19.9k
      res = new Bond;
1730
19.9k
      BOOST_LOG(rdWarningLog)
1731
0
          << "bond with order 0 found on line " << line
1732
0
          << ". This is not part of the MDL specification." << std::endl;
1733
19.9k
      break;
1734
14.7k
    default:
1735
14.7k
      type = Bond::UNSPECIFIED;
1736
      // it's a query bond of some type
1737
14.7k
      res = new QueryBond;
1738
14.7k
      if (bType == 8) {
1739
904
        BOND_NULL_QUERY *q;
1740
904
        q = makeBondNullQuery();
1741
904
        res->setQuery(q);
1742
13.8k
      } else if (bType == 5) {
1743
1.50k
        res->setQuery(makeSingleOrDoubleBondQuery());
1744
1.50k
        res->setProp(common_properties::_MolFileBondQuery, 1);
1745
12.3k
      } else if (bType == 6) {
1746
1.52k
        res->setQuery(makeSingleOrAromaticBondQuery());
1747
1.52k
        res->setProp(common_properties::_MolFileBondQuery, 1);
1748
10.8k
      } else if (bType == 7) {
1749
1.40k
        res->setQuery(makeDoubleOrAromaticBondQuery());
1750
1.40k
        res->setProp(common_properties::_MolFileBondQuery, 1);
1751
9.44k
      } else {
1752
9.44k
        BOND_NULL_QUERY *q;
1753
9.44k
        q = makeBondNullQuery();
1754
9.44k
        res->setQuery(q);
1755
9.44k
        BOOST_LOG(rdWarningLog)
1756
0
            << "unrecognized query bond type, " << bType << ", found on line "
1757
0
            << line << ". Using an \"any\" query." << std::endl;
1758
9.44k
      }
1759
14.7k
      break;
1760
122k
  }
1761
122k
  res->setBeginAtomIdx(idx1);
1762
122k
  res->setEndAtomIdx(idx2);
1763
122k
  res->setBondType(type);
1764
122k
  res->setProp(common_properties::_MolFileBondType, bType);
1765
1766
122k
  if (text.size() >= 12 && text.substr(9, 3) != "  0") {
1767
52.4k
    try {
1768
52.4k
      stereo = FileParserUtils::toUnsigned(text.substr(9, 3));
1769
52.4k
      switch (stereo) {
1770
6.84k
        case 0:
1771
6.84k
          res->setBondDir(Bond::NONE);
1772
6.84k
          break;
1773
14.8k
        case 1:
1774
14.8k
          res->setBondDir(Bond::BEGINWEDGE);
1775
14.8k
          break;
1776
4.28k
        case 6:
1777
4.28k
          res->setBondDir(Bond::BEGINDASH);
1778
4.28k
          break;
1779
567
        case 3:  // "either" double bond
1780
567
          res->setBondDir(Bond::EITHERDOUBLE);
1781
567
          res->setStereo(Bond::STEREOANY);
1782
567
          break;
1783
1.07k
        case 4:  // "either" single bond
1784
1.07k
          res->setBondDir(Bond::UNKNOWN);
1785
1.07k
          break;
1786
52.4k
      }
1787
33.2k
      res->setProp(common_properties::_MolFileBondStereo, stereo);
1788
33.2k
    } catch (boost::bad_lexical_cast &) {
1789
19.2k
      ;
1790
19.2k
    }
1791
52.4k
  }
1792
122k
  if (text.size() >= 18 && text.substr(15, 3) != "  0") {
1793
9.25k
    try {
1794
9.25k
      int topology = FileParserUtils::toInt(text.substr(15, 3));
1795
9.25k
      if (topology) {
1796
1.69k
        if (!res->hasQuery()) {
1797
1.06k
          auto *qBond = new QueryBond(*res);
1798
1.06k
          delete res;
1799
1.06k
          res = qBond;
1800
1.06k
        }
1801
1.69k
        BOND_EQUALS_QUERY *q = makeBondIsInRingQuery();
1802
1.69k
        switch (topology) {
1803
1.11k
          case 1:
1804
1.11k
            break;
1805
552
          case 2:
1806
552
            q->setNegation(true);
1807
552
            break;
1808
28
          default:
1809
28
            std::ostringstream errout;
1810
28
            errout << "Unrecognized bond topology specifier: " << topology
1811
28
                   << " on line " << line;
1812
28
            throw FileParseException(errout.str());
1813
1.69k
        }
1814
1.66k
        res->expandQuery(q);
1815
1.66k
      }
1816
9.25k
    } catch (boost::bad_lexical_cast &) {
1817
4.43k
      ;
1818
4.43k
    }
1819
9.25k
  }
1820
122k
  if (text.size() >= 21 && text.substr(18, 3) != "  0") {
1821
6.75k
    try {
1822
6.75k
      int reactStatus = FileParserUtils::toInt(text.substr(18, 3));
1823
6.75k
      res->setProp("molReactStatus", reactStatus);
1824
6.75k
    } catch (boost::bad_lexical_cast &) {
1825
3.91k
      ;
1826
3.91k
    }
1827
6.75k
  }
1828
122k
  return res;
1829
122k
}  // namespace
1830
1831
void ParseMolBlockAtoms(std::istream *inStream, unsigned int &line,
1832
                        unsigned int nAtoms, RWMol *mol, Conformer *conf,
1833
14.8k
                        bool strictParsing) {
1834
14.8k
  PRECONDITION(inStream, "bad stream");
1835
14.8k
  PRECONDITION(mol, "bad molecule");
1836
14.8k
  PRECONDITION(conf, "bad conformer");
1837
148k
  for (unsigned int i = 1; i <= nAtoms; ++i) {
1838
134k
    ++line;
1839
134k
    std::string tempStr = getLine(inStream);
1840
134k
    if (inStream->eof()) {
1841
436
      throw FileParseException("EOF hit while reading atoms");
1842
436
    }
1843
133k
    RDGeom::Point3D pos;
1844
133k
    Atom *atom = ParseMolFileAtomLine(tempStr, pos, line, strictParsing);
1845
133k
    unsigned int aid = mol->addAtom(atom, false, true);
1846
133k
    conf->setAtomPos(aid, pos);
1847
133k
    mol->setAtomBookmark(atom, i);
1848
133k
  }
1849
14.8k
}
1850
1851
void ParseMolBlockBonds(std::istream *inStream, unsigned int &line,
1852
                        unsigned int nBonds, RWMol *mol,
1853
17.2k
                        bool &chiralityPossible) {
1854
17.2k
  PRECONDITION(inStream, "bad stream");
1855
17.2k
  PRECONDITION(mol, "bad molecule");
1856
139k
  for (unsigned int i = 1; i <= nBonds; ++i) {
1857
122k
    ++line;
1858
122k
    std::string tempStr = getLine(inStream);
1859
122k
    if (inStream->eof()) {
1860
55
      throw FileParseException("EOF hit while reading bonds");
1861
55
    }
1862
122k
    Bond *bond = ParseMolFileBondLine(tempStr, line);
1863
    // if we got an aromatic bond set the flag on the bond and the connected
1864
    // atoms
1865
122k
    if (bond->getBondType() == Bond::AROMATIC) {
1866
4.55k
      bond->setIsAromatic(true);
1867
4.55k
    }
1868
    // if the bond might have chirality info associated with it, set a flag:
1869
122k
    if (bond->getBondDir() != Bond::NONE &&
1870
20.7k
        bond->getBondDir() != Bond::UNKNOWN) {
1871
19.6k
      chiralityPossible = true;
1872
19.6k
    }
1873
    // v2k has no way to set stereoCare on bonds, so set the property if both
1874
    // the beginning and end atoms have it set:
1875
122k
    int care1 = 0;
1876
122k
    int care2 = 0;
1877
122k
    if (!bond->hasProp(common_properties::molStereoCare) &&
1878
122k
        mol->getAtomWithIdx(bond->getBeginAtomIdx())
1879
122k
            ->getPropIfPresent(common_properties::molStereoCare, care1) &&
1880
27.1k
        mol->getAtomWithIdx(bond->getEndAtomIdx())
1881
27.1k
            ->getPropIfPresent(common_properties::molStereoCare, care2)) {
1882
9.18k
      if (care1 && care2) {
1883
367
        bond->setProp(common_properties::molStereoCare, 1);
1884
367
      }
1885
9.18k
    }
1886
122k
    mol->addBond(bond, true);
1887
122k
    mol->setBondBookmark(bond, i);
1888
122k
  }
1889
17.2k
}
1890
1891
bool checkAttachmentPointsAreValid(
1892
3.06k
    const RWMol *mol, std::pair<const int, SubstanceGroup> &sgroup) {
1893
3.06k
  bool res = true;
1894
3.06k
  int nAtoms = static_cast<int>(mol->getNumAtoms());
1895
3.06k
  std::vector<SubstanceGroup::AttachPoint> &attachPoints =
1896
3.06k
      sgroup.second.getAttachPoints();
1897
6.74k
  for (auto &attachPoint : attachPoints) {
1898
6.74k
    if (attachPoint.lvIdx == nAtoms) {
1899
555
      const std::vector<unsigned int> &bonds = sgroup.second.getBonds();
1900
555
      if (bonds.size() == 1) {
1901
304
        const auto bond = mol->getBondWithIdx(bonds.front());
1902
304
        if (bond->getBeginAtomIdx() == attachPoint.aIdx ||
1903
283
            bond->getEndAtomIdx() == attachPoint.aIdx) {
1904
283
          attachPoint.lvIdx = bond->getOtherAtomIdx(attachPoint.aIdx);
1905
283
        }
1906
304
      }
1907
555
    }
1908
6.74k
    if (attachPoint.lvIdx == nAtoms) {
1909
272
      BOOST_LOG(rdWarningLog)
1910
0
          << "Could not infer missing lvIdx on malformed SAP line for SGroup "
1911
0
          << sgroup.first << std::endl;
1912
272
      res = false;
1913
272
    }
1914
6.74k
  }
1915
3.06k
  return res;
1916
3.06k
}
1917
1918
bool ParseMolBlockProperties(std::istream *inStream, unsigned int &line,
1919
16.9k
                             RWMol *mol, bool strictParsing) {
1920
16.9k
  PRECONDITION(inStream, "bad stream");
1921
16.9k
  PRECONDITION(mol, "bad molecule");
1922
  // older mol files can have an atom list block here
1923
16.9k
  std::string tempStr = getLine(inStream);
1924
16.9k
  ++line;
1925
  // there is apparently some software out there that puts a
1926
  // blank line in mol blocks before the "M  END". If we aren't
1927
  // doing strict parsing, deal with that here.
1928
16.9k
  if (!tempStr.size()) {
1929
872
    if (!strictParsing) {
1930
861
      tempStr = getLine(inStream);
1931
861
      ++line;
1932
861
    } else {
1933
11
      std::ostringstream errout;
1934
11
      errout << "Problems encountered parsing Mol data, unexpected blank line "
1935
11
                "found at line "
1936
11
             << line;
1937
11
      throw FileParseException(errout.str());
1938
11
    }
1939
16.0k
  } else {
1940
16.0k
    if (tempStr[0] != 'M' && tempStr[0] != 'A' && tempStr[0] != 'V' &&
1941
585
        tempStr[0] != 'G' && tempStr[0] != 'S') {
1942
253
      ParseOldAtomList(mol, std::string_view(tempStr.c_str()), line);
1943
253
    }
1944
16.0k
  }
1945
1946
16.9k
  IDX_TO_SGROUP_MAP sGroupMap;
1947
16.9k
  IDX_TO_STR_VECT_MAP dataFieldsMap;
1948
16.9k
  bool fileComplete = false;
1949
16.9k
  bool firstChargeLine = true;
1950
16.9k
  unsigned int SCDcounter = 0;
1951
16.9k
  unsigned int lastDataSGroup = 0;
1952
16.9k
  std::ostringstream currentDataField;
1953
16.9k
  std::string lineBeg = tempStr.substr(0, 6);
1954
702k
  while (!inStream->eof() && !inStream->fail() && lineBeg != "M  END" &&
1955
685k
         tempStr.substr(0, 4) != "$$$$") {
1956
685k
    if (tempStr[0] == 'A') {
1957
2.06k
      line++;
1958
2.06k
      std::string nextLine = getLine(inStream);
1959
2.06k
      if (lineBeg != "M  END") {
1960
2.06k
        ParseAtomAlias(mol, tempStr, nextLine, line);
1961
2.06k
      }
1962
683k
    } else if (tempStr[0] == 'G') {
1963
631
      BOOST_LOG(rdWarningLog)
1964
0
          << " deprecated group abbreviation ignored on line " << line
1965
0
          << std::endl;
1966
      // we need to skip the next line, which holds the abbreviation:
1967
631
      line++;
1968
631
      tempStr = getLine(inStream);
1969
682k
    } else if (tempStr[0] == 'V') {
1970
3.15k
      ParseAtomValue(mol, tempStr, line);
1971
679k
    } else if (lineBeg == "S  SKP") {
1972
1.25k
      int nToSkip = FileParserUtils::toInt(tempStr.substr(6, 3));
1973
1.25k
      if (nToSkip < 0) {
1974
11
        std::ostringstream errout;
1975
11
        errout << "negative skip value " << nToSkip << " on line " << line;
1976
11
        throw FileParseException(errout.str());
1977
11
      }
1978
10.2k
      for (unsigned int i = 0; i < static_cast<unsigned int>(nToSkip); ++i) {
1979
9.05k
        ++line;
1980
9.05k
        tempStr = getLine(inStream);
1981
9.05k
      }
1982
678k
    } else if (lineBeg == "M  ALS") {
1983
3.72k
      ParseNewAtomList(mol, tempStr, line);
1984
674k
    } else if (lineBeg == "M  ISO") {
1985
2.42k
      ParseIsotopeLine(mol, tempStr, line);
1986
671k
    } else if (lineBeg == "M  RGP") {
1987
1.91k
      ParseRGroupLabels(mol, tempStr, line);
1988
669k
    } else if (lineBeg == "M  RBC") {
1989
9.68k
      ParseRingBondCountLine(mol, tempStr, line);
1990
660k
    } else if (lineBeg == "M  SUB") {
1991
3.60k
      ParseSubstitutionCountLine(mol, tempStr, line);
1992
656k
    } else if (lineBeg == "M  UNS") {
1993
2.78k
      ParseUnsaturationLine(mol, tempStr, line);
1994
653k
    } else if (lineBeg == "M  CHG") {
1995
1.69k
      ParseChargeLine(mol, tempStr, firstChargeLine, line);
1996
1.69k
      firstChargeLine = false;
1997
652k
    } else if (lineBeg == "M  RAD") {
1998
3.68k
      ParseRadicalLine(mol, tempStr, firstChargeLine, line);
1999
3.68k
      firstChargeLine = false;
2000
648k
    } else if (lineBeg == "M  PXA") {
2001
4.74k
      ParsePXALine(mol, tempStr, line);
2002
2003
      /* SGroup parsing start */
2004
643k
    } else if (lineBeg == "M  STY") {
2005
34.4k
      ParseSGroupV2000STYLine(sGroupMap, mol, tempStr, line, strictParsing);
2006
609k
    } else if (lineBeg == "M  SST") {
2007
4.48k
      ParseSGroupV2000SSTLine(sGroupMap, mol, tempStr, line, strictParsing);
2008
604k
    } else if (lineBeg == "M  SLB") {
2009
4.02k
      ParseSGroupV2000SLBLine(sGroupMap, mol, tempStr, line, strictParsing);
2010
600k
    } else if (lineBeg == "M  SCN") {
2011
8.16k
      ParseSGroupV2000SCNLine(sGroupMap, mol, tempStr, line, strictParsing);
2012
592k
    } else if (lineBeg == "M  SDS") {
2013
2.70k
      ParseSGroupV2000SDSLine(sGroupMap, mol, tempStr, line, strictParsing);
2014
589k
    } else if (lineBeg == "M  SAL" || lineBeg == "M  SBL" ||
2015
564k
               lineBeg == "M  SPA") {
2016
28.3k
      ParseSGroupV2000VectorDataLine(sGroupMap, mol, tempStr, line,
2017
28.3k
                                     strictParsing);
2018
561k
    } else if (lineBeg == "M  SMT") {
2019
12.1k
      ParseSGroupV2000SMTLine(sGroupMap, mol, tempStr, line, strictParsing);
2020
549k
    } else if (lineBeg == "M  SDI") {
2021
22.1k
      ParseSGroupV2000SDILine(sGroupMap, mol, tempStr, line, strictParsing);
2022
527k
    } else if (lineBeg == "M  CRS") {
2023
2
      std::ostringstream errout;
2024
2
      errout << "Unsupported SGroup subtype '" << lineBeg << "' on line "
2025
2
             << line;
2026
2
      throw FileParseException(errout.str());
2027
527k
    } else if (lineBeg == "M  SBV") {
2028
7.49k
      ParseSGroupV2000SBVLine(sGroupMap, mol, tempStr, line, strictParsing);
2029
519k
    } else if (lineBeg == "M  SDT") {
2030
24.2k
      ParseSGroupV2000SDTLine(sGroupMap, mol, tempStr, line, strictParsing);
2031
495k
    } else if (lineBeg == "M  SDD") {
2032
2.75k
      ParseSGroupV2000SDDLine(sGroupMap, mol, tempStr, line, strictParsing);
2033
492k
    } else if (lineBeg == "M  SCD" || lineBeg == "M  SED") {
2034
54.9k
      ParseSGroupV2000SCDSEDLine(sGroupMap, dataFieldsMap, mol, tempStr, line,
2035
54.9k
                                 strictParsing, SCDcounter, lastDataSGroup,
2036
54.9k
                                 currentDataField);
2037
437k
    } else if (lineBeg == "M  SPL") {
2038
6.36k
      ParseSGroupV2000SPLLine(sGroupMap, mol, tempStr, line, strictParsing);
2039
431k
    } else if (lineBeg == "M  SNC") {
2040
5.07k
      ParseSGroupV2000SNCLine(sGroupMap, mol, tempStr, line, strictParsing);
2041
426k
    } else if (lineBeg == "M  SAP") {
2042
17.3k
      ParseSGroupV2000SAPLine(sGroupMap, mol, tempStr, line, strictParsing);
2043
408k
    } else if (lineBeg == "M  SCL") {
2044
4.74k
      ParseSGroupV2000SCLLine(sGroupMap, mol, tempStr, line, strictParsing);
2045
404k
    } else if (lineBeg == "M  SBT") {
2046
5.35k
      ParseSGroupV2000SBTLine(sGroupMap, mol, tempStr, line, strictParsing);
2047
2048
      /* SGroup parsing end */
2049
398k
    } else if (lineBeg == "M  ZBO") {
2050
1.34k
      ParseZBOLine(mol, tempStr, line);
2051
397k
    } else if (lineBeg == "M  ZCH") {
2052
1.60k
      ParseZCHLine(mol, tempStr, line);
2053
395k
    } else if (lineBeg == "M  HYD") {
2054
1.68k
      ParseHYDLine(mol, tempStr, line);
2055
394k
    } else if (lineBeg == "M  MRV") {
2056
35.8k
      ParseMarvinSmartsLine(mol, tempStr, line);
2057
358k
    } else if (lineBeg == "M  APO") {
2058
1.55k
      ParseAttachPointLine(mol, tempStr, line, strictParsing);
2059
356k
    } else if (lineBeg == "M  LIN") {
2060
1.31k
      ParseLinkNodeLine(mol, tempStr, line);
2061
1.31k
    }
2062
685k
    line++;
2063
685k
    tempStr = getLine(inStream);
2064
685k
    lineBeg = tempStr.substr(0, 6);
2065
685k
  }
2066
16.9k
  if (tempStr[0] == 'M' && tempStr.substr(0, 6) == "M  END") {
2067
    // All went well, make final updates to SGroups, and add them to Mol
2068
4.88k
    for (auto &sgroup : sGroupMap) {
2069
3.21k
      if (sgroup.second.getIsValid()) {
2070
3.06k
        sgroup.second.setProp("DATAFIELDS", dataFieldsMap[sgroup.first]);
2071
3.06k
        sgroup.second.setIsValid(checkAttachmentPointsAreValid(mol, sgroup));
2072
3.06k
      }
2073
3.21k
      if (sgroup.second.getIsValid()) {
2074
3.04k
        addSubstanceGroup(*mol, sgroup.second);
2075
3.04k
      } else {
2076
167
        std::ostringstream errout;
2077
167
        errout << "SGroup " << sgroup.first << " is invalid";
2078
167
        if (strictParsing) {
2079
0
          throw FileParseException(errout.str());
2080
167
        } else {
2081
167
          BOOST_LOG(rdWarningLog)
2082
0
              << errout.str() << " and will be ignored" << std::endl;
2083
167
        }
2084
167
      }
2085
3.21k
    }
2086
2087
4.88k
    fileComplete = true;
2088
4.88k
  }
2089
16.9k
  return fileComplete;
2090
16.9k
}
2091
2092
Atom *ParseV3000AtomSymbol(std::string_view token, unsigned int &line,
2093
15.8k
                           bool strictParsing) {
2094
15.8k
  bool negate = false;
2095
15.8k
  token = FileParserUtils::strip(token);
2096
15.8k
  if (token.size() > 3 && (token[0] == 'N' || token[0] == 'n') &&
2097
1.66k
      (token[1] == 'O' || token[1] == 'o') &&
2098
753
      (token[2] == 'T' || token[2] == 't')) {
2099
9
    negate = true;
2100
9
    token = token.substr(3, token.size() - 3);
2101
9
    token = FileParserUtils::strip(token);
2102
9
  }
2103
2104
15.8k
  std::unique_ptr<Atom> res;
2105
15.8k
  if (token[0] == '[') {
2106
    // atom list:
2107
127
    if (token.back() != ']') {
2108
14
      std::ostringstream errout;
2109
14
      errout << "Bad atom token '" << token << "' on line: " << line;
2110
14
      throw FileParseException(errout.str());
2111
14
    }
2112
113
    token = token.substr(1, token.size() - 2);
2113
2114
113
    std::vector<std::string> splitToken;
2115
113
    boost::split(splitToken, token, boost::is_any_of(","));
2116
2117
113
    for (std::vector<std::string>::const_iterator stIt = splitToken.begin();
2118
2.64k
         stIt != splitToken.end(); ++stIt) {
2119
2.53k
      std::string_view stoken = *stIt;
2120
2.53k
      std::string atSymb(FileParserUtils::strip(stoken));
2121
2.53k
      if (atSymb.empty()) {
2122
1.72k
        continue;
2123
1.72k
      }
2124
813
      if (atSymb.size() == 2 && atSymb[1] >= 'A' && atSymb[1] <= 'Z') {
2125
7
        atSymb[1] = static_cast<char>(tolower(atSymb[1]));
2126
7
      }
2127
2128
813
      int atNum = PeriodicTable::getTable()->getAtomicNumber(atSymb);
2129
813
      if (!res) {
2130
41
        res.reset(new QueryAtom(atNum));
2131
772
      } else {
2132
772
        res->expandQuery(makeAtomNumQuery(atNum), Queries::COMPOSITE_OR, true);
2133
772
      }
2134
      // we want the atomic number of the query itself to always be zero
2135
      // this was Github #8820 and #8823
2136
813
      res->setAtomicNum(0);
2137
813
    }
2138
113
    res->getQuery()->setNegation(negate);
2139
15.7k
  } else {
2140
15.7k
    if (negate) {
2141
9
      std::ostringstream errout;
2142
9
      errout << "NOT tokens only supported for atom lists. line " << line;
2143
9
      throw FileParseException(errout.str());
2144
9
    }
2145
    // it's a normal CTAB atom symbol:
2146
    // NOTE: "R" and "R0"-"R99" are not in the v3K CTAB spec, but we're going to
2147
    // support them anyway
2148
15.7k
    bool isComplexQueryName =
2149
15.7k
        std::find(complexQueries.begin(), complexQueries.end(), token) !=
2150
15.7k
        complexQueries.end();
2151
15.7k
    if (isComplexQueryName || token == "R" ||
2152
15.3k
        (token[0] == 'R' && token >= "R0" && token <= "R99") || token == "R#" ||
2153
8.95k
        token == "*") {
2154
6.84k
      if (isComplexQueryName || token == "*") {
2155
470
        res.reset(new QueryAtom(0));
2156
470
        if (token == "*") {
2157
          // according to the MDL spec, these match anything
2158
71
          res->setQuery(makeAtomNullQuery());
2159
399
        } else if (isComplexQueryName) {
2160
399
          convertComplexNameToQuery(res.get(), token);
2161
399
        }
2162
        // queries have no implicit Hs:
2163
470
        res->setNoImplicit(true);
2164
6.37k
      } else {
2165
6.37k
        res.reset(new Atom(1));
2166
6.37k
        res->setAtomicNum(0);
2167
6.37k
      }
2168
6.84k
      if (token[0] == 'R' && token >= "R0" && token <= "R99") {
2169
6.29k
        auto rlabel = token.substr(1, token.length() - 1);
2170
6.29k
        int rnumber;
2171
6.29k
        try {
2172
6.29k
          rnumber = boost::lexical_cast<int>(rlabel);
2173
6.29k
        } catch (boost::bad_lexical_cast &) {
2174
4.13k
          rnumber = -1;
2175
4.13k
        }
2176
6.29k
        if (rnumber >= 0) {
2177
2.15k
          res->setIsotope(rnumber);
2178
2.15k
        }
2179
6.29k
      }
2180
6.84k
      if (token[0] == 'R') {
2181
        // we used to skip R# here because that really should be handled by an
2182
        // RGP spec, but that turned out to not be permissive enough... <sigh>
2183
6.37k
        setRGPProps(token, res.get());
2184
6.37k
      }
2185
8.88k
    } else if (token == "D") {  // mol blocks support "D" and "T" as
2186
                                // shorthand... handle that.
2187
78
      res.reset(new Atom(1));
2188
78
      res->setIsotope(2);
2189
8.80k
    } else if (token == "T") {  // mol blocks support "D" and "T" as
2190
                                // shorthand... handle that.
2191
2
      res.reset(new Atom(1));
2192
2
      res->setIsotope(3);
2193
8.80k
    } else if (token == "Pol" || token == "Mod") {
2194
26
      res.reset(new Atom(0));
2195
26
      res->setProp(common_properties::dummyLabel, std::string(token));
2196
8.77k
    } else if (GenericGroups::genericMatchers.find(std::string(token)) !=
2197
8.77k
               GenericGroups::genericMatchers.end()) {
2198
45
      res.reset(new QueryAtom(0));
2199
45
      res->setProp(common_properties::atomLabel, std::string(token));
2200
8.73k
    } else {
2201
8.73k
      std::string tcopy(token);
2202
8.73k
      res.reset(new Atom(0));
2203
8.73k
      lookupAtomicNumber(res.get(), tcopy, strictParsing);
2204
8.73k
    }
2205
15.7k
  }
2206
2207
15.8k
  POSTCONDITION(res, "no atom built");
2208
15.8k
  return res.release();
2209
15.8k
}
2210
2211
bool splitAssignToken(std::string_view token, std::string &prop,
2212
205k
                      std::string_view &val) {
2213
205k
  auto equalsLoc = token.find("=");
2214
205k
  if (equalsLoc == token.npos || equalsLoc != token.rfind("=")) {
2215
941
    return false;
2216
941
  }
2217
204k
  prop = token.substr(0, equalsLoc);
2218
204k
  boost::to_upper(prop);
2219
204k
  val = token.substr(equalsLoc + 1);
2220
204k
  return true;
2221
205k
}
2222
2223
template <class T>
2224
void ParseV3000AtomProps(RWMol *mol, Atom *&atom, typename T::iterator &token,
2225
                         const T &tokens, unsigned int &line,
2226
15.2k
                         bool strictParsing) {
2227
15.2k
  PRECONDITION(mol, "bad molecule");
2228
15.2k
  PRECONDITION(atom, "bad atom");
2229
15.2k
  std::ostringstream errout;
2230
131k
  while (token != tokens.end()) {
2231
116k
    std::string prop;
2232
116k
    std::string_view val;
2233
116k
    if (!splitAssignToken(*token, prop, val)) {
2234
465
      errout << "Invalid atom property: '" << *token << "' for atom "
2235
465
             << atom->getIdx() + 1 << " on line " << line << std::endl;
2236
465
      throw FileParseException(errout.str());
2237
465
    }
2238
2239
116k
    if (prop == "CHG") {
2240
487
      auto charge = FileParserUtils::toInt(val);
2241
487
      if (!atom->hasQuery()) {
2242
408
        atom->setFormalCharge(charge);
2243
408
      } else {
2244
79
        atom->expandQuery(makeAtomFormalChargeQuery(charge));
2245
79
      }
2246
115k
    } else if (prop == "RAD") {
2247
      // FIX handle queries here
2248
1.28k
      switch (FileParserUtils::toInt(val)) {
2249
116
        case 0:
2250
116
          break;
2251
12
        case 1:
2252
12
          atom->setNumRadicalElectrons(2);
2253
12
          break;
2254
87
        case 2:
2255
87
          atom->setNumRadicalElectrons(1);
2256
87
          break;
2257
1.05k
        case 3:
2258
1.05k
          atom->setNumRadicalElectrons(2);
2259
1.05k
          break;
2260
7
        default:
2261
7
          errout << "Unrecognized RAD value " << val << " for atom "
2262
7
                 << atom->getIdx() + 1 << " on line " << line << std::endl;
2263
7
          throw FileParseException(errout.str());
2264
1.28k
      }
2265
114k
    } else if (prop == "MASS") {
2266
      // the documentation for V3000 CTABs says that this should contain the
2267
      // "absolute atomic weight" (whatever that means).
2268
      // Online examples seem to have integer (isotope) values and Marvin
2269
      // won't even read something that has a float. We'll go with the int
2270
2.16k
      int v;
2271
2.16k
      double dv;
2272
2.16k
      try {
2273
2.16k
        v = FileParserUtils::toInt(val);
2274
2.16k
      } catch (boost::bad_lexical_cast &) {
2275
1.35k
        try {
2276
1.35k
          dv = FileParserUtils::toDouble(val);
2277
1.35k
          v = static_cast<int>(floor(dv));
2278
1.35k
        } catch (boost::bad_lexical_cast &) {
2279
17
          v = -1;
2280
17
        }
2281
1.35k
      }
2282
2.16k
      if (v < 0) {
2283
20
        errout << "Bad value for MASS :" << val << " for atom "
2284
20
               << atom->getIdx() + 1 << " on line " << line << std::endl;
2285
20
        throw FileParseException(errout.str());
2286
2.14k
      } else {
2287
2.14k
        if (!atom->hasQuery()) {
2288
1.64k
          atom->setIsotope(v);
2289
1.64k
        } else {
2290
497
          atom->expandQuery(makeAtomIsotopeQuery(v));
2291
497
        }
2292
2.14k
      }
2293
112k
    } else if (prop == "CFG") {
2294
9.25k
      auto cfg = FileParserUtils::toInt(val);
2295
9.25k
      switch (cfg) {
2296
2.78k
        case 0:
2297
2.78k
          break;
2298
1.09k
        case 1:
2299
5.33k
        case 2:
2300
6.44k
        case 3:
2301
6.44k
          atom->setProp(common_properties::molParity, cfg);
2302
6.44k
          break;
2303
13
        default:
2304
13
          errout << "Unrecognized CFG value : " << val << " for atom "
2305
13
                 << atom->getIdx() + 1 << " on line " << line << std::endl;
2306
13
          throw FileParseException(errout.str());
2307
9.25k
      }
2308
103k
    } else if (prop == "HCOUNT") {
2309
3.22k
      if (val != "0") {
2310
3.12k
        auto hcount = FileParserUtils::toInt(val);
2311
3.12k
        if (!atom->hasQuery()) {
2312
113
          atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
2313
113
        }
2314
3.12k
        if (hcount == -1) {
2315
630
          hcount = 0;
2316
630
        }
2317
3.12k
        if (hcount > 0) {
2318
1.59k
          ATOM_EQUALS_QUERY *oq = makeAtomImplicitHCountQuery(hcount);
2319
1.59k
          auto nq = makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>(
2320
1.59k
              hcount, oq->getDataFunc(),
2321
1.59k
              std::string("less_") + oq->getDescription());
2322
1.59k
          atom->expandQuery(nq);
2323
1.59k
          delete oq;
2324
1.59k
        } else {
2325
1.52k
          atom->expandQuery(makeAtomImplicitHCountQuery(0));
2326
1.52k
        }
2327
3.12k
      }
2328
99.8k
    } else if (prop == "UNSAT") {
2329
6.85k
      if (val == "1") {
2330
4.95k
        if (!atom->hasQuery()) {
2331
589
          atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
2332
589
        }
2333
4.95k
        atom->expandQuery(makeAtomUnsaturatedQuery());
2334
4.95k
      }
2335
93.0k
    } else if (prop == "RBCNT") {
2336
8.33k
      if (val != "0") {
2337
8.23k
        auto rbcount = FileParserUtils::toInt(val);
2338
8.23k
        if (!atom->hasQuery()) {
2339
1.50k
          atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
2340
1.50k
        }
2341
8.23k
        atom->setProp(common_properties::molRingBondCount, rbcount);
2342
8.23k
        if (rbcount == -1) {
2343
835
          rbcount = 0;
2344
7.40k
        } else if (rbcount == -2) {
2345
          // Ring bonds can only be counted during post processing
2346
51
          mol->setProp(common_properties::_NeedsQueryScan, 1);
2347
51
          rbcount = 0xDEADBEEF;
2348
7.35k
        } else if (rbcount > 4) {
2349
142
          rbcount = 4;
2350
142
        }
2351
8.23k
        atom->expandQuery(makeAtomRingBondCountQuery(rbcount));
2352
8.23k
      }
2353
84.6k
    } else if (prop == "VAL") {
2354
1.42k
      if (val != "0") {
2355
1.23k
        auto totval = FileParserUtils::toInt(val);
2356
1.23k
        atom->setProp(common_properties::molTotValence, totval);
2357
1.23k
      }
2358
83.2k
    } else if (prop == "RGROUPS") {
2359
22
      ParseV3000RGroups(mol, atom, val, line);
2360
      // FIX
2361
83.2k
    } else if (prop == "STBOX") {
2362
6.37k
      if (val != "0") {
2363
4.02k
        auto ival = FileParserUtils::toInt(val);
2364
4.02k
        atom->setProp(common_properties::molStereoCare, ival);
2365
4.02k
      }
2366
76.8k
    } else if (prop == "SUBST") {
2367
1.39k
      if (val != "0") {
2368
1.11k
        auto ival = FileParserUtils::toInt(val);
2369
1.11k
        atom->setProp(common_properties::molSubstCount, ival);
2370
1.11k
      }
2371
75.4k
    } else if (prop == "EXACHG") {
2372
1.55k
      if (val != "0") {
2373
1.20k
        auto ival = FileParserUtils::toInt(val);
2374
1.20k
        atom->setProp(common_properties::molRxnExactChange, ival);
2375
1.20k
      }
2376
73.8k
    } else if (prop == "INVRET") {
2377
3.40k
      if (val != "0") {
2378
2.73k
        auto ival = FileParserUtils::toInt(val);
2379
2.73k
        atom->setProp(common_properties::molInversionFlag, ival);
2380
2.73k
      }
2381
70.4k
    } else if (prop == "ATTCHPT") {
2382
11.8k
      if (val != "0") {
2383
11.4k
        auto ival = FileParserUtils::toInt(val);
2384
11.4k
        if (atom->hasProp(common_properties::molAttachPoint)) {
2385
10.7k
          errout << "Multiple ATTCHPT values for atom " << atom->getIdx() + 1
2386
10.7k
                 << " on line " << line;
2387
10.7k
          if (strictParsing) {
2388
2
            throw FileParseException(errout.str());
2389
10.7k
          } else {
2390
10.7k
            BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
2391
10.7k
            errout.str(std::string());
2392
10.7k
          }
2393
10.7k
        } else {
2394
654
          atom->setProp(common_properties::molAttachPoint, ival);
2395
654
        }
2396
11.4k
      }
2397
58.6k
    } else if (prop == "ATTCHORD") {
2398
      // there are two kinds of ATTCHORD
2399
      // one is for template instances and looks like this: ATTCHORD=(4 1 Al 3
2400
      // Br)
2401
2402
1.34k
      if (val.substr(0, 1) == "(") {
2403
        // this is a template instance
2404
2405
13
        val = val.substr(1, val.size() - 2);
2406
13
        std::vector<std::string> splitToken;
2407
13
        boost::split(splitToken, val, boost::is_any_of(" \t"));
2408
2409
13
        unsigned int itemCount = 0;
2410
13
        if (splitToken.size() > 0) {
2411
13
          itemCount = FileParserUtils::toInt(splitToken[0]);
2412
13
        }
2413
2414
13
        if (itemCount == 0 || itemCount % 2 != 0 ||
2415
8
            splitToken.size() != itemCount + 1) {
2416
8
          errout << "Invalid ATTCHORD value: '" << val << "' for atom "
2417
8
                 << atom->getIdx() + 1 << " on line " << line << std::endl;
2418
8
          throw FileParseException(errout.str());
2419
8
        }
2420
5
        std::vector<std::pair<unsigned int, std::string>> attchOrds;
2421
5
        for (unsigned int i = 1; i < itemCount; i += 2) {
2422
0
          unsigned int idx = FileParserUtils::toInt(splitToken[i]);
2423
          // check for uniqueness
2424
0
          for (const auto &[aidx, lbl] : attchOrds) {
2425
0
            if (idx == aidx + 1 || splitToken[i + 1] == lbl) {
2426
0
              errout << "Invalid ATTCHORD value: '" << val << "' for atom "
2427
0
                     << atom->getIdx() + 1 << " on line " << line << std::endl;
2428
2429
0
              throw FileParseException(errout.str());
2430
0
            }
2431
0
          }
2432
0
          attchOrds.emplace_back(idx - 1, splitToken[i + 1]);
2433
0
        }
2434
5
        atom->setProp(common_properties::molAttachOrderTemplate, attchOrds);
2435
1.32k
      } else {
2436
        // this is a normal ATTCHORD
2437
1.32k
        auto ival = FileParserUtils::toInt(val);
2438
1.32k
        atom->setProp(common_properties::molAttachOrder, ival);
2439
1.32k
      }
2440
57.3k
    } else if (prop == "CLASS") {
2441
1.24k
      atom->setProp(common_properties::molAtomClass, std::string(val));
2442
56.0k
    } else if (prop == "SEQID") {
2443
1.25k
      if (val != "0") {
2444
1.06k
        auto ival = FileParserUtils::toInt(val);
2445
1.06k
        atom->setProp(common_properties::molAtomSeqId, ival);
2446
1.06k
      }
2447
54.8k
    } else if (prop == "SEQNAME") {
2448
33
      if (val != "") {
2449
5
        atom->setProp(common_properties::molAtomSeqName, std::string(val));
2450
5
      }
2451
33
    }
2452
116k
    ++token;
2453
116k
  }
2454
15.2k
}
2455
2456
void tokenizeV3000Line(std::string_view line,
2457
16.8k
                       std::vector<std::string_view> &tokens) {
2458
16.8k
  tokens.clear();
2459
16.8k
  bool inQuotes = false;
2460
16.8k
  unsigned int parenDepth = 0;
2461
16.8k
  unsigned int start = 0;
2462
16.8k
  unsigned int pos = 0;
2463
33.3M
  while (pos < line.size()) {
2464
33.3M
    if (line[pos] == ' ' || line[pos] == '\t') {
2465
2.21M
      if (start == pos) {
2466
116k
        ++start;
2467
116k
        ++pos;
2468
2.10M
      } else if (!inQuotes && parenDepth == 0) {
2469
559k
        tokens.push_back(line.substr(start, pos - start));
2470
559k
        ++pos;
2471
559k
        start = pos;
2472
1.54M
      } else {
2473
1.54M
        ++pos;
2474
1.54M
      }
2475
31.1M
    } else if (line[pos] == ')' && parenDepth > 0) {
2476
15.0k
      --parenDepth;
2477
15.0k
      ++pos;
2478
31.1M
    } else if (line[pos] == '(' && !inQuotes) {
2479
355k
      ++parenDepth;
2480
355k
      ++pos;
2481
30.7M
    } else if (line[pos] == '"' && parenDepth == 0) {
2482
394k
      if (pos + 1 < line.size() && line[pos + 1] == '"') {
2483
8.59k
        pos += 2;
2484
386k
      } else if (inQuotes) {
2485
        // don't push on the quotes themselves
2486
190k
        tokens.push_back(line.substr(start + 1, pos - start - 1));
2487
190k
        ++pos;
2488
190k
        start = pos;
2489
190k
        inQuotes = false;
2490
195k
      } else {
2491
195k
        ++pos;
2492
195k
        inQuotes = true;
2493
195k
      }
2494
30.3M
    } else {
2495
30.3M
      ++pos;
2496
30.3M
    }
2497
33.3M
  }
2498
16.8k
  if (start != pos) {
2499
16.3k
    tokens.push_back(line.substr(start, line.size() - start));
2500
16.3k
  }
2501
#if 0
2502
      std::cerr<<"tokens: ";
2503
      std::copy(tokens.begin(),tokens.end(),std::ostream_iterator<std::string>(std::cerr,"|"));
2504
      std::cerr<<std::endl;
2505
#endif
2506
16.8k
}
2507
2508
bool calculate3dFlag(const RWMol &mol, const Conformer &conf,
2509
17.1k
                     bool chiralityPossible) {
2510
17.1k
  int marked3d = 0;
2511
17.1k
  if (mol.getPropIfPresent(common_properties::_3DConf, marked3d)) {
2512
393
    mol.clearProp(common_properties::_3DConf);
2513
393
  }
2514
2515
17.1k
  bool nonzeroZ = hasNonZeroZCoords(conf);
2516
2517
17.1k
  if (!nonzeroZ && marked3d == 1) {
2518
    // If we have no Z coordinates, mark the structure 2D if we see any
2519
    // 2D stereo markers, or stay as 3D if
2520
38
    if (chiralityPossible) {
2521
32
      BOOST_LOG(rdWarningLog)
2522
0
          << "Warning: molecule is tagged as 3D, but all Z coords are zero and 2D stereo "
2523
0
             "markers have been found, marking the mol as 2D."
2524
0
          << std::endl;
2525
32
      return false;
2526
32
    }
2527
6
    return true;
2528
17.0k
  } else if (marked3d == 0 && nonzeroZ) {
2529
3.14k
    BOOST_LOG(rdWarningLog)
2530
0
        << "Warning: molecule is tagged as 2D, but at least one Z coordinate is not zero. "
2531
0
           "Marking the mol as 3D."
2532
0
        << std::endl;
2533
3.14k
    return true;
2534
3.14k
  }
2535
2536
13.9k
  return nonzeroZ;
2537
17.1k
}
2538
2539
void ParseV3000AtomBlock(std::istream *inStream, unsigned int &line,
2540
                         unsigned int nAtoms, RWMol *mol, Conformer *conf,
2541
1.77k
                         bool strictParsing, bool expectMacroAtoms) {
2542
1.77k
  PRECONDITION(inStream, "bad stream");
2543
1.77k
  PRECONDITION(nAtoms > 0, "bad atom count");
2544
1.77k
  PRECONDITION(mol, "bad molecule");
2545
1.77k
  PRECONDITION(conf, "bad conformer");
2546
1.77k
  std::vector<std::string> splitLine;
2547
2548
1.77k
  auto inl = getV3000Line(inStream, line);
2549
1.77k
  std::string_view tempStr = inl;
2550
1.77k
  if (tempStr.length() < 10 || tempStr.substr(0, 10) != "BEGIN ATOM") {
2551
13
    std::ostringstream errout;
2552
13
    errout << "BEGIN ATOM line not found on line " << line;
2553
13
    throw FileParseException(errout.str());
2554
13
  }
2555
17.5k
  for (unsigned int i = 0; i < nAtoms; ++i) {
2556
16.2k
    inl = getV3000Line(inStream, line);
2557
16.2k
    tempStr = inl;
2558
16.2k
    auto trimmed = FileParserUtils::strip(tempStr);
2559
2560
16.2k
    std::vector<std::string_view> tokens;
2561
16.2k
    std::vector<std::string_view>::iterator token;
2562
2563
16.2k
    tokenizeV3000Line(trimmed, tokens);
2564
16.2k
    token = tokens.begin();
2565
2566
16.2k
    if (token == tokens.end()) {
2567
6
      std::ostringstream errout;
2568
6
      errout << "Bad atom line : '" << tempStr << "' on line" << line;
2569
6
      throw FileParseException(errout.str());
2570
6
    }
2571
16.2k
    unsigned int molIdx = 0;
2572
16.2k
    std::from_chars(token->data(), token->data() + token->size(), molIdx);
2573
2574
    // start with the symbol:
2575
16.2k
    ++token;
2576
16.2k
    if (token == tokens.end()) {
2577
46
      std::ostringstream errout;
2578
46
      errout << "Bad atom line : '" << tempStr << "' on line " << line;
2579
46
      throw FileParseException(errout.str());
2580
46
    }
2581
2582
    // before we parse the symbol, we need to know if the atom has a class attr.
2583
    // if it does, it is a macro atom reference, and we do not need to parse the
2584
    // symbol.  (the single letter codes can be the same as element sysmbols or
2585
    // special query names)
2586
2587
16.1k
    auto isMacroAtom = false;
2588
16.1k
    if (expectMacroAtoms) {
2589
0
      auto lookAheadToken = token + 1;
2590
0
      while (lookAheadToken != tokens.end()) {
2591
0
        std::string prop;
2592
0
        std::string_view val;
2593
0
        if (splitAssignToken(*lookAheadToken, prop, val) && prop == "CLASS") {
2594
0
          isMacroAtom = true;
2595
0
          break;
2596
0
        }
2597
0
        ++lookAheadToken;
2598
0
      }
2599
0
    }
2600
2601
16.1k
    Atom *atom = nullptr;
2602
16.1k
    if (isMacroAtom) {
2603
0
      atom = new Atom(0);
2604
0
      atom->setAtomicNum(0);
2605
0
      std::string tcopy(*token);
2606
0
      atom->setProp(common_properties::dummyLabel, tcopy);
2607
16.1k
    } else {
2608
16.1k
      atom = ParseV3000AtomSymbol(*token, line, strictParsing);
2609
16.1k
    }
2610
2611
    // now the position;
2612
16.1k
    RDGeom::Point3D pos;
2613
16.1k
    ++token;
2614
16.1k
    if (token == tokens.end()) {
2615
165
      delete atom;
2616
165
      std::ostringstream errout;
2617
165
      errout << "Bad atom line : '" << tempStr << "' on line " << line;
2618
165
      throw FileParseException(errout.str());
2619
165
    }
2620
2621
16.0k
    pos.x = atof(std::string(*token).c_str());
2622
16.0k
    ++token;
2623
16.0k
    if (token == tokens.end()) {
2624
85
      delete atom;
2625
85
      std::ostringstream errout;
2626
85
      errout << "Bad atom line : '" << tempStr << "' on line " << line;
2627
85
      throw FileParseException(errout.str());
2628
85
    }
2629
15.9k
    pos.y = atof(std::string(*token).c_str());
2630
15.9k
    ++token;
2631
15.9k
    if (token == tokens.end()) {
2632
88
      delete atom;
2633
88
      std::ostringstream errout;
2634
88
      errout << "Bad atom line : '" << tempStr << "' on line " << line;
2635
88
      throw FileParseException(errout.str());
2636
88
    }
2637
15.8k
    pos.z = atof(std::string(*token).c_str());
2638
    // the map number:
2639
15.8k
    ++token;
2640
15.8k
    if (token == tokens.end()) {
2641
70
      delete atom;
2642
70
      std::ostringstream errout;
2643
70
      errout << "Bad atom line : '" << tempStr << "' on line " << line;
2644
70
      throw FileParseException(errout.str());
2645
70
    }
2646
15.7k
    int mapNum = atoi(std::string(*token).c_str());
2647
15.7k
    if (mapNum > 0) {
2648
821
      atom->setProp(common_properties::molAtomMapNumber, mapNum);
2649
821
    }
2650
15.7k
    ++token;
2651
2652
15.7k
    unsigned int aid = mol->addAtom(atom, false, true);
2653
2654
    // additional properties this may change the atom,
2655
    // so be careful with it:
2656
15.7k
    ParseV3000AtomProps(mol, atom, token, tokens, line, strictParsing);
2657
2658
15.7k
    mol->setAtomBookmark(atom, molIdx);
2659
15.7k
    conf->setAtomPos(aid, pos);
2660
15.7k
  }
2661
1.30k
  inl = getV3000Line(inStream, line);
2662
1.30k
  tempStr = inl;
2663
1.30k
  if (tempStr.length() < 8 || tempStr.substr(0, 8) != "END ATOM") {
2664
14
    std::ostringstream errout;
2665
14
    errout << "END ATOM line not found on line " << line;
2666
14
    throw FileParseException(errout.str());
2667
14
  }
2668
1.30k
}
2669
2670
void ParseV3000BondBlock(std::istream *inStream, unsigned int &line,
2671
                         unsigned int nBonds, RWMol *mol,
2672
1.00k
                         bool &chiralityPossible) {
2673
1.00k
  PRECONDITION(inStream, "bad stream");
2674
1.00k
  PRECONDITION(nBonds > 0, "bad bond count");
2675
1.00k
  PRECONDITION(mol, "bad molecule");
2676
2677
1.00k
  auto inl = getV3000Line(inStream, line);
2678
1.00k
  std::string_view tempStr = inl;
2679
1.00k
  if (tempStr.length() < 10 || tempStr.substr(0, 10) != "BEGIN BOND") {
2680
18
    throw FileParseException("BEGIN BOND line not found");
2681
18
  }
2682
1.31k
  for (unsigned int i = 0; i < nBonds; ++i) {
2683
969
    inl = getV3000Line(inStream, line);
2684
969
    tempStr = inl;
2685
969
    tempStr = FileParserUtils::strip(tempStr);
2686
969
    std::vector<std::string_view> splitLine;
2687
969
    tokenizeV3000Line(tempStr, splitLine);
2688
969
    if (splitLine.size() < 4) {
2689
49
      std::ostringstream errout;
2690
49
      errout << "bond line " << line << " is too short";
2691
49
      throw FileParseException(errout.str());
2692
49
    }
2693
920
    Bond *bond;
2694
920
    unsigned int bondIdx = 0;
2695
920
    std::from_chars(splitLine[0].data(),
2696
920
                    splitLine[0].data() + splitLine[0].size(), bondIdx);
2697
920
    unsigned int bType = 0;
2698
920
    std::from_chars(splitLine[1].data(),
2699
920
                    splitLine[1].data() + splitLine[1].size(), bType);
2700
920
    unsigned int a1Idx = 0;
2701
920
    std::from_chars(splitLine[2].data(),
2702
920
                    splitLine[2].data() + splitLine[2].size(), a1Idx);
2703
920
    unsigned int a2Idx = 0;
2704
920
    std::from_chars(splitLine[3].data(),
2705
920
                    splitLine[3].data() + splitLine[3].size(), a2Idx);
2706
2707
920
    switch (bType) {
2708
19
      case 1:
2709
19
        bond = new Bond(Bond::SINGLE);
2710
19
        break;
2711
36
      case 2:
2712
36
        bond = new Bond(Bond::DOUBLE);
2713
36
        break;
2714
7
      case 3:
2715
7
        bond = new Bond(Bond::TRIPLE);
2716
7
        break;
2717
31
      case 4:
2718
31
        bond = new Bond(Bond::AROMATIC);
2719
31
        bond->setIsAromatic(true);
2720
31
        break;
2721
1
      case 9:
2722
1
        bond = new Bond(Bond::DATIVE);
2723
1
        break;
2724
3
      case 10:
2725
3
        bond = new Bond(Bond::HYDROGEN);
2726
3
        break;
2727
686
      case 0:
2728
686
        bond = new Bond(Bond::UNSPECIFIED);
2729
686
        BOOST_LOG(rdWarningLog)
2730
0
            << "bond with order 0 found on line " << line
2731
0
            << ". This is not part of the MDL specification." << std::endl;
2732
686
        break;
2733
133
      default:
2734
        // it's a query bond of some type
2735
133
        bond = new QueryBond;
2736
133
        if (bType == 8) {
2737
1
          BOND_NULL_QUERY *q;
2738
1
          q = makeBondNullQuery();
2739
1
          bond->setQuery(q);
2740
132
        } else if (bType == 5) {
2741
3
          bond->setQuery(makeSingleOrDoubleBondQuery());
2742
3
          bond->setProp(common_properties::_MolFileBondQuery, 1);
2743
129
        } else if (bType == 6) {
2744
6
          bond->setQuery(makeSingleOrAromaticBondQuery());
2745
6
          bond->setProp(common_properties::_MolFileBondQuery, 1);
2746
123
        } else if (bType == 7) {
2747
3
          bond->setQuery(makeDoubleOrAromaticBondQuery());
2748
3
          bond->setProp(common_properties::_MolFileBondQuery, 1);
2749
120
        } else {
2750
120
          BOND_NULL_QUERY *q;
2751
120
          q = makeBondNullQuery();
2752
120
          bond->setQuery(q);
2753
120
          BOOST_LOG(rdWarningLog)
2754
0
              << "unrecognized query bond type, " << bType << ", found on line "
2755
0
              << line << ". Using an \"any\" query." << std::endl;
2756
120
        }
2757
133
        break;
2758
920
    }
2759
916
    bond->setProp(common_properties::_MolFileBondType, bType);
2760
2761
    // additional bond properties:
2762
916
    unsigned int lPos = 4;
2763
916
    std::ostringstream errout;
2764
89.4k
    while (lPos < splitLine.size()) {
2765
89.1k
      std::string prop;
2766
89.1k
      std::string_view val;
2767
89.1k
      if (!splitAssignToken(splitLine[lPos], prop, val)) {
2768
476
        errout << "bad bond property '" << splitLine[lPos] << "' on line "
2769
476
               << line;
2770
476
        throw FileParseException(errout.str());
2771
476
      }
2772
88.6k
      if (prop == "CFG") {
2773
17.8k
        unsigned int cfg = 0;
2774
17.8k
        std::from_chars(val.data(), val.data() + val.size(), cfg);
2775
17.8k
        switch (cfg) {
2776
13.4k
          case 0:
2777
13.4k
            break;
2778
1.67k
          case 1:
2779
1.67k
            bond->setBondDir(Bond::BEGINWEDGE);
2780
1.67k
            chiralityPossible = true;
2781
1.67k
            break;
2782
2.29k
          case 2:
2783
2.29k
            if (bType == 1) {
2784
49
              bond->setBondDir(Bond::UNKNOWN);
2785
2.25k
            } else if (bType == 2) {
2786
249
              bond->setBondDir(Bond::EITHERDOUBLE);
2787
249
              bond->setStereo(Bond::STEREOANY);
2788
249
            }
2789
2.29k
            break;
2790
292
          case 3:
2791
292
            bond->setBondDir(Bond::BEGINDASH);
2792
292
            chiralityPossible = true;
2793
292
            break;
2794
95
          default:
2795
95
            errout << "bad bond CFG " << val << "' on line " << line;
2796
95
            throw FileParseException(errout.str());
2797
17.8k
        }
2798
17.7k
        bond->setProp(common_properties::_MolFileBondCfg, cfg);
2799
70.8k
      } else if (prop == "TOPO") {
2800
1.38k
        if (val != "0") {
2801
1.06k
          if (!bond->hasQuery()) {
2802
40
            auto *qBond = new QueryBond(*bond);
2803
40
            delete bond;
2804
40
            bond = qBond;
2805
40
          }
2806
1.06k
          BOND_EQUALS_QUERY *q = makeBondIsInRingQuery();
2807
1.06k
          if (val == "1") {
2808
            // nothing
2809
1.04k
          } else if (val == "2") {
2810
1.02k
            q->setNegation(true);
2811
1.02k
          } else {
2812
15
            errout << "bad bond TOPO " << val << "' on line " << line;
2813
15
            throw FileParseException(errout.str());
2814
15
          }
2815
1.05k
          bond->expandQuery(q);
2816
1.05k
        }
2817
69.4k
      } else if (prop == "RXCTR") {
2818
11.6k
        int reactStatus = FileParserUtils::toInt(val);
2819
11.6k
        bond->setProp(common_properties::molReactStatus, reactStatus);
2820
57.8k
      } else if (prop == "STBOX") {
2821
12.9k
        bond->setProp(common_properties::molStereoCare, std::string(val));
2822
44.8k
      } else if (prop == "ENDPTS") {
2823
3.29k
        bond->setProp(common_properties::_MolFileBondEndPts, std::string(val));
2824
41.5k
      } else if (prop == "ATTACH") {
2825
774
        bond->setProp(common_properties::_MolFileBondAttach, std::string(val));
2826
774
      }
2827
88.5k
      ++lPos;
2828
88.5k
    }
2829
2830
330
    bond->setBeginAtomIdx(mol->getAtomWithBookmark(a1Idx)->getIdx());
2831
330
    bond->setEndAtomIdx(mol->getAtomWithBookmark(a2Idx)->getIdx());
2832
330
    mol->addBond(bond, true);
2833
330
    mol->setBondBookmark(bond, bondIdx);
2834
2835
    // set the stereoCare property on the bond if it's not set already and
2836
    // both the beginning and end atoms have it set:
2837
330
    int care1 = 0;
2838
330
    int care2 = 0;
2839
330
    if (!bond->hasProp(common_properties::molStereoCare) &&
2840
0
        mol->getAtomWithIdx(bond->getBeginAtomIdx())
2841
0
            ->getPropIfPresent(common_properties::molStereoCare, care1) &&
2842
0
        mol->getAtomWithIdx(bond->getEndAtomIdx())
2843
0
            ->getPropIfPresent(common_properties::molStereoCare, care2)) {
2844
0
      if (care1 == care2) {
2845
0
        bond->setProp(common_properties::molStereoCare, care1);
2846
0
      }
2847
0
    }
2848
330
  }
2849
344
  inl = getV3000Line(inStream, line);
2850
344
  tempStr = inl;
2851
344
  if (tempStr.length() < 8 || tempStr.substr(0, 8) != "END BOND") {
2852
0
    std::ostringstream errout;
2853
0
    errout << "END BOND line not found at line " << line;
2854
0
    throw FileParseException(errout.str());
2855
0
  }
2856
344
}
2857
// The documentation about MRV_COORDINATE_BOND_TYPE in
2858
// https://docs.chemaxon.com/display/docs/chemaxon-specific-information-in-mdl-mol-files.md
2859
// seems to be wrong: it says the only data field in this group contains the
2860
// index for the coordinate atom. But behavior in Marvin Sketch seems to
2861
// indicate that it references the bond index instead (see
2862
// https://github.com/rdkit/rdkit/issues/4473)
2863
2864
16
void processMrvCoordinateBond(RWMol &mol, const SubstanceGroup &sg) {
2865
16
  std::vector<std::string> dataFields;
2866
16
  if (sg.getPropIfPresent("DATAFIELDS", dataFields)) {
2867
16
    if (dataFields.empty()) {
2868
1
      BOOST_LOG(rdWarningLog)
2869
0
          << "ignoring MRV_COORDINATE_BOND_TYPE SGroup without data fields."
2870
0
          << std::endl;
2871
1
      return;
2872
1
    }
2873
2874
15
    auto coordinate_bond_idx =
2875
15
        FileParserUtils::toUnsigned(dataFields[0], true) - 1;
2876
2877
15
    if (dataFields.size() > 1) {
2878
2
      BOOST_LOG(rdWarningLog) << "ignoring extra data fields in "
2879
0
                                 "MRV_COORDINATE_BOND_TYPE SGroup for bond "
2880
0
                              << coordinate_bond_idx << '.' << std::endl;
2881
2
    }
2882
2883
15
    Bond *old_bond = nullptr;
2884
15
    try {
2885
15
      old_bond = mol.getBondWithIdx(coordinate_bond_idx);
2886
15
    } catch (const Invar::Invariant &) {
2887
5
      BOOST_LOG(rdWarningLog)
2888
0
          << "molecule does not contain a bond matching the "
2889
0
             "MRV_COORDINATE_BOND_TYPE SGroup for bond "
2890
0
          << coordinate_bond_idx << ", ignoring." << std::endl;
2891
5
      return;
2892
5
    }
2893
2894
5
    if (!old_bond || old_bond->getBondType() != Bond::BondType::UNSPECIFIED) {
2895
1
      BOOST_LOG(rdWarningLog)
2896
0
          << "MRV_COORDINATE_BOND_TYPE SGroup with value "
2897
0
          << coordinate_bond_idx
2898
0
          << " does not reference a query bond, ignoring." << std::endl;
2899
1
      return;
2900
1
    }
2901
2902
4
    Bond new_bond(Bond::BondType::DATIVE);
2903
4
    auto preserveProps = true;
2904
4
    auto keepSGroups = true;
2905
4
    mol.replaceBond(coordinate_bond_idx, &new_bond, preserveProps, keepSGroups);
2906
4
  }
2907
16
}
2908
2909
119
void processSMARTSQ(RWMol &mol, const SubstanceGroup &sg) {
2910
119
  std::string field;
2911
119
  if (sg.getPropIfPresent("QUERYOP", field) && field != "=") {
2912
36
    BOOST_LOG(rdWarningLog) << "unrecognized QUERYOP '" << field
2913
0
                            << "' for SMARTSQ. Query ignored." << std::endl;
2914
36
    return;
2915
36
  }
2916
83
  std::vector<std::string> dataFields;
2917
83
  if (!sg.getPropIfPresent("DATAFIELDS", dataFields) || dataFields.empty()) {
2918
5
    BOOST_LOG(rdWarningLog)
2919
0
        << "empty FIELDDATA for SMARTSQ. Query ignored." << std::endl;
2920
5
    return;
2921
5
  }
2922
78
  if (dataFields.size() > 1) {
2923
15
    BOOST_LOG(rdWarningLog)
2924
0
        << "multiple FIELDDATA values for SMARTSQ. Taking the first."
2925
0
        << std::endl;
2926
15
  }
2927
78
  const std::string &sma = dataFields[0];
2928
78
  if (sma.empty()) {
2929
2
    BOOST_LOG(rdWarningLog)
2930
0
        << "Skipping empty SMARTS value for SMARTSQ." << std::endl;
2931
2
    return;
2932
2
  }
2933
2934
4.58k
  for (auto aidx : sg.getAtoms()) {
2935
4.58k
    auto at = mol.getAtomWithIdx(aidx);
2936
2937
4.58k
    std::unique_ptr<RWMol> m;
2938
4.58k
    try {
2939
4.58k
      m.reset(SmartsToMol(sma));
2940
4.58k
    } catch (...) {
2941
      // Is this ever used?
2942
1
    }
2943
2944
4.58k
    if (!m || !m->getNumAtoms()) {
2945
3
      BOOST_LOG(rdWarningLog)
2946
0
          << "SMARTS for SMARTSQ '" << sma
2947
0
          << "' could not be parsed or has no atoms. Ignoring it." << std::endl;
2948
3
      return;
2949
3
    }
2950
2951
4.58k
    if (!at->hasQuery()) {
2952
73
      QueryAtom qAt(*at);
2953
73
      int oidx = at->getIdx();
2954
73
      mol.replaceAtom(oidx, &qAt);
2955
73
      at = mol.getAtomWithIdx(oidx);
2956
73
    }
2957
4.58k
    QueryAtom::QUERYATOM_QUERY *query = nullptr;
2958
4.58k
    if (m->getNumAtoms() == 1) {
2959
2.51k
      query = m->getAtomWithIdx(0)->getQuery()->copy();
2960
2.51k
    } else {
2961
2.06k
      query = new RecursiveStructureQuery(m.release());
2962
2.06k
    }
2963
4.58k
    at->setQuery(query);
2964
4.58k
    at->setProp(common_properties::MRV_SMA, sma);
2965
4.58k
    at->setProp(common_properties::_MolFileAtomQuery, 1);
2966
4.58k
  }
2967
76
}
2968
2969
185
void processMrvImplicitH(RWMol &mol, const SubstanceGroup &sg) {
2970
185
  std::vector<std::string> dataFields;
2971
185
  if (sg.getPropIfPresent("DATAFIELDS", dataFields)) {
2972
8.79k
    for (const auto &df : dataFields) {
2973
8.79k
      if (df.substr(0, 6) == "IMPL_H") {
2974
3.36k
        auto val = FileParserUtils::toInt(df.substr(6));
2975
28.5k
        for (auto atIdx : sg.getAtoms()) {
2976
28.5k
          if (atIdx < mol.getNumAtoms()) {
2977
            // if the atom has aromatic bonds to it, then set the explicit
2978
            // value, otherwise skip it.
2979
28.5k
            auto atom = mol.getAtomWithIdx(atIdx);
2980
28.5k
            bool hasAromaticBonds = false;
2981
28.5k
            for (auto bndI :
2982
58.7k
                 boost::make_iterator_range(mol.getAtomBonds(atom))) {
2983
58.7k
              auto bnd = (mol)[bndI];
2984
58.7k
              if (bnd->getIsAromatic() ||
2985
58.3k
                  bnd->getBondType() == Bond::AROMATIC) {
2986
368
                hasAromaticBonds = true;
2987
368
                break;
2988
368
              }
2989
58.7k
            }
2990
28.5k
            if (hasAromaticBonds) {
2991
368
              atom->setNumExplicitHs(val);
2992
28.2k
            } else {
2993
28.2k
              BOOST_LOG(rdWarningLog)
2994
0
                  << "MRV_IMPLICIT_H SGroup on atom without aromatic "
2995
0
                     "bonds, "
2996
0
                  << atIdx << ", ignored." << std::endl;
2997
28.2k
            }
2998
28.5k
          } else {
2999
0
            BOOST_LOG(rdWarningLog)
3000
0
                << "bad atom index, " << atIdx
3001
0
                << ", found in MRV_IMPLICIT_H SGroup. Ignoring it."
3002
0
                << std::endl;
3003
0
          }
3004
28.5k
        }
3005
3.36k
      }
3006
8.79k
    }
3007
185
  }
3008
185
}
3009
3010
11
void processZBO(RWMol &mol, const SubstanceGroup &sg) {
3011
84
  for (auto bidx : sg.getBonds()) {
3012
84
    auto bond = mol.getBondWithIdx(bidx);
3013
84
    bond->setBondType(Bond::BondType::ZERO);
3014
84
  }
3015
11
}
3016
3017
193
void processZCH(RWMol &mol, const SubstanceGroup &sg) {
3018
193
  RDUNUSED_PARAM(mol);
3019
193
  std::vector<std::string> dataFields;
3020
193
  if (sg.getPropIfPresent("DATAFIELDS", dataFields)) {
3021
193
    if (dataFields.empty()) {
3022
3
      BOOST_LOG(rdWarningLog)
3023
0
          << "ignoring ZCHG SGroup without data fields." << std::endl;
3024
3
      return;
3025
3
    }
3026
6.41k
    for (const auto &df : dataFields) {
3027
6.41k
      std::string trimmed = boost::trim_copy(df);
3028
6.41k
      std::vector<std::string> splitLine;
3029
6.41k
      boost::split(splitLine, trimmed, boost::is_any_of(";"),
3030
6.41k
                   boost::token_compress_off);
3031
6.41k
      const auto &aids = sg.getAtoms();
3032
6.41k
      if (splitLine.size() < aids.size()) {
3033
419
        BOOST_LOG(rdWarningLog)
3034
0
            << "DATAFIELDS in ZCH SGroup is shorter than the number of atoms in the SGroup. Ignoring it."
3035
0
            << std::endl;
3036
419
        continue;
3037
419
      }
3038
16.4k
      for (auto i = 0u; i < aids.size(); ++i) {
3039
10.4k
        auto aid = aids[i];
3040
10.4k
        auto atom = mol.getAtomWithIdx(aid);
3041
10.4k
        auto val = 0;
3042
10.4k
        if (!splitLine[i].empty()) {
3043
10.0k
          val = FileParserUtils::toInt(splitLine[i]);
3044
10.0k
        }
3045
10.4k
        atom->setFormalCharge(val);
3046
10.4k
      }
3047
5.99k
    }
3048
190
  }
3049
193
}
3050
67
void processHYD(RWMol &mol, const SubstanceGroup &sg) {
3051
67
  std::vector<std::string> dataFields;
3052
67
  if (sg.getPropIfPresent("DATAFIELDS", dataFields)) {
3053
67
    if (dataFields.empty()) {
3054
1
      BOOST_LOG(rdWarningLog)
3055
0
          << "ignoring HYD SGroup without data fields." << std::endl;
3056
1
      return;
3057
1
    }
3058
3.37k
    for (const auto &df : dataFields) {
3059
3.37k
      std::string trimmed = boost::trim_copy(df);
3060
3.37k
      std::vector<std::string> splitLine;
3061
3.37k
      boost::split(splitLine, trimmed, boost::is_any_of(";"),
3062
3.37k
                   boost::token_compress_off);
3063
3.37k
      const auto &aids = sg.getAtoms();
3064
3.37k
      if (splitLine.size() < aids.size()) {
3065
1.62k
        BOOST_LOG(rdWarningLog)
3066
0
            << "DATAFIELDS in HYD SGroup is shorter than the number of atoms in the SGroup. Ignoring it."
3067
0
            << std::endl;
3068
1.62k
        continue;
3069
1.62k
      }
3070
3.17k
      for (auto i = 0u; i < aids.size(); ++i) {
3071
1.41k
        auto aid = aids[i];
3072
1.41k
        auto atom = mol.getAtomWithIdx(aid);
3073
1.41k
        auto val = 0;
3074
1.41k
        if (!splitLine[i].empty()) {
3075
1.18k
          val = FileParserUtils::toInt(splitLine[i]);
3076
1.18k
        }
3077
1.41k
        atom->setProp("_ZBO_H", true);
3078
1.41k
        atom->setNumExplicitHs(val);
3079
1.41k
      }
3080
1.75k
    }
3081
66
  }
3082
67
}
3083
3084
// process (and remove) SGroups which modify the structure
3085
// and which we can unambiguously apply
3086
4.88k
void processSGroups(RWMol *mol) {
3087
4.88k
  std::vector<unsigned int> sgsToRemove;
3088
4.88k
  unsigned int sgIdx = 0;
3089
4.88k
  for (auto &sg : getSubstanceGroups(*mol)) {
3090
3.09k
    if (sg.getProp<std::string>("TYPE") == "DAT") {
3091
964
      std::string field;
3092
964
      if (sg.getPropIfPresent("FIELDNAME", field)) {
3093
727
        if (field == "MRV_COORDINATE_BOND_TYPE") {
3094
          // V2000 support for coordinate bonds
3095
16
          processMrvCoordinateBond(*mol, sg);
3096
16
          sgsToRemove.push_back(sgIdx);
3097
16
          continue;
3098
711
        } else if (field == "MRV_IMPLICIT_H") {
3099
          // CXN extension to specify implicit Hs, used for aromatic rings
3100
185
          processMrvImplicitH(*mol, sg);
3101
185
          sgsToRemove.push_back(sgIdx);
3102
185
          continue;
3103
526
        } else if (field == "ZBO") {
3104
          // RDKit extension for zero-order bonds
3105
11
          processZBO(*mol, sg);
3106
11
          sgsToRemove.push_back(sgIdx);
3107
11
          continue;
3108
515
        } else if (field == "ZCH") {
3109
          // RDKit extension for charge on atoms involved in zero-order bonds
3110
193
          processZCH(*mol, sg);
3111
193
          sgsToRemove.push_back(sgIdx);
3112
193
          continue;
3113
322
        } else if (field == "HYD") {
3114
          // RDKit extension for hydrogen-count on atoms involved in
3115
          // zero-order bonds
3116
67
          processHYD(*mol, sg);
3117
67
          sgsToRemove.push_back(sgIdx);
3118
67
          continue;
3119
67
        }
3120
727
      }
3121
492
      if (sg.getPropIfPresent("QUERYTYPE", field) &&
3122
181
          (field == "SMARTSQ" || field == "SQ")) {
3123
119
        processSMARTSQ(*mol, sg);
3124
119
        sgsToRemove.push_back(sgIdx);
3125
119
        continue;
3126
119
      }
3127
492
    }
3128
2.50k
    ++sgIdx;
3129
2.50k
  }
3130
  // now remove the S groups we processed, we saved indices so do this in
3131
  // backwards
3132
4.88k
  auto &sgs = getSubstanceGroups(*mol);
3133
5.39k
  for (auto it = sgsToRemove.rbegin(); it != sgsToRemove.rend(); ++it) {
3134
508
    sgs.erase(sgs.begin() + *it);
3135
508
  }
3136
4.88k
}
3137
3138
4.88k
void ProcessMolProps(RWMol *mol) {
3139
4.88k
  PRECONDITION(mol, "no molecule");
3140
  // we have to loop the ugly way because we may need to actually replace an
3141
  // atom
3142
115k
  for (unsigned int aidx = 0; aidx < mol->getNumAtoms(); ++aidx) {
3143
110k
    auto atom = mol->getAtomWithIdx(aidx);
3144
110k
    int ival = 0;
3145
110k
    if (atom->getPropIfPresent(common_properties::molSubstCount, ival) &&
3146
0
        ival != 0) {
3147
0
      if (!atom->hasQuery()) {
3148
0
        atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
3149
0
      }
3150
0
      bool gtQuery = false;
3151
0
      if (ival == -1) {
3152
0
        ival = 0;
3153
0
      } else if (ival == -2) {
3154
        // as drawn
3155
0
        ival = atom->getDegree();
3156
0
      } else if (ival >= 6) {
3157
        // 6 or more
3158
0
        gtQuery = true;
3159
0
      }
3160
0
      if (!gtQuery) {
3161
0
        atom->expandQuery(makeAtomExplicitDegreeQuery(ival));
3162
0
      } else {
3163
        // create a temp query the normal way so that we can be sure to get
3164
        // the description right
3165
0
        std::unique_ptr<ATOM_EQUALS_QUERY> tmp{
3166
0
            makeAtomExplicitDegreeQuery(ival)};
3167
0
        atom->expandQuery(makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>(
3168
0
            ival, tmp->getDataFunc(),
3169
0
            std::string("less_") + tmp->getDescription()));
3170
0
      }
3171
0
    }
3172
110k
    if (atom->getPropIfPresent(common_properties::molTotValence, ival) &&
3173
3.54k
        ival != 0 && !atom->hasProp("_ZBO_H")) {
3174
3.54k
      atom->setNoImplicit(true);
3175
3.54k
      if (ival == 15     // V2000
3176
3.53k
          || ival == -1  // v3000
3177
3.54k
      ) {
3178
23
        atom->setNumExplicitHs(0);
3179
3.52k
      } else {
3180
3.52k
        if (static_cast<int>(atom->getValence(Atom::ValenceType::EXPLICIT)) >
3181
3.52k
            ival) {
3182
470
          BOOST_LOG(rdWarningLog)
3183
0
              << "atom " << atom->getIdx() << " has specified valence (" << ival
3184
0
              << ") smaller than the drawn valence "
3185
0
              << atom->getValence(Atom::ValenceType::EXPLICIT) << "."
3186
0
              << std::endl;
3187
470
          atom->setNumExplicitHs(0);
3188
3.05k
        } else {
3189
3.05k
          atom->setNumExplicitHs(ival -
3190
3.05k
                                 atom->getValence(Atom::ValenceType::EXPLICIT));
3191
3.05k
        }
3192
3.52k
      }
3193
3.54k
    }
3194
110k
    atom->clearProp(common_properties::molTotValence);
3195
110k
  }
3196
4.88k
  processSGroups(mol);
3197
4.88k
}
3198
3199
}  // namespace
3200
namespace FileParserUtils {
3201
bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol,
3202
                    Conformer *&conf, bool &chiralityPossible,
3203
                    unsigned int &nAtoms, unsigned int &nBonds,
3204
                    bool strictParsing, bool expectMEND,
3205
4.74k
                    bool expectMacroAtoms) {
3206
4.74k
  PRECONDITION(inStream, "bad stream");
3207
4.74k
  PRECONDITION(mol, "bad molecule");
3208
3209
4.74k
  std::string tempStr;
3210
4.74k
  std::vector<std::string> splitLine;
3211
3212
4.74k
  bool fileComplete = false;
3213
3214
4.74k
  tempStr = getV3000Line(inStream, line);
3215
4.74k
  boost::to_upper(tempStr);
3216
4.74k
  if (tempStr.length() < 10 || tempStr.substr(0, 10) != "BEGIN CTAB") {
3217
55
    std::ostringstream errout;
3218
55
    errout << "BEGIN CTAB line not found on line " << line;
3219
55
    throw FileParseException(errout.str());
3220
55
  }
3221
3222
4.68k
  tempStr = getV3000Line(inStream, line);
3223
4.68k
  boost::to_upper(tempStr);
3224
4.68k
  if (tempStr.size() < 8 || tempStr.substr(0, 7) != "COUNTS ") {
3225
16
    std::ostringstream errout;
3226
16
    errout << "Bad counts line : '" << tempStr << "' on line " << line;
3227
16
    throw FileParseException(errout.str());
3228
16
  }
3229
4.67k
  std::string trimmed =
3230
4.67k
      boost::trim_copy(tempStr.substr(7, tempStr.length() - 7));
3231
4.67k
  boost::split(splitLine, trimmed, boost::is_any_of(" \t"),
3232
4.67k
               boost::token_compress_on);
3233
4.67k
  if (splitLine.size() < 2) {
3234
6
    std::ostringstream errout;
3235
6
    errout << "Bad counts line : '" << tempStr << "' on line " << line;
3236
6
    throw FileParseException(errout.str());
3237
6
  }
3238
3239
4.66k
  nAtoms = FileParserUtils::toUnsigned(splitLine[0]);
3240
4.66k
  nBonds = FileParserUtils::toUnsigned(splitLine[1]);
3241
4.66k
  conf = new Conformer(nAtoms);
3242
3243
4.66k
  unsigned int nSgroups = 0, n3DConstraints = 0, chiralFlag = 0;
3244
3245
4.66k
  if (splitLine.size() > 2) {
3246
2.17k
    nSgroups = FileParserUtils::toUnsigned(splitLine[2]);
3247
2.17k
  }
3248
4.66k
  if (splitLine.size() > 3) {
3249
1.21k
    n3DConstraints = FileParserUtils::toUnsigned(splitLine[3]);
3250
1.21k
  }
3251
4.66k
  if (splitLine.size() > 4) {
3252
185
    chiralFlag = FileParserUtils::toUnsigned(splitLine[4]);
3253
185
  }
3254
3255
4.66k
  mol->setProp(common_properties::_MolFileChiralFlag, chiralFlag);
3256
3257
4.66k
  if (nAtoms) {
3258
1.77k
    ParseV3000AtomBlock(inStream, line, nAtoms, mol, conf, strictParsing,
3259
1.77k
                        expectMacroAtoms);
3260
1.77k
  }
3261
4.66k
  if (nBonds) {
3262
1.00k
    ParseV3000BondBlock(inStream, line, nBonds, mol, chiralityPossible);
3263
1.00k
  }
3264
3265
4.66k
  tempStr = getV3000Line(inStream, line);
3266
  // do link nodes:
3267
4.66k
  boost::to_upper(tempStr);
3268
6.44k
  while (tempStr.length() > 8 && tempStr.substr(0, 8) == "LINKNODE") {
3269
1.77k
    boost::to_upper(tempStr);
3270
    // if the line has nothing on it we just ignore it
3271
1.77k
    if (tempStr.size() > 9) {
3272
1.72k
      std::string existing = "";
3273
1.72k
      if (mol->getPropIfPresent(common_properties::molFileLinkNodes,
3274
1.72k
                                existing)) {
3275
1.65k
        existing += "|";
3276
1.65k
      }
3277
1.72k
      existing += tempStr.substr(9);  // skip the "LINKNODE "
3278
1.72k
      mol->setProp(common_properties::molFileLinkNodes, existing);
3279
1.72k
    }
3280
1.77k
    tempStr = getV3000Line(inStream, line);
3281
1.77k
  }
3282
3283
4.66k
  bool sgroupFound = false;
3284
4.66k
  bool obj3dFound = false;
3285
4.66k
  boost::to_upper(tempStr);
3286
13.0k
  while (tempStr.length() > 5 && tempStr.substr(0, 5) == "BEGIN") {
3287
8.41k
    if (tempStr.length() >= 12 && tempStr.substr(0, 12) == "BEGIN SGROUP") {
3288
1.43k
      if (sgroupFound) {
3289
6
        std::ostringstream errout;
3290
6
        errout << "BEGIN SGROUP found more than once on line " << line;
3291
6
        throw FileParseException(errout.str());
3292
3293
1.42k
      } else if (!nSgroups) {
3294
58
        std::ostringstream errout;
3295
58
        errout << "BEGIN SGROUP  found but Sgroups NOT expected on line "
3296
58
               << line;
3297
58
        if (strictParsing) {
3298
1
          throw FileParseException(errout.str());
3299
57
        } else {
3300
57
          BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
3301
          // Prepare to read a lot of sgroups
3302
57
          nSgroups = std::numeric_limits<unsigned int>::max();
3303
57
        }
3304
58
      }
3305
1.42k
      sgroupFound = true;
3306
1.42k
      tempStr =
3307
1.42k
          ParseV3000SGroupsBlock(inStream, line, nSgroups, mol, strictParsing);
3308
1.42k
      boost::to_upper(tempStr);
3309
1.42k
      if (tempStr.length() < 10 || tempStr.substr(0, 10) != "END SGROUP") {
3310
87
        std::ostringstream errout;
3311
87
        errout << "END SGROUP line not found on line " << line;
3312
87
        if (strictParsing) {
3313
2
          throw FileParseException(errout.str());
3314
85
        } else {
3315
85
          BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
3316
85
        }
3317
1.33k
      } else {
3318
1.33k
        tempStr = getV3000Line(inStream, line);
3319
1.33k
        boost::to_upper(tempStr);
3320
1.33k
      }
3321
3322
6.98k
    } else if (tempStr.length() >= 15 &&
3323
5.68k
               tempStr.substr(6, 10) == "COLLECTION") {
3324
4.12k
      tempStr = parseEnhancedStereo(inStream, line, mol, strictParsing);
3325
4.12k
      boost::to_upper(tempStr);
3326
4.12k
    } else if (tempStr.length() >= 11 &&
3327
2.48k
               tempStr.substr(0, 11) == "BEGIN OBJ3D") {
3328
51
      if (obj3dFound) {
3329
1
        std::ostringstream errout;
3330
1
        errout << "BEGIN OBJ3D found more than once on line " << line;
3331
1
        throw FileParseException(errout.str());
3332
1
      }
3333
50
      if (!n3DConstraints) {
3334
30
        std::ostringstream errout;
3335
30
        errout << "BEGIN OBJ3D found but 3n3DConstraints NOT expected on line "
3336
30
               << line;
3337
30
        if (strictParsing) {
3338
1
          throw FileParseException(errout.str());
3339
29
        } else {
3340
29
          BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
3341
29
        }
3342
30
      }
3343
49
      BOOST_LOG(rdWarningLog)
3344
0
          << "3D constraint information in mol block ignored at line " << line
3345
0
          << std::endl;
3346
49
      obj3dFound = true;
3347
451
      for (unsigned int i = 0; i < n3DConstraints; ++i) {
3348
402
        tempStr = getV3000Line(inStream, line);
3349
402
      }
3350
49
      tempStr = getV3000Line(inStream, line);
3351
49
      boost::to_upper(tempStr);
3352
49
      if (tempStr.length() < 9 || tempStr.substr(0, 9) != "END OBJ3D") {
3353
31
        std::ostringstream errout;
3354
31
        errout << "END OBJ3D line not found on line " << line;
3355
31
        if (strictParsing) {
3356
0
          throw FileParseException(errout.str());
3357
31
        } else {
3358
31
          BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
3359
31
        }
3360
31
      }
3361
49
      tempStr = getV3000Line(inStream, line);
3362
49
      boost::to_upper(tempStr);
3363
2.80k
    } else {
3364
      // skip blocks we don't know how to read
3365
2.80k
      BOOST_LOG(rdWarningLog) << "skipping block at line " << line << ": '"
3366
0
                              << tempStr << "'" << std::endl;
3367
8.05k
      while (tempStr.length() < 3 || tempStr.substr(0, 3) != "END") {
3368
5.25k
        tempStr = getV3000Line(inStream, line);
3369
5.25k
      }
3370
2.80k
      tempStr = getV3000Line(inStream, line);
3371
2.80k
      boost::to_upper(tempStr);
3372
2.80k
    }
3373
8.41k
  }
3374
3375
4.65k
  if (nSgroups && !sgroupFound) {
3376
52
    std::ostringstream errout;
3377
52
    errout << "BEGIN SGROUP line not found on line " << line;
3378
52
    if (strictParsing) {
3379
9
      throw FileParseException(errout.str());
3380
43
    } else {
3381
43
      BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
3382
43
    }
3383
52
  }
3384
3385
4.64k
  if (n3DConstraints && !obj3dFound) {
3386
42
    std::ostringstream errout;
3387
42
    errout << "BEGIN OBJ3D line not found on line " << line;
3388
42
    if (strictParsing) {
3389
4
      throw FileParseException(errout.str());
3390
38
    } else {
3391
38
      BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
3392
38
    }
3393
42
  }
3394
3395
4.64k
  boost::to_upper(tempStr);
3396
4.64k
  if (tempStr.length() < 8 || tempStr.substr(0, 8) != "END CTAB") {
3397
208
    if (strictParsing) {
3398
44
      throw FileParseException("END CTAB line not found");
3399
164
    } else {
3400
164
      BOOST_LOG(rdWarningLog) << "END CTAB line not found." << std::endl;
3401
164
    }
3402
208
  }
3403
3404
4.59k
  if (expectMEND) {
3405
165
    tempStr = getLine(inStream);
3406
165
    ++line;
3407
165
    if (tempStr[0] == 'M' && tempStr.substr(0, 6) == "M  END") {
3408
21
      fileComplete = true;
3409
21
    }
3410
4.43k
  } else {
3411
4.43k
    fileComplete = true;
3412
4.43k
  }
3413
3414
4.59k
  auto is3d = calculate3dFlag(*mol, *conf, chiralityPossible);
3415
4.59k
  conf->set3D(is3d);
3416
4.59k
  mol->addConformer(conf, true);
3417
4.59k
  conf = nullptr;
3418
3419
4.59k
  return fileComplete;
3420
4.64k
}
3421
3422
bool ParseV2000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol,
3423
                    Conformer *&conf, bool &chiralityPossible,
3424
                    unsigned int &nAtoms, unsigned int &nBonds,
3425
18.4k
                    bool strictParsing) {
3426
18.4k
  conf = new Conformer(nAtoms);
3427
3428
18.4k
  if (nAtoms == 0) {
3429
3.66k
    conf->set3D(false);
3430
14.8k
  } else {
3431
14.8k
    ParseMolBlockAtoms(inStream, line, nAtoms, mol, conf, strictParsing);
3432
14.8k
  }
3433
18.4k
  ParseMolBlockBonds(inStream, line, nBonds, mol, chiralityPossible);
3434
3435
18.4k
  auto is3d = calculate3dFlag(*mol, *conf, chiralityPossible);
3436
18.4k
  conf->set3D(is3d);
3437
18.4k
  mol->addConformer(conf, true);
3438
18.4k
  conf = nullptr;
3439
3440
18.4k
  bool fileComplete =
3441
18.4k
      ParseMolBlockProperties(inStream, line, mol, strictParsing);
3442
18.4k
  return fileComplete;
3443
18.4k
}
3444
3445
void finishMolProcessing(
3446
    RWMol *res, bool chiralityPossible,
3447
4.90k
    const RDKit::v2::FileParsers::MolFileParserParams &params) {
3448
4.90k
  if (!res) {
3449
0
    return;
3450
0
  }
3451
4.90k
  res->clearAllAtomBookmarks();
3452
4.90k
  res->clearAllBondBookmarks();
3453
3454
4.90k
  if (params.expandAttachmentPoints) {
3455
0
    MolOps::expandAttachmentPoints(*res);
3456
0
  }
3457
3458
  // calculate explicit valence on each atom:
3459
110k
  for (auto atom : res->atoms()) {
3460
110k
    atom->calcExplicitValence(false);
3461
110k
  }
3462
3463
  // postprocess mol file flags
3464
4.90k
  ProcessMolProps(res);
3465
3466
  // update the chirality and stereo-chemistry
3467
  //
3468
  // NOTE: we detect the stereochemistry before sanitizing/removing
3469
  // hydrogens because the removal of H atoms may actually remove
3470
  // the wedged bond from the molecule.  This wipes out the only
3471
  // sign that chirality ever existed and makes us sad... so first
3472
  // perceive chirality, then remove the Hs and sanitize.
3473
  //
3474
4.90k
  const Conformer &conf = res->getConformer();
3475
4.90k
  if (chiralityPossible || conf.is3D()) {
3476
3.51k
    if (!conf.is3D()) {
3477
418
      bool replaceExistingTags = true;
3478
418
      MolOps::assignChiralTypesFromBondDirs(*res, conf.getId(),
3479
418
                                            replaceExistingTags);
3480
3.09k
    } else {
3481
3.09k
      res->updatePropertyCache(false);
3482
3.09k
      MolOps::assignChiralTypesFrom3D(*res, conf.getId(), true);
3483
3.09k
    }
3484
3.51k
  }
3485
3486
4.90k
  Atropisomers::detectAtropisomerChirality(*res, &conf);
3487
3488
  // now that atom stereochem has been perceived, the wedging
3489
  // information is no longer needed, so we clear
3490
  // single bond dir flags:
3491
4.90k
  MolOps::clearSingleBondDirFlags(*res);
3492
3493
4.90k
  if (params.sanitize) {
3494
4.09k
    if (params.removeHs) {
3495
      // Bond stereo detection must happen before H removal, or
3496
      // else we might be removing stereogenic H atoms in double
3497
      // bonds (e.g. imines). But before we run stereo detection,
3498
      // we need to run mol cleanup so don't have trouble with
3499
      // e.g. nitro groups. Sadly, this a;; means we will find
3500
      // run both cleanup and ring finding twice (a fast find
3501
      // rings in bond stereo detection, and another in
3502
      // sanitization's SSSR symmetrization).
3503
2.50k
      unsigned int failedOp = 0;
3504
2.50k
      MolOps::sanitizeMol(*res, failedOp, MolOps::SANITIZE_CLEANUP);
3505
2.50k
      MolOps::detectBondStereochemistry(*res);
3506
2.50k
      MolOps::removeHs(*res);
3507
2.50k
    } else {
3508
1.59k
      MolOps::sanitizeMol(*res);
3509
1.59k
      MolOps::detectBondStereochemistry(*res);
3510
1.59k
    }
3511
3512
4.09k
    MolOps::assignStereochemistry(*res, true, true, true);
3513
4.09k
  } else {
3514
805
    MolOps::detectBondStereochemistry(*res);
3515
805
  }
3516
3517
4.90k
  if (res->hasProp(common_properties::_NeedsQueryScan)) {
3518
79
    res->clearProp(common_properties::_NeedsQueryScan);
3519
79
    QueryOps::completeMolQueries(res);
3520
79
  }
3521
4.90k
}
3522
}  // namespace FileParserUtils
3523
3524
namespace v2 {
3525
namespace FileParsers {
3526
//------------------------------------------------
3527
//
3528
//  Read a molecule from a stream
3529
//
3530
//------------------------------------------------
3531
std::unique_ptr<RWMol> MolFromMolDataStream(std::istream &inStream,
3532
                                            unsigned int &line,
3533
23.4k
                                            const MolFileParserParams &params) {
3534
23.4k
  std::string tempStr;
3535
23.4k
  bool fileComplete = false;
3536
23.4k
  bool chiralityPossible = false;
3537
23.4k
  Utils::LocaleSwitcher ls;
3538
  // mol name
3539
23.4k
  line++;
3540
23.4k
  tempStr = getLine(inStream);
3541
23.4k
  if (inStream.eof()) {
3542
42
    return nullptr;
3543
42
  }
3544
23.4k
  auto res = std::make_unique<RWMol>();
3545
23.4k
  res->setProp(common_properties::_Name, tempStr);
3546
3547
  // info
3548
23.4k
  line++;
3549
23.4k
  tempStr = getLine(inStream);
3550
23.4k
  res->setProp("_MolFileInfo", tempStr);
3551
23.4k
  if (tempStr.length() >= 22) {
3552
2.66k
    std::string dimLabel = tempStr.substr(20, 2);
3553
    // Unless labelled as 3D we assume 2D
3554
2.66k
    if (dimLabel == "3d" || dimLabel == "3D") {
3555
418
      res->setProp(common_properties::_3DConf, 1);
3556
418
    }
3557
2.66k
  }
3558
  // comments
3559
23.4k
  line++;
3560
23.4k
  tempStr = getLine(inStream);
3561
23.4k
  res->setProp("_MolFileComments", tempStr);
3562
3563
23.4k
  unsigned int nAtoms = 0, nBonds = 0, nLists = 0, chiralFlag = 0, nsText = 0,
3564
23.4k
               nRxnComponents = 0;
3565
23.4k
  int nReactants = 0, nProducts = 0, nIntermediates = 0;
3566
23.4k
  (void)nLists;  // read from the file but unused
3567
23.4k
  (void)nsText;
3568
23.4k
  (void)nRxnComponents;
3569
23.4k
  (void)nReactants;
3570
23.4k
  (void)nProducts;
3571
23.4k
  (void)nIntermediates;
3572
  // counts line, this is where we really get started
3573
23.4k
  line++;
3574
23.4k
  tempStr = getLine(inStream);
3575
3576
23.4k
  if (tempStr.size() < 6) {
3577
101
    if (res) {
3578
101
      res = nullptr;
3579
101
    }
3580
101
    std::ostringstream errout;
3581
101
    errout << "Counts line too short: '" << tempStr << "' on line" << line;
3582
101
    throw FileParseException(errout.str());
3583
101
  }
3584
3585
23.3k
  unsigned int spos = 0;
3586
  // this needs to go into a try block because if the lexical_cast throws an
3587
  // exception we want to catch throw a different exception
3588
23.3k
  try {
3589
23.3k
    nAtoms = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
3590
23.3k
    spos = 3;
3591
23.3k
    nBonds = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
3592
23.3k
    spos = 6;
3593
23.3k
  } catch (boost::bad_lexical_cast &) {
3594
61
    std::ostringstream errout;
3595
61
    errout << "Cannot convert '" << tempStr.substr(spos, 3)
3596
61
           << "' to unsigned int on line " << line;
3597
61
    throw FileParseException(errout.str());
3598
61
  }
3599
23.2k
  try {
3600
23.2k
    spos = 6;
3601
23.2k
    if (tempStr.size() >= 9) {
3602
11.0k
      nLists = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
3603
11.0k
    }
3604
3605
23.2k
    spos = 12;
3606
23.2k
    if (tempStr.size() >= spos + 3) {
3607
3.45k
      chiralFlag = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
3608
3.45k
    }
3609
3610
23.2k
    spos = 15;
3611
23.2k
    if (tempStr.size() >= spos + 3) {
3612
2.11k
      nsText = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
3613
2.11k
    }
3614
3615
23.2k
    spos = 18;
3616
23.2k
    if (tempStr.size() >= spos + 3) {
3617
1.47k
      nRxnComponents =
3618
1.47k
          FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
3619
1.47k
    }
3620
3621
23.2k
    spos = 21;
3622
23.2k
    if (tempStr.size() >= spos + 3) {
3623
1.38k
      nReactants = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
3624
1.38k
    }
3625
3626
23.2k
    spos = 24;
3627
23.2k
    if (tempStr.size() >= spos + 3) {
3628
1.25k
      nProducts = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
3629
1.25k
    }
3630
3631
23.2k
    spos = 27;
3632
23.2k
    if (tempStr.size() >= spos + 3) {
3633
1.19k
      nIntermediates =
3634
1.19k
          FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
3635
1.19k
    }
3636
3637
23.2k
  } catch (boost::bad_lexical_cast &) {
3638
    // some SD files (such as some from NCI) lack all the extra information
3639
    // on the header line, so ignore problems parsing there.
3640
9.34k
  }
3641
3642
23.2k
  unsigned int ctabVersion = 2000;
3643
23.2k
  if (tempStr.size() > 35) {
3644
7.34k
    if (tempStr.size() < 39 || tempStr[34] != 'V') {
3645
1.10k
      std::ostringstream errout;
3646
1.10k
      errout << "CTAB version string invalid at line " << line;
3647
1.10k
      if (params.strictParsing) {
3648
40
        throw FileParseException(errout.str());
3649
1.06k
      } else {
3650
1.06k
        BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
3651
1.06k
      }
3652
6.23k
    } else if (tempStr.substr(34, 5) == "V3000") {
3653
4.74k
      ctabVersion = 3000;
3654
4.74k
    } else if (tempStr.substr(34, 5) != "V2000") {
3655
503
      std::ostringstream errout;
3656
503
      errout << "Unsupported CTAB version: '" << tempStr.substr(34, 5)
3657
503
             << "' at line " << line;
3658
503
      if (params.strictParsing) {
3659
2
        throw FileParseException(errout.str());
3660
501
      } else {
3661
501
        BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
3662
501
      }
3663
992
    } else if (params.parsingSCSRMol) {
3664
0
      std::ostringstream errout;
3665
0
      errout << "SCSR Mol files is not V3000 at line" << line;
3666
0
      throw FileParseException(errout.str());
3667
0
    }
3668
7.34k
  }
3669
3670
23.2k
  res->setProp(common_properties::_MolFileChiralFlag, chiralFlag);
3671
3672
23.2k
  Conformer *conf = nullptr;
3673
23.2k
  try {
3674
23.2k
    if (ctabVersion == 2000) {
3675
18.4k
      fileComplete = FileParserUtils::ParseV2000CTAB(
3676
18.4k
          &inStream, line, res.get(), conf, chiralityPossible, nAtoms, nBonds,
3677
18.4k
          params.strictParsing);
3678
18.4k
    } else {
3679
4.74k
      if (nAtoms != 0 || nBonds != 0) {
3680
894
        std::ostringstream errout;
3681
894
        errout << "V3000 mol blocks should have 0s in the initial counts line. "
3682
894
                  "(line: "
3683
894
               << line << ")";
3684
894
        if (params.strictParsing) {
3685
1
          throw FileParseException(errout.str());
3686
893
        } else {
3687
893
          BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
3688
893
        }
3689
894
      }
3690
3691
4.74k
      auto expectMEND = true;
3692
4.74k
      auto expectMacroAtoms = false;
3693
4.74k
      if (params.parsingSCSRMol) {
3694
0
        expectMEND = false;
3695
0
        expectMacroAtoms = true;
3696
0
      }
3697
3698
4.74k
      fileComplete = FileParserUtils::ParseV3000CTAB(
3699
4.74k
          &inStream, line, res.get(), conf, chiralityPossible, nAtoms, nBonds,
3700
4.74k
          params.strictParsing, expectMEND, expectMacroAtoms);
3701
4.74k
    }
3702
23.2k
  } catch (MolFileUnhandledFeatureException &e) {
3703
    // unhandled mol file feature, show an error
3704
58
    res.reset();
3705
58
    delete conf;
3706
58
    conf = nullptr;
3707
58
    BOOST_LOG(rdErrorLog) << " Unhandled CTAB feature: '" << e.what()
3708
0
                          << "'. Molecule skipped." << std::endl;
3709
3710
58
    if (!inStream.eof()) {
3711
54
      tempStr = getLine(inStream);
3712
54
    }
3713
58
    ++line;
3714
1.30k
    while (!inStream.eof() && !inStream.fail() &&
3715
1.25k
           tempStr.substr(0, 6) != "M  END" && tempStr.substr(0, 4) != "$$$$") {
3716
1.25k
      tempStr = getLine(inStream);
3717
1.25k
      ++line;
3718
1.25k
    }
3719
58
    fileComplete = !inStream.eof() || tempStr.substr(0, 6) == "M  END" ||
3720
54
                   tempStr.substr(0, 4) == "$$$$";
3721
12.3k
  } catch (FileParseException &e) {
3722
    // catch our exceptions and throw them back after cleanup
3723
12.3k
    delete conf;
3724
12.3k
    conf = nullptr;
3725
12.3k
    throw e;
3726
12.3k
  }
3727
3728
9.42k
  if (!fileComplete) {
3729
4.52k
    delete conf;
3730
4.52k
    conf = nullptr;
3731
4.52k
    std::ostringstream errout;
3732
4.52k
    errout
3733
4.52k
        << "Problems encountered parsing Mol data, M  END missing around line "
3734
4.52k
        << line;
3735
4.52k
    throw FileParseException(errout.str());
3736
4.52k
  }
3737
3738
4.90k
  if (res) {
3739
4.90k
    FileParserUtils::finishMolProcessing(res.get(), chiralityPossible, params);
3740
4.90k
  }
3741
4.90k
  return res;
3742
9.42k
}
3743
3744
//------------------------------------------------
3745
//
3746
//  Read a molecule from a string
3747
//
3748
//------------------------------------------------
3749
std::unique_ptr<RWMol> MolFromMolBlock(const std::string &molBlock,
3750
0
                                       const MolFileParserParams &params) {
3751
0
  std::istringstream inStream(molBlock);
3752
0
  unsigned int line = 0;
3753
0
  return MolFromMolDataStream(inStream, line, params);
3754
0
}
3755
3756
//------------------------------------------------
3757
//
3758
//  Read a molecule from a file
3759
//
3760
//------------------------------------------------
3761
std::unique_ptr<RWMol> MolFromMolFile(const std::string &fName,
3762
0
                                      const MolFileParserParams &params) {
3763
0
  std::ifstream inStream(fName.c_str());
3764
0
  if (!inStream || (inStream.bad())) {
3765
0
    std::ostringstream errout;
3766
0
    errout << "Bad input file " << fName;
3767
0
    throw BadFileException(errout.str());
3768
0
  }
3769
0
  if (!inStream.eof()) {
3770
0
    unsigned int line = 0;
3771
0
    return MolFromMolDataStream(inStream, line, params);
3772
0
  } else {
3773
0
    return std::unique_ptr<RWMol>();
3774
0
  }
3775
0
}
3776
}  // namespace FileParsers
3777
}  // namespace v2
3778
}  // namespace RDKit