Coverage Report

Created: 2026-06-23 06:55

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rdkit/Code/GraphMol/FileParsers/FileParsers.h
Line
Count
Source
1
//
2
//  Copyright (C) 2002-2024 Greg Landrum and other RDKit contributors
3
//
4
//   @@ All Rights Reserved @@
5
//  This file is part of the RDKit.
6
//  The contents are covered by the terms of the BSD license
7
//  which is included in the file license.txt, found at the root
8
//  of the RDKit source tree.
9
//
10
#include <RDGeneral/export.h>
11
#ifndef RD_FILEPARSERS_H
12
#define RD_FILEPARSERS_H
13
14
#include <RDGeneral/types.h>
15
#include <GraphMol/RDKitBase.h>
16
#include <GraphMol/FileParsers/FileWriters.h>
17
#include "CDXMLParser.h"
18
#include <string>
19
#include <string_view>
20
#include <vector>
21
#include <exception>
22
23
#include <boost/shared_ptr.hpp>
24
25
namespace RDKit {
26
27
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
28
29
namespace v2 {
30
namespace FileParsers {
31
class RDKIT_FILEPARSERS_EXPORT MolFileUnhandledFeatureException
32
    : public std::exception {
33
 public:
34
  //! construct with an error message
35
0
  explicit MolFileUnhandledFeatureException(const char *msg) : _msg(msg) {}
36
  //! construct with an error message
37
  explicit MolFileUnhandledFeatureException(const std::string msg)
38
48
      : _msg(msg) {}
39
  //! get the error message
40
0
  const char *what() const noexcept override { return _msg.c_str(); }
41
48
  ~MolFileUnhandledFeatureException() noexcept override = default;
42
43
 private:
44
  std::string _msg;
45
};
46
47
struct RDKIT_FILEPARSERS_EXPORT MolFileParserParams {
48
  bool sanitize = true;      /**< sanitize the molecule after building it */
49
  bool removeHs = true;      /**< remove Hs after constructing the molecule */
50
  bool strictParsing = true; /**< if set to false, the parser is more lax about
51
                                correctness of the contents. */
52
  bool expandAttachmentPoints =
53
      false; /**< toggle conversion of attachment points into dummy atoms */
54
  bool parsingSCSRMol = false; /**< if true, we are parsing a SCSR mol file */
55
};
56
enum class SCSRTemplateNames {
57
  AsEntered,     //<! use the name of the temlate as entered in the SCSR Mol
58
  UseFirstName,  //<!Use the first name in the template
59
                 // def (For AA, the 3 letter code
60
  UseSecondName  //<!use the second name in the tempate def (
61
                 // For AA, the 1 letter code)
62
};
63
64
enum class SCSRBaseHbondOptions {
65
  Ignore,     //<! Do not include base Hbonds in expanded output
66
  UseSapAll,  //<!use all hbonds defined in SAPs
67
              // can be more than one per base
68
  UseSapOne,  //<!use only one SAP hbond per base
69
              // If multiple SAPs are defined, use the first
70
              // even if it is not the best
71
              //(this just maintains the relationship between
72
              // the to base pairs)
73
  Auto        //<!For bases that are C,G,A,T,U,In (and
74
              // derivatives) use the standard Watson-Crick
75
              // Hbonding.  No SAPs need to be defined, and if
76
              // defined, they are ignored.
77
};
78
79
struct RDKIT_FILEPARSERS_EXPORT MolFromSCSRParams {
80
  bool includeLeavingGroups =
81
      true; /**< when true, leaving groups on atoms that are not exo-bonded are
82
                retained.  When false, no leaving groups are retained */
83
  SCSRTemplateNames scsrTemplateNames = SCSRTemplateNames::AsEntered;
84
85
  SCSRBaseHbondOptions scsrBaseHbondOptions = SCSRBaseHbondOptions::UseSapAll;
86
};
87
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromMolDataStream(
88
    std::istream &inStream, unsigned int &line,
89
    const MolFileParserParams &params = MolFileParserParams());
90
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromMolBlock(
91
    const std::string &molBlock,
92
    const MolFileParserParams &params = MolFileParserParams());
93
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromMolFile(
94
    const std::string &fName,
95
    const MolFileParserParams &params = MolFileParserParams());
96
97
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RDKit::RWMol> MolFromSCSRDataStream(
98
    std::istream &inStream, unsigned int &line,
99
    const MolFileParserParams &molFileParserParams = MolFileParserParams(),
100
    const MolFromSCSRParams &molFromSCSRParams = MolFromSCSRParams());
101
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RDKit::RWMol> MolFromSCSRBlock(
102
    const std::string &molBlock,
103
    const MolFileParserParams &molFileParserParams = MolFileParserParams(),
104
    const MolFromSCSRParams &molFromSCSRParams = MolFromSCSRParams());
105
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RDKit::RWMol> MolFromSCSRFile(
106
    const std::string &fName,
107
    const MolFileParserParams &molFileParserParams = MolFileParserParams(),
108
    const MolFromSCSRParams &molFromSCSRParams = MolFromSCSRParams());
109
110
}  // namespace FileParsers
111
}  // namespace v2
112
113
inline namespace v1 {
114
using RDKit::v2::FileParsers::MolFileUnhandledFeatureException;
115
//-----
116
// mol files
117
//-----
118
// \brief construct a molecule from MDL mol data in a stream
119
/*!
120
 *   \param inStream - stream containing the data
121
 *   \param line     - current line number (used for error reporting)
122
 *   \param sanitize - toggles sanitization and stereochemistry
123
 *                     perception of the molecule
124
 *   \param removeHs - toggles removal of Hs from the molecule. H removal
125
 *                     is only done if the molecule is sanitized
126
 *   \param line     - current line number (used for error reporting)
127
 *   \param strictParsing - if set to false, the parser is more lax about
128
 * correctness of the contents.
129
 *
130
 */
131
inline RWMol *MolDataStreamToMol(std::istream *inStream, unsigned int &line,
132
                                 bool sanitize = true, bool removeHs = true,
133
19.8k
                                 bool strictParsing = true) {
134
19.8k
  v2::FileParsers::MolFileParserParams ps;
135
19.8k
  ps.sanitize = sanitize;
136
19.8k
  ps.removeHs = removeHs;
137
19.8k
  ps.strictParsing = strictParsing;
138
19.8k
  return v2::FileParsers::MolFromMolDataStream(*inStream, line, ps).release();
139
19.8k
};
140
// \overload
141
inline RWMol *MolDataStreamToMol(std::istream &inStream, unsigned int &line,
142
                                 bool sanitize = true, bool removeHs = true,
143
0
                                 bool strictParsing = true) {
144
0
  return MolDataStreamToMol(&inStream, line, sanitize, removeHs, strictParsing);
145
0
};
146
// \brief construct a molecule from an MDL mol block
147
/*!
148
 *   \param molBlock - string containing the mol block
149
 *   \param sanitize - toggles sanitization and stereochemistry
150
 *                     perception of the molecule
151
 *   \param removeHs - toggles removal of Hs from the molecule. H removal
152
 *                     is only done if the molecule is sanitized
153
 *   \param strictParsing - if set to false, the parser is more lax about
154
 * correctness of the contents.
155
 */
156
inline RWMol *MolBlockToMol(const std::string &molBlock, bool sanitize = true,
157
0
                            bool removeHs = true, bool strictParsing = true) {
158
0
  v2::FileParsers::MolFileParserParams ps;
159
0
  ps.sanitize = sanitize;
160
0
  ps.removeHs = removeHs;
161
0
  ps.strictParsing = strictParsing;
162
0
  return v2::FileParsers::MolFromMolBlock(molBlock, ps).release();
163
0
};
164
165
// \brief construct a molecule from an MDL mol file
166
/*!
167
 *   \param fName    - string containing the file name
168
 *   \param sanitize - toggles sanitization and stereochemistry
169
 *                     perception of the molecule
170
 *   \param removeHs - toggles removal of Hs from the molecule. H removal
171
 *                     is only done if the molecule is sanitized
172
 *   \param strictParsing - if set to false, the parser is more lax about
173
 * correctness of the contents.
174
 */
175
inline RWMol *MolFileToMol(const std::string &fName, bool sanitize = true,
176
0
                           bool removeHs = true, bool strictParsing = true) {
177
0
  v2::FileParsers::MolFileParserParams ps;
178
0
  ps.sanitize = sanitize;
179
0
  ps.removeHs = removeHs;
180
0
  ps.strictParsing = strictParsing;
181
0
  return v2::FileParsers::MolFromMolFile(fName, ps).release();
182
0
};
183
}  // namespace v1
184
185
//-----
186
//  TPL handling:
187
//-----
188
189
namespace v2 {
190
namespace FileParsers {
191
struct RDKIT_FILEPARSERS_EXPORT TPLParserParams {
192
  bool sanitize = true; /**< sanitize the molecule after building it */
193
  bool skipFirstConf =
194
      false; /**< if set to true, the first conformer will be skipped */
195
};
196
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromTPLDataStream(
197
    std::istream &inStream, unsigned int &line,
198
    const TPLParserParams &params = TPLParserParams());
199
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromTPLFile(
200
    const std::string &fName,
201
    const TPLParserParams &params = TPLParserParams());
202
203
}  // namespace FileParsers
204
}  // namespace v2
205
206
inline namespace v1 {
207
//! \brief translate TPL data (BioCad format) into a multi-conf molecule
208
/*!
209
  \param inStream:      the stream from which to read
210
  \param line:          used to track the line number of errors
211
  \param sanitize:      toggles sanitization and stereochemistry
212
                        perception of the molecule
213
  \param skipFirstConf: according to the TPL format description, the atomic
214
                        coords in the atom-information block describe the first
215
                        conformation and the first conf block describes second
216
                        conformation. The CombiCode, on the other hand, writes
217
                        the first conformation data both to the atom-information
218
                        block and to the first conf block. We want to be able to
219
                        read CombiCode-style tpls, so we'll allow this
220
  mis-feature
221
                        to be parsed when this flag is set.
222
*/
223
inline RWMol *TPLDataStreamToMol(std::istream *inStream, unsigned int &line,
224
                                 bool sanitize = true,
225
0
                                 bool skipFirstConf = false) {
226
0
  v2::FileParsers::TPLParserParams ps;
227
0
  ps.sanitize = sanitize;
228
0
  ps.skipFirstConf = skipFirstConf;
229
0
  return v2::FileParsers::MolFromTPLDataStream(*inStream, line, ps).release();
230
0
}
231
232
//! \brief construct a multi-conf molecule from a TPL (BioCad format) file
233
/*!
234
  \param fName:         the name of the file from which to read
235
  \param sanitize:      toggles sanitization and stereochemistry
236
                        perception of the molecule
237
  \param skipFirstConf: according to the TPL format description, the atomic
238
                        coords in the atom-information block describe the first
239
                        conformation and the first conf block describes second
240
                        conformation. The CombiCode, on the other hand, writes
241
                        the first conformation data both to the atom-information
242
                        block and to the first conf block. We want to be able to
243
                        read CombiCode-style tpls, so we'll allow this
244
  mis-feature
245
                        to be parsed when this flag is set.
246
*/
247
inline RWMol *TPLFileToMol(const std::string &fName, bool sanitize = true,
248
0
                           bool skipFirstConf = false) {
249
0
  v2::FileParsers::TPLParserParams ps;
250
0
  ps.sanitize = sanitize;
251
0
  ps.skipFirstConf = skipFirstConf;
252
0
  return v2::FileParsers::MolFromTPLFile(fName, ps).release();
253
0
}
254
}  // namespace v1
255
256
namespace v2 {
257
namespace FileParsers {
258
259
//-----
260
//  MOL2 handling
261
//-----
262
263
typedef enum {
264
  CORINA = 0  //!< supports output from Corina and some dbtranslate output
265
} Mol2Type;
266
267
struct Mol2ParserParams {
268
  bool sanitize = true; /**< sanitize the molecule after building it */
269
  bool removeHs = true; /**< remove Hs after constructing the molecule */
270
  Mol2Type variant = Mol2Type::CORINA; /**< the atom type definitions to use */
271
  bool cleanupSubstructures =
272
      true; /**< toggles recognition and cleanup of common substructures */
273
};
274
275
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromMol2DataStream(
276
    std::istream &inStream,
277
    const Mol2ParserParams &params = Mol2ParserParams());
278
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromMol2Block(
279
    const std::string &molBlock,
280
    const Mol2ParserParams &params = Mol2ParserParams());
281
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromMol2File(
282
    const std::string &fName,
283
    const Mol2ParserParams &params = Mol2ParserParams());
284
285
}  // namespace FileParsers
286
}  // namespace v2
287
288
inline namespace v1 {
289
using RDKit::v2::FileParsers::Mol2Type;
290
291
// \brief construct a molecule from a Tripos mol2 file
292
/*!
293
 *
294
 *   \param fName    - string containing the file name
295
 *   \param sanitize - toggles sanitization of the molecule
296
 *   \param removeHs - toggles removal of Hs from the molecule. H removal
297
 *                     is only done if the molecule is sanitized
298
 *   \param variant  - the atom type definitions to use
299
 *   \param cleanupSubstructures - toggles recognition and cleanup of common
300
 *                                 substructures
301
 */
302
inline RWMol *Mol2FileToMol(const std::string &fName, bool sanitize = true,
303
                            bool removeHs = true,
304
                            Mol2Type variant = Mol2Type::CORINA,
305
0
                            bool cleanupSubstructures = true) {
306
0
  v2::FileParsers::Mol2ParserParams ps;
307
0
  ps.sanitize = sanitize;
308
0
  ps.removeHs = removeHs;
309
0
  ps.variant = variant;
310
0
  ps.cleanupSubstructures = cleanupSubstructures;
311
0
  return v2::FileParsers::MolFromMol2File(fName, ps).release();
312
0
}
313
314
// \brief construct a molecule from Tripos mol2 data in a stream
315
/*!
316
 *   \param inStream - stream containing the data
317
 *   \param sanitize - toggles sanitization of the molecule
318
 *   \param removeHs - toggles removal of Hs from the molecule. H removal
319
 *                     is only done if the molecule is sanitized
320
 *   \param variant  - the atom type definitions to use
321
 *   \param cleanupSubstructures - toggles recognition and cleanup of common
322
 *                                 substructures
323
 */
324
inline RWMol *Mol2DataStreamToMol(std::istream &inStream, bool sanitize = true,
325
                                  bool removeHs = true,
326
                                  Mol2Type variant = Mol2Type::CORINA,
327
0
                                  bool cleanupSubstructures = true) {
328
0
  v2::FileParsers::Mol2ParserParams ps;
329
0
  ps.sanitize = sanitize;
330
0
  ps.removeHs = removeHs;
331
0
  ps.variant = variant;
332
0
  ps.cleanupSubstructures = cleanupSubstructures;
333
0
  return v2::FileParsers::MolFromMol2DataStream(inStream, ps).release();
334
0
}
335
// \overload
336
inline RWMol *Mol2DataStreamToMol(std::istream *inStream, bool sanitize = true,
337
                                  bool removeHs = true,
338
                                  Mol2Type variant = Mol2Type::CORINA,
339
0
                                  bool cleanupSubstructures = true) {
340
0
  return Mol2DataStreamToMol(*inStream, sanitize, removeHs, variant,
341
0
                             cleanupSubstructures);
342
0
}
343
344
// \brief construct a molecule from a Tripos mol2 block
345
/*!
346
 *   \param molBlock - string containing the mol block
347
 *   \param sanitize - toggles sanitization of the molecule
348
 *   \param removeHs - toggles removal of Hs from the molecule. H removal
349
 *                     is only done if the molecule is sanitized
350
 *   \param variant  - the atom type definitions to use
351
 *   \param cleanupSubstructures - toggles recognition and cleanup of common
352
 *                                 substructures
353
 */
354
inline RWMol *Mol2BlockToMol(const std::string &molBlock, bool sanitize = true,
355
                             bool removeHs = true,
356
                             Mol2Type variant = Mol2Type::CORINA,
357
0
                             bool cleanupSubstructures = true) {
358
0
  v2::FileParsers::Mol2ParserParams ps;
359
0
  ps.sanitize = sanitize;
360
0
  ps.removeHs = removeHs;
361
0
  ps.variant = variant;
362
0
  ps.cleanupSubstructures = cleanupSubstructures;
363
0
  return v2::FileParsers::MolFromMol2Block(molBlock, ps).release();
364
0
}
365
}  // namespace v1
366
367
namespace v2 {
368
namespace FileParsers {
369
370
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromXYZDataStream(
371
    std::istream &inStream);
372
// \brief construct a molecule from an xyz block
373
/*!
374
 *   \param xyzBlock    - string containing the xyz block
375
 */
376
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromXYZBlock(
377
    const std::string &xyzBlock);
378
// \brief construct a molecule from an xyz file
379
/*!
380
 *   \param fName    - string containing the file name
381
 */
382
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromXYZFile(
383
    const std::string &fName);
384
}  // namespace FileParsers
385
}  // namespace v2
386
inline namespace v1 {
387
0
inline RWMol *XYZDataStreamToMol(std::istream &inStream) {
388
0
  return v2::FileParsers::MolFromXYZDataStream(inStream).release();
389
0
}
390
// \brief construct a molecule from an xyz block
391
/*!
392
 *   \param xyzBlock    - string containing the xyz block
393
 */
394
0
inline RWMol *XYZBlockToMol(const std::string &xyzBlock) {
395
0
  return v2::FileParsers::MolFromXYZBlock(xyzBlock).release();
396
0
}
397
// \brief construct a molecule from an xyz file
398
/*!
399
 *   \param fName    - string containing the file name
400
 */
401
0
inline RWMol *XYZFileToMol(const std::string &fName) {
402
0
  return v2::FileParsers::MolFromXYZFile(fName).release();
403
0
}
404
405
}  // namespace v1
406
407
namespace v2 {
408
namespace FileParsers {
409
struct RDKIT_FILEPARSERS_EXPORT PDBParserParams {
410
  bool sanitize = true; /**< sanitize the molecule after building it */
411
  bool removeHs = true; /**< remove Hs after constructing the molecule */
412
  bool proximityBonding = true; /**< if set to true, proximity bonding will be
413
                                   performed */
414
  unsigned int flavor = 0;      /**< flavor to use */
415
};
416
417
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromPDBDataStream(
418
    std::istream &inStream, const PDBParserParams &params = PDBParserParams());
419
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromPDBFile(
420
    const std::string &fname,
421
    const PDBParserParams &params = PDBParserParams());
422
RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RWMol> MolFromPDBBlock(
423
    const std::string &str, const PDBParserParams &params = PDBParserParams());
424
}  // namespace FileParsers
425
}  // namespace v2
426
427
inline namespace v1 {
428
using RDKit::v2::FileParsers::PDBParserParams;
429
inline RWMol *PDBBlockToMol(const std::string &str, bool sanitize = true,
430
                            bool removeHs = true, unsigned int flavor = 0,
431
0
                            bool proximityBonding = true) {
432
0
  v2::FileParsers::PDBParserParams ps;
433
0
  ps.sanitize = sanitize;
434
0
  ps.removeHs = removeHs;
435
0
  ps.flavor = flavor;
436
0
  ps.proximityBonding = proximityBonding;
437
0
  return v2::FileParsers::MolFromPDBBlock(str, ps).release();
438
0
}
439
inline RWMol *PDBBlockToMol(const char *str, bool sanitize = true,
440
                            bool removeHs = true, unsigned int flavor = 0,
441
0
                            bool proximityBonding = true) {
442
0
  return PDBBlockToMol(std::string(str), sanitize, removeHs, flavor,
443
0
                       proximityBonding);
444
0
}
445
inline RWMol *PDBFileToMol(const std::string &fname, bool sanitize = true,
446
                           bool removeHs = true, unsigned int flavor = 0,
447
0
                           bool proximityBonding = true) {
448
0
  v2::FileParsers::PDBParserParams ps;
449
0
  ps.sanitize = sanitize;
450
0
  ps.removeHs = removeHs;
451
0
  ps.flavor = flavor;
452
0
  ps.proximityBonding = proximityBonding;
453
0
  return v2::FileParsers::MolFromPDBFile(fname, ps).release();
454
0
}
455
inline RWMol *PDBDataStreamToMol(std::istream &inStream, bool sanitize = true,
456
                                 bool removeHs = true, unsigned int flavor = 0,
457
0
                                 bool proximityBonding = true) {
458
0
  v2::FileParsers::PDBParserParams ps;
459
0
  ps.sanitize = sanitize;
460
0
  ps.removeHs = removeHs;
461
0
  ps.flavor = flavor;
462
0
  ps.proximityBonding = proximityBonding;
463
0
  return v2::FileParsers::MolFromPDBDataStream(inStream, ps).release();
464
0
}
465
inline RWMol *PDBDataStreamToMol(std::istream *inStream, bool sanitize = true,
466
                                 bool removeHs = true, unsigned int flavor = 0,
467
0
                                 bool proximityBonding = true) {
468
0
  return PDBDataStreamToMol(*inStream, sanitize, removeHs, flavor,
469
0
                            proximityBonding);
470
0
}
471
}  // namespace v1
472
473
// \brief reads a molecule from the metadata in an RDKit-generated SVG file
474
/*!
475
 *   \param svg      - string containing the SVG
476
 *   \param sanitize - toggles sanitization of the molecule
477
 *   \param removeHs - toggles removal of Hs from the molecule. H removal
478
 *                     is only done if the molecule is sanitized
479
 *
480
 *   **NOTE** This functionality should be considered beta.
481
 */
482
RDKIT_FILEPARSERS_EXPORT RWMol *RDKitSVGToMol(const std::string &svg,
483
                                              bool sanitize = true,
484
                                              bool removeHs = true);
485
/*! \overload
486
 */
487
RDKIT_FILEPARSERS_EXPORT RWMol *RDKitSVGToMol(std::istream *instream,
488
                                              bool sanitize = true,
489
                                              bool removeHs = true);
490
491
inline std::unique_ptr<RDKit::RWMol> operator""_ctab(const char *text,
492
0
                                                     size_t len) {
493
0
  std::string data(text, len);
494
0
  try {
495
0
    return v2::FileParsers::MolFromMolBlock(data);
496
0
  } catch (const RDKit::MolSanitizeException &) {
497
0
    return nullptr;
498
0
  }
499
0
}
500
inline std::unique_ptr<RDKit::RWMol> operator""_mol2(const char *text,
501
0
                                                     size_t len) {
502
0
  std::string data(text, len);
503
0
  try {
504
0
    return v2::FileParsers::MolFromMol2Block(data);
505
0
  } catch (const RDKit::MolSanitizeException &) {
506
0
    return nullptr;
507
0
  }
508
0
}
509
510
inline std::unique_ptr<RDKit::RWMol> operator""_pdb(const char *text,
511
0
                                                    size_t len) {
512
0
  std::string data(text, len);
513
0
  try {
514
0
    return v2::FileParsers::MolFromPDBBlock(data);
515
0
  } catch (const RDKit::MolSanitizeException &) {
516
0
    return nullptr;
517
0
  }
518
0
}
519
520
}  // namespace RDKit
521
522
#endif