/src/rdkit/Code/GraphMol/SmilesParse/SmilesWrite.h
Line | Count | Source |
1 | | // |
2 | | // Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors |
3 | | // |
4 | | // @@ All Rights Reserved @@ |
5 | | // This file is part of the RDKit. |
6 | | // The contents are covered by the terms of the BSD license |
7 | | // which is included in the file license.txt, found at the root |
8 | | // of the RDKit source tree. |
9 | | // |
10 | | #include <RDGeneral/export.h> |
11 | | #ifndef RD_SMILESWRITE_H_012020 |
12 | | #define RD_SMILESWRITE_H_012020 |
13 | | |
14 | | #include <string> |
15 | | #include <vector> |
16 | | #include <memory> |
17 | | #include <cstdint> |
18 | | #include <limits> |
19 | | #include <RDGeneral/BetterEnums.h> |
20 | | |
21 | | #include <boost/shared_ptr.hpp> |
22 | | |
23 | | namespace RDKit { |
24 | | class Atom; |
25 | | class Bond; |
26 | | class ROMol; |
27 | | |
28 | | typedef std::vector<boost::shared_ptr<ROMol>> MOL_SPTR_VECT; |
29 | | |
30 | | struct RDKIT_SMILESPARSE_EXPORT SmilesWriteParams { |
31 | | bool doIsomericSmiles = |
32 | | true; /**< include stereochemistry and isotope information */ |
33 | | bool doKekule = false; /**< kekulize the molecule before generating the SMILES |
34 | | and output single/double bonds. NOTE that the output |
35 | | is not canonical and that this will thrown an |
36 | | exception if the molecule cannot be kekulized. */ |
37 | | bool canonical = true; /**< generate canonical SMILES */ |
38 | | bool cleanStereo = true; /**< clean up stereo */ |
39 | | bool allBondsExplicit = false; /**< include symbols for all bonds */ |
40 | | bool allHsExplicit = false; /**< provide hydrogen counts for every atom */ |
41 | | bool doRandom = false; /**< randomize the output order. The resulting SMILES |
42 | | is not canonical and the value of the canonical |
43 | | parameter will be ignored. */ |
44 | | int rootedAtAtom = -1; /**< make sure the SMILES starts at the specified |
45 | | atom. The resulting SMILES is not canonical and |
46 | | the value of the canonical parameter will be |
47 | | ignored. */ |
48 | | bool includeDativeBonds = |
49 | | true; /**< include the RDKit extension for dative bonds. Otherwise dative |
50 | | bonds will be written as single bonds*/ |
51 | | bool ignoreAtomMapNumbers = false; /**< If true, ignores any atom map numbers |
52 | | when canonicalizing the molecule */ |
53 | | }; |
54 | | |
55 | | namespace SmilesWrite { |
56 | | |
57 | | BETTER_ENUM(CXSmilesFields, uint32_t, // clang-format off |
58 | | CX_NONE = 0, |
59 | | CX_ATOM_LABELS = 1 << 0, |
60 | | CX_MOLFILE_VALUES = 1 << 1, |
61 | | CX_COORDS = 1 << 2, |
62 | | CX_RADICALS = 1 << 3, |
63 | | CX_ATOM_PROPS = 1 << 4, |
64 | | CX_LINKNODES = 1 << 5, |
65 | | CX_ENHANCEDSTEREO = 1 << 6, |
66 | | CX_SGROUPS = 1 << 7, |
67 | | CX_POLYMER = 1 << 8, |
68 | | CX_BOND_CFG = 1 << 9, |
69 | | CX_BOND_ATROPISOMER = 1 << 10, |
70 | | CX_COORDINATE_BONDS = 1 << 11, |
71 | | CX_HYDROGEN_BONDS = 1 << 12, |
72 | | CX_ZERO_BONDS = 1 << 13, |
73 | | CX_ALL = 0x7fffffff, |
74 | | CX_ALL_BUT_COORDS = CX_ALL ^ CX_COORDS |
75 | | ); |
76 | | |
77 | | //! \brief returns the cxsmiles data for a molecule |
78 | | RDKIT_SMILESPARSE_EXPORT std::string getCXExtensions( |
79 | | const ROMol &mol, std::uint32_t flags = CXSmilesFields::CX_ALL); |
80 | | |
81 | | //! \brief returns the cxsmiles data for a vector of molecules |
82 | | RDKIT_SMILESPARSE_EXPORT std::string getCXExtensions( |
83 | | const std::vector<ROMol *> &mols, std::uint32_t flags); |
84 | | |
85 | | //! \brief returns true if the atom number is in the SMILES organic subset |
86 | | RDKIT_SMILESPARSE_EXPORT bool inOrganicSubset(int atomicNumber); |
87 | | |
88 | | //! \brief returns the SMILES for an atom |
89 | | /*! |
90 | | \param atom : the atom to work with |
91 | | \param ps : the parameters controlling the SMILES generation |
92 | | */ |
93 | | RDKIT_SMILESPARSE_EXPORT std::string GetAtomSmiles(const Atom *atom, |
94 | | const SmilesWriteParams &ps); |
95 | | |
96 | | //! \brief returns the SMILES for an atom |
97 | | /*! |
98 | | \param atom : the atom to work with |
99 | | \param doKekule : we're doing kekulized smiles (e.g. don't use |
100 | | lower case for the atom label) |
101 | | \param bondIn : the bond we came into the atom on (unused) |
102 | | \param allHsExplicit : if true, hydrogen counts will be provided for every |
103 | | atom. |
104 | | \param isomericSmiles : if true, isomeric SMILES will be generated |
105 | | */ |
106 | | inline std::string GetAtomSmiles(const Atom *atom, bool doKekule = false, |
107 | | const Bond * = nullptr, |
108 | | bool allHsExplicit = false, |
109 | 0 | bool isomericSmiles = true) { |
110 | 0 | // RDUNUSED_PARAM(bondIn); |
111 | 0 | SmilesWriteParams ps; |
112 | 0 | ps.doIsomericSmiles = isomericSmiles; |
113 | 0 | ps.doKekule = doKekule; |
114 | 0 | ps.allHsExplicit = allHsExplicit; |
115 | 0 | return GetAtomSmiles(atom, ps); |
116 | 0 | }; |
117 | | |
118 | | //! \brief returns the SMILES for a bond |
119 | | /*! |
120 | | \param bond : the bond to work with |
121 | | \param ps : the parameters controlling the SMILES generation |
122 | | \param atomToLeftIdx : the index of the atom preceding \c bond |
123 | | in the SMILES |
124 | | */ |
125 | | RDKIT_SMILESPARSE_EXPORT std::string GetBondSmiles(const Bond *bond, |
126 | | const SmilesWriteParams &ps, |
127 | | int atomToLeftIdx = -1); |
128 | | //! \brief returns the SMILES for a bond |
129 | | /*! |
130 | | \param bond : the bond to work with |
131 | | \param atomToLeftIdx : the index of the atom preceding \c bond |
132 | | in the SMILES |
133 | | \param doKekule : we're doing kekulized smiles (e.g. write out |
134 | | bond orders for aromatic bonds) |
135 | | \param allBondsExplicit : if true, symbols will be included for all bonds. |
136 | | */ |
137 | | inline std::string GetBondSmiles(const Bond *bond, int atomToLeftIdx = -1, |
138 | | bool doKekule = false, |
139 | 0 | bool allBondsExplicit = false) { |
140 | 0 | SmilesWriteParams ps; |
141 | 0 | ps.doKekule = doKekule; |
142 | 0 | ps.allBondsExplicit = allBondsExplicit; |
143 | 0 | ps.doIsomericSmiles = false; |
144 | 0 | return GetBondSmiles(bond, ps, atomToLeftIdx); |
145 | 0 | }; |
146 | | |
147 | | namespace detail { |
148 | | RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles( |
149 | | const ROMol &mol, const SmilesWriteParams ¶ms, bool doingCXSmiles, bool includeStereoGroups=true); |
150 | | } |
151 | | |
152 | | } // namespace SmilesWrite |
153 | | |
154 | | //! \brief returns canonical SMILES for a molecule |
155 | | RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles( |
156 | | const ROMol &mol, const SmilesWriteParams ¶ms); |
157 | | |
158 | | //! \brief returns SMILES for a molecule, canonical by default |
159 | | /*! |
160 | | \param mol : the molecule in question. |
161 | | \param doIsomericSmiles : include stereochemistry and isotope information |
162 | | in the SMILES |
163 | | |
164 | | \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) NOTE that |
165 | | this will throw an exception if the molecule cannot be kekulized. |
166 | | |
167 | | \param rootedAtAtom : make sure the SMILES starts at the specified atom. |
168 | | The resulting SMILES is not, of course, canonical. |
169 | | \param canonical : if false, no attempt will be made to canonicalize the |
170 | | SMILES |
171 | | \param allBondsExplicit : if true, symbols will be included for all bonds. |
172 | | \param allHsExplicit : if true, hydrogen counts will be provided for every |
173 | | atom. |
174 | | \param doRandom : if true, the first atom in the SMILES string will be |
175 | | selected at random and the SMILES string will not be canonical |
176 | | \param ignoreAtomMapNumbers : if true, ignores any atom map numbers when |
177 | | canonicalizing the molecule |
178 | | */ |
179 | | inline std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles = true, |
180 | | bool doKekule = false, int rootedAtAtom = -1, |
181 | | bool canonical = true, |
182 | | bool allBondsExplicit = false, |
183 | | bool allHsExplicit = false, |
184 | | bool doRandom = false, |
185 | 0 | bool ignoreAtomMapNumbers = false) { |
186 | 0 | SmilesWriteParams ps; |
187 | 0 | ps.doIsomericSmiles = doIsomericSmiles; |
188 | 0 | ps.doKekule = doKekule; |
189 | 0 | ps.rootedAtAtom = rootedAtAtom; |
190 | 0 | ps.canonical = canonical; |
191 | 0 | ps.allBondsExplicit = allBondsExplicit; |
192 | 0 | ps.allHsExplicit = allHsExplicit; |
193 | 0 | ps.doRandom = doRandom; |
194 | 0 | ps.ignoreAtomMapNumbers = ignoreAtomMapNumbers; |
195 | 0 | return MolToSmiles(mol, ps); |
196 | 0 | }; |
197 | | |
198 | | //! \brief returns a vector of random SMILES for a molecule (may contain |
199 | | //! duplicates) |
200 | | /*! |
201 | | \param mol : the molecule in question. |
202 | | \param numSmiles : the number of SMILES to return |
203 | | \param randomSeed : if >0, will be used to seed the random number generator |
204 | | \param doIsomericSmiles : include stereochemistry and isotope information |
205 | | in the SMILES |
206 | | \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) |
207 | | \param allBondsExplicit : if true, symbols will be included for all bonds. |
208 | | \param allHsExplicit : if true, hydrogen counts will be provided for every |
209 | | atom. |
210 | | */ |
211 | | RDKIT_SMILESPARSE_EXPORT std::vector<std::string> MolToRandomSmilesVect( |
212 | | const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed = 0, |
213 | | bool doIsomericSmiles = true, bool doKekule = false, |
214 | | bool allBondsExplicit = false, bool allHsExplicit = false); |
215 | | |
216 | | //! \brief returns canonical SMILES for part of a molecule |
217 | | RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToSmiles( |
218 | | const ROMol &mol, const SmilesWriteParams ¶ms, |
219 | | const std::vector<int> &atomsToUse, |
220 | | const std::vector<int> *bondsToUse = nullptr, |
221 | | const std::vector<std::string> *atomSymbols = nullptr, |
222 | | const std::vector<std::string> *bondSymbols = nullptr); |
223 | | |
224 | | //! \brief returns canonical SMILES for part of a molecule |
225 | | /*! |
226 | | \param mol : the molecule in question. |
227 | | \param atomsToUse : indices of the atoms in the fragment |
228 | | \param bondsToUse : indices of the bonds in the fragment. If this is not |
229 | | provided, |
230 | | all bonds between the atoms in atomsToUse will be included |
231 | | \param atomSymbols : symbols to use for the atoms in the output SMILES |
232 | | \param bondSymbols : symbols to use for the bonds in the output SMILES |
233 | | \param doIsomericSmiles : include stereochemistry and isotope information |
234 | | in the SMILES |
235 | | \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) |
236 | | \param rootedAtAtom : make sure the SMILES starts at the specified atom. |
237 | | The resulting SMILES is not, of course, canonical. |
238 | | \param canonical : if false, no attempt will be made to canonicalize the |
239 | | SMILES |
240 | | \param allBondsExplicit : if true, symbols will be included for all bonds. |
241 | | \param allHsExplicit : if true, hydrogen counts will be provided for every |
242 | | atom. |
243 | | \param doRandom : generate a randomized smiles string by randomly choosing |
244 | | the priority to follow in the DFS traversal. [default false] |
245 | | |
246 | | \b NOTE: the bondSymbols are *not* currently used in the canonicalization. |
247 | | |
248 | | */ |
249 | | inline std::string MolFragmentToSmiles( |
250 | | const ROMol &mol, const std::vector<int> &atomsToUse, |
251 | | const std::vector<int> *bondsToUse = nullptr, |
252 | | const std::vector<std::string> *atomSymbols = nullptr, |
253 | | const std::vector<std::string> *bondSymbols = nullptr, |
254 | | bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1, |
255 | | bool canonical = true, bool allBondsExplicit = false, |
256 | 0 | bool allHsExplicit = false) { |
257 | 0 | SmilesWriteParams ps; |
258 | 0 | ps.doIsomericSmiles = doIsomericSmiles; |
259 | 0 | ps.doKekule = doKekule; |
260 | 0 | ps.rootedAtAtom = rootedAtAtom; |
261 | 0 | ps.canonical = canonical; |
262 | 0 | ps.allBondsExplicit = allBondsExplicit; |
263 | 0 | ps.allHsExplicit = allHsExplicit; |
264 | 0 | return MolFragmentToSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols, |
265 | 0 | bondSymbols); |
266 | 0 | } |
267 | | |
268 | | BETTER_ENUM(RestoreBondDirOption, unsigned int, |
269 | | RestoreBondDirOptionTrue = 0, //<!DO restore bond dirs |
270 | | RestoreBondDirOptionClear = 1 //<!clear all bond dir information |
271 | | ); |
272 | | |
273 | | //! \brief returns canonical CXSMILES for a molecule |
274 | | RDKIT_SMILESPARSE_EXPORT std::string MolToCXSmiles( |
275 | | const ROMol &mol, const SmilesWriteParams &ps, |
276 | | std::uint32_t flags = SmilesWrite::CXSmilesFields::CX_ALL, |
277 | | RestoreBondDirOption restoreBondDirs = |
278 | | RestoreBondDirOption::RestoreBondDirOptionClear); |
279 | | |
280 | | //! \brief returns canonical CXSMILES for a molecule |
281 | | /*! |
282 | | \param mol : the molecule in question. |
283 | | \param doIsomericSmiles : include stereochemistry and isotope information |
284 | | in the SMILES |
285 | | \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) |
286 | | \param rootedAtAtom : make sure the SMILES starts at the specified atom. |
287 | | The resulting SMILES is not, of course, canonical. |
288 | | \param canonical : if false, no attempt will be made to canonicalize the |
289 | | SMILES |
290 | | \param allBondsExplicit : if true, symbols will be included for all bonds. |
291 | | \param allHsExplicit : if true, hydrogen counts will be provided for every |
292 | | \param doRandom : generate a randomized smiles string by randomly choosing |
293 | | the priority to follow in the DFS traversal. [default false] |
294 | | atom. |
295 | | */ |
296 | | inline std::string MolToCXSmiles(const ROMol &mol, bool doIsomericSmiles = true, |
297 | | bool doKekule = false, int rootedAtAtom = -1, |
298 | | bool canonical = true, |
299 | | bool allBondsExplicit = false, |
300 | | bool allHsExplicit = false, |
301 | 0 | bool doRandom = false) { |
302 | 0 | SmilesWriteParams ps; |
303 | 0 | ps.doIsomericSmiles = doIsomericSmiles; |
304 | 0 | ps.doKekule = doKekule; |
305 | 0 | ps.rootedAtAtom = rootedAtAtom; |
306 | 0 | ps.canonical = canonical; |
307 | 0 | ps.allBondsExplicit = allBondsExplicit; |
308 | 0 | ps.allHsExplicit = allHsExplicit; |
309 | 0 | ps.doRandom = doRandom; |
310 | 0 | return MolToCXSmiles(mol, ps, SmilesWrite::CXSmilesFields::CX_ALL); |
311 | 0 | }; |
312 | | |
313 | | //! \brief returns canonical CXSMILES for part of a molecule |
314 | | RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToCXSmiles( |
315 | | const ROMol &mol, const SmilesWriteParams ¶ms, |
316 | | const std::vector<int> &atomsToUse, |
317 | | const std::vector<int> *bondsToUse = nullptr, |
318 | | const std::vector<std::string> *atomSymbols = nullptr, |
319 | | const std::vector<std::string> *bondSymbols = nullptr); |
320 | | |
321 | | //! \brief returns canonical CXSMILES for part of a molecule |
322 | | /*! |
323 | | \param mol : the molecule in question. |
324 | | \param atomsToUse : indices of the atoms in the fragment |
325 | | \param bondsToUse : indices of the bonds in the fragment. If this is not |
326 | | provided, |
327 | | all bonds between the atoms in atomsToUse will be included |
328 | | \param atomSymbols : symbols to use for the atoms in the output SMILES |
329 | | \param bondSymbols : symbols to use for the bonds in the output SMILES |
330 | | \param doIsomericSmiles : include stereochemistry and isotope information |
331 | | in the SMILES |
332 | | \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) |
333 | | \param rootedAtAtom : make sure the SMILES starts at the specified atom. |
334 | | The resulting SMILES is not, of course, canonical. |
335 | | \param canonical : if false, no attempt will be made to canonicalize the |
336 | | SMILES |
337 | | \param allBondsExplicit : if true, symbols will be included for all bonds. |
338 | | \param allHsExplicit : if true, hydrogen counts will be provided for every |
339 | | atom. |
340 | | |
341 | | \b NOTE: the bondSymbols are *not* currently used in the canonicalization. |
342 | | |
343 | | */ |
344 | | inline std::string MolFragmentToCXSmiles( |
345 | | const ROMol &mol, const std::vector<int> &atomsToUse, |
346 | | const std::vector<int> *bondsToUse = nullptr, |
347 | | const std::vector<std::string> *atomSymbols = nullptr, |
348 | | const std::vector<std::string> *bondSymbols = nullptr, |
349 | | bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1, |
350 | | bool canonical = true, bool allBondsExplicit = false, |
351 | 0 | bool allHsExplicit = false) { |
352 | 0 | SmilesWriteParams ps; |
353 | 0 | ps.doIsomericSmiles = doIsomericSmiles; |
354 | 0 | ps.doKekule = doKekule; |
355 | 0 | ps.rootedAtAtom = rootedAtAtom; |
356 | 0 | ps.canonical = canonical; |
357 | 0 | ps.allBondsExplicit = allBondsExplicit; |
358 | 0 | ps.allHsExplicit = allHsExplicit; |
359 | 0 | return MolFragmentToCXSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols, |
360 | 0 | bondSymbols); |
361 | 0 | } |
362 | | |
363 | | } // namespace RDKit |
364 | | #endif |