/src/openbabel/src/fingerprints/fingerecfp.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | Copyright (C) 2016 by NextMove Software |
3 | | |
4 | | This program is free software; you can redistribute it and/or modify |
5 | | it under the terms of the GNU General Public License as published by |
6 | | the Free Software Foundation version 2 of the License. |
7 | | |
8 | | This program is distributed in the hope that it will be useful, |
9 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | | GNU General Public License for more details. |
12 | | ***********************************************************************/ |
13 | | |
14 | | #include <openbabel/babelconfig.h> |
15 | | #include <openbabel/oberror.h> |
16 | | #include <openbabel/mol.h> |
17 | | #include <openbabel/atom.h> |
18 | | #include <openbabel/bond.h> |
19 | | #include <openbabel/obiter.h> |
20 | | #include <openbabel/fingerprint.h> |
21 | | #include <openbabel/obiter.h> |
22 | | #include <openbabel/elements.h> |
23 | | |
24 | | #include <vector> |
25 | | #include <algorithm> |
26 | | |
27 | | using namespace std; |
28 | | namespace OpenBabel |
29 | | { |
30 | | |
31 | | /// \brief Fingerprint based on ECFP |
32 | | class fingerprintECFP : public OBFingerprint |
33 | | { |
34 | | public: |
35 | | fingerprintECFP(const char* ID, bool IsDefault=false, |
36 | | unsigned int radius=4, bool keepdups = true) |
37 | 12 | : OBFingerprint(ID, IsDefault), |
38 | 12 | _radius(radius), _keepdups(keepdups), _flags(0){}; |
39 | | |
40 | | const char* Description() override |
41 | 0 | { |
42 | | // Important! The second line is used by some output formats (e.g. FPS) |
43 | | // to determine the default size |
44 | 0 | return "Extended-Connectivity Fingerprints (ECFPs)\n" |
45 | 0 | "4096 bits.\n" |
46 | 0 | "Circular topological fingerprints of specified radius\n"; |
47 | 0 | } |
48 | | |
49 | | //Calculates the fingerprint |
50 | | bool GetFingerprint(OBBase* pOb, vector<unsigned int>&fp, int nbits=0) override; |
51 | | |
52 | | /// \returns fragment info unless SetFlags(OBFingerprint::FPT_NOINFO) has been called before GetFingerprint() called. |
53 | | std::string DescribeBits(const std::vector<unsigned int> fp, bool bSet=true) override |
54 | 0 | { return _ss.str(); } |
55 | | |
56 | 0 | unsigned int Flags() override { return _flags; } |
57 | 0 | void SetFlags(unsigned int f) override { _flags=f; } |
58 | | |
59 | | private: |
60 | | std::vector<unsigned int> v; |
61 | | |
62 | | stringstream _ss; |
63 | | unsigned int _radius; |
64 | | bool _keepdups; |
65 | | unsigned int _flags; |
66 | | }; |
67 | | |
68 | | //*********************************************** |
69 | | //Make the global instances |
70 | | fingerprintECFP theECFP0("ECFP0",false, 0, true); |
71 | | fingerprintECFP theECFP2("ECFP2",false, 1, true); |
72 | | fingerprintECFP theECFP4("ECFP4",false, 2, true); |
73 | | fingerprintECFP theECFP6("ECFP6",false, 3, true); |
74 | | fingerprintECFP theECFP8("ECFP8",false, 4, true); |
75 | | fingerprintECFP theECFP10("ECFP10",false, 5, true); |
76 | | //*********************************************** |
77 | | |
78 | 0 | #define mix32(a,b,c) \ |
79 | 0 | { \ |
80 | 0 | a -= b; a -= c; a ^= (c>>13); \ |
81 | 0 | b -= c; b -= a; b ^= (a<<8); \ |
82 | 0 | c -= a; c -= b; c ^= (b>>13); \ |
83 | 0 | a -= b; a -= c; a ^= (c>>12); \ |
84 | 0 | b -= c; b -= a; b ^= (a<<16); \ |
85 | 0 | c -= a; c -= b; c ^= (b>>5); \ |
86 | 0 | a -= b; a -= c; a ^= (c>>3); \ |
87 | 0 | b -= c; b -= a; b ^= (a<<10); \ |
88 | 0 | c -= a; c -= b; c ^= (b>>15); \ |
89 | 0 | } |
90 | | |
91 | | static unsigned int ECFPHash(unsigned char *ptr, unsigned int length) |
92 | 0 | { |
93 | 0 | unsigned int a = 0; |
94 | 0 | unsigned int b = 0; |
95 | 0 | unsigned int c = 0; |
96 | |
|
97 | 0 | unsigned int len = length; |
98 | 0 | while (len >= 12) { |
99 | 0 | a += ptr[0] + ((unsigned int)ptr[1]<<8) |
100 | 0 | + ((unsigned int)ptr[2]<<16) |
101 | 0 | + ((unsigned int)ptr[3]<<24); |
102 | 0 | b += ptr[4] + ((unsigned int)ptr[5]<<8) |
103 | 0 | + ((unsigned int)ptr[6]<<16) |
104 | 0 | + ((unsigned int)ptr[7]<<24); |
105 | 0 | c += ptr[8] + ((unsigned int)ptr[9]<<8) |
106 | 0 | + ((unsigned int)ptr[10]<<16) |
107 | 0 | + ((unsigned int)ptr[11]<<24); |
108 | 0 | mix32(a,b,c); |
109 | 0 | ptr += 12; |
110 | 0 | len -= 12; |
111 | 0 | } |
112 | |
|
113 | 0 | c += length; |
114 | 0 | switch (len) { |
115 | 0 | case 11: c += ((unsigned int)ptr[10]<<24); |
116 | 0 | case 10: c += ((unsigned int)ptr[9]<<16); |
117 | 0 | case 9: c += ((unsigned int)ptr[8]<<8); |
118 | 0 | case 8: b += ((unsigned int)ptr[7]<<24); |
119 | 0 | case 7: b += ((unsigned int)ptr[6]<<16); |
120 | 0 | case 6: b += ((unsigned int)ptr[5]<<8); |
121 | 0 | case 5: b += ptr[4]; |
122 | 0 | case 4: a += ((unsigned int)ptr[3]<<24); |
123 | 0 | case 3: a += ((unsigned int)ptr[2]<<16); |
124 | 0 | case 2: a += ((unsigned int)ptr[1]<<8); |
125 | 0 | case 1: a += ptr[0]; |
126 | 0 | } |
127 | 0 | mix32(a,b,c); |
128 | 0 | return c; |
129 | 0 | } |
130 | | |
131 | | |
132 | | static unsigned int ECFPHash(std::vector<unsigned int> &v) |
133 | 0 | { |
134 | 0 | unsigned int len = (unsigned int)v.size(); |
135 | 0 | unsigned int a = 0; |
136 | 0 | unsigned int b = 0; |
137 | 0 | unsigned int c = 0; |
138 | 0 | unsigned int i = 0; |
139 | |
|
140 | 0 | while (i+2 < len) { |
141 | 0 | a += v[i]; |
142 | 0 | b += v[i+1]; |
143 | 0 | c += v[i+2]; |
144 | 0 | mix32(a,b,c); |
145 | 0 | i += 3; |
146 | 0 | } |
147 | 0 | c += len; |
148 | 0 | switch (len - i) { |
149 | 0 | case 2: b += v[i+1]; |
150 | 0 | case 1: a += v[i]; |
151 | 0 | } |
152 | 0 | mix32(a,b,c); |
153 | 0 | return c; |
154 | 0 | } |
155 | | |
156 | | |
157 | | struct AtomInfo { |
158 | | unsigned int e[5]; // hashcodes for ecfp0, 2, 4... This will segfault at ECFP12 |
159 | | std::vector<unsigned int> b[4]; // for duplicate removal as described in paper? |
160 | | }; |
161 | | |
162 | | struct NborInfo { |
163 | | unsigned int order; |
164 | | unsigned int idx; |
165 | | |
166 | 0 | NborInfo(unsigned int o, unsigned int i) : order(o), idx(i) {} |
167 | | bool operator < (const NborInfo &x) const |
168 | 0 | { |
169 | 0 | if (order != x.order) |
170 | 0 | return order < x.order; |
171 | 0 | return idx < x.idx; |
172 | 0 | } |
173 | | }; |
174 | | |
175 | | static void ECFPPass(OpenBabel::OBMol &mol, |
176 | | AtomInfo *ainfo, unsigned int pass) |
177 | 0 | { |
178 | 0 | FOR_ATOMS_OF_MOL(atom, mol) { |
179 | 0 | if (atom->GetAtomicNum() == OBElements::Hydrogen) |
180 | 0 | continue; |
181 | 0 | OpenBabel::OBAtom* aptr = &(*atom); |
182 | 0 | unsigned int idx = aptr->GetIdx(); |
183 | 0 | AtomInfo *ptr = &ainfo[idx]; |
184 | |
|
185 | 0 | std::vector<NborInfo> nbrs; |
186 | 0 | FOR_BONDS_OF_ATOM(bptr, aptr) { |
187 | 0 | OpenBabel::OBAtom* nptr = bptr->GetNbrAtom(aptr); |
188 | 0 | if (nptr->GetAtomicNum() == OBElements::Hydrogen) |
189 | 0 | continue; |
190 | 0 | unsigned int order; |
191 | 0 | if (!bptr->IsAromatic()) { |
192 | 0 | switch (bptr->GetBondOrder()) { |
193 | 0 | case 2: order = 2; break; |
194 | 0 | case 3: order = 3; break; |
195 | 0 | default: order = 1; |
196 | 0 | } |
197 | 0 | } else order = 4; |
198 | | |
199 | 0 | unsigned int nidx = nptr->GetIdx(); |
200 | |
|
201 | 0 | nbrs.push_back(NborInfo(order,ainfo[nidx].e[pass-1])); |
202 | | // for duplicate removal as described in paper (?) |
203 | 0 | if (pass == 1) |
204 | 0 | ptr->b[pass-1].push_back(bptr->GetIdx()); |
205 | 0 | } |
206 | 0 | std::sort(nbrs.begin(),nbrs.end()); |
207 | |
|
208 | 0 | std::vector<unsigned int> vint; |
209 | 0 | vint.push_back(pass); |
210 | 0 | vint.push_back(ptr->e[pass-1]); |
211 | 0 | std::vector<NborInfo>::const_iterator ni; |
212 | 0 | for (ni=nbrs.begin(); ni!=nbrs.end(); ++ni) { |
213 | 0 | vint.push_back(ni->order); |
214 | 0 | vint.push_back(ni->idx); |
215 | 0 | } |
216 | 0 | ptr->e[pass] = ECFPHash(vint); |
217 | 0 | } |
218 | 0 | } |
219 | | |
220 | | static void ECFPFirstPass(OpenBabel::OBMol &mol, |
221 | | AtomInfo *ainfo) |
222 | 0 | { |
223 | 0 | unsigned char buffer[8]; |
224 | | |
225 | | /* First Pass: ECFP_0 */ |
226 | 0 | FOR_ATOMS_OF_MOL(atom, mol) { |
227 | 0 | if (atom->GetAtomicNum() == OBElements::Hydrogen) |
228 | 0 | continue; |
229 | 0 | OpenBabel::OBAtom* aptr = &(*atom); |
230 | 0 | unsigned int idx = aptr->GetIdx(); |
231 | 0 | buffer[0] = aptr->GetHvyDegree(); // degree of heavy atom connections |
232 | 0 | buffer[1] = aptr->GetExplicitValence() - aptr->ExplicitHydrogenCount(); // valence of heavy atom connections |
233 | 0 | buffer[2] = aptr->GetAtomicNum(); |
234 | 0 | buffer[3] = (unsigned char)aptr->GetIsotope(); |
235 | 0 | buffer[4] = (unsigned char)aptr->GetFormalCharge(); |
236 | 0 | buffer[5] = (unsigned char)(aptr->ExplicitHydrogenCount() + aptr->GetImplicitHCount()); |
237 | 0 | buffer[6] = aptr->IsInRing() ? 1 : 0; |
238 | 0 | buffer[7] = 0; // aptr->IsAromatic() ? 1 : 0; |
239 | 0 | ainfo[idx].e[0] = ECFPHash(buffer,8); |
240 | 0 | } |
241 | 0 | } |
242 | | |
243 | | bool fingerprintECFP::GetFingerprint(OBBase* pOb, vector<unsigned int>&fp, int nbits) |
244 | 0 | { |
245 | 0 | OBMol* pmol = dynamic_cast<OBMol*>(pOb); |
246 | 0 | if(!pmol) return false; |
247 | | |
248 | | // default fingeprint size |
249 | 0 | if (nbits <= 0) |
250 | 0 | nbits = 4096; |
251 | |
|
252 | 0 | fp.resize(0); // clear without deallocating memory |
253 | 0 | fp.resize(nbits/Getbitsperint()); |
254 | | |
255 | 0 | _ss.str(""); |
256 | |
|
257 | 0 | unsigned int pass; |
258 | |
|
259 | 0 | unsigned int count = pmol->NumAtoms(); |
260 | 0 | if (count == 0) return true; |
261 | | |
262 | | // Access this using the Atom::Idx() |
263 | 0 | AtomInfo *ainfo = new AtomInfo[count+1]; |
264 | |
|
265 | 0 | ECFPFirstPass(*pmol,ainfo); |
266 | 0 | for (pass=1; pass<= _radius; pass++) |
267 | 0 | ECFPPass(*pmol,ainfo,pass); |
268 | | |
269 | | // Duplicate removal - this is a simplified version of what's in the paper |
270 | 0 | FOR_ATOMS_OF_MOL(atom, pmol) { |
271 | 0 | if (atom->GetAtomicNum() == OBElements::Hydrogen) |
272 | 0 | continue; |
273 | 0 | unsigned int idx = atom->GetIdx(); |
274 | 0 | for (pass=0; pass <= _radius; pass++) { |
275 | 0 | unsigned int bit = (ainfo[idx].e[pass] % nbits) & 0x7fffffff; |
276 | 0 | SetBit(fp, bit); |
277 | 0 | } |
278 | 0 | } |
279 | |
|
280 | 0 | delete[] ainfo; |
281 | |
|
282 | 0 | return true; |
283 | 0 | } |
284 | | |
285 | | } //namespace OpenBabel |
286 | | |
287 | | //! \file fingerecfp.cpp |
288 | | //! \brief ECFP fingerprint definition and implementation |