Coverage Report

Created: 2025-08-24 07:05

/src/openbabel/src/fingerprints/fingerecfp.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
Copyright (C) 2016 by NextMove Software
3
4
This program is free software; you can redistribute it and/or modify
5
it under the terms of the GNU General Public License as published by
6
the Free Software Foundation version 2 of the License.
7
8
This program is distributed in the hope that it will be useful,
9
but WITHOUT ANY WARRANTY; without even the implied warranty of
10
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
GNU General Public License for more details.
12
***********************************************************************/
13
14
#include <openbabel/babelconfig.h>
15
#include <openbabel/oberror.h>
16
#include <openbabel/mol.h>
17
#include <openbabel/atom.h>
18
#include <openbabel/bond.h>
19
#include <openbabel/obiter.h>
20
#include <openbabel/fingerprint.h>
21
#include <openbabel/obiter.h>
22
#include <openbabel/elements.h>
23
24
#include <vector>
25
#include <algorithm>
26
27
using namespace std;
28
namespace OpenBabel
29
{
30
31
/// \brief Fingerprint based on ECFP
32
class fingerprintECFP : public OBFingerprint
33
{
34
public:
35
  fingerprintECFP(const char* ID, bool IsDefault=false,
36
                  unsigned int radius=4, bool keepdups = true)
37
12
    : OBFingerprint(ID, IsDefault),
38
12
      _radius(radius), _keepdups(keepdups), _flags(0){};
39
40
  const char* Description() override
41
0
  {
42
          // Important! The second line is used by some output formats (e.g. FPS)
43
    // to determine the default size
44
0
    return "Extended-Connectivity Fingerprints (ECFPs)\n"
45
0
                 "4096 bits.\n"
46
0
                 "Circular topological fingerprints of specified radius\n";
47
0
  }
48
49
  //Calculates the fingerprint
50
  bool GetFingerprint(OBBase* pOb, vector<unsigned int>&fp, int nbits=0) override;
51
52
  /// \returns fragment info unless SetFlags(OBFingerprint::FPT_NOINFO) has been called before GetFingerprint() called.
53
  std::string DescribeBits(const std::vector<unsigned int> fp, bool bSet=true) override
54
0
  { return _ss.str(); }
55
56
0
  unsigned int Flags() override { return _flags; }
57
0
  void SetFlags(unsigned int f) override { _flags=f; }
58
59
private:
60
  std::vector<unsigned int> v;
61
62
  stringstream _ss;
63
  unsigned int _radius;
64
  bool _keepdups;
65
  unsigned int _flags;
66
};
67
68
//***********************************************
69
//Make the global instances
70
  fingerprintECFP theECFP0("ECFP0",false, 0, true);
71
  fingerprintECFP theECFP2("ECFP2",false, 1, true);
72
  fingerprintECFP theECFP4("ECFP4",false, 2, true);
73
  fingerprintECFP theECFP6("ECFP6",false, 3, true);
74
  fingerprintECFP theECFP8("ECFP8",false, 4, true);
75
  fingerprintECFP theECFP10("ECFP10",false, 5, true);
76
//***********************************************
77
78
0
#define mix32(a,b,c) \
79
0
{ \
80
0
  a -= b; a -= c; a ^= (c>>13); \
81
0
  b -= c; b -= a; b ^= (a<<8); \
82
0
  c -= a; c -= b; c ^= (b>>13); \
83
0
  a -= b; a -= c; a ^= (c>>12);  \
84
0
  b -= c; b -= a; b ^= (a<<16); \
85
0
  c -= a; c -= b; c ^= (b>>5); \
86
0
  a -= b; a -= c; a ^= (c>>3);  \
87
0
  b -= c; b -= a; b ^= (a<<10); \
88
0
  c -= a; c -= b; c ^= (b>>15); \
89
0
}
90
91
static unsigned int ECFPHash(unsigned char *ptr, unsigned int length)
92
0
{
93
0
  unsigned int a = 0;
94
0
  unsigned int b = 0;
95
0
  unsigned int c = 0;
96
97
0
  unsigned int len = length;
98
0
  while (len >= 12) {
99
0
    a += ptr[0] + ((unsigned int)ptr[1]<<8)
100
0
                + ((unsigned int)ptr[2]<<16)
101
0
                + ((unsigned int)ptr[3]<<24);
102
0
    b += ptr[4] + ((unsigned int)ptr[5]<<8)
103
0
                + ((unsigned int)ptr[6]<<16)
104
0
                + ((unsigned int)ptr[7]<<24);
105
0
    c += ptr[8] + ((unsigned int)ptr[9]<<8)
106
0
                + ((unsigned int)ptr[10]<<16)
107
0
                + ((unsigned int)ptr[11]<<24);
108
0
    mix32(a,b,c);
109
0
    ptr += 12;
110
0
    len -= 12;
111
0
  }
112
113
0
  c += length;
114
0
  switch (len) {
115
0
  case 11: c += ((unsigned int)ptr[10]<<24);
116
0
  case 10: c += ((unsigned int)ptr[9]<<16);
117
0
  case  9: c += ((unsigned int)ptr[8]<<8);
118
0
  case  8: b += ((unsigned int)ptr[7]<<24);
119
0
  case  7: b += ((unsigned int)ptr[6]<<16);
120
0
  case  6: b += ((unsigned int)ptr[5]<<8);
121
0
  case  5: b += ptr[4];
122
0
  case  4: a += ((unsigned int)ptr[3]<<24);
123
0
  case  3: a += ((unsigned int)ptr[2]<<16);
124
0
  case  2: a += ((unsigned int)ptr[1]<<8);
125
0
  case  1: a += ptr[0];
126
0
  }
127
0
  mix32(a,b,c);
128
0
  return c;
129
0
}
130
131
132
static unsigned int ECFPHash(std::vector<unsigned int> &v)
133
0
{
134
0
  unsigned int len = (unsigned int)v.size();
135
0
  unsigned int a = 0;
136
0
  unsigned int b = 0;
137
0
  unsigned int c = 0;
138
0
  unsigned int i = 0;
139
140
0
  while (i+2 < len) {
141
0
    a += v[i];
142
0
    b += v[i+1];
143
0
    c += v[i+2];
144
0
    mix32(a,b,c);
145
0
    i += 3;
146
0
  }
147
0
  c += len;
148
0
  switch (len - i) {
149
0
  case 2:  b += v[i+1];
150
0
  case 1:  a += v[i];
151
0
  }
152
0
  mix32(a,b,c);
153
0
  return c;
154
0
}
155
156
157
struct AtomInfo {
158
  unsigned int e[5]; // hashcodes for ecfp0, 2, 4... This will segfault at ECFP12
159
  std::vector<unsigned int> b[4]; // for duplicate removal as described in paper?
160
};
161
162
struct NborInfo {
163
  unsigned int order;
164
  unsigned int idx;
165
166
0
  NborInfo(unsigned int o, unsigned int i) : order(o), idx(i) {}
167
  bool operator < (const NborInfo &x) const
168
0
  {
169
0
    if (order != x.order)
170
0
      return order < x.order;
171
0
    return idx < x.idx;
172
0
  }
173
};
174
175
static void ECFPPass(OpenBabel::OBMol &mol,
176
                     AtomInfo *ainfo, unsigned int pass)
177
0
{
178
0
  FOR_ATOMS_OF_MOL(atom, mol) {
179
0
    if (atom->GetAtomicNum() == OBElements::Hydrogen)
180
0
      continue;
181
0
    OpenBabel::OBAtom* aptr = &(*atom);
182
0
    unsigned int idx = aptr->GetIdx();
183
0
    AtomInfo *ptr = &ainfo[idx];
184
185
0
    std::vector<NborInfo> nbrs;
186
0
    FOR_BONDS_OF_ATOM(bptr, aptr) {
187
0
      OpenBabel::OBAtom* nptr = bptr->GetNbrAtom(aptr);
188
0
      if (nptr->GetAtomicNum() == OBElements::Hydrogen)
189
0
        continue;
190
0
      unsigned int order;
191
0
      if (!bptr->IsAromatic()) {
192
0
        switch (bptr->GetBondOrder()) {
193
0
        case 2:  order = 2;  break;
194
0
        case 3:  order = 3;  break;
195
0
        default: order = 1;
196
0
        }
197
0
      } else order = 4;
198
199
0
      unsigned int nidx = nptr->GetIdx();
200
201
0
      nbrs.push_back(NborInfo(order,ainfo[nidx].e[pass-1]));
202
      // for duplicate removal as described in paper (?)
203
0
      if (pass == 1)
204
0
        ptr->b[pass-1].push_back(bptr->GetIdx());
205
0
    }
206
0
    std::sort(nbrs.begin(),nbrs.end());
207
208
0
    std::vector<unsigned int> vint;
209
0
    vint.push_back(pass);
210
0
    vint.push_back(ptr->e[pass-1]);
211
0
    std::vector<NborInfo>::const_iterator ni;
212
0
    for (ni=nbrs.begin(); ni!=nbrs.end(); ++ni) {
213
0
      vint.push_back(ni->order);
214
0
      vint.push_back(ni->idx);
215
0
    }
216
0
    ptr->e[pass] = ECFPHash(vint);
217
0
  }
218
0
}
219
220
static void ECFPFirstPass(OpenBabel::OBMol &mol,
221
                          AtomInfo *ainfo)
222
0
{
223
0
  unsigned char buffer[8];
224
225
  /* First Pass: ECFP_0 */
226
0
  FOR_ATOMS_OF_MOL(atom, mol) {
227
0
    if (atom->GetAtomicNum() == OBElements::Hydrogen)
228
0
      continue;
229
0
    OpenBabel::OBAtom* aptr = &(*atom);
230
0
    unsigned int idx = aptr->GetIdx();
231
0
    buffer[0] = aptr->GetHvyDegree(); // degree of heavy atom connections
232
0
    buffer[1] = aptr->GetExplicitValence() - aptr->ExplicitHydrogenCount(); // valence of heavy atom connections
233
0
    buffer[2] = aptr->GetAtomicNum();
234
0
    buffer[3] = (unsigned char)aptr->GetIsotope();
235
0
    buffer[4] = (unsigned char)aptr->GetFormalCharge();
236
0
    buffer[5] = (unsigned char)(aptr->ExplicitHydrogenCount() + aptr->GetImplicitHCount());
237
0
    buffer[6] = aptr->IsInRing() ? 1 : 0;
238
0
    buffer[7] = 0;  // aptr->IsAromatic() ? 1 : 0;
239
0
    ainfo[idx].e[0] = ECFPHash(buffer,8);
240
0
  }
241
0
}
242
243
bool fingerprintECFP::GetFingerprint(OBBase* pOb, vector<unsigned int>&fp, int nbits)
244
0
{
245
0
  OBMol* pmol = dynamic_cast<OBMol*>(pOb);
246
0
  if(!pmol) return false;
247
  
248
  // default fingeprint size
249
0
  if (nbits <= 0)
250
0
    nbits = 4096;
251
252
0
  fp.resize(0); // clear without deallocating memory
253
0
  fp.resize(nbits/Getbitsperint());
254
  
255
0
  _ss.str("");
256
257
0
  unsigned int pass;
258
259
0
  unsigned int count = pmol->NumAtoms();
260
0
  if (count == 0) return true;
261
262
  // Access this using the Atom::Idx()
263
0
  AtomInfo *ainfo = new AtomInfo[count+1];
264
265
0
  ECFPFirstPass(*pmol,ainfo);
266
0
  for (pass=1; pass<= _radius; pass++)
267
0
    ECFPPass(*pmol,ainfo,pass);
268
269
  // Duplicate removal - this is a simplified version of what's in the paper
270
0
  FOR_ATOMS_OF_MOL(atom, pmol) {
271
0
    if (atom->GetAtomicNum() == OBElements::Hydrogen)
272
0
      continue;    
273
0
    unsigned int idx = atom->GetIdx();
274
0
    for (pass=0; pass <= _radius; pass++) {
275
0
      unsigned int bit = (ainfo[idx].e[pass] % nbits) & 0x7fffffff; 
276
0
      SetBit(fp, bit);
277
0
    }
278
0
  }
279
280
0
  delete[] ainfo;
281
282
0
  return true;
283
0
}
284
285
} //namespace OpenBabel
286
287
//! \file fingerecfp.cpp
288
//! \brief ECFP fingerprint definition and implementation