Coverage Report

Created: 2025-07-13 06:44

/src/openbabel/src/fingerprints/finger3.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
finger3.cpp: Fingerprints based on list of SMARTS patterns
3
Copyright (C) 2005 Chris Morley
4
5
This file is part of the Open Babel project.
6
For more information, see <http://openbabel.org/>
7
8
This program is free software; you can redistribute it and/or modify
9
it under the terms of the GNU General Public License as published by
10
the Free Software Foundation version 2 of the License.
11
12
This program is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
GNU General Public License for more details.
16
***********************************************************************/
17
18
#include <openbabel/babelconfig.h>
19
#include <openbabel/mol.h>
20
#include <openbabel/parsmart.h>
21
#include <openbabel/oberror.h>
22
#include <sstream>
23
#include <fstream>
24
#include <map>
25
#include <string>
26
27
#include <openbabel/fingerprint.h>
28
29
using namespace std;
30
namespace OpenBabel
31
{
32
/// \brief Fingerprint based on list of SMARTS patterns
33
class PatternFP  : public OBFingerprint
34
{
35
private:
36
  struct pattern
37
  {
38
    string smartsstring;
39
    OBSmartsPattern obsmarts;
40
    string description;
41
    int numbits;
42
    int numoccurrences;
43
    int bitindex;
44
  };
45
  vector<pattern> _pats;
46
  int _bitcount;
47
  string _version;
48
49
protected:
50
  string _patternsfile;
51
52
public:
53
  PatternFP(const char* ID, const char* filename=nullptr,
54
4
      bool IsDefault=false) : OBFingerprint(ID, IsDefault)
55
4
  {
56
4
    if (filename == nullptr)
57
2
      _patternsfile="patterns.txt";
58
2
    else
59
2
      _patternsfile = filename;
60
4
  }
61
62
/////////////////////////////////////////////////////////////////////////////
63
  const char* Description() override
64
0
  {
65
0
    static string desc;
66
    //Read patterns file if it has not been done already,
67
    //because we need _bitcount and _version updated
68
69
    // _bitcount and _version are available only after the datafile has been parsed.
70
    // This is a burden on normal operation (Description() gets called on startup from OBDefine),
71
    // so the secondline is present only after the fingerprint has been used.
72
    // the
73
0
    string secondline;
74
0
    if(!_pats.empty())
75
0
      secondline = "\n" + toString(_bitcount) + " bits. Datafile version = " +  _version;
76
0
    desc = "SMARTS patterns specified in the file " + _patternsfile
77
0
      + secondline
78
0
      + "\nPatternFP is definable";
79
0
    return (desc.c_str());
80
0
  }
81
82
//////////////////////////////////////////////////////////////////////////////
83
  //Each bit represents a single substructure
84
0
  unsigned int Flags() override { return FPT_UNIQUEBITS; }
85
86
///////////////////////////////////////////////////////////////////////////////
87
  PatternFP* MakeInstance(const std::vector<std::string>& textlines) override
88
0
  {
89
0
    return new PatternFP(textlines[1].c_str(),textlines[2].c_str());
90
0
  }
91
92
////////////////////////////////////////////////////////////////////////////////
93
  bool GetFingerprint(OBBase* pOb, vector<unsigned int>&fp, int foldbits) override
94
0
  {
95
0
    OBMol* pmol = dynamic_cast<OBMol*>(pOb);
96
0
    if(!pmol)
97
0
      return false;
98
99
    //This fingerprint is constructed from a molecule with no explicit hydrogens.
100
0
    pmol->DeleteHydrogens();
101
102
0
    unsigned int n;
103
    //Read patterns file if it has not been done already
104
0
    if(_pats.empty())
105
0
      ReadPatternFile(_version);
106
107
    //Make fp size the smallest power of two to contain the patterns
108
0
    n=Getbitsperint();
109
0
    while(n < _bitcount)
110
0
      n*=2;
111
0
    fp.resize(n/Getbitsperint());
112
113
0
    n=0; //bit position
114
0
    vector<pattern>::iterator ppat;
115
0
    for(ppat=_pats.begin();ppat!=_pats.end();++ppat)
116
0
    {
117
0
      if(ppat->numbits //ignore pattern if numbits==0
118
0
        && ppat->obsmarts.Match(*pmol, ppat->numoccurrences==0))//do single match if all that's needed
119
0
      {
120
        /* Set bits in the fingerprint depending on the number of matches in the molecule
121
           and the parameters, numbits and numoccurrences, in the pattern.
122
           The pattern will set or clear numbits bits in the fingerprint.
123
           They will be in numoccurrences+1 groups, each containing an approximately
124
           equal number of bits.
125
           The first group of bits will be set if numMatches > numoccurrences;
126
           The second group will be set if numMatches > numoccurrences - 1;
127
           and so on.
128
           So with a pattern with numbits = 4 and numoccurrences = 2,
129
           the groups would be 1, 1, and 2 bits.
130
           A molecule with
131
              1 match to the pattern would give 0011
132
              2 matches to the pattern would give 0111
133
              3 or more matches to the pattern would give 1111
134
        */
135
0
        int numMatches = ppat->obsmarts.GetUMapList().size();
136
0
        int num =  ppat->numbits, div = ppat->numoccurrences+1, ngrp;
137
138
0
        int i = n;
139
0
        while(num)
140
0
        {
141
0
          ngrp = (num -1)/div-- +1; //rounds up
142
0
          num -= ngrp;
143
0
          while(ngrp--)
144
0
            if (numMatches > div) {
145
0
              SetBit(fp,i);
146
0
            }
147
0
          i++;
148
0
        }
149
0
      }
150
0
      n += ppat->numbits;
151
0
    }
152
153
0
    if(foldbits)
154
0
      Fold(fp, foldbits);
155
0
    return true;
156
0
  }
157
158
  /////////////////////////////////////////////////////////////////////
159
  bool ReadPatternFile(string& ver)
160
0
  {
161
    //Reads three types of file. See below
162
0
    ifstream ifs;
163
0
    stringstream errorMsg;
164
165
0
    if (OpenDatafile(ifs, _patternsfile).length() == 0)
166
0
    {
167
0
      errorMsg << "Cannot open " << _patternsfile << endl;
168
0
      obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
169
0
      return false;
170
0
    }
171
172
0
    string line;
173
0
    if(!getline(ifs, line)) //first line
174
0
      return false;
175
0
    bool smartsfirst = (Trim(line)=="#Comments after SMARTS");
176
177
0
    _bitcount=0;
178
0
    bool indata=false;
179
0
    do
180
0
    {
181
0
      if(Trim(line).size()>0 && line[0]!='#')
182
0
      {
183
0
        pattern p;
184
0
        p.numbits=1; p.numoccurrences=0; //default values
185
0
        p.bitindex = _bitcount;
186
0
        istringstream ss(line);
187
0
        indata = true;
188
0
        if(smartsfirst)
189
0
        {
190
0
          if(isdigit(line[0]))
191
0
          {
192
0
            if(!ParseRDKitFormat(ss, p))
193
0
              continue;
194
0
          }
195
0
          else
196
            //Original format, which looks like:
197
            //  SMARTS description
198
0
            ss >> p.smartsstring >> p.description;
199
0
        }
200
0
        else
201
0
        {
202
          // Christian Laggner's format:
203
          //  description: SMARTS [occurrences [numbits]]
204
0
          getline(ss, p.description, ':');
205
0
          ss >> p.smartsstring;
206
0
          ss >> p.numoccurrences >> p.numbits;
207
0
        }
208
209
0
        if(!p.obsmarts.Init(p.smartsstring))
210
0
        {
211
0
          obErrorLog.ThrowError(__FUNCTION__,
212
0
            "Faulty SMARTS: " + p.description + ' ' + p.smartsstring, obError);
213
0
          continue;
214
0
        }
215
0
        _pats.push_back(p);
216
0
        _bitcount += p.numbits;
217
0
      }
218
0
      else if(!indata)
219
0
      {
220
        //Find version number
221
0
        string::size_type pos = line.find("Version");
222
0
        if(pos!=string::npos)
223
0
          pos+=8;
224
0
        else if(line.find("Extracted from RDKit")!=string::npos)
225
0
        {
226
0
          pos=20;
227
0
          while((pos=line.find('r',pos))!=string::npos)
228
0
            if(isdigit(line[++pos]))
229
0
              break;
230
0
        }
231
0
        if(pos!=string::npos)
232
0
        {
233
0
          ver=line.substr(pos) + ' ';//space fixes bug in while() when number at end of line
234
0
          pos=1;
235
0
          while(isdigit(ver[++pos]));
236
0
          ver.erase(pos);
237
0
        }
238
0
      }
239
0
    }while(getline(ifs,line));
240
241
0
    if (ifs)
242
0
      ifs.close();
243
0
    return true;
244
0
  }
245
246
///////////////////////////////////////////////////////////////////////////////
247
  string DescribeBits(const vector<unsigned int> fp, bool bSet=true) override
248
0
  {
249
    //checkmol-type output with tab separated functional group names
250
0
    stringstream ss;
251
0
    vector<pattern>::iterator ppat;
252
0
    for(ppat=_pats.begin();ppat!=_pats.end();++ppat)
253
0
    {
254
0
      int n = ppat->bitindex;
255
0
      int num =  ppat->numbits, div = ppat->numoccurrences+1, ngrp;
256
0
      while(num) //for each group of bits
257
0
      {
258
0
        ngrp = (num + div - 1) / div; //rounds up
259
0
        div -= 1;
260
0
        num -= ngrp;
261
0
        if(GetBit(fp, n) == bSet)
262
0
        {
263
0
          ss << ppat->description;
264
0
          if(div>0)
265
0
            ss << '*' << div+1;
266
0
          ss << '\t' ;
267
0
          break; //ignore the bits signifying a smaller number of occurrences
268
0
        }
269
0
        n += ngrp;
270
0
      }
271
0
    }
272
0
    ss << endl;
273
0
    return ss.str();
274
0
  }
275
276
///////////////////////////////////////////////////////////////////////////////////
277
  bool ParseRDKitFormat(istringstream& ss, pattern& p)
278
0
  {
279
    //rdkit format, e.g.
280
    //  14:('[S,s]-[S,s]',0), # S-S
281
0
    const int dum = 20; //an arbitrary number in case delimiters in ignore statements not found
282
0
    string number, comment;
283
0
    getline(ss, number, ':');
284
0
    ss.ignore(dum, '\'');
285
0
    getline(ss, p.smartsstring, '\'');
286
0
    if(p.smartsstring[0]=='?') //ignore patterns with SMARTS '?'
287
0
      p.smartsstring="[999]";//this seems to match nothing;  was return false;
288
0
    ss.ignore(dum,',');
289
0
    ss >> p.numoccurrences;
290
0
    ss.ignore(dum,'#');
291
0
    getline(ss, comment);
292
293
    //description is number + edited commment
294
0
    Trim(comment);
295
0
    string::size_type pos;
296
0
    pos = comment.find("FIX");
297
0
    if(pos==string::npos)
298
0
      pos = comment.find("*NOTE*");
299
0
    if(pos!=string::npos)
300
0
      comment.erase(pos);
301
0
    p.description = number + ": " + comment;
302
0
    return true;
303
0
  }
304
305
306
}; //class PatternFP
307
308
//***********************************************
309
//Make a global instance
310
PatternFP FP3PatternFP("FP3");
311
PatternFP FP4PatternFP("FP4", "SMARTS_InteLigand.txt");
312
//***********************************************
313
314
/*! \class PatternFP
315
A bit is set when there is a match to one of a list
316
of SMARTS patterns in the datafile, which is specified in the constructor.
317
If no filename is given, the default filename is patterns.txt.
318
Fingerprints can be made by declaring a global variable, as in:
319
320
PatternFP FP4PatternFP("FP4", "SMARTS_InteLigand.txt");
321
322
Alternatively, an entry in plugindefines.txt like:
323
324
PatternFP
325
MACCS          #ID of this fingerprint type
326
MACCS.txt      #File containing the SMARTS patterns
327
328
defines a fingerprint without the need to recompile.
329
330
Three file formats are supported:
331
 - the preferred format (e.g. SMARTS_InteLigand.txt in FP4)
332
 - the original format (patterns.txt has an incomplete set of SMARTS patterns)
333
 - a format made by extracting from an RDKit file (MACCS.txt)
334
The last two require the first line to be:
335
#Comments after SMARTS
336
337
Lines starting with # are ignored.
338
For the preferred format each line is of the form:
339
description: SMARTS [occurrences [numbits]]
340
A bit is set in the fingerprint for ach SMARTS pattern matched.
341
The optional integer parameters refine this behaviour; the most obvious uses are:
342
 - if <occurrences> is present and greater than its default value of 0, the bit
343
   is set only if the number of matches to the pattern is greater than <occurrences>.
344
 - if <occurrences> is 0 and <numbits> is greater than its default value of 1, then
345
   the fingerprint has <numbits> bits set if there is a match. This gives greater weight
346
   to the pattern for use in similarity measures like Tanimoto.
347
 - if the parameters are n-1 and n and the number of matches is n,
348
   a bit is set for each of the conditions n>=m, n>=m-1, ... , n>=1
349
   This can be used to distinguish structures with many similar atoms like n-alkanes.
350
The use of other values for the parameters, which can be any positive integer, can give
351
other analogous behaviours. If numbits is 0 the pattern is ignored.
352
*/
353
354
}//namespace
355
356
//! \file finger3.cpp
357
//! \brief fingerprints based on list of SMARTS patterns