Coverage Report

Created: 2025-08-24 07:05

/src/openbabel/src/fingerprint.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
fingerprint.cpp - Implementation of fingerpring base class and fastsearching
3
4
Copyright (C) 2005 by Chris Morley
5
6
This file is part of the Open Babel project.
7
For more information, see <http://openbabel.org/>
8
9
This program is free software; you can redistribute it and/or modify
10
it under the terms of the GNU General Public License as published by
11
the Free Software Foundation version 2 of the License.
12
13
This program is distributed in the hope that it will be useful,
14
but WITHOUT ANY WARRANTY; without even the implied warranty of
15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
GNU General Public License for more details.
17
***********************************************************************/
18
19
#include <openbabel/babelconfig.h>
20
21
#include <vector>
22
#include <algorithm>
23
#include <iosfwd>
24
#include <cstring>
25
#include <fstream>
26
27
#include <openbabel/fingerprint.h>
28
#include <openbabel/oberror.h>
29
30
using namespace std;
31
namespace OpenBabel
32
{
33
#if defined(__CYGWIN__) || defined(__MINGW32__)
34
  // macro to implement static OBPlugin::PluginMapType& Map()
35
  PLUGIN_CPP_FILE(OBFingerprint)
36
#endif
37
38
  const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int);
39
40
  void OBFingerprint::SetBit(vector<unsigned int>& vec, const unsigned int n)
41
0
  {
42
0
    vec[n/Getbitsperint()] |= (1 << (n % Getbitsperint()));
43
0
  }
44
45
  bool OBFingerprint::GetBit(const vector<unsigned int>& vec, const unsigned int n)
46
0
  {
47
0
    unsigned int word =vec[n/Getbitsperint()];
48
0
    return (word &= (1 << (n % Getbitsperint())))!=0;
49
0
  }
50
51
  ////////////////////////////////////////
52
  void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits)
53
0
  {
54
0
    if(nbits<Getbitsperint())
55
0
    {
56
0
      stringstream ss;
57
0
      ss << "Can't fold to less than " << Getbitsperint() << "bits";
58
0
      obErrorLog.ThrowError(__FUNCTION__, ss.str(), obError);
59
0
      return;
60
0
    }
61
    // "folding" to a larger # of bits
62
0
    if (nbits > vec.size()*Getbitsperint()) {
63
0
      vec.resize(nbits/Getbitsperint(), 0);
64
0
    }
65
0
    else {
66
      // normal folding to smaller vector sizes
67
0
      while(vec.size()*Getbitsperint()/2 >= nbits)
68
0
        vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2,
69
0
                            vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end());
70
0
    }
71
0
  }
72
73
  ////////////////////////////////////////
74
/*  bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt)
75
  {
76
    Fptpos iter;
77
    if(id.empty())
78
      iter=FPtsMap().begin();
79
    else
80
      {
81
        iter=FPtsMap().find(id);
82
        if(iter!=FPtsMap().end())
83
          ++iter;
84
      }
85
    if(iter==FPtsMap().end())
86
      return false;
87
    id    = iter->first;
88
    pFPrt = iter->second;
89
    return true;
90
  }
91
92
  OBFingerprint* OBFingerprint::FindFingerprint(const string& ID)
93
  {
94
    if(ID.empty())
95
      return _pDefault;
96
    Fptpos iter = FPtsMap().find(ID);
97
    if(iter==FPtsMap().end())
98
      return NULL;
99
    else
100
      return iter->second;
101
  }
102
*/
103
  double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2)
104
0
  {
105
    //Independent of sizeof(unsigned int)
106
0
    if(vec1.size()!=vec2.size())
107
0
      return -1; //different number of bits
108
0
    int andbits=0, orbits=0;
109
0
    for (unsigned i=0;i<vec1.size();++i)
110
0
      {
111
0
        int andfp = vec1[i] & vec2[i];
112
0
        int orfp = vec1[i] | vec2[i];
113
        //Count bits
114
      /* GCC 3.4 supports a "population count" builtin, which on many targets is
115
         implemented with a single instruction.  There is a fallback definition
116
         in libgcc in case a target does not have one, which should be just as
117
         good as the static function below.  */
118
0
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
119
0
        andbits += __builtin_popcount(andfp);
120
0
        orbits += __builtin_popcount(orfp);
121
#else
122
        for(;andfp;andfp=andfp<<1)
123
          if(andfp<0) ++andbits;
124
        for(;orfp;orfp=orfp<<1)
125
          if(orfp<0) ++orbits;
126
#endif
127
0
      }
128
0
    if(orbits==0)
129
0
      return 0.0;
130
0
    return((double)andbits/(double)orbits);
131
0
  }
132
133
  //*****************************************************************
134
  bool FastSearch::Find(OBBase* pOb, vector<unsigned long>& SeekPositions,
135
                        unsigned int MaxCandidates)
136
0
  {
137
    ///Finds chemical objects in datafilename (which must previously have been indexed)
138
    ///that have all the same bits set in their fingerprints as in the fingerprint of
139
    ///a pattern object. (Usually a substructure search in molecules.)
140
    ///The type of fingerprint and its degree of folding does not have to be specified
141
    ///here because the values in the index file are used.
142
    ///The positions of the candidate matching molecules in the original datafile are returned.
143
144
0
    vector<unsigned int> vecwords;
145
0
    _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint());
146
147
0
    vector<unsigned int>candidates; //indices of matches from fingerprint screen
148
0
    candidates.reserve(MaxCandidates);
149
150
0
    unsigned int dataSize = _index.header.nEntries;
151
    //  GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype);
152
153
0
    unsigned int words = _index.header.words;
154
0
    unsigned int* nextp = &_index.fptdata[0];
155
0
    unsigned int* ppat0 = &vecwords[0];
156
0
    unsigned int* p;
157
0
    unsigned int* ppat;
158
0
    unsigned int i;
159
0
    for(i=0;i<dataSize; ++i) //speed critical section
160
0
      {
161
0
        p=nextp;
162
0
        nextp += words;
163
0
        ppat=ppat0;
164
0
        bool ppat_has_additional_bits = false;
165
0
        while(p<nextp)
166
0
          {
167
0
            if ((*ppat & *p) ^ *ppat) { // any bits in ppat that are not in p?
168
0
              ppat_has_additional_bits = true;
169
0
              break;
170
0
            }
171
0
            p++;
172
0
            ppat++;
173
0
          }
174
0
        if(!ppat_has_additional_bits)
175
0
          {
176
0
            candidates.push_back(i);
177
0
            if(candidates.size()>=MaxCandidates)
178
0
              break;
179
0
          }
180
0
      }
181
182
0
    if(i<_index.header.nEntries) //premature end to search
183
0
      {
184
0
        stringstream errorMsg;
185
0
        errorMsg << "Stopped looking after " << i << " molecules." << endl;
186
0
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning);
187
0
      }
188
189
0
    vector<unsigned int>::iterator itr;
190
0
    for(itr=candidates.begin();itr!=candidates.end();++itr)
191
0
      {
192
0
        SeekPositions.push_back(_index.seekdata[*itr]);
193
0
      }
194
0
    return true;
195
0
  }
196
197
////////////////////////////////////////////////////////////
198
 bool FastSearch::FindMatch(OBBase* pOb, vector<unsigned long>& SeekPositions,
199
                            unsigned int MaxCandidates)
200
0
{
201
//Similar to FastSearch::Find() except that successful candidates have all bits the same as the target
202
0
  vector<unsigned int> vecwords;
203
0
  _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint());
204
205
0
  vector<unsigned int>candidates; //indices of matches from fingerprint screen
206
207
0
  unsigned int dataSize = _index.header.nEntries;
208
0
  unsigned int words = _index.header.words;
209
0
  unsigned int* nextp = &_index.fptdata[0]; // start of next FP in index
210
0
  unsigned int* ppat0 = &vecwords[0];       // start of target FP
211
0
  unsigned int* p;                          // current position in index
212
0
  unsigned int* ppat;                       // current position in target FP
213
0
  unsigned int i; // need address of this, can't be register
214
0
  for(i=0;i<dataSize; ++i) //speed critical section
215
0
  {
216
0
    p=nextp;
217
0
    nextp += words;
218
0
    ppat=ppat0;
219
220
0
    while((*p++ == *ppat++ ) && (p<nextp));
221
222
0
    if(p==nextp)
223
0
    {
224
0
      candidates.push_back(i);
225
0
      if(candidates.size()>=MaxCandidates)
226
0
        break;
227
0
    }
228
0
  }
229
230
0
  vector<unsigned int>::iterator itr;
231
0
  for(itr=candidates.begin();itr!=candidates.end();++itr)
232
0
    {
233
0
      SeekPositions.push_back(_index.seekdata[*itr]);
234
0
    }
235
0
  return true;
236
0
}
237
238
  /////////////////////////////////////////////////////////
239
  bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned long>& SeekposMap,
240
                               double MinTani, double MaxTani)
241
0
  {
242
0
    vector<unsigned int> targetfp;
243
0
    _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint());
244
245
0
    unsigned int words = _index.header.words;
246
0
    unsigned int dataSize = _index.header.nEntries;
247
0
    unsigned int* nextp = &_index.fptdata[0];
248
0
    unsigned int* p;
249
0
    unsigned int i;
250
0
    for(i=0;i<dataSize; ++i) //speed critical section
251
0
      {
252
0
        p=nextp;
253
0
        nextp += words;
254
0
        double tani = OBFingerprint::Tanimoto(targetfp,p);
255
0
        if(tani>MinTani && tani < MaxTani)
256
0
          SeekposMap.insert(pair<const double, unsigned long>(tani,_index.seekdata[i]));
257
0
      }
258
0
    return true;
259
0
  }
260
261
  /////////////////////////////////////////////////////////
262
  bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned long>& SeekposMap,
263
                               int nCandidates)
264
0
  {
265
    ///If nCandidates is zero or omitted the original size of the multimap is used
266
0
    if(nCandidates)
267
0
      {
268
        //initialise the multimap with nCandidate zero entries
269
0
        SeekposMap.clear();
270
0
        int i;
271
0
        for(i=0;i<nCandidates;++i)
272
0
          SeekposMap.insert(pair<const double, unsigned long>(0,0));
273
0
      }
274
0
    else if(SeekposMap.size()==0)
275
0
      return false;
276
277
0
    vector<unsigned int> targetfp;
278
0
    _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint());
279
280
0
    unsigned int words = _index.header.words;
281
0
    unsigned int dataSize = _index.header.nEntries;
282
0
    unsigned int* nextp = &_index.fptdata[0];
283
0
    unsigned int* p;
284
0
    unsigned int i;
285
0
    for(i=0;i<dataSize; ++i) //speed critical section
286
0
      {
287
0
        p=nextp;
288
0
        nextp += words;
289
0
        double tani = OBFingerprint::Tanimoto(targetfp,p);
290
0
        if(tani>SeekposMap.begin()->first)
291
0
          {
292
0
            SeekposMap.insert(pair<const double, unsigned long>(tani,_index.seekdata[i]));
293
0
            SeekposMap.erase(SeekposMap.begin());
294
0
          }
295
0
      }
296
0
    return true;
297
0
  }
298
299
  /////////////////////////////////////////////////////////
300
  string FastSearch::ReadIndex(istream* pIndexstream)
301
0
  {
302
    //Reads fs index from istream into member variables
303
0
    _index.Read(pIndexstream);
304
305
0
    _pFP = _index.CheckFP();
306
0
    if(!_pFP)
307
0
      *(_index.header.datafilename) = '\0';
308
309
0
    return _index.header.datafilename; //will be empty on error
310
0
  }
311
312
  //////////////////////////////////////////////////////////
313
  string FastSearch::ReadIndexFile(string IndexFilename)
314
0
  {
315
0
    ifstream ifs(IndexFilename.c_str(),ios::binary);
316
0
    if(ifs)
317
0
      return ReadIndex(&ifs);
318
0
    else
319
0
    {
320
0
      string dum;
321
0
      return dum;
322
0
    }
323
0
  }
324
325
  //////////////////////////////////////////////////////////
326
  bool FptIndex::Read(istream* pIndexstream)
327
0
  {
328
//    pIndexstream->read((char*)&(header), sizeof(FptIndexHeader));
329
//    pIndexstream->seekg(header.headerlength);//allows header length to be changed
330
331
0
    if(!ReadHeader(pIndexstream))
332
0
      {
333
0
        *(header.datafilename) = '\0';
334
0
        return false;
335
0
      }
336
337
0
    unsigned long nwords = header.nEntries * header.words;
338
0
    fptdata.resize(nwords);
339
0
    seekdata.resize(header.nEntries);
340
341
0
    pIndexstream->read((char*)&(fptdata[0]), sizeof(unsigned int) * nwords);
342
0
    if(header.seek64) 
343
0
      {
344
0
      pIndexstream->read((char*)&(seekdata[0]), sizeof(unsigned long) * header.nEntries);
345
0
      }
346
0
    else 
347
0
      { //legacy format
348
0
   vector<unsigned int> tmp(header.nEntries);
349
0
         pIndexstream->read((char*)&(tmp[0]), sizeof(unsigned int) * header.nEntries);
350
0
   std::copy(tmp.begin(),tmp.end(),seekdata.begin());
351
0
      }
352
353
0
    if(pIndexstream->fail())
354
0
      {
355
0
        *(header.datafilename) = '\0';
356
0
        return false;
357
0
      }
358
0
    return true;
359
0
  }
360
361
  //////////////////////////////////////////////////////////
362
  bool FptIndex::ReadHeader(istream* pIndexstream)
363
0
  {
364
0
    pIndexstream->read( (char*)&header.headerlength, sizeof(unsigned) );
365
0
    pIndexstream->read( (char*)&header.nEntries,     sizeof(unsigned) );
366
0
    pIndexstream->read( (char*)&header.words,        sizeof(unsigned) );
367
0
    pIndexstream->read( (char*)&header.fpid,         sizeof(header.fpid) );
368
0
    pIndexstream->read( (char*)&header.seek64,       sizeof(header.seek64) );
369
0
    pIndexstream->read( (char*)&header.datafilename, sizeof(header.datafilename) );
370
0
    return !pIndexstream->fail();
371
0
 }
372
373
  //////////////////////////////////////////////////////////
374
  OBFingerprint* FptIndex::CheckFP()
375
0
  {
376
    //check that fingerprint type is available
377
0
    OBFingerprint* pFP = OBFingerprint::FindFingerprint(header.fpid);
378
0
    if(!pFP)
379
0
      {
380
0
        stringstream errorMsg;
381
0
        errorMsg << "Index has Fingerprints of type '" << header.fpid
382
0
                 << " which is not currently loaded." << endl;
383
0
        obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError);
384
0
      }
385
0
    return pFP; //NULL if not available
386
0
  }
387
388
  //*******************************************************
389
  FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os,
390
                                       std::string& fpid, int FptBits, int nmols)
391
0
  {
392
    ///Starts indexing process
393
0
    _indexstream = os;
394
0
    _nbits=FptBits;
395
0
    _pindex= new FptIndex;
396
0
    _pindex->header.headerlength = 3*sizeof(unsigned)+sizeof(_pindex->header.fpid)
397
0
                                    +sizeof(_pindex->header.datafilename);
398
0
    strncpy(_pindex->header.fpid,fpid.c_str(),15);
399
0
    _pindex->header.fpid[14]='\0'; //ensure fpid is terminated at 14 characters.
400
0
    _pindex->header.seek64 = 1;
401
0
    strncpy(_pindex->header.datafilename, datafilename.c_str(), 255);
402
403
    //just a hint to reserve size of vectors; definitive value set in destructor
404
0
    _pindex->header.nEntries = nmols;
405
406
    //check that fingerprint type is available
407
0
    _pFP = _pindex->CheckFP();
408
0
    if(fpid.empty()) // add id of default FP
409
0
      strcpy(_pindex->header.fpid, _pFP->GetID());
410
411
    //Save a small amount of time by not generating info (FP2 currently)
412
0
    _pFP->SetFlags(_pFP->Flags() | OBFingerprint::FPT_NOINFO);
413
0
  }
414
415
  /////////////////////////////////////////////////////////////
416
  FastSearchIndexer::FastSearchIndexer(FptIndex* pindex, std::ostream* os, int nmols)
417
0
  {
418
    //nmols is new total number of molecules
419
0
    _indexstream = os;
420
0
    _pindex = pindex;
421
0
    _nbits  = _pindex->header.words * OBFingerprint::Getbitsperint();
422
423
    //just a hint to reserve size of vectors; definitive value set in destructor
424
0
    _pindex->header.nEntries = nmols;
425
426
    //check that fingerprint type is available
427
0
    _pFP = _pindex->CheckFP();
428
0
  }
429
430
  /////////////////////////////////////////////////////////////
431
  FastSearchIndexer::~FastSearchIndexer()
432
0
  {
433
    ///Saves index file
434
0
    FptIndexHeader& hdr = _pindex->header;
435
0
    hdr.nEntries = _pindex->seekdata.size();
436
    //Write header
437
    //_indexstream->write((const char*)&hdr, sizeof(FptIndexHeader));
438
0
    _indexstream->write( (const char*)&hdr.headerlength, sizeof(unsigned) );
439
0
    _indexstream->write( (const char*)&hdr.nEntries,     sizeof(unsigned) );
440
0
    _indexstream->write( (const char*)&hdr.words,        sizeof(unsigned) );
441
0
    _indexstream->write( (const char*)&hdr.fpid,         sizeof(hdr.fpid) );
442
0
    _indexstream->write( (const char*)&hdr.seek64,         sizeof(hdr.seek64) );
443
0
    _indexstream->write( (const char*)&hdr.datafilename, sizeof(hdr.datafilename) );
444
445
0
    _indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int));
446
0
    _indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned long));
447
0
    if(!_indexstream)
448
0
      obErrorLog.ThrowError(__FUNCTION__,
449
0
                            "Difficulty writing index", obWarning);
450
0
    delete _pindex;
451
452
0
    _pFP->SetFlags(_pFP->Flags() & ~OBFingerprint::FPT_NOINFO); //Clear
453
0
  }
454
455
  ///////////////////////////////////////////////////////////////
456
  bool FastSearchIndexer::Add(OBBase* pOb, std::streampos seekpos)
457
0
  {
458
    ///Adds a fingerprint
459
460
0
    vector<unsigned int> vecwords;
461
0
    if(!_pFP)
462
0
      return false;
463
0
    if(_pFP->GetFingerprint(pOb, vecwords, _nbits))
464
0
      {
465
0
        _pindex->header.words = vecwords.size(); //Use size as returned from fingerprint
466
0
        if(_pindex->fptdata.empty() && _pindex->header.nEntries!=0)
467
0
        {
468
          //Reserve size of vectors at start to avoid multiple realloction and copying later.
469
          //Done here rather than in constructor because needs the size of the fingerprint.
470
0
          _pindex->fptdata.reserve(_pindex->header.nEntries * _pindex->header.words);
471
0
          _pindex->seekdata.reserve(_pindex->header.nEntries);
472
0
        }
473
0
        for(unsigned int i=0;i<_pindex->header.words;++i)
474
0
          _pindex->fptdata.push_back(vecwords[i]);
475
0
        _pindex->seekdata.push_back(seekpos);
476
0
        return true;
477
0
      }
478
0
    obErrorLog.ThrowError(__FUNCTION__, "Failed to make a fingerprint", obWarning);
479
0
    return false;
480
0
  }
481
482
  /*!
483
    \class OBFingerprint fingerprint.h <openbabel/fingerprint.h>
484
    These fingerprints are condensed representation of molecules (or other objects)
485
    as a list of boolean values (actually bits in a vector<unsigned>) with length
486
    of a power of 2. The main motivation is for fast searching of data sources
487
    containing large numbers of molecules (up to several million). Open Babel
488
    provides some routines which can search text files containing lists of molecules
489
    in any format. See the documentation on the class FastSearch.
490
491
    There are descriptions of molecular fingerprints at <br>
492
    http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br>
493
    http://www.mesaac.com/Fingerprint.htm <br>
494
    Many methods of preparing fingerprints have been described, but the type supported
495
    currently in OpenBabel has each bit representing a substructure (or other
496
    molecular property). If a substructure is present in the molecule, then a
497
    particular bit is set to 1. But because the hashing method may also map other
498
    substructures to the same bit, a match does not guarantee that a particular
499
    substructure is present; there may be false positives.  However, with proper design,
500
    a large fraction of irrelevant molecules in a data set can be eliminated in a
501
    fast search with boolean methods on the fingerprints.
502
    It then becomes feasible to make a definitive substructure search by
503
    conventional methods on this reduced list even if it is slow.
504
505
    OpenBabel provides a framework for applying new types of fingerprints without
506
    changing any existing code. They are derived from OBFingerprint and the
507
    source file is just compiled with the rest of OpenBabel. Alternatively,
508
    they can be separately compiled as a DLL or shared library and discovered
509
    when OpenBabel runs.
510
511
    For more on these specific implementations of fingerprints in Open
512
    Babel, please take a look at the developer's wiki:
513
    http://openbabel.org/wiki/Fingerprints
514
515
    Fingerprints derived from this abstract base class OBFingerprint can be for any
516
    object derived from OBBase (not just for OBMol).
517
    Each derived class provides an ID as a string and OBFingerprint keeps a map of
518
    these to provides a pointer to the class when requested in FindFingerprint.
519
520
    <h4>-- To define a fingerprint type --</h4>
521
    The classes derived form OBFingerprint are required to provide
522
    a GetFingerprint() routine and a Description() routine
523
    \code
524
    class MyFpType : OBFingerprint
525
    {
526
       MyFpType(const char* id) : OBFingerprint(id){};
527
528
       virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits)
529
       {
530
          //Convert pOb to the required type, usually OBMol
531
          OBMol* pmol = dynamic_cast<OBMol*>(pOb);
532
          fp.resize(required_number_of_words);
533
          ...
534
          use SetBit(fp,n); to set the nth bit
535
536
          if(nbits)
537
             Fold(fp, nbits);
538
       }
539
540
       virtual const char* Description(){ return "Some descriptive text";}
541
       ...
542
    };
543
    \endcode
544
545
    Declare a global instance with the ID you will use in -f options to specify
546
    its use.
547
    \code
548
    MyFpType theMyFpType("myfpID");
549
    \endcode
550
551
    <h4>-- To obtain a fingerprint --</h4>
552
    \code
553
    OBMol mol;
554
    ...
555
    vector<unsigned int> fp;
556
    OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint
557
    \endcode
558
    or
559
    \code
560
    vector<unsigned int> fp;
561
    OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID");
562
    ...and maybe...
563
    pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger
564
    \endcode
565
566
    <h4>-- To print a list of available fingerprint types --</h4>
567
    \code
568
    std::string id;
569
    OBFingerPrint* pFPrt=NULL;
570
    while(OBFingerprint::GetNextFPrt(id, pFPrt))
571
    {
572
       cout << id << " -- " << pFPrt->Description() << endl;
573
    }
574
    \endcode
575
576
    Fingerprints are handled as vector<unsigned int> so that the number of bits
577
    in this vector and their order will be platform and compiler
578
    dependent, because of size of int types and endian differences.
579
    Use fingerprints (and fastsearch indexes containing them) only
580
    for comparing with other fingerprints prepared on the same machine.
581
582
    The FingerprintFormat class is an output format which displays fingerprints
583
    as hexadecimal. When multiple molecules are supplied it will calculate the
584
    Tanimoto coefficient from the first molecule to each of the others. It also
585
    shows whether the first molecule is a possible substructure to all the others,
586
    i.e. whether all the bits set in the fingerprint for the first molecule are
587
    set in the fingerprint of the others. To display hexadecimal information when
588
    multiple molecules are provided it is necessay to use the -xh option.
589
590
    To see a list of available format types, type obabel -F on the command line.
591
    The -xF option of the FingerprintFormat class also provides this output, but due
592
    to a quirk in the way the program works, it is necessary to have a valid input
593
    molecule for this option to work.
594
  */
595
596
  /*! \class FastSearch fingerprint.h <openbabel/fingerprint.h>
597
    The FastSearch class searches an index to a datafile containing a list of molecules
598
    (or other objects) prepared by FastSearchIndexer.
599
600
    OpenBabel can also search files for molecules containing a substructure specified
601
    by a SMARTS string, using OBSmartsPattern or from the command line:
602
    \code
603
    obabel datafile.xxx -O outfile.yyy -sSMARTS
604
    \endcode
605
    But when the data file contains more than about 10,000 molecules this becomes
606
    rather too slow to be used interactively. To do it more quickly, an index
607
    of the molecules containing their fingerprints (see OBFingerprint) is prepared using
608
    FastSearchIndexer. The indexing process may take a long time but only has to
609
    be done once. The index can be searched very quickly with FastSearch. Because
610
    of the nature of fingerprints a match to a bit does not guarantee
611
    the presence of a particular substructure or other molecular property, so that
612
    a definitive answer may require a subsequent search of the (much reduced) list
613
    of candidates by another method (like OBSmartsPattern).
614
615
    Note that the index files are not portable. They need to be prepared on the
616
    computer that will access them.
617
618
    <h4>Using FastSearch and FastSearchIndexer in a program</h4>
619
620
    The index has two tables:
621
    - an array of fingerprints of the molecules
622
    - an array of the seek positions in the datasource file of all the molecules
623
624
    <h4>To prepare an fastsearch index file:</h4>
625
    - Open an ostream to the index file.
626
    - Make a FastSearchIndexer object on the heap or the stack, passing in as parameters:
627
    - the datafile name, the indexfile stream,
628
    - the id of the fingerprint type to be used,
629
    -  the number of bits the fingerprint is to be folded down to, If it is to be left
630
    unfolded, set fpid to 0 or do not specify it.
631
    .
632
    - For each molecule, call Add() with its pointer and its position in the datafile.<br>
633
    Currently the std::streampos value is implicitly cast to unsigned int so that
634
    for 32bit machines the datafile has to be no longer than about 2Gbyte.
635
    - The index file is written when the FastSearchIndexer object is deleted or goes out
636
    of scope.
637
638
    <h4>To search in a fastsearch index file</h4>
639
640
    - Open a std::istream to the indexfile (in binary mode on some systems)
641
    - Make a FastSearch object, read the index and open the datafile whose
642
    name it provides
643
    \code
644
    ifstream ifs(indexname,ios::binary);
645
    FastSearch fs;
646
    string datafilename = fs.ReadIndex(&ifs);
647
    if(datafilename.empty()
648
       return false;
649
650
    ifstream datastream(datafilename);
651
    if(!datastream)
652
       return false;
653
    \endcode
654
655
    <strong>To do a search for molecules which have all the substructure bits the
656
    OBMol object, patternMol</strong>
657
    \code
658
    vector<unsigned int>& SeekPositions;
659
    if(!fs.Find(patternMol, SeekPositions, MaxCandidates))
660
      for(itr=SeekPositions.begin();itr!=SeekPositions.end();++itr)
661
      {
662
         datastream.seekg(*itr);
663
         ... read the candidate molecule
664
         and subject to more rigorous test if necessary
665
      }
666
    \endcode
667
668
    <strong>To do a similarity search based on the Tanimoto coefficient</strong>
669
    This is defined as: <br>
670
    <em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br>
671
    where the boolean operations between the fingerprints are bitwise<br>
672
    The Tanimoto coefficient has no absolute meaning and depends on
673
    the design of the fingerprint.
674
    \code
675
    multimap<double, unsigned int> SeekposMap;
676
    // to find n best molecules
677
    fs.FindSimilar(&patternMol, SeekposMap, n);
678
    ...or
679
    // to find molecules with Tanimoto coefficient > MinTani
680
    fs.FindSimilar(&patternMol, SeekposMap, MinTani);
681
682
    multimap<double, unsigned int>::reverse_iterator itr;
683
    for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr)
684
    {
685
       datastream.seekg(itr->second);
686
       // ... read the candidate molecule
687
       double tani = itr->first;
688
    }
689
    \endcode
690
691
    The FastSearchFormat class facilitates the use of these routine from the
692
    command line or other front end program. For instance:
693
694
    <strong>Prepare an index:</strong>
695
    \code
696
    obabel datafile.xxx -O index.fs
697
    \endcode
698
    With options you can specify:
699
    - which type of fingerprint to be used, e.g. -xfFP2,
700
    - whether it is folded to a specified number of bits, e.g. -xn128
701
    (which should be a power of 2)
702
    - whether to pre-select the molecules which are indexed:
703
    - by structure e.g only ethers and esters, -sCOC
704
    - by excluding molecules with bezene rings, -vc1ccccc1
705
    - by position in the datafile e.g. only mols 10 to 90, -f10 -l90
706
    .
707
    <strong>Substructure search</strong> in a previously prepared index file
708
    \code
709
    obabel index.fs -O outfile.yyy -sSMILES
710
    \endcode
711
    The datafile can also be used as the input file, provided the input format is specified as fs
712
    \code
713
    obabel datafile.xxx -O outfile.yyy -sSMILES -ifs
714
    \endcode
715
    A "slow" search not using fingerprints would be done on the same data by omitting -ifs.
716
    A "slow" search can use SMARTS strings, but the fast search is restricted
717
    to the subset which is valid SMILES.
718
719
    With the -S option, the target can be specified as a molecule in a file of any format
720
    \code
721
    obabel datafile.xxx -O outfile.yyy -Spattern_mol.zzz -ifs
722
    \endcode
723
    These searches have two stages: a fingerprint search which produces
724
    a number of candidate molecules and a definitive search which selects
725
    from these using SMARTS pattern matching. The maximum number of candidates
726
    is 4000 by default but you can change this with an option
727
    e.g. -al 8000  (Note that you need the space between l and the number.)
728
    If the fingerprint search reaches the maximum number it will not
729
    look further and will tell you at what stage it stopped.
730
731
    <strong>Similarity search</strong> in a previously prepared index file<br>
732
    This rather done (rather than a substructure search) if the -at option is used,
733
    \code
734
    obabel datafile.xxx -O outfile.yyy -sSMILES -ifs -at0.7
735
    \endcode
736
    for instance
737
    - -at0.7 will recover all molecules with Tanimoto greater than 0.7
738
    - -at15 (no decimal point) will recover the 15 molecules with largest coefficients.
739
    - -aa will add the Tanimoto coefficient to the titles of the output molecules.
740
741
    All stages, the indexing, the interpretation of the SMILES string in the -s option,
742
    the file in the -S option and the final SMARTS filter convert to OBMol and apply
743
    ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O  and N(=O)=O
744
    are recognized as chemically identical.
745
  */
746
747
}//Openbabel
748
749
//! \file fingerprint.cpp
750
//! \brief Definitions for OBFingerprint base class and fastsearch classes