/src/openbabel/src/fingerprint.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | fingerprint.cpp - Implementation of fingerpring base class and fastsearching |
3 | | |
4 | | Copyright (C) 2005 by Chris Morley |
5 | | |
6 | | This file is part of the Open Babel project. |
7 | | For more information, see <http://openbabel.org/> |
8 | | |
9 | | This program is free software; you can redistribute it and/or modify |
10 | | it under the terms of the GNU General Public License as published by |
11 | | the Free Software Foundation version 2 of the License. |
12 | | |
13 | | This program is distributed in the hope that it will be useful, |
14 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | | GNU General Public License for more details. |
17 | | ***********************************************************************/ |
18 | | |
19 | | #include <openbabel/babelconfig.h> |
20 | | |
21 | | #include <vector> |
22 | | #include <algorithm> |
23 | | #include <iosfwd> |
24 | | #include <cstring> |
25 | | #include <fstream> |
26 | | |
27 | | #include <openbabel/fingerprint.h> |
28 | | #include <openbabel/oberror.h> |
29 | | |
30 | | using namespace std; |
31 | | namespace OpenBabel |
32 | | { |
33 | | #if defined(__CYGWIN__) || defined(__MINGW32__) |
34 | | // macro to implement static OBPlugin::PluginMapType& Map() |
35 | | PLUGIN_CPP_FILE(OBFingerprint) |
36 | | #endif |
37 | | |
38 | | const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int); |
39 | | |
40 | | void OBFingerprint::SetBit(vector<unsigned int>& vec, const unsigned int n) |
41 | 0 | { |
42 | 0 | vec[n/Getbitsperint()] |= (1 << (n % Getbitsperint())); |
43 | 0 | } |
44 | | |
45 | | bool OBFingerprint::GetBit(const vector<unsigned int>& vec, const unsigned int n) |
46 | 0 | { |
47 | 0 | unsigned int word =vec[n/Getbitsperint()]; |
48 | 0 | return (word &= (1 << (n % Getbitsperint())))!=0; |
49 | 0 | } |
50 | | |
51 | | //////////////////////////////////////// |
52 | | void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits) |
53 | 0 | { |
54 | 0 | if(nbits<Getbitsperint()) |
55 | 0 | { |
56 | 0 | stringstream ss; |
57 | 0 | ss << "Can't fold to less than " << Getbitsperint() << "bits"; |
58 | 0 | obErrorLog.ThrowError(__FUNCTION__, ss.str(), obError); |
59 | 0 | return; |
60 | 0 | } |
61 | | // "folding" to a larger # of bits |
62 | 0 | if (nbits > vec.size()*Getbitsperint()) { |
63 | 0 | vec.resize(nbits/Getbitsperint(), 0); |
64 | 0 | } |
65 | 0 | else { |
66 | | // normal folding to smaller vector sizes |
67 | 0 | while(vec.size()*Getbitsperint()/2 >= nbits) |
68 | 0 | vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2, |
69 | 0 | vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end()); |
70 | 0 | } |
71 | 0 | } |
72 | | |
73 | | //////////////////////////////////////// |
74 | | /* bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt) |
75 | | { |
76 | | Fptpos iter; |
77 | | if(id.empty()) |
78 | | iter=FPtsMap().begin(); |
79 | | else |
80 | | { |
81 | | iter=FPtsMap().find(id); |
82 | | if(iter!=FPtsMap().end()) |
83 | | ++iter; |
84 | | } |
85 | | if(iter==FPtsMap().end()) |
86 | | return false; |
87 | | id = iter->first; |
88 | | pFPrt = iter->second; |
89 | | return true; |
90 | | } |
91 | | |
92 | | OBFingerprint* OBFingerprint::FindFingerprint(const string& ID) |
93 | | { |
94 | | if(ID.empty()) |
95 | | return _pDefault; |
96 | | Fptpos iter = FPtsMap().find(ID); |
97 | | if(iter==FPtsMap().end()) |
98 | | return NULL; |
99 | | else |
100 | | return iter->second; |
101 | | } |
102 | | */ |
103 | | double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2) |
104 | 0 | { |
105 | | //Independent of sizeof(unsigned int) |
106 | 0 | if(vec1.size()!=vec2.size()) |
107 | 0 | return -1; //different number of bits |
108 | 0 | int andbits=0, orbits=0; |
109 | 0 | for (unsigned i=0;i<vec1.size();++i) |
110 | 0 | { |
111 | 0 | int andfp = vec1[i] & vec2[i]; |
112 | 0 | int orfp = vec1[i] | vec2[i]; |
113 | | //Count bits |
114 | | /* GCC 3.4 supports a "population count" builtin, which on many targets is |
115 | | implemented with a single instruction. There is a fallback definition |
116 | | in libgcc in case a target does not have one, which should be just as |
117 | | good as the static function below. */ |
118 | 0 | #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) |
119 | 0 | andbits += __builtin_popcount(andfp); |
120 | 0 | orbits += __builtin_popcount(orfp); |
121 | | #else |
122 | | for(;andfp;andfp=andfp<<1) |
123 | | if(andfp<0) ++andbits; |
124 | | for(;orfp;orfp=orfp<<1) |
125 | | if(orfp<0) ++orbits; |
126 | | #endif |
127 | 0 | } |
128 | 0 | if(orbits==0) |
129 | 0 | return 0.0; |
130 | 0 | return((double)andbits/(double)orbits); |
131 | 0 | } |
132 | | |
133 | | //***************************************************************** |
134 | | bool FastSearch::Find(OBBase* pOb, vector<unsigned long>& SeekPositions, |
135 | | unsigned int MaxCandidates) |
136 | 0 | { |
137 | | ///Finds chemical objects in datafilename (which must previously have been indexed) |
138 | | ///that have all the same bits set in their fingerprints as in the fingerprint of |
139 | | ///a pattern object. (Usually a substructure search in molecules.) |
140 | | ///The type of fingerprint and its degree of folding does not have to be specified |
141 | | ///here because the values in the index file are used. |
142 | | ///The positions of the candidate matching molecules in the original datafile are returned. |
143 | |
|
144 | 0 | vector<unsigned int> vecwords; |
145 | 0 | _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint()); |
146 | |
|
147 | 0 | vector<unsigned int>candidates; //indices of matches from fingerprint screen |
148 | 0 | candidates.reserve(MaxCandidates); |
149 | |
|
150 | 0 | unsigned int dataSize = _index.header.nEntries; |
151 | | // GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype); |
152 | |
|
153 | 0 | unsigned int words = _index.header.words; |
154 | 0 | unsigned int* nextp = &_index.fptdata[0]; |
155 | 0 | unsigned int* ppat0 = &vecwords[0]; |
156 | 0 | unsigned int* p; |
157 | 0 | unsigned int* ppat; |
158 | 0 | unsigned int i; |
159 | 0 | for(i=0;i<dataSize; ++i) //speed critical section |
160 | 0 | { |
161 | 0 | p=nextp; |
162 | 0 | nextp += words; |
163 | 0 | ppat=ppat0; |
164 | 0 | bool ppat_has_additional_bits = false; |
165 | 0 | while(p<nextp) |
166 | 0 | { |
167 | 0 | if ((*ppat & *p) ^ *ppat) { // any bits in ppat that are not in p? |
168 | 0 | ppat_has_additional_bits = true; |
169 | 0 | break; |
170 | 0 | } |
171 | 0 | p++; |
172 | 0 | ppat++; |
173 | 0 | } |
174 | 0 | if(!ppat_has_additional_bits) |
175 | 0 | { |
176 | 0 | candidates.push_back(i); |
177 | 0 | if(candidates.size()>=MaxCandidates) |
178 | 0 | break; |
179 | 0 | } |
180 | 0 | } |
181 | |
|
182 | 0 | if(i<_index.header.nEntries) //premature end to search |
183 | 0 | { |
184 | 0 | stringstream errorMsg; |
185 | 0 | errorMsg << "Stopped looking after " << i << " molecules." << endl; |
186 | 0 | obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning); |
187 | 0 | } |
188 | |
|
189 | 0 | vector<unsigned int>::iterator itr; |
190 | 0 | for(itr=candidates.begin();itr!=candidates.end();++itr) |
191 | 0 | { |
192 | 0 | SeekPositions.push_back(_index.seekdata[*itr]); |
193 | 0 | } |
194 | 0 | return true; |
195 | 0 | } |
196 | | |
197 | | //////////////////////////////////////////////////////////// |
198 | | bool FastSearch::FindMatch(OBBase* pOb, vector<unsigned long>& SeekPositions, |
199 | | unsigned int MaxCandidates) |
200 | 0 | { |
201 | | //Similar to FastSearch::Find() except that successful candidates have all bits the same as the target |
202 | 0 | vector<unsigned int> vecwords; |
203 | 0 | _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint()); |
204 | |
|
205 | 0 | vector<unsigned int>candidates; //indices of matches from fingerprint screen |
206 | |
|
207 | 0 | unsigned int dataSize = _index.header.nEntries; |
208 | 0 | unsigned int words = _index.header.words; |
209 | 0 | unsigned int* nextp = &_index.fptdata[0]; // start of next FP in index |
210 | 0 | unsigned int* ppat0 = &vecwords[0]; // start of target FP |
211 | 0 | unsigned int* p; // current position in index |
212 | 0 | unsigned int* ppat; // current position in target FP |
213 | 0 | unsigned int i; // need address of this, can't be register |
214 | 0 | for(i=0;i<dataSize; ++i) //speed critical section |
215 | 0 | { |
216 | 0 | p=nextp; |
217 | 0 | nextp += words; |
218 | 0 | ppat=ppat0; |
219 | |
|
220 | 0 | while((*p++ == *ppat++ ) && (p<nextp)); |
221 | |
|
222 | 0 | if(p==nextp) |
223 | 0 | { |
224 | 0 | candidates.push_back(i); |
225 | 0 | if(candidates.size()>=MaxCandidates) |
226 | 0 | break; |
227 | 0 | } |
228 | 0 | } |
229 | |
|
230 | 0 | vector<unsigned int>::iterator itr; |
231 | 0 | for(itr=candidates.begin();itr!=candidates.end();++itr) |
232 | 0 | { |
233 | 0 | SeekPositions.push_back(_index.seekdata[*itr]); |
234 | 0 | } |
235 | 0 | return true; |
236 | 0 | } |
237 | | |
238 | | ///////////////////////////////////////////////////////// |
239 | | bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned long>& SeekposMap, |
240 | | double MinTani, double MaxTani) |
241 | 0 | { |
242 | 0 | vector<unsigned int> targetfp; |
243 | 0 | _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint()); |
244 | |
|
245 | 0 | unsigned int words = _index.header.words; |
246 | 0 | unsigned int dataSize = _index.header.nEntries; |
247 | 0 | unsigned int* nextp = &_index.fptdata[0]; |
248 | 0 | unsigned int* p; |
249 | 0 | unsigned int i; |
250 | 0 | for(i=0;i<dataSize; ++i) //speed critical section |
251 | 0 | { |
252 | 0 | p=nextp; |
253 | 0 | nextp += words; |
254 | 0 | double tani = OBFingerprint::Tanimoto(targetfp,p); |
255 | 0 | if(tani>MinTani && tani < MaxTani) |
256 | 0 | SeekposMap.insert(pair<const double, unsigned long>(tani,_index.seekdata[i])); |
257 | 0 | } |
258 | 0 | return true; |
259 | 0 | } |
260 | | |
261 | | ///////////////////////////////////////////////////////// |
262 | | bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned long>& SeekposMap, |
263 | | int nCandidates) |
264 | 0 | { |
265 | | ///If nCandidates is zero or omitted the original size of the multimap is used |
266 | 0 | if(nCandidates) |
267 | 0 | { |
268 | | //initialise the multimap with nCandidate zero entries |
269 | 0 | SeekposMap.clear(); |
270 | 0 | int i; |
271 | 0 | for(i=0;i<nCandidates;++i) |
272 | 0 | SeekposMap.insert(pair<const double, unsigned long>(0,0)); |
273 | 0 | } |
274 | 0 | else if(SeekposMap.size()==0) |
275 | 0 | return false; |
276 | | |
277 | 0 | vector<unsigned int> targetfp; |
278 | 0 | _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint()); |
279 | |
|
280 | 0 | unsigned int words = _index.header.words; |
281 | 0 | unsigned int dataSize = _index.header.nEntries; |
282 | 0 | unsigned int* nextp = &_index.fptdata[0]; |
283 | 0 | unsigned int* p; |
284 | 0 | unsigned int i; |
285 | 0 | for(i=0;i<dataSize; ++i) //speed critical section |
286 | 0 | { |
287 | 0 | p=nextp; |
288 | 0 | nextp += words; |
289 | 0 | double tani = OBFingerprint::Tanimoto(targetfp,p); |
290 | 0 | if(tani>SeekposMap.begin()->first) |
291 | 0 | { |
292 | 0 | SeekposMap.insert(pair<const double, unsigned long>(tani,_index.seekdata[i])); |
293 | 0 | SeekposMap.erase(SeekposMap.begin()); |
294 | 0 | } |
295 | 0 | } |
296 | 0 | return true; |
297 | 0 | } |
298 | | |
299 | | ///////////////////////////////////////////////////////// |
300 | | string FastSearch::ReadIndex(istream* pIndexstream) |
301 | 0 | { |
302 | | //Reads fs index from istream into member variables |
303 | 0 | _index.Read(pIndexstream); |
304 | |
|
305 | 0 | _pFP = _index.CheckFP(); |
306 | 0 | if(!_pFP) |
307 | 0 | *(_index.header.datafilename) = '\0'; |
308 | |
|
309 | 0 | return _index.header.datafilename; //will be empty on error |
310 | 0 | } |
311 | | |
312 | | ////////////////////////////////////////////////////////// |
313 | | string FastSearch::ReadIndexFile(string IndexFilename) |
314 | 0 | { |
315 | 0 | ifstream ifs(IndexFilename.c_str(),ios::binary); |
316 | 0 | if(ifs) |
317 | 0 | return ReadIndex(&ifs); |
318 | 0 | else |
319 | 0 | { |
320 | 0 | string dum; |
321 | 0 | return dum; |
322 | 0 | } |
323 | 0 | } |
324 | | |
325 | | ////////////////////////////////////////////////////////// |
326 | | bool FptIndex::Read(istream* pIndexstream) |
327 | 0 | { |
328 | | // pIndexstream->read((char*)&(header), sizeof(FptIndexHeader)); |
329 | | // pIndexstream->seekg(header.headerlength);//allows header length to be changed |
330 | |
|
331 | 0 | if(!ReadHeader(pIndexstream)) |
332 | 0 | { |
333 | 0 | *(header.datafilename) = '\0'; |
334 | 0 | return false; |
335 | 0 | } |
336 | | |
337 | 0 | unsigned long nwords = header.nEntries * header.words; |
338 | 0 | fptdata.resize(nwords); |
339 | 0 | seekdata.resize(header.nEntries); |
340 | |
|
341 | 0 | pIndexstream->read((char*)&(fptdata[0]), sizeof(unsigned int) * nwords); |
342 | 0 | if(header.seek64) |
343 | 0 | { |
344 | 0 | pIndexstream->read((char*)&(seekdata[0]), sizeof(unsigned long) * header.nEntries); |
345 | 0 | } |
346 | 0 | else |
347 | 0 | { //legacy format |
348 | 0 | vector<unsigned int> tmp(header.nEntries); |
349 | 0 | pIndexstream->read((char*)&(tmp[0]), sizeof(unsigned int) * header.nEntries); |
350 | 0 | std::copy(tmp.begin(),tmp.end(),seekdata.begin()); |
351 | 0 | } |
352 | |
|
353 | 0 | if(pIndexstream->fail()) |
354 | 0 | { |
355 | 0 | *(header.datafilename) = '\0'; |
356 | 0 | return false; |
357 | 0 | } |
358 | 0 | return true; |
359 | 0 | } |
360 | | |
361 | | ////////////////////////////////////////////////////////// |
362 | | bool FptIndex::ReadHeader(istream* pIndexstream) |
363 | 0 | { |
364 | 0 | pIndexstream->read( (char*)&header.headerlength, sizeof(unsigned) ); |
365 | 0 | pIndexstream->read( (char*)&header.nEntries, sizeof(unsigned) ); |
366 | 0 | pIndexstream->read( (char*)&header.words, sizeof(unsigned) ); |
367 | 0 | pIndexstream->read( (char*)&header.fpid, sizeof(header.fpid) ); |
368 | 0 | pIndexstream->read( (char*)&header.seek64, sizeof(header.seek64) ); |
369 | 0 | pIndexstream->read( (char*)&header.datafilename, sizeof(header.datafilename) ); |
370 | 0 | return !pIndexstream->fail(); |
371 | 0 | } |
372 | | |
373 | | ////////////////////////////////////////////////////////// |
374 | | OBFingerprint* FptIndex::CheckFP() |
375 | 0 | { |
376 | | //check that fingerprint type is available |
377 | 0 | OBFingerprint* pFP = OBFingerprint::FindFingerprint(header.fpid); |
378 | 0 | if(!pFP) |
379 | 0 | { |
380 | 0 | stringstream errorMsg; |
381 | 0 | errorMsg << "Index has Fingerprints of type '" << header.fpid |
382 | 0 | << " which is not currently loaded." << endl; |
383 | 0 | obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); |
384 | 0 | } |
385 | 0 | return pFP; //NULL if not available |
386 | 0 | } |
387 | | |
388 | | //******************************************************* |
389 | | FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os, |
390 | | std::string& fpid, int FptBits, int nmols) |
391 | 0 | { |
392 | | ///Starts indexing process |
393 | 0 | _indexstream = os; |
394 | 0 | _nbits=FptBits; |
395 | 0 | _pindex= new FptIndex; |
396 | 0 | _pindex->header.headerlength = 3*sizeof(unsigned)+sizeof(_pindex->header.fpid) |
397 | 0 | +sizeof(_pindex->header.datafilename); |
398 | 0 | strncpy(_pindex->header.fpid,fpid.c_str(),15); |
399 | 0 | _pindex->header.fpid[14]='\0'; //ensure fpid is terminated at 14 characters. |
400 | 0 | _pindex->header.seek64 = 1; |
401 | 0 | strncpy(_pindex->header.datafilename, datafilename.c_str(), 255); |
402 | | |
403 | | //just a hint to reserve size of vectors; definitive value set in destructor |
404 | 0 | _pindex->header.nEntries = nmols; |
405 | | |
406 | | //check that fingerprint type is available |
407 | 0 | _pFP = _pindex->CheckFP(); |
408 | 0 | if(fpid.empty()) // add id of default FP |
409 | 0 | strcpy(_pindex->header.fpid, _pFP->GetID()); |
410 | | |
411 | | //Save a small amount of time by not generating info (FP2 currently) |
412 | 0 | _pFP->SetFlags(_pFP->Flags() | OBFingerprint::FPT_NOINFO); |
413 | 0 | } |
414 | | |
415 | | ///////////////////////////////////////////////////////////// |
416 | | FastSearchIndexer::FastSearchIndexer(FptIndex* pindex, std::ostream* os, int nmols) |
417 | 0 | { |
418 | | //nmols is new total number of molecules |
419 | 0 | _indexstream = os; |
420 | 0 | _pindex = pindex; |
421 | 0 | _nbits = _pindex->header.words * OBFingerprint::Getbitsperint(); |
422 | | |
423 | | //just a hint to reserve size of vectors; definitive value set in destructor |
424 | 0 | _pindex->header.nEntries = nmols; |
425 | | |
426 | | //check that fingerprint type is available |
427 | 0 | _pFP = _pindex->CheckFP(); |
428 | 0 | } |
429 | | |
430 | | ///////////////////////////////////////////////////////////// |
431 | | FastSearchIndexer::~FastSearchIndexer() |
432 | 0 | { |
433 | | ///Saves index file |
434 | 0 | FptIndexHeader& hdr = _pindex->header; |
435 | 0 | hdr.nEntries = _pindex->seekdata.size(); |
436 | | //Write header |
437 | | //_indexstream->write((const char*)&hdr, sizeof(FptIndexHeader)); |
438 | 0 | _indexstream->write( (const char*)&hdr.headerlength, sizeof(unsigned) ); |
439 | 0 | _indexstream->write( (const char*)&hdr.nEntries, sizeof(unsigned) ); |
440 | 0 | _indexstream->write( (const char*)&hdr.words, sizeof(unsigned) ); |
441 | 0 | _indexstream->write( (const char*)&hdr.fpid, sizeof(hdr.fpid) ); |
442 | 0 | _indexstream->write( (const char*)&hdr.seek64, sizeof(hdr.seek64) ); |
443 | 0 | _indexstream->write( (const char*)&hdr.datafilename, sizeof(hdr.datafilename) ); |
444 | |
|
445 | 0 | _indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int)); |
446 | 0 | _indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned long)); |
447 | 0 | if(!_indexstream) |
448 | 0 | obErrorLog.ThrowError(__FUNCTION__, |
449 | 0 | "Difficulty writing index", obWarning); |
450 | 0 | delete _pindex; |
451 | |
|
452 | 0 | _pFP->SetFlags(_pFP->Flags() & ~OBFingerprint::FPT_NOINFO); //Clear |
453 | 0 | } |
454 | | |
455 | | /////////////////////////////////////////////////////////////// |
456 | | bool FastSearchIndexer::Add(OBBase* pOb, std::streampos seekpos) |
457 | 0 | { |
458 | | ///Adds a fingerprint |
459 | |
|
460 | 0 | vector<unsigned int> vecwords; |
461 | 0 | if(!_pFP) |
462 | 0 | return false; |
463 | 0 | if(_pFP->GetFingerprint(pOb, vecwords, _nbits)) |
464 | 0 | { |
465 | 0 | _pindex->header.words = vecwords.size(); //Use size as returned from fingerprint |
466 | 0 | if(_pindex->fptdata.empty() && _pindex->header.nEntries!=0) |
467 | 0 | { |
468 | | //Reserve size of vectors at start to avoid multiple realloction and copying later. |
469 | | //Done here rather than in constructor because needs the size of the fingerprint. |
470 | 0 | _pindex->fptdata.reserve(_pindex->header.nEntries * _pindex->header.words); |
471 | 0 | _pindex->seekdata.reserve(_pindex->header.nEntries); |
472 | 0 | } |
473 | 0 | for(unsigned int i=0;i<_pindex->header.words;++i) |
474 | 0 | _pindex->fptdata.push_back(vecwords[i]); |
475 | 0 | _pindex->seekdata.push_back(seekpos); |
476 | 0 | return true; |
477 | 0 | } |
478 | 0 | obErrorLog.ThrowError(__FUNCTION__, "Failed to make a fingerprint", obWarning); |
479 | 0 | return false; |
480 | 0 | } |
481 | | |
482 | | /*! |
483 | | \class OBFingerprint fingerprint.h <openbabel/fingerprint.h> |
484 | | These fingerprints are condensed representation of molecules (or other objects) |
485 | | as a list of boolean values (actually bits in a vector<unsigned>) with length |
486 | | of a power of 2. The main motivation is for fast searching of data sources |
487 | | containing large numbers of molecules (up to several million). Open Babel |
488 | | provides some routines which can search text files containing lists of molecules |
489 | | in any format. See the documentation on the class FastSearch. |
490 | | |
491 | | There are descriptions of molecular fingerprints at <br> |
492 | | http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br> |
493 | | http://www.mesaac.com/Fingerprint.htm <br> |
494 | | Many methods of preparing fingerprints have been described, but the type supported |
495 | | currently in OpenBabel has each bit representing a substructure (or other |
496 | | molecular property). If a substructure is present in the molecule, then a |
497 | | particular bit is set to 1. But because the hashing method may also map other |
498 | | substructures to the same bit, a match does not guarantee that a particular |
499 | | substructure is present; there may be false positives. However, with proper design, |
500 | | a large fraction of irrelevant molecules in a data set can be eliminated in a |
501 | | fast search with boolean methods on the fingerprints. |
502 | | It then becomes feasible to make a definitive substructure search by |
503 | | conventional methods on this reduced list even if it is slow. |
504 | | |
505 | | OpenBabel provides a framework for applying new types of fingerprints without |
506 | | changing any existing code. They are derived from OBFingerprint and the |
507 | | source file is just compiled with the rest of OpenBabel. Alternatively, |
508 | | they can be separately compiled as a DLL or shared library and discovered |
509 | | when OpenBabel runs. |
510 | | |
511 | | For more on these specific implementations of fingerprints in Open |
512 | | Babel, please take a look at the developer's wiki: |
513 | | http://openbabel.org/wiki/Fingerprints |
514 | | |
515 | | Fingerprints derived from this abstract base class OBFingerprint can be for any |
516 | | object derived from OBBase (not just for OBMol). |
517 | | Each derived class provides an ID as a string and OBFingerprint keeps a map of |
518 | | these to provides a pointer to the class when requested in FindFingerprint. |
519 | | |
520 | | <h4>-- To define a fingerprint type --</h4> |
521 | | The classes derived form OBFingerprint are required to provide |
522 | | a GetFingerprint() routine and a Description() routine |
523 | | \code |
524 | | class MyFpType : OBFingerprint |
525 | | { |
526 | | MyFpType(const char* id) : OBFingerprint(id){}; |
527 | | |
528 | | virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits) |
529 | | { |
530 | | //Convert pOb to the required type, usually OBMol |
531 | | OBMol* pmol = dynamic_cast<OBMol*>(pOb); |
532 | | fp.resize(required_number_of_words); |
533 | | ... |
534 | | use SetBit(fp,n); to set the nth bit |
535 | | |
536 | | if(nbits) |
537 | | Fold(fp, nbits); |
538 | | } |
539 | | |
540 | | virtual const char* Description(){ return "Some descriptive text";} |
541 | | ... |
542 | | }; |
543 | | \endcode |
544 | | |
545 | | Declare a global instance with the ID you will use in -f options to specify |
546 | | its use. |
547 | | \code |
548 | | MyFpType theMyFpType("myfpID"); |
549 | | \endcode |
550 | | |
551 | | <h4>-- To obtain a fingerprint --</h4> |
552 | | \code |
553 | | OBMol mol; |
554 | | ... |
555 | | vector<unsigned int> fp; |
556 | | OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint |
557 | | \endcode |
558 | | or |
559 | | \code |
560 | | vector<unsigned int> fp; |
561 | | OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID"); |
562 | | ...and maybe... |
563 | | pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger |
564 | | \endcode |
565 | | |
566 | | <h4>-- To print a list of available fingerprint types --</h4> |
567 | | \code |
568 | | std::string id; |
569 | | OBFingerPrint* pFPrt=NULL; |
570 | | while(OBFingerprint::GetNextFPrt(id, pFPrt)) |
571 | | { |
572 | | cout << id << " -- " << pFPrt->Description() << endl; |
573 | | } |
574 | | \endcode |
575 | | |
576 | | Fingerprints are handled as vector<unsigned int> so that the number of bits |
577 | | in this vector and their order will be platform and compiler |
578 | | dependent, because of size of int types and endian differences. |
579 | | Use fingerprints (and fastsearch indexes containing them) only |
580 | | for comparing with other fingerprints prepared on the same machine. |
581 | | |
582 | | The FingerprintFormat class is an output format which displays fingerprints |
583 | | as hexadecimal. When multiple molecules are supplied it will calculate the |
584 | | Tanimoto coefficient from the first molecule to each of the others. It also |
585 | | shows whether the first molecule is a possible substructure to all the others, |
586 | | i.e. whether all the bits set in the fingerprint for the first molecule are |
587 | | set in the fingerprint of the others. To display hexadecimal information when |
588 | | multiple molecules are provided it is necessay to use the -xh option. |
589 | | |
590 | | To see a list of available format types, type obabel -F on the command line. |
591 | | The -xF option of the FingerprintFormat class also provides this output, but due |
592 | | to a quirk in the way the program works, it is necessary to have a valid input |
593 | | molecule for this option to work. |
594 | | */ |
595 | | |
596 | | /*! \class FastSearch fingerprint.h <openbabel/fingerprint.h> |
597 | | The FastSearch class searches an index to a datafile containing a list of molecules |
598 | | (or other objects) prepared by FastSearchIndexer. |
599 | | |
600 | | OpenBabel can also search files for molecules containing a substructure specified |
601 | | by a SMARTS string, using OBSmartsPattern or from the command line: |
602 | | \code |
603 | | obabel datafile.xxx -O outfile.yyy -sSMARTS |
604 | | \endcode |
605 | | But when the data file contains more than about 10,000 molecules this becomes |
606 | | rather too slow to be used interactively. To do it more quickly, an index |
607 | | of the molecules containing their fingerprints (see OBFingerprint) is prepared using |
608 | | FastSearchIndexer. The indexing process may take a long time but only has to |
609 | | be done once. The index can be searched very quickly with FastSearch. Because |
610 | | of the nature of fingerprints a match to a bit does not guarantee |
611 | | the presence of a particular substructure or other molecular property, so that |
612 | | a definitive answer may require a subsequent search of the (much reduced) list |
613 | | of candidates by another method (like OBSmartsPattern). |
614 | | |
615 | | Note that the index files are not portable. They need to be prepared on the |
616 | | computer that will access them. |
617 | | |
618 | | <h4>Using FastSearch and FastSearchIndexer in a program</h4> |
619 | | |
620 | | The index has two tables: |
621 | | - an array of fingerprints of the molecules |
622 | | - an array of the seek positions in the datasource file of all the molecules |
623 | | |
624 | | <h4>To prepare an fastsearch index file:</h4> |
625 | | - Open an ostream to the index file. |
626 | | - Make a FastSearchIndexer object on the heap or the stack, passing in as parameters: |
627 | | - the datafile name, the indexfile stream, |
628 | | - the id of the fingerprint type to be used, |
629 | | - the number of bits the fingerprint is to be folded down to, If it is to be left |
630 | | unfolded, set fpid to 0 or do not specify it. |
631 | | . |
632 | | - For each molecule, call Add() with its pointer and its position in the datafile.<br> |
633 | | Currently the std::streampos value is implicitly cast to unsigned int so that |
634 | | for 32bit machines the datafile has to be no longer than about 2Gbyte. |
635 | | - The index file is written when the FastSearchIndexer object is deleted or goes out |
636 | | of scope. |
637 | | |
638 | | <h4>To search in a fastsearch index file</h4> |
639 | | |
640 | | - Open a std::istream to the indexfile (in binary mode on some systems) |
641 | | - Make a FastSearch object, read the index and open the datafile whose |
642 | | name it provides |
643 | | \code |
644 | | ifstream ifs(indexname,ios::binary); |
645 | | FastSearch fs; |
646 | | string datafilename = fs.ReadIndex(&ifs); |
647 | | if(datafilename.empty() |
648 | | return false; |
649 | | |
650 | | ifstream datastream(datafilename); |
651 | | if(!datastream) |
652 | | return false; |
653 | | \endcode |
654 | | |
655 | | <strong>To do a search for molecules which have all the substructure bits the |
656 | | OBMol object, patternMol</strong> |
657 | | \code |
658 | | vector<unsigned int>& SeekPositions; |
659 | | if(!fs.Find(patternMol, SeekPositions, MaxCandidates)) |
660 | | for(itr=SeekPositions.begin();itr!=SeekPositions.end();++itr) |
661 | | { |
662 | | datastream.seekg(*itr); |
663 | | ... read the candidate molecule |
664 | | and subject to more rigorous test if necessary |
665 | | } |
666 | | \endcode |
667 | | |
668 | | <strong>To do a similarity search based on the Tanimoto coefficient</strong> |
669 | | This is defined as: <br> |
670 | | <em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br> |
671 | | where the boolean operations between the fingerprints are bitwise<br> |
672 | | The Tanimoto coefficient has no absolute meaning and depends on |
673 | | the design of the fingerprint. |
674 | | \code |
675 | | multimap<double, unsigned int> SeekposMap; |
676 | | // to find n best molecules |
677 | | fs.FindSimilar(&patternMol, SeekposMap, n); |
678 | | ...or |
679 | | // to find molecules with Tanimoto coefficient > MinTani |
680 | | fs.FindSimilar(&patternMol, SeekposMap, MinTani); |
681 | | |
682 | | multimap<double, unsigned int>::reverse_iterator itr; |
683 | | for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr) |
684 | | { |
685 | | datastream.seekg(itr->second); |
686 | | // ... read the candidate molecule |
687 | | double tani = itr->first; |
688 | | } |
689 | | \endcode |
690 | | |
691 | | The FastSearchFormat class facilitates the use of these routine from the |
692 | | command line or other front end program. For instance: |
693 | | |
694 | | <strong>Prepare an index:</strong> |
695 | | \code |
696 | | obabel datafile.xxx -O index.fs |
697 | | \endcode |
698 | | With options you can specify: |
699 | | - which type of fingerprint to be used, e.g. -xfFP2, |
700 | | - whether it is folded to a specified number of bits, e.g. -xn128 |
701 | | (which should be a power of 2) |
702 | | - whether to pre-select the molecules which are indexed: |
703 | | - by structure e.g only ethers and esters, -sCOC |
704 | | - by excluding molecules with bezene rings, -vc1ccccc1 |
705 | | - by position in the datafile e.g. only mols 10 to 90, -f10 -l90 |
706 | | . |
707 | | <strong>Substructure search</strong> in a previously prepared index file |
708 | | \code |
709 | | obabel index.fs -O outfile.yyy -sSMILES |
710 | | \endcode |
711 | | The datafile can also be used as the input file, provided the input format is specified as fs |
712 | | \code |
713 | | obabel datafile.xxx -O outfile.yyy -sSMILES -ifs |
714 | | \endcode |
715 | | A "slow" search not using fingerprints would be done on the same data by omitting -ifs. |
716 | | A "slow" search can use SMARTS strings, but the fast search is restricted |
717 | | to the subset which is valid SMILES. |
718 | | |
719 | | With the -S option, the target can be specified as a molecule in a file of any format |
720 | | \code |
721 | | obabel datafile.xxx -O outfile.yyy -Spattern_mol.zzz -ifs |
722 | | \endcode |
723 | | These searches have two stages: a fingerprint search which produces |
724 | | a number of candidate molecules and a definitive search which selects |
725 | | from these using SMARTS pattern matching. The maximum number of candidates |
726 | | is 4000 by default but you can change this with an option |
727 | | e.g. -al 8000 (Note that you need the space between l and the number.) |
728 | | If the fingerprint search reaches the maximum number it will not |
729 | | look further and will tell you at what stage it stopped. |
730 | | |
731 | | <strong>Similarity search</strong> in a previously prepared index file<br> |
732 | | This rather done (rather than a substructure search) if the -at option is used, |
733 | | \code |
734 | | obabel datafile.xxx -O outfile.yyy -sSMILES -ifs -at0.7 |
735 | | \endcode |
736 | | for instance |
737 | | - -at0.7 will recover all molecules with Tanimoto greater than 0.7 |
738 | | - -at15 (no decimal point) will recover the 15 molecules with largest coefficients. |
739 | | - -aa will add the Tanimoto coefficient to the titles of the output molecules. |
740 | | |
741 | | All stages, the indexing, the interpretation of the SMILES string in the -s option, |
742 | | the file in the -S option and the final SMARTS filter convert to OBMol and apply |
743 | | ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O and N(=O)=O |
744 | | are recognized as chemically identical. |
745 | | */ |
746 | | |
747 | | }//Openbabel |
748 | | |
749 | | //! \file fingerprint.cpp |
750 | | //! \brief Definitions for OBFingerprint base class and fastsearch classes |