/src/openbabel/include/openbabel/fingerprint.h
Line | Count | Source |
1 | | /********************************************************************** |
2 | | fingerprint.h - Base class for fingerprints and fast searching |
3 | | |
4 | | Copyright (C) 2005 by Chris Morley |
5 | | |
6 | | This file is part of the Open Babel project. |
7 | | For more information, see <http://openbabel.org/> |
8 | | |
9 | | This program is free software; you can redistribute it and/or modify |
10 | | it under the terms of the GNU General Public License as published by |
11 | | the Free Software Foundation version 2 of the License. |
12 | | |
13 | | This program is distributed in the hope that it will be useful, |
14 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | | GNU General Public License for more details. |
17 | | ***********************************************************************/ |
18 | | |
19 | | #ifndef OB_FINGERPRINT_H |
20 | | #define OB_FINGERPRINT_H |
21 | | |
22 | | #include <list> |
23 | | #include <map> |
24 | | #include <set> |
25 | | #include <vector> |
26 | | #include <string> |
27 | | |
28 | | #include <openbabel/plugin.h> |
29 | | |
30 | | #ifndef OBFPRT |
31 | | #define OBFPRT |
32 | | #endif |
33 | | |
34 | | namespace OpenBabel |
35 | | { |
36 | | class OBBase; //Forward declaration; used only as pointer. |
37 | | |
38 | | /// \brief The base class for fingerprints |
39 | | class OBFPRT OBFingerprint : public OBPlugin |
40 | | { |
41 | | //see end of cpp file for detailed documentation |
42 | | |
43 | | MAKE_PLUGIN(OBFingerprint) |
44 | | |
45 | | const char* TypeID() override |
46 | 18 | { |
47 | 18 | return "fingerprints"; |
48 | 18 | } |
49 | | |
50 | | //Rest of OBFingerprints declarations |
51 | | public: |
52 | | |
53 | 0 | virtual ~OBFingerprint(){} |
54 | | |
55 | | /// Sets the nth bit |
56 | | void SetBit(std::vector<unsigned int>& vec, const unsigned int n); |
57 | | |
58 | | ///return true if the nth bit is set; |
59 | | bool GetBit(const std::vector<unsigned int>& vec, const unsigned int n); |
60 | | |
61 | | /// Repeatedly ORs the top half with the bottom half until no smaller than nbits |
62 | | void Fold(std::vector<unsigned int>& vec, unsigned int nbits); |
63 | | |
64 | | /// \return fingerprint in vector, which may be resized, folded to nbits (if nbits!=0) |
65 | | virtual bool GetFingerprint(OBBase* pOb, std::vector<unsigned int>& fp, int nbits=0)=0; |
66 | | |
67 | | /// Optional flags |
68 | | enum FptFlag{FPT_UNIQUEBITS=1, FPT_NOINFO=2}; |
69 | 0 | virtual unsigned int Flags() { return 0;}; |
70 | | //// \since version 2.3 |
71 | 0 | virtual void SetFlags(unsigned int){} |
72 | | |
73 | | /// \return a description of each bit that is set (or unset, if bSet=false) |
74 | | /// \since version 2.2 |
75 | | virtual std::string DescribeBits(const std::vector<unsigned int> /* fp */, |
76 | | bool /* bSet */ =true) |
77 | 0 | { |
78 | 0 | std::string txt(""); |
79 | 0 | return txt; |
80 | 0 | } |
81 | | |
82 | | /// \return the Tanimoto coefficient between two vectors (vector<unsigned int>& SeekPositions) |
83 | | static double Tanimoto(const std::vector<unsigned int>& vec1, const std::vector<unsigned int>& vec2); |
84 | | |
85 | | /// Inline version of Tanimoto() taking a pointer for the second vector |
86 | | static double Tanimoto(const std::vector<unsigned int>& vec1, const unsigned int* p2) |
87 | 0 | { |
88 | | ///If used for two vectors, vec1 and vec2, call as Tanimoto(vec1, &vec2[0]); |
89 | 0 | int andbits=0, orbits=0; |
90 | 0 | unsigned int i; |
91 | 0 | for (i=0;i<vec1.size();++i) |
92 | 0 | { |
93 | 0 | int andfp = vec1[i] & p2[i]; |
94 | 0 | int orfp = vec1[i] | p2[i]; |
95 | | // Count bits |
96 | | /* GCC 3.4 supports a "population count" builtin, which on many targets is |
97 | | implemented with a single instruction. There is a fallback definition |
98 | | in libgcc in case a target does not have one, which should be just as |
99 | | good as the static function below. */ |
100 | 0 | #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) |
101 | 0 | andbits += __builtin_popcount(andfp); |
102 | 0 | orbits += __builtin_popcount(orfp); |
103 | | #else |
104 | | for(;andfp;andfp=andfp<<1) |
105 | | if(andfp<0) ++andbits; |
106 | | for(;orfp;orfp=orfp<<1) |
107 | | if(orfp<0) ++orbits; |
108 | | #endif |
109 | 0 | } |
110 | 0 | return((double)andbits/(double)orbits); |
111 | 0 | }; |
112 | | |
113 | 17 | static unsigned int Getbitsperint(){ return bitsperint; } |
114 | | |
115 | | private: |
116 | | ///Function object to set bits |
117 | | struct bit_or |
118 | | { |
119 | | unsigned int operator()(const unsigned int a, const unsigned int b) |
120 | 0 | { |
121 | 0 | return a | b; |
122 | 0 | } |
123 | | }; |
124 | | |
125 | | |
126 | | public: |
127 | | /// \return a pointer to a fingerprint (the default if ID is empty), or NULL if not available |
128 | | ///For backward compatibility; a synonym of OBFingerprint::FindType |
129 | 3 | static OBFingerprint* FindFingerprint(const char* ID){ return FindType(ID);} |
130 | | |
131 | | private: |
132 | | static const unsigned int bitsperint;// = 8 * sizeof(unsigned int); |
133 | | }; |
134 | | |
135 | | //Fast search routines |
136 | | /// \struct FptIndexHeader fingerprint.h <openbabel/fingerprint.h> |
137 | | /// \brief Header for fastsearch index file |
138 | | struct OBFPRT FptIndexHeader |
139 | | { |
140 | | unsigned int headerlength;///<offset to data: sizeof(FptIndexHeader) |
141 | | unsigned int nEntries; ///<number of fingerprints |
142 | | unsigned int words; ///<number 32bit words per fingerprint |
143 | | char fpid[15]; ///<ID of the fingerprint type |
144 | | char seek64; //if true, seek data consists of 64bit long values (only zero in legacy indices) |
145 | | char datafilename[256]; ///<the data that this is an index to |
146 | | }; |
147 | | |
148 | | /// \struct FptIndex fingerprint.h <openbabel/fingerprint.h> |
149 | | /// \brief Structure of fastsearch index files |
150 | | struct OBFPRT FptIndex |
151 | | { |
152 | | FptIndexHeader header; |
153 | | std::vector<unsigned int> fptdata; |
154 | | std::vector<unsigned long> seekdata; |
155 | | bool Read(std::istream* pIndexstream); |
156 | | bool ReadIndex(std::istream* pIndexstream); |
157 | | bool ReadHeader(std::istream* pIndexstream); |
158 | | |
159 | | /// \return A pointer to FP used or NULL and an error message |
160 | | OBFingerprint* CheckFP(); |
161 | | }; |
162 | | |
163 | | /// \class FastSearch fingerprint.h <openbabel/fingerprint.h> |
164 | | /// \brief Class to search fingerprint index files |
165 | | class OBFPRT FastSearch |
166 | | { |
167 | | //see end of cpp file for detailed documentation |
168 | | public: |
169 | | /// \brief Loads an index from a file and returns the name of the datafile |
170 | | std::string ReadIndexFile(std::string IndexFilename); |
171 | | std::string ReadIndex(std::istream* pIndexstream); |
172 | | |
173 | 0 | virtual ~FastSearch(){}; |
174 | | |
175 | | /// \brief Does substructure search and returns vector of the file positions of matches |
176 | | bool Find(OBBase* pOb, std::vector<unsigned long>& SeekPositions, unsigned int MaxCandidates); |
177 | | |
178 | | /// \brief Similar to Find() but all bits of matching fingerprints have to be the same |
179 | | /// \since version 2.1 |
180 | | bool FindMatch(OBBase* pOb, std::vector<unsigned long>& SeekPositions, |
181 | | unsigned int MaxCandidates); |
182 | | |
183 | | /// \return A multimap containing objects whose Tanimoto coefficients with the target |
184 | | /// is greater than the value specified. |
185 | | bool FindSimilar(OBBase* pOb, std::multimap<double, unsigned long>& SeekposMap, |
186 | | double MinTani, double MaxTani = 1.1 ); |
187 | | |
188 | | /// \return A multimap containing the nCandidates objects with largest Tanimoto |
189 | | /// coefficients with the target. |
190 | | bool FindSimilar(OBBase* pOb, std::multimap<double, unsigned long>& SeekposMap, |
191 | | int nCandidates=0); |
192 | | |
193 | | /// \return a pointer to the fingerprint type used to constuct the index |
194 | 0 | OBFingerprint* GetFingerprint() const{ return _pFP;}; |
195 | | |
196 | | /// \return a pointer to the index header containing size info etc. |
197 | 0 | const FptIndexHeader& GetIndexHeader() const{ return _index.header;}; |
198 | | |
199 | | private: |
200 | | FptIndex _index; |
201 | | OBFingerprint* _pFP; |
202 | | }; |
203 | | |
204 | | /// \class FastSearchIndexer fingerprint.h <openbabel/fingerprint.h> |
205 | | /// \brief Class to prepare fingerprint index files See FastSearch class for details |
206 | | class OBFPRT FastSearchIndexer |
207 | | { |
208 | | //see end of cpp file for detailed documentation |
209 | | public: |
210 | | ///\brief Constructor with a new index |
211 | | FastSearchIndexer(std::string& datafilename, std::ostream* os, std::string& fpid, |
212 | | int FptBits=0, int nmols=0); |
213 | | |
214 | | ///\brief Constructor using existing index |
215 | | FastSearchIndexer(FptIndex* pindex, std::ostream* os, int nmols=0); |
216 | | |
217 | | ~FastSearchIndexer(); |
218 | | |
219 | | ///\brief Called for each object |
220 | | bool Add(OBBase* pOb, std::streampos seekpos); |
221 | | |
222 | | private: |
223 | | std::ostream* _indexstream; |
224 | | FptIndex* _pindex; |
225 | | OBFingerprint* _pFP; |
226 | | int _nbits; |
227 | | }; |
228 | | |
229 | | } //namespace OpenBabel |
230 | | #endif |
231 | | |
232 | | //! \file fingerprint.h |
233 | | //! \brief Declaration of OBFingerprint base class and fastsearch classes |