/src/openbabel/src/fingerprints/finger3.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | finger3.cpp: Fingerprints based on list of SMARTS patterns |
3 | | Copyright (C) 2005 Chris Morley |
4 | | |
5 | | This file is part of the Open Babel project. |
6 | | For more information, see <http://openbabel.org/> |
7 | | |
8 | | This program is free software; you can redistribute it and/or modify |
9 | | it under the terms of the GNU General Public License as published by |
10 | | the Free Software Foundation version 2 of the License. |
11 | | |
12 | | This program is distributed in the hope that it will be useful, |
13 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | GNU General Public License for more details. |
16 | | ***********************************************************************/ |
17 | | |
18 | | #include <openbabel/babelconfig.h> |
19 | | #include <openbabel/mol.h> |
20 | | #include <openbabel/parsmart.h> |
21 | | #include <openbabel/oberror.h> |
22 | | #include <sstream> |
23 | | #include <fstream> |
24 | | #include <map> |
25 | | #include <string> |
26 | | |
27 | | #include <openbabel/fingerprint.h> |
28 | | |
29 | | using namespace std; |
30 | | namespace OpenBabel |
31 | | { |
32 | | /// \brief Fingerprint based on list of SMARTS patterns |
33 | | class PatternFP : public OBFingerprint |
34 | | { |
35 | | private: |
36 | | struct pattern |
37 | | { |
38 | | string smartsstring; |
39 | | OBSmartsPattern obsmarts; |
40 | | string description; |
41 | | int numbits; |
42 | | int numoccurrences; |
43 | | int bitindex; |
44 | | }; |
45 | | vector<pattern> _pats; |
46 | | int _bitcount; |
47 | | string _version; |
48 | | |
49 | | protected: |
50 | | string _patternsfile; |
51 | | |
52 | | public: |
53 | | PatternFP(const char* ID, const char* filename=nullptr, |
54 | 4 | bool IsDefault=false) : OBFingerprint(ID, IsDefault) |
55 | 4 | { |
56 | 4 | if (filename == nullptr) |
57 | 2 | _patternsfile="patterns.txt"; |
58 | 2 | else |
59 | 2 | _patternsfile = filename; |
60 | 4 | } |
61 | | |
62 | | ///////////////////////////////////////////////////////////////////////////// |
63 | | const char* Description() override |
64 | 0 | { |
65 | 0 | static string desc; |
66 | | //Read patterns file if it has not been done already, |
67 | | //because we need _bitcount and _version updated |
68 | | |
69 | | // _bitcount and _version are available only after the datafile has been parsed. |
70 | | // This is a burden on normal operation (Description() gets called on startup from OBDefine), |
71 | | // so the secondline is present only after the fingerprint has been used. |
72 | | // the |
73 | 0 | string secondline; |
74 | 0 | if(!_pats.empty()) |
75 | 0 | secondline = "\n" + toString(_bitcount) + " bits. Datafile version = " + _version; |
76 | 0 | desc = "SMARTS patterns specified in the file " + _patternsfile |
77 | 0 | + secondline |
78 | 0 | + "\nPatternFP is definable"; |
79 | 0 | return (desc.c_str()); |
80 | 0 | } |
81 | | |
82 | | ////////////////////////////////////////////////////////////////////////////// |
83 | | //Each bit represents a single substructure |
84 | 0 | unsigned int Flags() override { return FPT_UNIQUEBITS; } |
85 | | |
86 | | /////////////////////////////////////////////////////////////////////////////// |
87 | | PatternFP* MakeInstance(const std::vector<std::string>& textlines) override |
88 | 0 | { |
89 | 0 | return new PatternFP(textlines[1].c_str(),textlines[2].c_str()); |
90 | 0 | } |
91 | | |
92 | | //////////////////////////////////////////////////////////////////////////////// |
93 | | bool GetFingerprint(OBBase* pOb, vector<unsigned int>&fp, int foldbits) override |
94 | 0 | { |
95 | 0 | OBMol* pmol = dynamic_cast<OBMol*>(pOb); |
96 | 0 | if(!pmol) |
97 | 0 | return false; |
98 | | |
99 | | //This fingerprint is constructed from a molecule with no explicit hydrogens. |
100 | 0 | pmol->DeleteHydrogens(); |
101 | |
|
102 | 0 | unsigned int n; |
103 | | //Read patterns file if it has not been done already |
104 | 0 | if(_pats.empty()) |
105 | 0 | ReadPatternFile(_version); |
106 | | |
107 | | //Make fp size the smallest power of two to contain the patterns |
108 | 0 | n=Getbitsperint(); |
109 | 0 | while(n < _bitcount) |
110 | 0 | n*=2; |
111 | 0 | fp.resize(n/Getbitsperint()); |
112 | |
|
113 | 0 | n=0; //bit position |
114 | 0 | vector<pattern>::iterator ppat; |
115 | 0 | for(ppat=_pats.begin();ppat!=_pats.end();++ppat) |
116 | 0 | { |
117 | 0 | if(ppat->numbits //ignore pattern if numbits==0 |
118 | 0 | && ppat->obsmarts.Match(*pmol, ppat->numoccurrences==0))//do single match if all that's needed |
119 | 0 | { |
120 | | /* Set bits in the fingerprint depending on the number of matches in the molecule |
121 | | and the parameters, numbits and numoccurrences, in the pattern. |
122 | | The pattern will set or clear numbits bits in the fingerprint. |
123 | | They will be in numoccurrences+1 groups, each containing an approximately |
124 | | equal number of bits. |
125 | | The first group of bits will be set if numMatches > numoccurrences; |
126 | | The second group will be set if numMatches > numoccurrences - 1; |
127 | | and so on. |
128 | | So with a pattern with numbits = 4 and numoccurrences = 2, |
129 | | the groups would be 1, 1, and 2 bits. |
130 | | A molecule with |
131 | | 1 match to the pattern would give 0011 |
132 | | 2 matches to the pattern would give 0111 |
133 | | 3 or more matches to the pattern would give 1111 |
134 | | */ |
135 | 0 | int numMatches = ppat->obsmarts.GetUMapList().size(); |
136 | 0 | int num = ppat->numbits, div = ppat->numoccurrences+1, ngrp; |
137 | |
|
138 | 0 | int i = n; |
139 | 0 | while(num) |
140 | 0 | { |
141 | 0 | ngrp = (num -1)/div-- +1; //rounds up |
142 | 0 | num -= ngrp; |
143 | 0 | while(ngrp--) |
144 | 0 | if (numMatches > div) { |
145 | 0 | SetBit(fp,i); |
146 | 0 | } |
147 | 0 | i++; |
148 | 0 | } |
149 | 0 | } |
150 | 0 | n += ppat->numbits; |
151 | 0 | } |
152 | |
|
153 | 0 | if(foldbits) |
154 | 0 | Fold(fp, foldbits); |
155 | 0 | return true; |
156 | 0 | } |
157 | | |
158 | | ///////////////////////////////////////////////////////////////////// |
159 | | bool ReadPatternFile(string& ver) |
160 | 0 | { |
161 | | //Reads three types of file. See below |
162 | 0 | ifstream ifs; |
163 | 0 | stringstream errorMsg; |
164 | |
|
165 | 0 | if (OpenDatafile(ifs, _patternsfile).length() == 0) |
166 | 0 | { |
167 | 0 | errorMsg << "Cannot open " << _patternsfile << endl; |
168 | 0 | obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); |
169 | 0 | return false; |
170 | 0 | } |
171 | | |
172 | 0 | string line; |
173 | 0 | if(!getline(ifs, line)) //first line |
174 | 0 | return false; |
175 | 0 | bool smartsfirst = (Trim(line)=="#Comments after SMARTS"); |
176 | |
|
177 | 0 | _bitcount=0; |
178 | 0 | bool indata=false; |
179 | 0 | do |
180 | 0 | { |
181 | 0 | if(Trim(line).size()>0 && line[0]!='#') |
182 | 0 | { |
183 | 0 | pattern p; |
184 | 0 | p.numbits=1; p.numoccurrences=0; //default values |
185 | 0 | p.bitindex = _bitcount; |
186 | 0 | istringstream ss(line); |
187 | 0 | indata = true; |
188 | 0 | if(smartsfirst) |
189 | 0 | { |
190 | 0 | if(isdigit(line[0])) |
191 | 0 | { |
192 | 0 | if(!ParseRDKitFormat(ss, p)) |
193 | 0 | continue; |
194 | 0 | } |
195 | 0 | else |
196 | | //Original format, which looks like: |
197 | | // SMARTS description |
198 | 0 | ss >> p.smartsstring >> p.description; |
199 | 0 | } |
200 | 0 | else |
201 | 0 | { |
202 | | // Christian Laggner's format: |
203 | | // description: SMARTS [occurrences [numbits]] |
204 | 0 | getline(ss, p.description, ':'); |
205 | 0 | ss >> p.smartsstring; |
206 | 0 | ss >> p.numoccurrences >> p.numbits; |
207 | 0 | } |
208 | | |
209 | 0 | if(!p.obsmarts.Init(p.smartsstring)) |
210 | 0 | { |
211 | 0 | obErrorLog.ThrowError(__FUNCTION__, |
212 | 0 | "Faulty SMARTS: " + p.description + ' ' + p.smartsstring, obError); |
213 | 0 | continue; |
214 | 0 | } |
215 | 0 | _pats.push_back(p); |
216 | 0 | _bitcount += p.numbits; |
217 | 0 | } |
218 | 0 | else if(!indata) |
219 | 0 | { |
220 | | //Find version number |
221 | 0 | string::size_type pos = line.find("Version"); |
222 | 0 | if(pos!=string::npos) |
223 | 0 | pos+=8; |
224 | 0 | else if(line.find("Extracted from RDKit")!=string::npos) |
225 | 0 | { |
226 | 0 | pos=20; |
227 | 0 | while((pos=line.find('r',pos))!=string::npos) |
228 | 0 | if(isdigit(line[++pos])) |
229 | 0 | break; |
230 | 0 | } |
231 | 0 | if(pos!=string::npos) |
232 | 0 | { |
233 | 0 | ver=line.substr(pos) + ' ';//space fixes bug in while() when number at end of line |
234 | 0 | pos=1; |
235 | 0 | while(isdigit(ver[++pos])); |
236 | 0 | ver.erase(pos); |
237 | 0 | } |
238 | 0 | } |
239 | 0 | }while(getline(ifs,line)); |
240 | | |
241 | 0 | if (ifs) |
242 | 0 | ifs.close(); |
243 | 0 | return true; |
244 | 0 | } |
245 | | |
246 | | /////////////////////////////////////////////////////////////////////////////// |
247 | | string DescribeBits(const vector<unsigned int> fp, bool bSet=true) override |
248 | 0 | { |
249 | | //checkmol-type output with tab separated functional group names |
250 | 0 | stringstream ss; |
251 | 0 | vector<pattern>::iterator ppat; |
252 | 0 | for(ppat=_pats.begin();ppat!=_pats.end();++ppat) |
253 | 0 | { |
254 | 0 | int n = ppat->bitindex; |
255 | 0 | int num = ppat->numbits, div = ppat->numoccurrences+1, ngrp; |
256 | 0 | while(num) //for each group of bits |
257 | 0 | { |
258 | 0 | ngrp = (num + div - 1) / div; //rounds up |
259 | 0 | div -= 1; |
260 | 0 | num -= ngrp; |
261 | 0 | if(GetBit(fp, n) == bSet) |
262 | 0 | { |
263 | 0 | ss << ppat->description; |
264 | 0 | if(div>0) |
265 | 0 | ss << '*' << div+1; |
266 | 0 | ss << '\t' ; |
267 | 0 | break; //ignore the bits signifying a smaller number of occurrences |
268 | 0 | } |
269 | 0 | n += ngrp; |
270 | 0 | } |
271 | 0 | } |
272 | 0 | ss << endl; |
273 | 0 | return ss.str(); |
274 | 0 | } |
275 | | |
276 | | /////////////////////////////////////////////////////////////////////////////////// |
277 | | bool ParseRDKitFormat(istringstream& ss, pattern& p) |
278 | 0 | { |
279 | | //rdkit format, e.g. |
280 | | // 14:('[S,s]-[S,s]',0), # S-S |
281 | 0 | const int dum = 20; //an arbitrary number in case delimiters in ignore statements not found |
282 | 0 | string number, comment; |
283 | 0 | getline(ss, number, ':'); |
284 | 0 | ss.ignore(dum, '\''); |
285 | 0 | getline(ss, p.smartsstring, '\''); |
286 | 0 | if(p.smartsstring[0]=='?') //ignore patterns with SMARTS '?' |
287 | 0 | p.smartsstring="[999]";//this seems to match nothing; was return false; |
288 | 0 | ss.ignore(dum,','); |
289 | 0 | ss >> p.numoccurrences; |
290 | 0 | ss.ignore(dum,'#'); |
291 | 0 | getline(ss, comment); |
292 | | |
293 | | //description is number + edited commment |
294 | 0 | Trim(comment); |
295 | 0 | string::size_type pos; |
296 | 0 | pos = comment.find("FIX"); |
297 | 0 | if(pos==string::npos) |
298 | 0 | pos = comment.find("*NOTE*"); |
299 | 0 | if(pos!=string::npos) |
300 | 0 | comment.erase(pos); |
301 | 0 | p.description = number + ": " + comment; |
302 | 0 | return true; |
303 | 0 | } |
304 | | |
305 | | |
306 | | }; //class PatternFP |
307 | | |
308 | | //*********************************************** |
309 | | //Make a global instance |
310 | | PatternFP FP3PatternFP("FP3"); |
311 | | PatternFP FP4PatternFP("FP4", "SMARTS_InteLigand.txt"); |
312 | | //*********************************************** |
313 | | |
314 | | /*! \class PatternFP |
315 | | A bit is set when there is a match to one of a list |
316 | | of SMARTS patterns in the datafile, which is specified in the constructor. |
317 | | If no filename is given, the default filename is patterns.txt. |
318 | | Fingerprints can be made by declaring a global variable, as in: |
319 | | |
320 | | PatternFP FP4PatternFP("FP4", "SMARTS_InteLigand.txt"); |
321 | | |
322 | | Alternatively, an entry in plugindefines.txt like: |
323 | | |
324 | | PatternFP |
325 | | MACCS #ID of this fingerprint type |
326 | | MACCS.txt #File containing the SMARTS patterns |
327 | | |
328 | | defines a fingerprint without the need to recompile. |
329 | | |
330 | | Three file formats are supported: |
331 | | - the preferred format (e.g. SMARTS_InteLigand.txt in FP4) |
332 | | - the original format (patterns.txt has an incomplete set of SMARTS patterns) |
333 | | - a format made by extracting from an RDKit file (MACCS.txt) |
334 | | The last two require the first line to be: |
335 | | #Comments after SMARTS |
336 | | |
337 | | Lines starting with # are ignored. |
338 | | For the preferred format each line is of the form: |
339 | | description: SMARTS [occurrences [numbits]] |
340 | | A bit is set in the fingerprint for ach SMARTS pattern matched. |
341 | | The optional integer parameters refine this behaviour; the most obvious uses are: |
342 | | - if <occurrences> is present and greater than its default value of 0, the bit |
343 | | is set only if the number of matches to the pattern is greater than <occurrences>. |
344 | | - if <occurrences> is 0 and <numbits> is greater than its default value of 1, then |
345 | | the fingerprint has <numbits> bits set if there is a match. This gives greater weight |
346 | | to the pattern for use in similarity measures like Tanimoto. |
347 | | - if the parameters are n-1 and n and the number of matches is n, |
348 | | a bit is set for each of the conditions n>=m, n>=m-1, ... , n>=1 |
349 | | This can be used to distinguish structures with many similar atoms like n-alkanes. |
350 | | The use of other values for the parameters, which can be any positive integer, can give |
351 | | other analogous behaviours. If numbits is 0 the pattern is ignored. |
352 | | */ |
353 | | |
354 | | }//namespace |
355 | | |
356 | | //! \file finger3.cpp |
357 | | //! \brief fingerprints based on list of SMARTS patterns |