/src/openbabel/src/formats/fastsearchformat.cpp
Line | Count | Source |
1 | | /********************************************************************** |
2 | | fastsearchformat.cpp: Preparation and searching of fingerprint-based index files |
3 | | Copyright (C) 2005-2006 by Chris Morley |
4 | | |
5 | | This program is free software; you can redistribute it and/or modify |
6 | | it under the terms of the GNU General Public License as published by |
7 | | the Free Software Foundation version 2 of the License. |
8 | | |
9 | | This program is distributed in the hope that it will be useful, |
10 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | | GNU General Public License for more details. |
13 | | ***********************************************************************/ |
14 | | #include <openbabel/babelconfig.h> |
15 | | |
16 | | #include <sstream> |
17 | | #include <iostream> |
18 | | #include <fstream> |
19 | | |
20 | | #include <openbabel/mol.h> |
21 | | #include <openbabel/obconversion.h> |
22 | | #include <openbabel/fingerprint.h> |
23 | | #include <openbabel/op.h> |
24 | | #include <openbabel/elements.h> |
25 | | #include <openbabel/bond.h> |
26 | | #include <openbabel/obutil.h> |
27 | | #include <cstdlib> |
28 | | #include <algorithm> |
29 | | |
30 | | |
31 | | using namespace std; |
32 | | namespace OpenBabel { |
33 | | |
34 | | /// \brief Prepares and searches of fingerprint-based index files. See FastSearch class for details |
35 | | class FastSearchFormat : public OBFormat |
36 | | { |
37 | | public: |
38 | | //Register this format type ID |
39 | 6 | FastSearchFormat() : fsi(nullptr) |
40 | 6 | { |
41 | 6 | OBConversion::RegisterFormat("fs",this); |
42 | | //Specify the number of option taken by options |
43 | 6 | OBConversion::RegisterOptionParam("S", this, 1, OBConversion::GENOPTIONS); |
44 | 6 | OBConversion::RegisterOptionParam("S", this, 1, OBConversion::INOPTIONS); |
45 | 6 | OBConversion::RegisterOptionParam("f", this, 1); |
46 | 6 | OBConversion::RegisterOptionParam("N", this, 1); |
47 | 6 | OBConversion::RegisterOptionParam("u", this, 0); |
48 | 6 | OBConversion::RegisterOptionParam("t", this, 1, OBConversion::INOPTIONS); |
49 | 6 | OBConversion::RegisterOptionParam("l", this, 1, OBConversion::INOPTIONS); |
50 | 6 | OBConversion::RegisterOptionParam("a", this, 0, OBConversion::INOPTIONS); |
51 | 6 | OBConversion::RegisterOptionParam("e", this, 0, OBConversion::INOPTIONS); |
52 | 6 | } |
53 | | |
54 | | const char* Description() override // required |
55 | 7 | { return |
56 | 7 | "Fastsearch format\n" |
57 | 7 | "Fingerprint-aided substructure and similarity searching\n\n" |
58 | | |
59 | 7 | "Writing to the fs format makes an index of a multi-molecule datafile::\n\n" |
60 | 7 | " obabel dataset.sdf -ofs\n\n" |
61 | 7 | "This prepares an index :file:`dataset.fs` with default parameters, and is slow\n" |
62 | 7 | "(~30 minutes for a 250,000 molecule file).\n\n" |
63 | | |
64 | 7 | "However, when reading from the fs format searches are much faster, a few seconds,\n" |
65 | 7 | "and so can be done interactively.\n\n" |
66 | 7 | "The search target is the parameter of the ``-s`` option and can be\n" |
67 | 7 | "slightly extended SMILES (with ``[#n]`` atoms and ``~`` bonds) or\n" |
68 | 7 | "the name of a file containing a molecule.\n\n" |
69 | | |
70 | 7 | "Several types of searches are possible:\n\n" |
71 | 7 | "- Identical molecule::\n\n" |
72 | 7 | " obabel index.fs -O outfile.yyy -s SMILES exact\n\n" |
73 | 7 | "- Substructure::\n\n" |
74 | 7 | " obabel index.fs -O outfile.yyy -s SMILES or\n" |
75 | 7 | " obabel index.fs -O outfile.yyy -s filename.xxx\n\n" |
76 | 7 | " where ``xxx`` is a format id known to OpenBabel, e.g. sdf\n" |
77 | 7 | "- Molecular similarity based on Tanimoto coefficient::\n\n" |
78 | 7 | " obabel index.fs -O outfile.yyy -at15 -sSMILES # best 15 molecules\n" |
79 | 7 | " obabel index.fs -O outfile.yyy -at0.7 -sSMILES # Tanimoto >0.7\n" |
80 | 7 | " obabel index.fs -O outfile.yyy -at0.7,0.9 -sSMILES\n" |
81 | 7 | " # Tanimoto >0.7 && Tanimoto < 0.9\n\n" |
82 | 7 | "The datafile plus the ``-ifs`` option can be used instead of the index file.\n\n" |
83 | 7 | "NOTE on 32-bit systems the datafile MUST NOT be larger than 4GB.\n\n" |
84 | 7 | "Dative bonds like -[N+][O-](=O) are indexed as -N(=O)(=O), and when searching\n" |
85 | 7 | "the target molecule should be in the second form.\n\n" |
86 | | |
87 | 7 | ".. seealso::\n\n" |
88 | | |
89 | 7 | " :ref:`fingerprints`\n\n" |
90 | | |
91 | 7 | "Write Options (when making index) e.g. -xfFP3\n" |
92 | 7 | " f# Fingerprint type\n" |
93 | 7 | " If not specified, the default fingerprint (currently FP2) is used\n" |
94 | 7 | " N# Fold fingerprint to # bits\n" |
95 | 7 | " u Update an existing index\n\n" |
96 | | |
97 | 7 | "Read Options (when searching) e.g. -at0.7\n" |
98 | 7 | " t# Do similarity search:#mols or # as min Tanimoto\n" |
99 | 7 | " a Add Tanimoto coeff to title in similarity search\n" |
100 | 7 | " l# Maximum number of candidates. Default<4000>\n" |
101 | 7 | " e Exact match\n" |
102 | 7 | " Alternative to using exact in ``-s`` parameter, see above\n" |
103 | 7 | " n No further SMARTS filtering after fingerprint phase\n\n" |
104 | 7 | ; |
105 | 7 | } |
106 | | |
107 | 33 | unsigned int Flags() override { return READBINARY | READONEONLY | WRITEBINARY; } |
108 | | |
109 | | public: |
110 | | bool ReadChemObject(OBConversion* pConv) override; |
111 | | bool WriteChemObject(OBConversion* pConv) override; |
112 | | |
113 | | private: |
114 | | bool ObtainTarget(OBConversion* pConv, std::vector<OBMol>& patternMols, const std::string& indexname); |
115 | | void AddPattern(vector<OBMol>& patternMols, OBMol patternMol, int idx); |
116 | | |
117 | | private: |
118 | | ///big data structure which will remain in memory after it is loaded |
119 | | //until the program ends. |
120 | | FastSearch fs; |
121 | | FastSearchIndexer* fsi; |
122 | | streampos LastSeekpos; //used during update |
123 | | OBStopwatch sw; //used when preparing index |
124 | | int nmols; //number mols in data file |
125 | | }; |
126 | | |
127 | | /////////////////////////////////////////////////////////////// |
128 | | //Make an instance of the format class |
129 | | FastSearchFormat theFastSearchFormat; |
130 | | |
131 | | /////////////////////////////////////////////////////////////// |
132 | | bool FastSearchFormat::ReadChemObject(OBConversion* pConv) |
133 | 2 | { |
134 | | //Searches index file for structural matches |
135 | | //This function is called only once per search |
136 | | |
137 | 2 | std::string auditMsg = "OpenBabel::Read fastsearch index "; |
138 | 2 | std::string description(Description()); |
139 | 2 | auditMsg += description.substr(0,description.find('\n')); |
140 | 2 | obErrorLog.ThrowError(__FUNCTION__, |
141 | 2 | auditMsg, |
142 | 2 | obAuditMsg); |
143 | | |
144 | | //Derive index name |
145 | 2 | string indexname = pConv->GetInFilename(); |
146 | 2 | string::size_type pos=indexname.find_last_of('.'); |
147 | 2 | if(pos!=string::npos) |
148 | 0 | { |
149 | 0 | indexname.erase(pos); |
150 | 0 | indexname += ".fs"; |
151 | 0 | } |
152 | | |
153 | | //Have to open input stream again because needs to be in binary mode |
154 | 2 | ifstream ifs; |
155 | 2 | stringstream errorMsg; |
156 | 2 | if(!indexname.empty()) |
157 | 0 | ifs.open(indexname.c_str(),ios::binary); |
158 | 2 | if(!ifs) |
159 | 0 | { |
160 | 0 | errorMsg << "Couldn't open " << indexname << endl; |
161 | 0 | obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); |
162 | 0 | return false; |
163 | 0 | } |
164 | | |
165 | 2 | string datafilename = fs.ReadIndex(&ifs); |
166 | 2 | if(datafilename.empty()) |
167 | 2 | { |
168 | 2 | errorMsg << "Difficulty reading from index " << indexname << endl; |
169 | 2 | obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); |
170 | 2 | return false; |
171 | 2 | } |
172 | | |
173 | 0 | vector<OBMol> patternMols; |
174 | 0 | if(!ObtainTarget(pConv, patternMols, indexname)) |
175 | 0 | return false; |
176 | | |
177 | 0 | bool exactmatch = pConv->IsOption("e", OBConversion::INOPTIONS) != nullptr; // -ae option |
178 | | |
179 | | //Open the datafile and put it in pConv |
180 | | //datafile name derived from index file probably won't have a file path |
181 | | //but indexname may. Derive a full datafile name |
182 | 0 | string path; |
183 | 0 | pos = indexname.find_last_of("/\\"); |
184 | 0 | if(pos==string::npos) |
185 | 0 | path = datafilename; |
186 | 0 | else |
187 | 0 | path = indexname.substr(0,pos+1) + datafilename; |
188 | |
|
189 | 0 | ifstream datastream(path.c_str()); |
190 | 0 | if(!datastream) |
191 | 0 | { |
192 | 0 | errorMsg << "Difficulty opening " << path << endl; |
193 | 0 | obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); |
194 | 0 | return false; |
195 | 0 | } |
196 | 0 | pConv->SetInStream(&datastream); |
197 | | |
198 | | //Input format is currently fs; set it appropriately |
199 | 0 | bool isgzip = false; |
200 | 0 | if(!pConv->SetInAndOutFormats(pConv->FormatFromExt(datafilename.c_str(), isgzip), pConv->GetOutFormat())) |
201 | 0 | return false; |
202 | | |
203 | 0 | if (isgzip) |
204 | 0 | { |
205 | 0 | errorMsg << "Index datafile must not be in gzip format: " << path << endl; |
206 | 0 | obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); |
207 | 0 | return false; |
208 | 0 | } |
209 | | // If target has dative bonds like -[N+](=O)[O-] convert it to the uncharged form |
210 | | // (-N(=O)=O and add uncharged form to vector of mols which are sent to |
211 | | // the -s (SMARTS)filter. |
212 | | // Also check whether the target has dative bonds in the uncharged form and supply |
213 | | // the charged form to the -s filter. |
214 | | // Together with the automatic conversion to the uncharged form when the fs index is made, |
215 | | // this ensures that both forms are found however they occur in the datafile or the taget. |
216 | 0 | vector<OBBase*> extraSMARTSMols; |
217 | 0 | vector<OBMol>extraUnchargedMols; |
218 | 0 | for(unsigned i=0;i<patternMols.size();++i) |
219 | 0 | { |
220 | 0 | if(patternMols[i].ConvertDativeBonds()) |
221 | 0 | extraSMARTSMols.push_back(&patternMols[i]); |
222 | 0 | else |
223 | 0 | { |
224 | | // If target has uncharged dative bonds, still use it for fastsearching, |
225 | | // but add the charged form for -s filter. |
226 | 0 | extraUnchargedMols.push_back(patternMols[i]); |
227 | 0 | if(extraUnchargedMols.back().MakeDativeBonds()) |
228 | 0 | extraSMARTSMols.push_back(&extraUnchargedMols.back()); |
229 | 0 | } |
230 | 0 | } |
231 | 0 | OBOp* sFilter = OBOp::FindType("s"); |
232 | 0 | if(sFilter) |
233 | 0 | sFilter->ProcessVec(extraSMARTSMols); |
234 | | |
235 | | //Now do searching |
236 | 0 | const char* p = pConv->IsOption("t",OBConversion::INOPTIONS); |
237 | 0 | if(p) |
238 | 0 | { |
239 | | //Do a similarity search |
240 | 0 | multimap<double, unsigned long> SeekposMap; |
241 | 0 | string txt=p; |
242 | 0 | if(txt.find('.')==string::npos) |
243 | 0 | { |
244 | | //Finds n molecules with largest Tanimoto |
245 | 0 | int n = atoi(p); |
246 | 0 | fs.FindSimilar(&patternMols[0], SeekposMap, n); |
247 | 0 | } |
248 | 0 | else |
249 | 0 | { |
250 | | //Finds molecules with Tanimoto > MinTani |
251 | 0 | double MaxTani = 1.1; |
252 | 0 | size_t pos = txt.find(','); |
253 | 0 | if( pos != string::npos ) { |
254 | 0 | MaxTani = atof( txt.substr( pos + 1 ).c_str() ); |
255 | 0 | } |
256 | 0 | double MinTani = atof( txt.substr( 0, pos ).c_str() ); |
257 | 0 | fs.FindSimilar(&patternMols[0], SeekposMap, MinTani, MaxTani); |
258 | 0 | } |
259 | | |
260 | | //Don't want to filter through SMARTS filter |
261 | 0 | pConv->RemoveOption("s", OBConversion::GENOPTIONS); |
262 | | //also because op names are case independent |
263 | 0 | pConv->RemoveOption("S", OBConversion::GENOPTIONS); |
264 | |
|
265 | 0 | multimap<double, unsigned long>::reverse_iterator itr; |
266 | 0 | for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr) |
267 | 0 | { |
268 | 0 | datastream.seekg(itr->second); |
269 | |
|
270 | 0 | if(pConv->IsOption("a", OBConversion::INOPTIONS)) |
271 | 0 | { |
272 | | //Adds Tanimoto coeff to title |
273 | | //First remove any previous value |
274 | 0 | pConv->RemoveOption("addtotitle", OBConversion::GENOPTIONS); |
275 | 0 | stringstream ss; |
276 | 0 | ss << " " << itr->first; |
277 | 0 | pConv->AddOption("addtotitle",OBConversion::GENOPTIONS, ss.str().c_str()); |
278 | |
|
279 | 0 | } |
280 | 0 | pConv->SetOneObjectOnly(); |
281 | 0 | if(itr != --SeekposMap.rend()) |
282 | 0 | pConv->SetMoreFilesToCome();//so that not seen as last on output |
283 | 0 | pConv->Convert(nullptr, nullptr); |
284 | 0 | } |
285 | 0 | } |
286 | | |
287 | 0 | else |
288 | 0 | { |
289 | | //Structure search |
290 | 0 | int MaxCandidates = 4000; |
291 | 0 | p = pConv->IsOption("l",OBConversion::INOPTIONS); |
292 | 0 | if(p && atoi(p)) |
293 | 0 | MaxCandidates = atoi(p); |
294 | |
|
295 | 0 | vector<unsigned long> SeekPositions; |
296 | |
|
297 | 0 | if(exactmatch) |
298 | 0 | { |
299 | | //Find mols where all fingerprint bits are the same as the target |
300 | 0 | fs.FindMatch(&patternMols[0], SeekPositions, MaxCandidates); |
301 | | // ensure that SMARTS filter in transform.cpp looks only for an exact match |
302 | | // by setting an option with the number of heavy atoms in the pattern mol included. |
303 | 0 | stringstream ss; |
304 | 0 | ss << patternMols[0].NumHvyAtoms(); |
305 | 0 | pConv->AddOption("exactmatch", OBConversion::GENOPTIONS, ss.str().c_str()); |
306 | 0 | } |
307 | | |
308 | 0 | else |
309 | 0 | { |
310 | | //Do a substructure search for each target |
311 | 0 | vector<OBMol>::iterator iter; |
312 | 0 | for(iter=patternMols.begin();iter!=patternMols.end();++iter) |
313 | 0 | fs.Find(&*iter, SeekPositions, MaxCandidates); |
314 | 0 | clog << SeekPositions.size() << " candidates from fingerprint search phase" << endl; |
315 | 0 | } |
316 | |
|
317 | 0 | vector<unsigned long>::iterator seekitr, |
318 | 0 | begin = SeekPositions.begin(), end = SeekPositions.end(); |
319 | |
|
320 | 0 | if(patternMols.size()>1)//only sort and eliminate duplicates if necessary |
321 | 0 | { |
322 | 0 | sort(begin, end); |
323 | 0 | end = unique(begin, end); //removed duplicates are after new end |
324 | 0 | } |
325 | | |
326 | | //Output the candidate molecules, filtering through s filter, unless it was not requested |
327 | 0 | if(pConv->IsOption("n", OBConversion::INOPTIONS) ) |
328 | 0 | pConv->RemoveOption("s",OBConversion::GENOPTIONS); |
329 | |
|
330 | 0 | pConv->SetLast(false); |
331 | 0 | for(seekitr=begin; seekitr!=end; ++seekitr) |
332 | 0 | { |
333 | 0 | datastream.seekg(*seekitr); |
334 | 0 | if(!pConv->GetInFormat()->ReadChemObject(pConv)) |
335 | 0 | return false; |
336 | 0 | pConv->SetFirstInput(false); //needed for OpSort |
337 | 0 | } |
338 | 0 | } |
339 | 0 | return false; //To finish |
340 | 0 | } |
341 | | |
342 | | ///////////////////////////////////////////////////// |
343 | | bool FastSearchFormat::WriteChemObject(OBConversion* pConv) |
344 | 5 | { |
345 | | //Prepares or updates an index file. Called for each molecule indexed |
346 | 5 | bool update = pConv->IsOption("u") != nullptr; |
347 | | |
348 | 5 | static ostream* pOs; |
349 | 5 | static bool NewOstreamUsed; |
350 | 5 | if (fsi == nullptr) |
351 | 5 | { |
352 | | // Warn that compressed files cannot be used. It's hard to seek |
353 | | // inside of a gzip file. |
354 | 5 | if(pConv->GetInGzipped()) |
355 | 0 | { |
356 | 0 | obErrorLog.ThrowError(__FUNCTION__, |
357 | 0 | "Fastindex search requires an uncompressed input file so it can quickly seek to a record.", |
358 | 0 | obWarning); |
359 | 0 | } |
360 | | |
361 | | //First pass sets up FastSearchIndexer object |
362 | 5 | pOs = pConv->GetOutStream();// with named index it is already open |
363 | 5 | NewOstreamUsed=false; |
364 | 5 | string mes("prepare an"); |
365 | 5 | if(update) |
366 | 0 | mes = "update the"; |
367 | 5 | clog << "This will " << mes << " index of " << pConv->GetInFilename() |
368 | 5 | << " and may take some time..." << flush; |
369 | | |
370 | 5 | if(!pConv->IsLastFile()) |
371 | 0 | { |
372 | 0 | obErrorLog.ThrowError(__FUNCTION__, |
373 | 0 | "There should not be multiple input files. A .fs file is an index of a single datafile.", |
374 | 0 | obError); |
375 | 0 | return false; |
376 | 0 | } |
377 | | |
378 | 5 | std::string auditMsg = "OpenBabel::Write fastsearch index "; |
379 | 5 | std::string description(Description()); |
380 | 5 | auditMsg += description.substr( 0, description.find('\n') ); |
381 | 5 | obErrorLog.ThrowError(__FUNCTION__,auditMsg,obAuditMsg); |
382 | | |
383 | 5 | FptIndex* pidx = nullptr; //used with update |
384 | | |
385 | | //if(pOs==&cout) did not work with GUI |
386 | 5 | if(!dynamic_cast<ofstream*>(pOs)) |
387 | 5 | { |
388 | | //No index filename specified |
389 | | //Derive index name from datafile name |
390 | 5 | string indexname=pConv->GetInFilename(); |
391 | 5 | string::size_type pos=indexname.find_last_of('.'); |
392 | 5 | if(pos!=string::npos) |
393 | 0 | indexname.erase(pos); |
394 | 5 | indexname += ".fs"; |
395 | | |
396 | 5 | bool idxok=true; |
397 | 5 | if(update) |
398 | 0 | { |
399 | 0 | LastSeekpos = 0; |
400 | | |
401 | | //Read in existing index |
402 | 0 | idxok=false; |
403 | 0 | ifstream ifs(indexname.c_str(),ifstream::binary); |
404 | 0 | if(ifs.good()) |
405 | 0 | { |
406 | 0 | pidx = new FptIndex; |
407 | 0 | idxok = pidx->Read(&ifs); |
408 | 0 | } |
409 | 0 | }//ifs closed here |
410 | | |
411 | 5 | pOs = new ofstream(indexname.c_str(),ofstream::binary); |
412 | | |
413 | 5 | if(!pOs->good() || !idxok) |
414 | 0 | { |
415 | 0 | stringstream errorMsg; |
416 | 0 | errorMsg << "Trouble opening or reading " << indexname << endl; |
417 | 0 | obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obError); |
418 | 0 | static_cast<ofstream *>(pOs)->close(); // close the file before quitting |
419 | 0 | delete pOs; |
420 | 0 | delete pidx; // remove possible memory leak |
421 | 0 | return false; |
422 | 0 | } |
423 | 5 | NewOstreamUsed=true; |
424 | 5 | } |
425 | 0 | else // not cout |
426 | 0 | { |
427 | 0 | if(update) |
428 | 0 | { |
429 | 0 | obErrorLog.ThrowError(__FUNCTION__, |
430 | 0 | "Currently, updating is only done on index files that" |
431 | 0 | "have the same name as the datafile.\n" |
432 | 0 | "Do not specify an output file; use the form:\n" |
433 | 0 | " obabel datafile.xxx -ofs -xu", obError); |
434 | 0 | return false; |
435 | 0 | } |
436 | 0 | } |
437 | | |
438 | 5 | int nbits = 0; |
439 | 5 | const char* p = pConv->IsOption("N"); |
440 | 5 | if(p) |
441 | 0 | nbits = atoi(p); |
442 | | |
443 | 5 | string fpid; //fingerprint type |
444 | 5 | p=pConv->IsOption("f"); |
445 | 5 | if(p) |
446 | 0 | fpid=p; |
447 | | |
448 | | //Prepare name without path |
449 | 5 | string datafilename = pConv->GetInFilename(); |
450 | 5 | if(datafilename.empty()) |
451 | 5 | { |
452 | 5 | obErrorLog.ThrowError(__FUNCTION__, "No datafile!", obError); |
453 | 5 | delete pidx; |
454 | 5 | return false; |
455 | 5 | } |
456 | 0 | string::size_type pos = datafilename.find_last_of("/\\"); |
457 | 0 | if(pos!=string::npos) |
458 | 0 | datafilename=datafilename.substr(pos+1); |
459 | |
|
460 | 0 | nmols = pConv->NumInputObjects(); |
461 | 0 | if(nmols>0) |
462 | 0 | clog << "\nIt contains " << nmols << " molecules" << flush; |
463 | 0 | if(nmols>500000) |
464 | 0 | { |
465 | 0 | istream* is = pConv->GetInStream(); |
466 | 0 | streampos origpos = is->tellg(); |
467 | 0 | is->seekg(0,ios_base::end); |
468 | 0 | long long filesize = is->tellg(); |
469 | 0 | if(sizeof(void*) < 8 && filesize > 4294967295u) |
470 | 0 | { |
471 | 0 | obErrorLog.ThrowError(__FUNCTION__, "The datafile must not be larger than 4GB", obError); |
472 | 0 | return false; |
473 | 0 | } |
474 | 0 | is->seekg(origpos); |
475 | 0 | } |
476 | 0 | sw.Start(); |
477 | |
|
478 | 0 | if(update) |
479 | 0 | { |
480 | 0 | fsi = new FastSearchIndexer(pidx, pOs, nmols);//using existing index |
481 | | |
482 | | //Seek to position in datafile of last of old objects |
483 | 0 | LastSeekpos = *(pidx->seekdata.end()-1); |
484 | 0 | pConv->GetInStream()->seekg(LastSeekpos); |
485 | 0 | } |
486 | 0 | else |
487 | 0 | fsi = new FastSearchIndexer(datafilename, pOs, fpid, nbits, nmols); |
488 | |
|
489 | 0 | obErrorLog.StopLogging(); |
490 | 0 | } |
491 | | |
492 | | //All passes provide an object for indexing |
493 | 0 | OBBase* pOb = pConv->GetChemObject(); |
494 | 0 | OBMol* pmol = dynamic_cast<OBMol*> (pOb); |
495 | 0 | if(pmol) |
496 | 0 | pmol->ConvertDativeBonds();//use standard form for dative bonds |
497 | |
|
498 | 0 | streampos seekpos = pConv->GetInPos(); |
499 | 0 | if(!update || seekpos>LastSeekpos) |
500 | 0 | { |
501 | 0 | fsi->Add(pOb, seekpos ); |
502 | 0 | if(pConv->GetOutputIndex()==400 && nmols>1000) |
503 | 0 | { |
504 | 0 | clog << " Estimated completion time "; |
505 | 0 | double secs = sw.Elapsed() * nmols / 400; // |
506 | 0 | if(secs>150) |
507 | 0 | clog << secs/60 << " minutes" << endl; |
508 | 0 | else |
509 | 0 | clog << secs << " seconds" << endl; |
510 | 0 | } |
511 | 0 | } |
512 | 0 | else |
513 | | //Don't index old objects during update. Don't increment pConv->Index. |
514 | 0 | pConv->SetOutputIndex(pConv->GetOutputIndex()-1); |
515 | |
|
516 | 0 | if(pConv->IsLast()) |
517 | 0 | { |
518 | | //Last pass |
519 | 0 | delete fsi; //saves index file |
520 | 0 | if(NewOstreamUsed) |
521 | 0 | delete pOs; |
522 | | |
523 | | //return to starting conditions |
524 | 0 | fsi=nullptr; |
525 | |
|
526 | 0 | obErrorLog.StartLogging(); |
527 | |
|
528 | 0 | double secs = sw.Elapsed(); |
529 | 0 | if(secs>150) |
530 | 0 | clog << "\n It took " << secs/60 << " minutes" << endl; |
531 | 0 | else |
532 | 0 | clog << "\n It took " << secs << " seconds" << endl; |
533 | 0 | } |
534 | 0 | delete pOb; |
535 | 0 | return true; |
536 | 5 | } |
537 | | |
538 | | /////////////////////////////////////////////////////////////// |
539 | | bool FastSearchFormat::ObtainTarget(OBConversion* pConv, vector<OBMol>& patternMols, const string& indexname) |
540 | 0 | { |
541 | | //Obtains an OBMol from: |
542 | | // the filename in the -s option or |
543 | | // the SMARTS string in the -s option or |
544 | | // by converting the file in the -S or -aS options (deprecated). |
545 | | // If there is no -s -S or -aS option, information on the index file is displayed. |
546 | |
|
547 | 0 | OBMol patternMol; |
548 | 0 | patternMol.SetIsPatternStructure(); |
549 | |
|
550 | 0 | const char* p = pConv->IsOption("s",OBConversion::GENOPTIONS); |
551 | |
|
552 | 0 | bool OldSOption=false; |
553 | | //If no -s option, make OBMol from file in -S option or -aS option (both deprecated) |
554 | 0 | if(!p) |
555 | 0 | { |
556 | 0 | p = pConv->IsOption("S",OBConversion::GENOPTIONS); |
557 | 0 | if(!p) |
558 | 0 | p = pConv->IsOption("S",OBConversion::INOPTIONS);//for GUI mainly |
559 | 0 | OldSOption = true; |
560 | 0 | } |
561 | 0 | if(p) |
562 | 0 | { |
563 | 0 | vector<string> vec; |
564 | 0 | tokenize(vec, p); |
565 | |
|
566 | 0 | if(vec.size() == 0) |
567 | 0 | { |
568 | 0 | obErrorLog.ThrowError(__FUNCTION__, |
569 | 0 | "Missing argument for -s/-S", obError); |
570 | 0 | return false; |
571 | 0 | } |
572 | | |
573 | | //ignore leading ~ (not relevant to fastsearch) |
574 | 0 | if(vec[0][0]=='~') |
575 | 0 | vec[0].erase(0,1); |
576 | |
|
577 | 0 | if(vec.size()>1 && vec[1]=="exact") |
578 | 0 | pConv->AddOption("e", OBConversion::INOPTIONS); |
579 | |
|
580 | 0 | OBConversion patternConv; |
581 | 0 | OBFormat* pFormat; |
582 | | //Interpret as a filename if possible |
583 | 0 | string& txt =vec [0]; |
584 | 0 | if( txt.empty() || |
585 | 0 | txt.find('.')==string::npos || |
586 | 0 | !(pFormat = patternConv.FormatFromExt(txt.c_str())) || |
587 | 0 | !patternConv.SetInFormat(pFormat) || |
588 | 0 | !patternConv.ReadFile(&patternMol, txt) || |
589 | 0 | patternMol.NumAtoms()==0) |
590 | | //if false, have a valid patternMol from a file |
591 | 0 | { |
592 | | //is SMARTS/SMILES |
593 | | //Replace e.g. [#6] in SMARTS by C so that it can be converted as SMILES |
594 | | //for the fingerprint phase, but allow more generality in the SMARTS phase. |
595 | 0 | for(;;) |
596 | 0 | { |
597 | 0 | string::size_type pos1, pos2; |
598 | 0 | pos1 = txt.find("[#"); |
599 | 0 | if(pos1==string::npos) |
600 | 0 | break; |
601 | 0 | pos2 = txt.find(']'); |
602 | 0 | int atno; |
603 | 0 | if(pos2!=string::npos && (atno = atoi(txt.substr(pos1+2, pos2-pos1-2).c_str())) && atno>0) |
604 | 0 | txt.replace(pos1, pos2-pos1+1, OBElements::GetSymbol(atno)); |
605 | 0 | else |
606 | 0 | { |
607 | 0 | obErrorLog.ThrowError(__FUNCTION__,"Ill-formed [#n] atom in SMARTS", obError); |
608 | 0 | return false; |
609 | 0 | } |
610 | 0 | } |
611 | | |
612 | 0 | bool hasTildeBond; |
613 | 0 | if( (hasTildeBond = (txt.find('~')!=string::npos)) ) // extra parens to indicate truth value |
614 | 0 | { |
615 | | //Find ~ bonds and make versions of query molecule with a single and aromatic bonds |
616 | | //To avoid having to parse the SMILES here, replace ~ by $ (quadruple bond) |
617 | | //and then replace this in patternMol. Check first that there are no $ already |
618 | | //Sadly, isocynanides may have $ bonds. |
619 | 0 | if(txt.find('$')!=string::npos) |
620 | 0 | { |
621 | 0 | obErrorLog.ThrowError(__FUNCTION__, |
622 | 0 | "Cannot use ~ bonds in patterns with $ (quadruple) bonds.)", obError); |
623 | 0 | return false; |
624 | 0 | } |
625 | 0 | replace(txt.begin(),txt.end(), '~' , '$'); |
626 | 0 | } |
627 | | |
628 | | //read as standard SMILES |
629 | 0 | patternConv.SetInFormat("smi"); |
630 | 0 | if(!patternConv.ReadString(&patternMol, vec[0])) |
631 | 0 | { |
632 | 0 | obErrorLog.ThrowError(__FUNCTION__,"Cannot read the SMILES string",obError); |
633 | 0 | return false; |
634 | 0 | } |
635 | 0 | if(hasTildeBond) |
636 | 0 | { |
637 | 0 | AddPattern(patternMols, patternMol, 0); //recursively add all combinations of tilde bond values |
638 | 0 | return true; |
639 | 0 | } |
640 | 0 | } |
641 | 0 | else |
642 | 0 | { |
643 | | // target(s) are in a file |
644 | 0 | patternMols.push_back(patternMol); |
645 | 0 | while(patternConv.Read(&patternMol)) |
646 | 0 | patternMols.push_back(patternMol); |
647 | 0 | return true; |
648 | 0 | } |
649 | 0 | } |
650 | | |
651 | 0 | if(OldSOption) //only when using deprecated -S and -aS options |
652 | 0 | { |
653 | | //make -s option for later SMARTS test |
654 | 0 | OBConversion conv; |
655 | 0 | if(conv.SetOutFormat("smi")) |
656 | 0 | { |
657 | 0 | string optiontext = conv.WriteString(&patternMol, true); |
658 | 0 | pConv->AddOption("s", OBConversion::GENOPTIONS, optiontext.c_str()); |
659 | 0 | } |
660 | 0 | } |
661 | |
|
662 | 0 | if(!p) |
663 | 0 | { |
664 | | //neither -s or -S options provided. Output info rather than doing search |
665 | 0 | const FptIndexHeader& header = fs.GetIndexHeader(); |
666 | 0 | string id(header.fpid); |
667 | 0 | if(id.empty()) |
668 | 0 | id = "default"; |
669 | 0 | clog << indexname << " is an index of\n " << header.datafilename |
670 | 0 | << ".\n It contains " << header.nEntries |
671 | 0 | << " molecules. The fingerprint type is " << id << " with " |
672 | 0 | << OBFingerprint::Getbitsperint() * header.words << " bits.\n" |
673 | 0 | << "Typical usage for a substructure search:\n" |
674 | 0 | << "obabel indexfile.fs -osmi -sSMILES\n" |
675 | 0 | << "(-s option in GUI is 'Convert only if match SMARTS or mols in file')" << endl; |
676 | 0 | return false; |
677 | 0 | } |
678 | | |
679 | 0 | patternMols.push_back(patternMol); |
680 | 0 | return true; |
681 | 0 | } |
682 | | |
683 | | void FastSearchFormat::AddPattern(vector<OBMol>& patternMols, OBMol patternMol, int idx) |
684 | 0 | { |
685 | | //Recursive function to generate all combinations of aromatic/single bonds for each tilde bond |
686 | | //Copying an OBMol, which happens when adding it to a vector, kekulizes it, |
687 | | // changing aromatic (bo=5) bonds. So set order after adding. Should work here, |
688 | | // but is dangerous if the vector needs to be reallocated. |
689 | |
|
690 | 0 | if(idx>=patternMol.NumBonds()) |
691 | 0 | return; |
692 | 0 | if(patternMol.GetBond(idx)->GetBondOrder()==4) |
693 | 0 | { |
694 | 0 | patternMol.GetBond(idx)->SetBondOrder(1); |
695 | 0 | patternMols.push_back(patternMol); |
696 | 0 | AddPattern(patternMols, patternMol,idx+1); |
697 | |
|
698 | 0 | patternMols.push_back(patternMol); |
699 | 0 | patternMols.back().GetBond(idx)->SetBondOrder(5); |
700 | 0 | } |
701 | 0 | AddPattern(patternMols, patternMol,idx+1); |
702 | 0 | } |
703 | | |
704 | | /* Accept ~ bonds. Need to generate two PatternMols for each '~' |
705 | | i.e. '-' or nothing for single and ':' for aromatic |
706 | | or 2^n patterns for n bonds. SMILES format will accept : but cannot |
707 | | provide an isolated aromatic bond. So need to edit OBMol. Retain the atom |
708 | | indices of the bond's atoms and change the bond's order. |
709 | | ObtainTarget() will return a vector of OBMol and the Find() in L260 will be done |
710 | | for each. All fs matches will go into SeekPositions. At the end this |
711 | | will be sorted and duplicates removed with unique. |
712 | | */ |
713 | | |
714 | | }//Openbabel |
715 | | |
716 | | //! \file fastsearchformat.cpp |
717 | | //! \brief Preparation and searching of fingerprint-based index files |