/src/htslib/cram/open_trace_file.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Author: James Bonfield |
3 | | |
4 | | Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL |
5 | | All rights reserved |
6 | | |
7 | | Redistribution and use in source and binary forms, with or without |
8 | | modification, are permitted provided that the following conditions are met: |
9 | | |
10 | | 1. Redistributions of source code must retain the above copyright notice, |
11 | | this list of conditions and the following disclaimer. |
12 | | |
13 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
14 | | this list of conditions and the following disclaimer in the documentation |
15 | | and/or other materials provided with the distribution. |
16 | | |
17 | | 3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF |
18 | | MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or |
19 | | promote products derived from this software without specific prior written |
20 | | permission. |
21 | | |
22 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
23 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
24 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
25 | | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
26 | | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
27 | | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
28 | | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
29 | | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
30 | | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
31 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | /* |
35 | | Copyright (c) 2008, 2009, 2013, 2014-2015, 2018-2020 Genome Research Ltd. |
36 | | Author: James Bonfield <jkb@sanger.ac.uk> |
37 | | |
38 | | Redistribution and use in source and binary forms, with or without |
39 | | modification, are permitted provided that the following conditions are met: |
40 | | |
41 | | 1. Redistributions of source code must retain the above copyright notice, |
42 | | this list of conditions and the following disclaimer. |
43 | | |
44 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
45 | | this list of conditions and the following disclaimer in the documentation |
46 | | and/or other materials provided with the distribution. |
47 | | |
48 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
49 | | Institute nor the names of its contributors may be used to endorse or promote |
50 | | products derived from this software without specific prior written permission. |
51 | | |
52 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
53 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
54 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
55 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
56 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
57 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
58 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
59 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
60 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
61 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
62 | | */ |
63 | | |
64 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
65 | | #include <config.h> |
66 | | |
67 | | #include <stdlib.h> |
68 | | #include <stdio.h> |
69 | | #include <string.h> |
70 | | #include <unistd.h> |
71 | | #include <limits.h> |
72 | | #include <errno.h> |
73 | | #include <sys/types.h> |
74 | | #include <sys/stat.h> |
75 | | |
76 | | #include "os.h" |
77 | | #ifndef PATH_MAX |
78 | | # define PATH_MAX 1024 |
79 | | #endif |
80 | | |
81 | | #include "open_trace_file.h" |
82 | | #include "misc.h" |
83 | | #include "../htslib/hfile.h" |
84 | | #include "../htslib/hts_log.h" |
85 | | #include "../htslib/hts.h" |
86 | | |
87 | | /* |
88 | | * Returns whether the path refers to a regular file. |
89 | | */ |
90 | 1 | static int is_file(char *fn) { |
91 | 1 | struct stat buf; |
92 | 1 | if ( stat(fn,&buf) ) return 0; |
93 | 0 | return S_ISREG(buf.st_mode); |
94 | 1 | } |
95 | | |
96 | | /* |
97 | | * Tokenises the search path splitting on colons (unix) or semicolons |
98 | | * (windows). |
99 | | * We also explicitly add a "./" to the end of the search path |
100 | | * |
101 | | * Returns: A new search path with items separated by nul chars. Two nul |
102 | | * chars in a row represent the end of the tokenised path. |
103 | | * Returns NULL for a failure. |
104 | | * |
105 | | * The returned data has been malloced. It is up to the caller to free this |
106 | | * memory. |
107 | | */ |
108 | 1 | char *tokenise_search_path(const char *searchpath) { |
109 | 1 | char *newsearch; |
110 | 1 | unsigned int i, j; |
111 | 1 | size_t len; |
112 | 1 | char path_sep = HTS_PATH_SEPARATOR_CHAR; |
113 | | |
114 | 1 | if (!searchpath) |
115 | 0 | searchpath=""; |
116 | | |
117 | 1 | newsearch = (char *)malloc((len = strlen(searchpath))+5); |
118 | 1 | if (!newsearch) |
119 | 0 | return NULL; |
120 | | |
121 | 16 | for (i = 0, j = 0; i < len; i++) { |
122 | | /* "::" => ":". Used for escaping colons in http://foo */ |
123 | 15 | if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') { |
124 | 0 | newsearch[j++] = ':'; |
125 | 0 | i++; |
126 | 0 | continue; |
127 | 0 | } |
128 | | |
129 | | /* Handle http:// and ftp:// too without :: */ |
130 | 15 | if (path_sep == ':') { |
131 | 15 | if ((i == 0 || (i > 0 && searchpath[i-1] == ':')) && |
132 | 15 | (!strncmp(&searchpath[i], "http:", 5) || |
133 | 1 | !strncmp(&searchpath[i], "https:", 6) || |
134 | 1 | !strncmp(&searchpath[i], "ftp:", 4) || |
135 | 1 | !strncmp(&searchpath[i], "|http:", 6) || |
136 | 1 | !strncmp(&searchpath[i], "|https:", 7) || |
137 | 1 | !strncmp(&searchpath[i], "|ftp:", 5) || |
138 | 1 | !strncmp(&searchpath[i], "URL=http:", 9) || |
139 | 1 | !strncmp(&searchpath[i], "URL=https:",10)|| |
140 | 1 | !strncmp(&searchpath[i], "URL=ftp:", 8))) { |
141 | 6 | do { |
142 | 6 | newsearch[j++] = searchpath[i]; |
143 | 6 | } while (i<len && searchpath[i++] != ':'); |
144 | 1 | if (searchpath[i] == ':') |
145 | 0 | i++; |
146 | 1 | if (searchpath[i]=='/') |
147 | 1 | newsearch[j++] = searchpath[i++]; |
148 | 1 | if (searchpath[i]=='/') |
149 | 1 | newsearch[j++] = searchpath[i++]; |
150 | | // Look for host:port |
151 | 13 | do { |
152 | 13 | newsearch[j++] = searchpath[i++]; |
153 | 13 | } while (i<len && searchpath[i] != ':' && searchpath[i] != '/'); |
154 | 1 | newsearch[j++] = searchpath[i++]; |
155 | 1 | if (searchpath[i] == ':') |
156 | 0 | i++; |
157 | 1 | } |
158 | 15 | } |
159 | | |
160 | 15 | if (searchpath[i] == path_sep) { |
161 | | /* Skip blank path components */ |
162 | 0 | if (j && newsearch[j-1] != 0) |
163 | 0 | newsearch[j++] = 0; |
164 | 15 | } else { |
165 | 15 | newsearch[j++] = searchpath[i]; |
166 | 15 | } |
167 | 15 | } |
168 | | |
169 | 1 | if (j) |
170 | 1 | newsearch[j++] = 0; |
171 | 1 | newsearch[j++] = '.'; |
172 | 1 | newsearch[j++] = '/'; |
173 | 1 | newsearch[j++] = 0; |
174 | 1 | newsearch[j++] = 0; |
175 | | |
176 | 1 | return newsearch; |
177 | 1 | } |
178 | | |
179 | | static char *expand_path(const char *file, char *dirname, int max_s_digits); |
180 | | |
181 | 1 | mFILE *find_file_url(const char *file, char *url) { |
182 | 1 | char *path = NULL, buf[8192]; |
183 | 1 | mFILE *mf = NULL; |
184 | 1 | ssize_t len; |
185 | 1 | hFILE *hf = NULL; |
186 | | |
187 | | /* Expand %s for the trace name. Only one digit is allowed between |
188 | | The % and s to avoid ambiguity with percent-encoded URLs */ |
189 | | |
190 | 1 | path = expand_path(file, url, 1); |
191 | 1 | if (!path) |
192 | 0 | return NULL; |
193 | | |
194 | 1 | if (!(hf = hopen(path, "r"))) { |
195 | 1 | if (errno != ENOENT) |
196 | 0 | hts_log_warning("Failed to open reference \"%s\": %s", path, strerror(errno)); |
197 | 1 | goto fail; |
198 | 1 | } |
199 | | |
200 | 0 | if (NULL == (mf = mfcreate(NULL, 0))) |
201 | 0 | goto fail; |
202 | 0 | while ((len = hread(hf, buf, sizeof(buf))) > 0) { |
203 | 0 | if (mfwrite(buf, len, 1, mf) <= 0) { |
204 | 0 | hclose_abruptly(hf); |
205 | 0 | goto fail; |
206 | 0 | } |
207 | 0 | } |
208 | 0 | if (hclose(hf) < 0 || len < 0) { |
209 | 0 | hts_log_warning("Failed to read reference \"%s\": %s", path, strerror(errno)); |
210 | 0 | goto fail; |
211 | 0 | } |
212 | | |
213 | 0 | free(path); |
214 | 0 | mrewind(mf); |
215 | 0 | return mf; |
216 | | |
217 | 1 | fail: |
218 | 1 | mfdestroy(mf); |
219 | 1 | free(path); |
220 | 1 | return NULL; |
221 | 0 | } |
222 | | |
223 | | /* |
224 | | * Takes a dirname possibly including % rules and appends the filename |
225 | | * to it. |
226 | | * |
227 | | * Returns expanded pathname or NULL for malloc failure. |
228 | | */ |
229 | 2 | static char *expand_path(const char *file, char *dirname, int max_s_digits) { |
230 | 2 | size_t len = strlen(dirname); |
231 | 2 | size_t lenf = strlen(file); |
232 | 2 | char *cp, *path; |
233 | | |
234 | 2 | path = malloc(len+lenf+2); // worst expansion DIR/FILE |
235 | 2 | if (!path) { |
236 | 0 | hts_log_error("Out of memory"); |
237 | 0 | return NULL; |
238 | 0 | } |
239 | | |
240 | 2 | if (dirname[len-1] == '/') |
241 | 1 | len--; |
242 | | |
243 | | /* Special case for "./" or absolute filenames */ |
244 | 2 | if (*file == '/' || (len==1 && *dirname == '.')) { |
245 | 1 | memcpy(path, file, lenf + 1); |
246 | 1 | } else { |
247 | | /* Handle %[0-9]*s expansions, if required */ |
248 | 1 | char *path_end = path; |
249 | 1 | *path = 0; |
250 | 2 | while ((cp = strchr(dirname, '%'))) { |
251 | 1 | char *endp; |
252 | 1 | long l = strtol(cp+1, &endp, 10); |
253 | 1 | if (*endp != 's' || endp - cp - 1 > max_s_digits) { |
254 | 0 | strncpy(path_end, dirname, (endp+1)-dirname); |
255 | 0 | path_end += (endp+1)-dirname; |
256 | 0 | dirname = endp+1; |
257 | 0 | continue; |
258 | 0 | } |
259 | | |
260 | 1 | strncpy(path_end, dirname, cp-dirname); |
261 | 1 | path_end += cp-dirname; |
262 | 1 | if (l) { |
263 | 0 | strncpy(path_end, file, l); |
264 | 0 | path_end += MIN(strlen(file), l); |
265 | 0 | file += MIN(strlen(file), l); |
266 | 1 | } else { |
267 | 1 | strcpy(path_end, file); |
268 | 1 | path_end += strlen(file); |
269 | 1 | file += strlen(file); |
270 | 1 | } |
271 | 1 | len -= (endp+1) - dirname; |
272 | 1 | dirname = endp+1; |
273 | 1 | } |
274 | 1 | strncpy(path_end, dirname, len); |
275 | 1 | path_end += MIN(strlen(dirname), len); |
276 | 1 | *path_end = 0; |
277 | 1 | if (*file) { |
278 | 0 | *path_end++ = '/'; |
279 | 0 | strcpy(path_end, file); |
280 | 0 | } |
281 | 1 | } |
282 | | |
283 | | //fprintf(stderr, "*PATH=\"%s\"\n", path); |
284 | 2 | return path; |
285 | 2 | } |
286 | | |
287 | | /* |
288 | | * Searches for file in the directory 'dirname'. If it finds it, it opens |
289 | | * it. This also searches for compressed versions of the file in dirname |
290 | | * too. |
291 | | * |
292 | | * Returns mFILE pointer if found |
293 | | * NULL if not |
294 | | */ |
295 | 1 | static mFILE *find_file_dir(const char *file, char *dirname) { |
296 | 1 | char *path; |
297 | 1 | mFILE *mf = NULL; |
298 | | |
299 | 1 | path = expand_path(file, dirname, INT_MAX); |
300 | 1 | if (!path) |
301 | 0 | return NULL; |
302 | | |
303 | 1 | if (is_file(path)) |
304 | 0 | mf = mfopen(path, "rbm"); |
305 | | |
306 | 1 | free(path); |
307 | 1 | return mf; |
308 | 1 | } |
309 | | |
310 | | /* |
311 | | * ------------------------------------------------------------------------ |
312 | | * Public functions below. |
313 | | */ |
314 | | |
315 | | /* |
316 | | * Opens a trace file named 'file'. This is initially looked for as a |
317 | | * pathname relative to a file named "relative_to". This may (for |
318 | | * example) be the name of an experiment file referencing the trace |
319 | | * file. In this case by passing relative_to as the experiment file |
320 | | * filename the trace file will be picked up in the same directory as |
321 | | * the experiment file. Relative_to may be supplied as NULL. |
322 | | * |
323 | | * 'file' is looked for at relative_to, then the current directory, and then |
324 | | * all of the locations listed in 'path' (which is a colon separated list). |
325 | | * If 'path' is NULL it uses the RAWDATA environment variable instead. |
326 | | * |
327 | | * Returns a mFILE pointer when found. |
328 | | * NULL otherwise. |
329 | | */ |
330 | 1 | mFILE *open_path_mfile(const char *file, char *path, char *relative_to) { |
331 | 1 | char *newsearch; |
332 | 1 | char *ele; |
333 | 1 | mFILE *fp; |
334 | | |
335 | | /* Use path first */ |
336 | 1 | if (!path) |
337 | 0 | path = getenv("RAWDATA"); |
338 | 1 | if (NULL == (newsearch = tokenise_search_path(path))) |
339 | 0 | return NULL; |
340 | | |
341 | | /* |
342 | | * Step through the search path testing out each component. |
343 | | * We now look through each path element treating some prefixes as |
344 | | * special, otherwise we treat the element as a directory. |
345 | | */ |
346 | 3 | for (ele = newsearch; *ele; ele += strlen(ele)+1) { |
347 | 2 | char *ele2; |
348 | | |
349 | | /* |
350 | | * '|' prefixing a path component indicates that we do not |
351 | | * wish to perform the compression extension searching in that |
352 | | * location. |
353 | | * |
354 | | * NB: this has been removed from the htslib implementation. |
355 | | */ |
356 | 2 | if (*ele == '|') { |
357 | 0 | ele2 = ele+1; |
358 | 2 | } else { |
359 | 2 | ele2 = ele; |
360 | 2 | } |
361 | | |
362 | 2 | if (0 == strncmp(ele2, "URL=", 4)) { |
363 | 0 | if ((fp = find_file_url(file, ele2+4))) { |
364 | 0 | free(newsearch); |
365 | 0 | return fp; |
366 | 0 | } |
367 | 2 | } else if (!strncmp(ele2, "http:", 5) || |
368 | 2 | !strncmp(ele2, "https:", 6) || |
369 | 2 | !strncmp(ele2, "ftp:", 4)) { |
370 | 1 | if ((fp = find_file_url(file, ele2))) { |
371 | 0 | free(newsearch); |
372 | 0 | return fp; |
373 | 0 | } |
374 | 1 | } else if ((fp = find_file_dir(file, ele2))) { |
375 | 0 | free(newsearch); |
376 | 0 | return fp; |
377 | 0 | } |
378 | 2 | } |
379 | | |
380 | 1 | free(newsearch); |
381 | | |
382 | | /* Look in the same location as the incoming 'relative_to' filename */ |
383 | 1 | if (relative_to) { |
384 | 0 | char *cp; |
385 | 0 | char relative_path[PATH_MAX+1]; |
386 | 0 | strcpy(relative_path, relative_to); |
387 | 0 | if ((cp = strrchr(relative_path, '/'))) |
388 | 0 | *cp = 0; |
389 | 0 | if ((fp = find_file_dir(file, relative_path))) |
390 | 0 | return fp; |
391 | 0 | } |
392 | | |
393 | 1 | return NULL; |
394 | 1 | } |
395 | | |
396 | | |
397 | | /* |
398 | | * As per open_path_mfile, but searching only for local filenames. |
399 | | * This is useful as we may avoid doing a full mfopen and loading |
400 | | * the entire file into memory. |
401 | | * |
402 | | * Returns the expanded pathname if found. |
403 | | * NULL if not |
404 | | */ |
405 | 0 | char *find_path(const char *file, const char *path) { |
406 | 0 | char *newsearch; |
407 | 0 | char *ele; |
408 | 0 | char *outpath = NULL; |
409 | | |
410 | | /* Use path first */ |
411 | 0 | if (!path) |
412 | 0 | path = getenv("RAWDATA"); |
413 | 0 | if (NULL == (newsearch = tokenise_search_path(path))) |
414 | 0 | return NULL; |
415 | | |
416 | 0 | for (ele = newsearch; *ele; ele += strlen(ele)+1) { |
417 | 0 | char *ele2 = (*ele == '|') ? ele+1 : ele; |
418 | |
|
419 | 0 | if (!strncmp(ele2, "URL=", 4) || |
420 | 0 | !strncmp(ele2, "http:", 5) || |
421 | 0 | !strncmp(ele2, "https:", 6) || |
422 | 0 | !strncmp(ele2, "ftp:", 4)) { |
423 | 0 | continue; |
424 | 0 | } else { |
425 | 0 | outpath = expand_path(file, ele2, INT_MAX); |
426 | 0 | if (is_file(outpath)) { |
427 | 0 | free(newsearch); |
428 | 0 | return outpath; |
429 | 0 | } else { |
430 | 0 | free(outpath); |
431 | 0 | } |
432 | 0 | } |
433 | 0 | } |
434 | | |
435 | 0 | free(newsearch); |
436 | |
|
437 | 0 | return NULL; |
438 | 0 | } |