/src/htslib/cram/open_trace_file.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Author: James Bonfield |
3 | | |
4 | | Copyright (c) 2000-2001 MEDICAL RESEARCH COUNCIL |
5 | | All rights reserved |
6 | | |
7 | | Redistribution and use in source and binary forms, with or without |
8 | | modification, are permitted provided that the following conditions are met: |
9 | | |
10 | | 1. Redistributions of source code must retain the above copyright notice, |
11 | | this list of conditions and the following disclaimer. |
12 | | |
13 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
14 | | this list of conditions and the following disclaimer in the documentation |
15 | | and/or other materials provided with the distribution. |
16 | | |
17 | | 3. Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF |
18 | | MOLECULAR BIOLOGY nor the names of its contributors may be used to endorse or |
19 | | promote products derived from this software without specific prior written |
20 | | permission. |
21 | | |
22 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
23 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
24 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
25 | | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
26 | | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
27 | | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
28 | | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON |
29 | | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
30 | | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
31 | | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | /* |
35 | | Copyright (c) 2008, 2009, 2013, 2014-2015, 2018-2020 Genome Research Ltd. |
36 | | Author: James Bonfield <jkb@sanger.ac.uk> |
37 | | |
38 | | Redistribution and use in source and binary forms, with or without |
39 | | modification, are permitted provided that the following conditions are met: |
40 | | |
41 | | 1. Redistributions of source code must retain the above copyright notice, |
42 | | this list of conditions and the following disclaimer. |
43 | | |
44 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
45 | | this list of conditions and the following disclaimer in the documentation |
46 | | and/or other materials provided with the distribution. |
47 | | |
48 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
49 | | Institute nor the names of its contributors may be used to endorse or promote |
50 | | products derived from this software without specific prior written permission. |
51 | | |
52 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
53 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
54 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
55 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
56 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
57 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
58 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
59 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
60 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
61 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
62 | | */ |
63 | | |
64 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
65 | | #include <config.h> |
66 | | |
67 | | #include <stdlib.h> |
68 | | #include <stdio.h> |
69 | | #include <string.h> |
70 | | #include <unistd.h> |
71 | | #include <limits.h> |
72 | | #include <errno.h> |
73 | | #include <sys/types.h> |
74 | | #include <sys/stat.h> |
75 | | |
76 | | #include "os.h" |
77 | | #ifndef PATH_MAX |
78 | | # define PATH_MAX 1024 |
79 | | #endif |
80 | | |
81 | | #include "open_trace_file.h" |
82 | | #include "misc.h" |
83 | | #include "../htslib/hfile.h" |
84 | | #include "../htslib/hts_log.h" |
85 | | #include "../htslib/hts.h" |
86 | | |
87 | | /* |
88 | | * Returns whether the path refers to a regular file. |
89 | | */ |
90 | 12 | static int is_file(char *fn) { |
91 | 12 | struct stat buf; |
92 | 12 | if ( stat(fn,&buf) ) return 0; |
93 | 1 | return S_ISREG(buf.st_mode); |
94 | 12 | } |
95 | | |
96 | | /* |
97 | | * Tokenises the search path splitting on colons (unix) or semicolons |
98 | | * (windows). |
99 | | * We also explicitly add a "./" to the end of the search path |
100 | | * |
101 | | * Returns: A new search path with items separated by nul chars. Two nul |
102 | | * chars in a row represent the end of the tokenised path. |
103 | | * Returns NULL for a failure. |
104 | | * |
105 | | * The returned data has been malloced. It is up to the caller to free this |
106 | | * memory. |
107 | | */ |
108 | 12 | char *tokenise_search_path(const char *searchpath) { |
109 | 12 | char *newsearch; |
110 | 12 | unsigned int i, j; |
111 | 12 | size_t len; |
112 | 12 | char path_sep = HTS_PATH_SEPARATOR_CHAR; |
113 | | |
114 | 12 | if (!searchpath) |
115 | 12 | searchpath=""; |
116 | | |
117 | 12 | newsearch = (char *)malloc((len = strlen(searchpath))+5); |
118 | 12 | if (!newsearch) |
119 | 0 | return NULL; |
120 | | |
121 | 12 | for (i = 0, j = 0; i < len; i++) { |
122 | | /* "::" => ":". Used for escaping colons in http://foo */ |
123 | 0 | if (i < len-1 && searchpath[i] == ':' && searchpath[i+1] == ':') { |
124 | 0 | newsearch[j++] = ':'; |
125 | 0 | i++; |
126 | 0 | continue; |
127 | 0 | } |
128 | | |
129 | | /* Handle http:// and ftp:// too without :: */ |
130 | 0 | if (path_sep == ':') { |
131 | 0 | if ((i == 0 || (i > 0 && searchpath[i-1] == ':')) && |
132 | 0 | (!strncmp(&searchpath[i], "http:", 5) || |
133 | 0 | !strncmp(&searchpath[i], "https:", 6) || |
134 | 0 | !strncmp(&searchpath[i], "ftp:", 4) || |
135 | 0 | !strncmp(&searchpath[i], "|http:", 6) || |
136 | 0 | !strncmp(&searchpath[i], "|https:", 7) || |
137 | 0 | !strncmp(&searchpath[i], "|ftp:", 5) || |
138 | 0 | !strncmp(&searchpath[i], "URL=http:", 9) || |
139 | 0 | !strncmp(&searchpath[i], "URL=https:",10)|| |
140 | 0 | !strncmp(&searchpath[i], "URL=ftp:", 8))) { |
141 | 0 | do { |
142 | 0 | newsearch[j++] = searchpath[i]; |
143 | 0 | } while (i<len && searchpath[i++] != ':'); |
144 | 0 | if (searchpath[i] == ':') |
145 | 0 | i++; |
146 | 0 | if (searchpath[i]=='/') |
147 | 0 | newsearch[j++] = searchpath[i++]; |
148 | 0 | if (searchpath[i]=='/') |
149 | 0 | newsearch[j++] = searchpath[i++]; |
150 | | // Look for host:port |
151 | 0 | do { |
152 | 0 | newsearch[j++] = searchpath[i++]; |
153 | 0 | } while (i<len && searchpath[i] != ':' && searchpath[i] != '/'); |
154 | 0 | newsearch[j++] = searchpath[i++]; |
155 | 0 | if (searchpath[i] == ':') |
156 | 0 | i++; |
157 | 0 | } |
158 | 0 | } |
159 | |
|
160 | 0 | if (searchpath[i] == path_sep) { |
161 | | /* Skip blank path components */ |
162 | 0 | if (j && newsearch[j-1] != 0) |
163 | 0 | newsearch[j++] = 0; |
164 | 0 | } else { |
165 | 0 | newsearch[j++] = searchpath[i]; |
166 | 0 | } |
167 | 0 | } |
168 | | |
169 | 12 | if (j) |
170 | 0 | newsearch[j++] = 0; |
171 | 12 | newsearch[j++] = '.'; |
172 | 12 | newsearch[j++] = '/'; |
173 | 12 | newsearch[j++] = 0; |
174 | 12 | newsearch[j++] = 0; |
175 | | |
176 | 12 | return newsearch; |
177 | 12 | } |
178 | | |
179 | | static char *expand_path(const char *file, const char *dirname, |
180 | | int max_s_digits); |
181 | | |
182 | 0 | mFILE *find_file_url(const char *file, char *url) { |
183 | 0 | char *path = NULL, buf[8192]; |
184 | 0 | mFILE *mf = NULL; |
185 | 0 | ssize_t len; |
186 | 0 | hFILE *hf = NULL; |
187 | | |
188 | | /* Expand %s for the trace name. Only one digit is allowed between |
189 | | The % and s to avoid ambiguity with percent-encoded URLs */ |
190 | |
|
191 | 0 | path = expand_path(file, url, 1); |
192 | 0 | if (!path) |
193 | 0 | return NULL; |
194 | | |
195 | 0 | if (!(hf = hopen(path, "r"))) { |
196 | 0 | if (errno != ENOENT) |
197 | 0 | hts_log_warning("Failed to open reference \"%s\": %s", path, strerror(errno)); |
198 | 0 | goto fail; |
199 | 0 | } |
200 | | |
201 | 0 | if (NULL == (mf = mfcreate(NULL, 0))) |
202 | 0 | goto fail; |
203 | 0 | while ((len = hread(hf, buf, sizeof(buf))) > 0) { |
204 | 0 | if (mfwrite(buf, len, 1, mf) <= 0) { |
205 | 0 | hclose_abruptly(hf); |
206 | 0 | goto fail; |
207 | 0 | } |
208 | 0 | } |
209 | 0 | if (hclose(hf) < 0 || len < 0) { |
210 | 0 | hts_log_warning("Failed to read reference \"%s\": %s", path, strerror(errno)); |
211 | 0 | goto fail; |
212 | 0 | } |
213 | | |
214 | 0 | free(path); |
215 | 0 | mrewind(mf); |
216 | 0 | return mf; |
217 | | |
218 | 0 | fail: |
219 | 0 | mfdestroy(mf); |
220 | 0 | free(path); |
221 | 0 | return NULL; |
222 | 0 | } |
223 | | |
224 | | /* |
225 | | * Takes a dirname possibly including % rules and appends the filename |
226 | | * to it. |
227 | | * |
228 | | * Returns expanded pathname or NULL for malloc failure. |
229 | | */ |
230 | | static char *expand_path(const char *file, const char *dirname, |
231 | 12 | int max_s_digits) { |
232 | 12 | size_t len = strlen(dirname); |
233 | 12 | size_t lenf = strlen(file); |
234 | 12 | const char *end_dirname = dirname + len, *cp; |
235 | 12 | char *path; |
236 | | |
237 | 12 | path = malloc(len+lenf+2); // worst expansion DIR/FILE |
238 | 12 | if (!path) { |
239 | 0 | hts_log_error("Out of memory"); |
240 | 0 | return NULL; |
241 | 0 | } |
242 | | |
243 | | // Remove trailing '/'s, unless the path matches "/" |
244 | 24 | while (len > 1 && dirname[len-1] == '/') { |
245 | 12 | len--; |
246 | 12 | end_dirname--; |
247 | 12 | } |
248 | | |
249 | | /* Special case for "./" or absolute filenames */ |
250 | 12 | if (*file == '/' || (len==1 && *dirname == '.')) { |
251 | 12 | memcpy(path, file, lenf + 1); |
252 | 12 | } else { |
253 | | /* Handle %[0-9]*s expansions, if required */ |
254 | 0 | char *path_end = path; |
255 | |
|
256 | 0 | while ((cp = strchr(dirname, '%'))) { |
257 | 0 | char *endp; |
258 | | // Get optional length |
259 | 0 | long l = strtol(cp+1, &endp, 10); |
260 | 0 | if (*endp != 's' || l < 0 || endp - cp - 1 > max_s_digits) { |
261 | | // Not %[0-9]s. Copy over directly, taking care of edge cases |
262 | | // like the string ending with '%' or '%[0-9]*'. |
263 | 0 | const char *e = MIN(endp+1, end_dirname); |
264 | 0 | memcpy(path_end, dirname, e - dirname); |
265 | 0 | path_end += e - dirname; |
266 | 0 | dirname = e; |
267 | 0 | continue; |
268 | 0 | } |
269 | | |
270 | | // Copy part up to '%' |
271 | 0 | memcpy(path_end, dirname, cp-dirname); |
272 | 0 | path_end += cp-dirname; |
273 | | |
274 | | // Insert segment from file |
275 | 0 | size_t to_copy = l > 0 ? MIN(lenf, l) : lenf; |
276 | 0 | memcpy(path_end, file, to_copy); |
277 | 0 | path_end += to_copy; |
278 | 0 | file += to_copy; |
279 | 0 | lenf -= to_copy; |
280 | | |
281 | | // Skip to part of dirname after the 's' |
282 | 0 | dirname = endp+1; |
283 | 0 | } |
284 | | |
285 | | // Add anything left in dirname |
286 | 0 | if (dirname < end_dirname) { |
287 | 0 | memcpy(path_end, dirname, end_dirname - dirname); |
288 | 0 | path_end += end_dirname - dirname; |
289 | 0 | } |
290 | |
|
291 | 0 | if (*file) { |
292 | | // Add remainder of file |
293 | 0 | if (path_end > path && *(path_end - 1) != '/') |
294 | 0 | *path_end++ = '/'; |
295 | 0 | memcpy(path_end, file, lenf); |
296 | 0 | path_end += lenf; |
297 | 0 | } |
298 | | // Terminate string |
299 | 0 | *path_end = '\0'; |
300 | 0 | } |
301 | | |
302 | | // fprintf(stderr, "*PATH=\"%s\"\n", path); |
303 | 12 | return path; |
304 | 12 | } |
305 | | |
306 | | /* |
307 | | * Searches for file in the directory 'dirname'. If it finds it, it opens |
308 | | * it. This also searches for compressed versions of the file in dirname |
309 | | * too. |
310 | | * |
311 | | * Returns mFILE pointer if found |
312 | | * NULL if not |
313 | | */ |
314 | 12 | static mFILE *find_file_dir(const char *file, char *dirname) { |
315 | 12 | char *path; |
316 | 12 | mFILE *mf = NULL; |
317 | | |
318 | 12 | path = expand_path(file, dirname, INT_MAX); |
319 | 12 | if (!path) |
320 | 0 | return NULL; |
321 | | |
322 | 12 | if (is_file(path)) |
323 | 0 | mf = mfopen(path, "rbm"); |
324 | | |
325 | 12 | free(path); |
326 | 12 | return mf; |
327 | 12 | } |
328 | | |
329 | | /* |
330 | | * ------------------------------------------------------------------------ |
331 | | * Public functions below. |
332 | | */ |
333 | | |
334 | | /* |
335 | | * Opens a trace file named 'file'. This is initially looked for as a |
336 | | * pathname relative to a file named "relative_to". This may (for |
337 | | * example) be the name of an experiment file referencing the trace |
338 | | * file. In this case by passing relative_to as the experiment file |
339 | | * filename the trace file will be picked up in the same directory as |
340 | | * the experiment file. Relative_to may be supplied as NULL. |
341 | | * |
342 | | * 'file' is looked for at relative_to, then the current directory, and then |
343 | | * all of the locations listed in 'path' (which is a colon separated list). |
344 | | * If 'path' is NULL it uses the RAWDATA environment variable instead. |
345 | | * |
346 | | * If non-NULL *local is filled out to 1 for a local file and 0 for a remote |
347 | | * URL. |
348 | | * |
349 | | * Returns a mFILE pointer when found. |
350 | | * NULL otherwise. |
351 | | */ |
352 | | mFILE *open_path_mfile(const char *file, char *path, char *relative_to, |
353 | 12 | int *local) { |
354 | 12 | char *newsearch; |
355 | 12 | char *ele; |
356 | 12 | mFILE *fp; |
357 | | |
358 | 12 | if (local) |
359 | 12 | *local = 1; |
360 | | |
361 | | /* Use path first */ |
362 | 12 | if (!path) |
363 | 12 | path = getenv("RAWDATA"); |
364 | 12 | if (NULL == (newsearch = tokenise_search_path(path))) |
365 | 0 | return NULL; |
366 | | |
367 | | /* |
368 | | * Step through the search path testing out each component. |
369 | | * We now look through each path element treating some prefixes as |
370 | | * special, otherwise we treat the element as a directory. |
371 | | */ |
372 | 24 | for (ele = newsearch; *ele; ele += strlen(ele)+1) { |
373 | 12 | char *ele2; |
374 | | |
375 | | /* |
376 | | * '|' prefixing a path component indicates that we do not |
377 | | * wish to perform the compression extension searching in that |
378 | | * location. |
379 | | * |
380 | | * NB: this has been removed from the htslib implementation. |
381 | | */ |
382 | 12 | if (*ele == '|') { |
383 | 0 | ele2 = ele+1; |
384 | 12 | } else { |
385 | 12 | ele2 = ele; |
386 | 12 | } |
387 | | |
388 | 12 | if (0 == strncmp(ele2, "URL=", 4)) { |
389 | 0 | if ((fp = find_file_url(file, ele2+4))) { |
390 | 0 | if (local) |
391 | 0 | *local = strncmp(ele2+4, "file:", 5) == 0 ? 1 : 0; |
392 | 0 | free(newsearch); |
393 | 0 | return fp; |
394 | 0 | } |
395 | 12 | } else if (hisremote(ele2)) { |
396 | 0 | if ((fp = find_file_url(file, ele2))) { |
397 | 0 | free(newsearch); |
398 | 0 | if (local) |
399 | 0 | *local = 0; |
400 | 0 | return fp; |
401 | 0 | } |
402 | 12 | } else if ((fp = find_file_dir(file, ele2))) { |
403 | 0 | free(newsearch); |
404 | 0 | return fp; |
405 | 0 | } |
406 | 12 | } |
407 | | |
408 | 12 | free(newsearch); |
409 | | |
410 | | /* Look in the same location as the incoming 'relative_to' filename */ |
411 | 12 | if (relative_to) { |
412 | 0 | char *cp; |
413 | 0 | char relative_path[PATH_MAX+1]; |
414 | 0 | strcpy(relative_path, relative_to); |
415 | 0 | if ((cp = strrchr(relative_path, '/'))) |
416 | 0 | *cp = 0; |
417 | 0 | if ((fp = find_file_dir(file, relative_path))) |
418 | 0 | return fp; |
419 | 0 | } |
420 | | |
421 | 12 | return NULL; |
422 | 12 | } |
423 | | |
424 | | |
425 | | /* |
426 | | * As per open_path_mfile, but searching only for local filenames. |
427 | | * This is useful as we may avoid doing a full mfopen and loading |
428 | | * the entire file into memory. |
429 | | * |
430 | | * Returns the expanded pathname if found. |
431 | | * NULL if not |
432 | | */ |
433 | 0 | char *find_path(const char *file, const char *path) { |
434 | 0 | char *newsearch; |
435 | 0 | char *ele; |
436 | 0 | char *outpath = NULL; |
437 | | |
438 | | /* Use path first */ |
439 | 0 | if (!path) |
440 | 0 | path = getenv("RAWDATA"); |
441 | 0 | if (NULL == (newsearch = tokenise_search_path(path))) |
442 | 0 | return NULL; |
443 | | |
444 | 0 | for (ele = newsearch; *ele; ele += strlen(ele)+1) { |
445 | 0 | char *ele2 = (*ele == '|') ? ele+1 : ele; |
446 | |
|
447 | 0 | if (!strncmp(ele2, "URL=", 4) || |
448 | 0 | !strncmp(ele2, "http:", 5) || |
449 | 0 | !strncmp(ele2, "https:", 6) || |
450 | 0 | !strncmp(ele2, "ftp:", 4)) { |
451 | 0 | continue; |
452 | 0 | } else { |
453 | 0 | outpath = expand_path(file, ele2, INT_MAX); |
454 | 0 | if (is_file(outpath)) { |
455 | 0 | free(newsearch); |
456 | 0 | return outpath; |
457 | 0 | } else { |
458 | 0 | free(outpath); |
459 | 0 | } |
460 | 0 | } |
461 | 0 | } |
462 | | |
463 | 0 | free(newsearch); |
464 | |
|
465 | 0 | return NULL; |
466 | 0 | } |