Line | Count | Source (jump to first uncovered line) |
1 | | /* File retrieval. |
2 | | Copyright (C) 1996-2011, 2014-2015, 2018-2024 Free Software |
3 | | Foundation, Inc. |
4 | | |
5 | | This file is part of GNU Wget. |
6 | | |
7 | | GNU Wget is free software; you can redistribute it and/or modify |
8 | | it under the terms of the GNU General Public License as published by |
9 | | the Free Software Foundation; either version 3 of the License, or (at |
10 | | your option) any later version. |
11 | | |
12 | | GNU Wget is distributed in the hope that it will be useful, |
13 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | GNU General Public License for more details. |
16 | | |
17 | | You should have received a copy of the GNU General Public License |
18 | | along with Wget. If not, see <http://www.gnu.org/licenses/>. |
19 | | |
20 | | Additional permission under GNU GPL version 3 section 7 |
21 | | |
22 | | If you modify this program, or any covered work, by linking or |
23 | | combining it with the OpenSSL project's OpenSSL library (or a |
24 | | modified version of that library), containing parts covered by the |
25 | | terms of the OpenSSL or SSLeay licenses, the Free Software Foundation |
26 | | grants you additional permission to convey the resulting work. |
27 | | Corresponding Source for a non-source form of such a combination |
28 | | shall include the source code for the parts of OpenSSL used as well |
29 | | as that of the covered work. */ |
30 | | |
31 | | #include "wget.h" |
32 | | |
33 | | #include <stdio.h> |
34 | | #include <stdlib.h> |
35 | | #include <unistd.h> |
36 | | #include <errno.h> |
37 | | #include <string.h> |
38 | | #include <assert.h> |
39 | | #ifdef VMS |
40 | | # include <unixio.h> /* For delete(). */ |
41 | | #endif |
42 | | |
43 | | #ifdef HAVE_LIBZ |
44 | | # include <zlib.h> |
45 | | #endif |
46 | | |
47 | | #ifdef HAVE_LIBPROXY |
48 | | # include "proxy.h" |
49 | | #endif |
50 | | |
51 | | #include "exits.h" |
52 | | #include "utils.h" |
53 | | #include "retr.h" |
54 | | #include "progress.h" |
55 | | #include "url.h" |
56 | | #include "recur.h" |
57 | | #include "ftp.h" |
58 | | #include "http.h" |
59 | | #include "host.h" |
60 | | #include "connect.h" |
61 | | #include "convert.h" |
62 | | #include "ptimer.h" |
63 | | #include "html-url.h" |
64 | | #include "iri.h" |
65 | | #include "hsts.h" |
66 | | |
67 | | /* Total size of downloaded files. Used to enforce quota. */ |
68 | | wgint total_downloaded_bytes; |
69 | | |
70 | | /* Total download time in seconds. */ |
71 | | double total_download_time; |
72 | | |
73 | | /* If non-NULL, the stream to which output should be written. This |
74 | | stream is initialized when `-O' is used. */ |
75 | | FILE *output_stream; |
76 | | |
77 | | /* Whether output_document is a regular file we can manipulate, |
78 | | i.e. not `-' or a device file. */ |
79 | | bool output_stream_regular; |
80 | | |
81 | | static struct { |
82 | | wgint chunk_bytes; |
83 | | double chunk_start; |
84 | | double sleep_adjust; |
85 | | } limit_data; |
86 | | |
87 | | static void |
88 | | limit_bandwidth_reset (void) |
89 | 0 | { |
90 | 0 | xzero (limit_data); |
91 | 0 | } |
92 | | |
93 | | #ifdef HAVE_LIBZ |
94 | | static voidpf |
95 | | zalloc (voidpf opaque, unsigned int items, unsigned int size) |
96 | 0 | { |
97 | 0 | (void) opaque; |
98 | 0 | return (voidpf) xcalloc (items, size); |
99 | 0 | } |
100 | | |
101 | | static void |
102 | | zfree (voidpf opaque, voidpf address) |
103 | 0 | { |
104 | 0 | (void) opaque; |
105 | 0 | xfree (address); |
106 | 0 | } |
107 | | #endif |
108 | | |
109 | | /* Limit the bandwidth by pausing the download for an amount of time. |
110 | | BYTES is the number of bytes received from the network, and TIMER |
111 | | is the timer that started at the beginning of download. */ |
112 | | |
113 | | static void |
114 | | limit_bandwidth (wgint bytes, struct ptimer *timer) |
115 | 0 | { |
116 | 0 | double delta_t = ptimer_read (timer) - limit_data.chunk_start; |
117 | 0 | double expected; |
118 | |
|
119 | 0 | limit_data.chunk_bytes += bytes; |
120 | | |
121 | | /* Calculate the amount of time we expect downloading the chunk |
122 | | should take. If in reality it took less time, sleep to |
123 | | compensate for the difference. */ |
124 | 0 | expected = (double) limit_data.chunk_bytes / opt.limit_rate; |
125 | |
|
126 | 0 | if (expected > delta_t) |
127 | 0 | { |
128 | 0 | double slp = expected - delta_t + limit_data.sleep_adjust; |
129 | 0 | double t0, t1; |
130 | 0 | if (slp < 0.2) |
131 | 0 | { |
132 | 0 | DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n", |
133 | 0 | slp * 1000, number_to_static_string (limit_data.chunk_bytes), |
134 | 0 | delta_t)); |
135 | 0 | return; |
136 | 0 | } |
137 | 0 | DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n", |
138 | 0 | slp * 1000, number_to_static_string (limit_data.chunk_bytes), |
139 | 0 | limit_data.sleep_adjust)); |
140 | |
|
141 | 0 | t0 = ptimer_read (timer); |
142 | 0 | xsleep (slp); |
143 | 0 | t1 = ptimer_measure (timer); |
144 | | |
145 | | /* Due to scheduling, we probably slept slightly longer (or |
146 | | shorter) than desired. Calculate the difference between the |
147 | | desired and the actual sleep, and adjust the next sleep by |
148 | | that amount. */ |
149 | 0 | limit_data.sleep_adjust = slp - (t1 - t0); |
150 | | /* If sleep_adjust is very large, it's likely due to suspension |
151 | | and not clock inaccuracy. Don't enforce those. */ |
152 | 0 | if (limit_data.sleep_adjust > 0.5) |
153 | 0 | limit_data.sleep_adjust = 0.5; |
154 | 0 | else if (limit_data.sleep_adjust < -0.5) |
155 | 0 | limit_data.sleep_adjust = -0.5; |
156 | 0 | } |
157 | | |
158 | 0 | limit_data.chunk_bytes = 0; |
159 | 0 | limit_data.chunk_start = ptimer_read (timer); |
160 | 0 | } |
161 | | |
162 | | /* Write data in BUF to OUT. However, if *SKIP is non-zero, skip that |
163 | | amount of data and decrease SKIP. Increment *TOTAL by the amount |
164 | | of data written. If OUT2 is not NULL, also write BUF to OUT2. |
165 | | In case of error writing to OUT, -2 is returned. In case of error |
166 | | writing to OUT2, -3 is returned. Return 1 if the whole BUF was |
167 | | skipped. */ |
168 | | |
169 | | static int |
170 | | write_data (FILE *out, FILE *out2, const char *buf, int bufsize, |
171 | | wgint *skip, wgint *written) |
172 | 0 | { |
173 | 0 | if (out == NULL && out2 == NULL) |
174 | 0 | return 1; |
175 | | |
176 | 0 | if (skip) |
177 | 0 | { |
178 | 0 | if (*skip > bufsize) |
179 | 0 | { |
180 | 0 | *skip -= bufsize; |
181 | 0 | return 1; |
182 | 0 | } |
183 | 0 | if (*skip) |
184 | 0 | { |
185 | 0 | buf += *skip; |
186 | 0 | bufsize -= *skip; |
187 | 0 | *skip = 0; |
188 | 0 | if (bufsize == 0) |
189 | 0 | return 1; |
190 | 0 | } |
191 | 0 | } |
192 | | |
193 | 0 | if (out) |
194 | 0 | fwrite (buf, 1, bufsize, out); |
195 | 0 | if (out2) |
196 | 0 | fwrite (buf, 1, bufsize, out2); |
197 | |
|
198 | 0 | if (written) |
199 | 0 | *written += bufsize; |
200 | | |
201 | | /* Immediately flush the downloaded data. This should not hinder |
202 | | performance: fast downloads will arrive in large 16K chunks |
203 | | (which stdio would write out immediately anyway), and slow |
204 | | downloads wouldn't be limited by disk speed. */ |
205 | | |
206 | | /* 2005-04-20 SMS. |
207 | | Perhaps it shouldn't hinder performance, but it sure does, at least |
208 | | on VMS (more than 2X). Rather than speculate on what it should or |
209 | | shouldn't do, it might make more sense to test it. Even better, it |
210 | | might be nice to explain what possible benefit it could offer, as |
211 | | it appears to be a clear invitation to poor performance with no |
212 | | actual justification. (Also, why 16K? Anyone test other values?) |
213 | | */ |
214 | 0 | #ifndef __VMS |
215 | 0 | if (out) |
216 | 0 | fflush (out); |
217 | 0 | if (out2) |
218 | 0 | fflush (out2); |
219 | 0 | #endif /* ndef __VMS */ |
220 | |
|
221 | 0 | if (out && ferror (out)) |
222 | 0 | return -2; |
223 | 0 | else if (out2 && ferror (out2)) |
224 | 0 | return -3; |
225 | | |
226 | 0 | return 0; |
227 | 0 | } |
228 | | |
229 | | /* Read the contents of file descriptor FD until it the connection |
230 | | terminates or a read error occurs. The data is read in portions of |
231 | | up to 16K and written to OUT as it arrives. If opt.verbose is set, |
232 | | the progress is shown. |
233 | | |
234 | | TOREAD is the amount of data expected to arrive, normally only used |
235 | | by the progress gauge. |
236 | | |
237 | | STARTPOS is the position from which the download starts, used by |
238 | | the progress gauge. If QTYREAD is non-NULL, the value it points to |
239 | | is incremented by the amount of data read from the network. If |
240 | | QTYWRITTEN is non-NULL, the value it points to is incremented by |
241 | | the amount of data written to disk. The time it took to download |
242 | | the data is stored to ELAPSED. |
243 | | |
244 | | If OUT2 is non-NULL, the contents is also written to OUT2. |
245 | | OUT2 will get an exact copy of the response: if this is a chunked |
246 | | response, everything -- including the chunk headers -- is written |
247 | | to OUT2. (OUT will only get the unchunked response.) |
248 | | |
249 | | The function exits and returns the amount of data read. In case of |
250 | | error while reading data, -1 is returned. In case of error while |
251 | | writing data to OUT, -2 is returned. In case of error while writing |
252 | | data to OUT2, -3 is returned. */ |
253 | | |
254 | | int |
255 | | fd_read_body (const char *downloaded_filename, int fd, FILE *out, wgint toread, wgint startpos, |
256 | | |
257 | | wgint *qtyread, wgint *qtywritten, double *elapsed, int flags, |
258 | | FILE *out2) |
259 | 0 | { |
260 | 0 | int ret = 0; |
261 | 0 | int dlbufsize = MAX (BUFSIZ, 64 * 1024); |
262 | 0 | char *dlbuf = xmalloc (dlbufsize); |
263 | |
|
264 | 0 | struct ptimer *timer = NULL; |
265 | 0 | double last_successful_read_tm = 0; |
266 | | |
267 | | /* The progress gauge, set according to the user preferences. */ |
268 | 0 | void *progress = NULL; |
269 | | |
270 | | /* Non-zero if the progress gauge is interactive, i.e. if it can |
271 | | continually update the display. When true, smaller timeout |
272 | | values are used so that the gauge can update the display when |
273 | | data arrives slowly. */ |
274 | 0 | bool progress_interactive = false; |
275 | |
|
276 | 0 | bool exact = !!(flags & rb_read_exactly); |
277 | | |
278 | | /* Used only by HTTP/HTTPS chunked transfer encoding. */ |
279 | 0 | bool chunked = flags & rb_chunked_transfer_encoding; |
280 | 0 | wgint skip = 0; |
281 | | |
282 | | /* How much data we've read/written. */ |
283 | 0 | wgint sum_read = 0; |
284 | 0 | wgint sum_written = 0; |
285 | 0 | wgint remaining_chunk_size = 0; |
286 | |
|
287 | 0 | #ifdef HAVE_LIBZ |
288 | | /* try to minimize the number of calls to inflate() and write_data() per |
289 | | call to fd_read() */ |
290 | 0 | unsigned int gzbufsize = dlbufsize * 4; |
291 | 0 | char *gzbuf = NULL; |
292 | 0 | z_stream gzstream; |
293 | |
|
294 | 0 | if (flags & rb_compressed_gzip) |
295 | 0 | { |
296 | 0 | gzbuf = xmalloc (gzbufsize); |
297 | 0 | gzstream.zalloc = zalloc; |
298 | 0 | gzstream.zfree = zfree; |
299 | 0 | gzstream.opaque = Z_NULL; |
300 | 0 | gzstream.next_in = Z_NULL; |
301 | 0 | gzstream.avail_in = 0; |
302 | |
|
303 | 0 | #define GZIP_DETECT 32 /* gzip format detection */ |
304 | 0 | #define GZIP_WINDOW 15 /* logarithmic window size (default: 15) */ |
305 | 0 | ret = inflateInit2 (&gzstream, GZIP_DETECT | GZIP_WINDOW); |
306 | 0 | if (ret != Z_OK) |
307 | 0 | { |
308 | 0 | xfree (gzbuf); |
309 | 0 | errno = (ret == Z_MEM_ERROR) ? ENOMEM : EINVAL; |
310 | 0 | ret = -1; |
311 | 0 | goto out; |
312 | 0 | } |
313 | 0 | } |
314 | 0 | #endif |
315 | | |
316 | 0 | if (flags & rb_skip_startpos) |
317 | 0 | skip = startpos; |
318 | |
|
319 | 0 | if (opt.show_progress) |
320 | 0 | { |
321 | 0 | const char *filename_progress; |
322 | | /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL |
323 | | argument to progress_create because the indicator doesn't |
324 | | (yet) know about "skipping" data. */ |
325 | 0 | wgint start = skip ? 0 : startpos; |
326 | 0 | if (opt.dir_prefix) |
327 | 0 | filename_progress = downloaded_filename + strlen (opt.dir_prefix) + 1; |
328 | 0 | else |
329 | 0 | filename_progress = downloaded_filename; |
330 | 0 | progress = progress_create (filename_progress, start, start + toread); |
331 | 0 | progress_interactive = progress_interactive_p (progress); |
332 | 0 | } |
333 | |
|
334 | 0 | if (opt.limit_rate) |
335 | 0 | limit_bandwidth_reset (); |
336 | | |
337 | | /* A timer is needed for tracking progress, for throttling, and for |
338 | | tracking elapsed time. If either of these are requested, start |
339 | | the timer. */ |
340 | 0 | if (progress || opt.limit_rate || elapsed) |
341 | 0 | { |
342 | 0 | timer = ptimer_new (); |
343 | 0 | last_successful_read_tm = 0; |
344 | 0 | } |
345 | | |
346 | | /* Use a smaller buffer for low requested bandwidths. For example, |
347 | | with --limit-rate=2k, it doesn't make sense to slurp in 16K of |
348 | | data and then sleep for 8s. With buffer size equal to the limit, |
349 | | we never have to sleep for more than one second. */ |
350 | 0 | if (opt.limit_rate && opt.limit_rate < dlbufsize) |
351 | 0 | dlbufsize = opt.limit_rate; |
352 | | |
353 | | /* Read from FD while there is data to read. Normally toread==0 |
354 | | means that it is unknown how much data is to arrive. However, if |
355 | | EXACT is set, then toread==0 means what it says: that no data |
356 | | should be read. */ |
357 | 0 | while (!exact || (sum_read < toread)) |
358 | 0 | { |
359 | 0 | int rdsize; |
360 | 0 | double tmout = opt.read_timeout; |
361 | |
|
362 | 0 | if (chunked) |
363 | 0 | { |
364 | 0 | if (remaining_chunk_size == 0) |
365 | 0 | { |
366 | 0 | char *line = fd_read_line (fd); |
367 | 0 | char *endl; |
368 | 0 | if (line == NULL) |
369 | 0 | { |
370 | 0 | ret = -1; |
371 | 0 | break; |
372 | 0 | } |
373 | 0 | else if (out2 != NULL) |
374 | 0 | fwrite (line, 1, strlen (line), out2); |
375 | | |
376 | 0 | remaining_chunk_size = strtol (line, &endl, 16); |
377 | 0 | xfree (line); |
378 | |
|
379 | 0 | if (remaining_chunk_size < 0) |
380 | 0 | { |
381 | 0 | ret = -1; |
382 | 0 | break; |
383 | 0 | } |
384 | | |
385 | 0 | if (remaining_chunk_size == 0) |
386 | 0 | { |
387 | 0 | ret = 0; |
388 | 0 | line = fd_read_line (fd); |
389 | 0 | if (line == NULL) |
390 | 0 | ret = -1; |
391 | 0 | else |
392 | 0 | { |
393 | 0 | if (out2 != NULL) |
394 | 0 | fwrite (line, 1, strlen (line), out2); |
395 | 0 | xfree (line); |
396 | 0 | } |
397 | 0 | break; |
398 | 0 | } |
399 | 0 | } |
400 | | |
401 | 0 | rdsize = MIN (remaining_chunk_size, dlbufsize); |
402 | 0 | } |
403 | 0 | else |
404 | 0 | rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize; |
405 | | |
406 | 0 | if (progress_interactive) |
407 | 0 | { |
408 | | /* For interactive progress gauges, always specify a ~1s |
409 | | timeout, so that the gauge can be updated regularly even |
410 | | when the data arrives very slowly or stalls. */ |
411 | 0 | tmout = 0.95; |
412 | | /* avoid wrong 'interactive timeout' */ |
413 | 0 | errno = 0; |
414 | 0 | if (opt.read_timeout) |
415 | 0 | { |
416 | 0 | double waittm; |
417 | 0 | waittm = ptimer_read (timer) - last_successful_read_tm; |
418 | 0 | if (waittm + tmout > opt.read_timeout) |
419 | 0 | { |
420 | | /* Don't let total idle time exceed read timeout. */ |
421 | 0 | tmout = opt.read_timeout - waittm; |
422 | | /* if 0 fd_read can be 'blocked read' */ |
423 | 0 | if (tmout <= 0) |
424 | 0 | { |
425 | | /* We've already exceeded the timeout. */ |
426 | 0 | ret = -1, errno = ETIMEDOUT; |
427 | 0 | break; |
428 | 0 | } |
429 | 0 | } |
430 | 0 | } |
431 | 0 | } |
432 | 0 | ret = fd_read (fd, dlbuf, rdsize, tmout); |
433 | |
|
434 | 0 | if (progress_interactive && ret < 0 && errno == ETIMEDOUT) |
435 | 0 | ret = 0; /* interactive timeout, handled above */ |
436 | 0 | else if (ret <= 0) |
437 | 0 | break; /* EOF or read error */ |
438 | | |
439 | 0 | if (progress || opt.limit_rate || elapsed) |
440 | 0 | { |
441 | 0 | ptimer_measure (timer); |
442 | 0 | if (ret > 0) |
443 | 0 | last_successful_read_tm = ptimer_read (timer); |
444 | 0 | } |
445 | |
|
446 | 0 | if (ret > 0) |
447 | 0 | { |
448 | 0 | int write_res; |
449 | |
|
450 | 0 | sum_read += ret; |
451 | |
|
452 | 0 | #ifdef HAVE_LIBZ |
453 | 0 | if (gzbuf) |
454 | 0 | { |
455 | 0 | int err; |
456 | 0 | int towrite; |
457 | | |
458 | | /* Write original data to WARC file */ |
459 | 0 | write_res = write_data (NULL, out2, dlbuf, ret, NULL, NULL); |
460 | 0 | if (write_res < 0) |
461 | 0 | { |
462 | 0 | ret = write_res; |
463 | 0 | goto out; |
464 | 0 | } |
465 | | |
466 | 0 | gzstream.avail_in = ret; |
467 | 0 | gzstream.next_in = (unsigned char *) dlbuf; |
468 | |
|
469 | 0 | do |
470 | 0 | { |
471 | 0 | gzstream.avail_out = gzbufsize; |
472 | 0 | gzstream.next_out = (unsigned char *) gzbuf; |
473 | |
|
474 | 0 | err = inflate (&gzstream, Z_NO_FLUSH); |
475 | |
|
476 | 0 | switch (err) |
477 | 0 | { |
478 | 0 | case Z_MEM_ERROR: |
479 | 0 | errno = ENOMEM; |
480 | 0 | ret = -1; |
481 | 0 | goto out; |
482 | 0 | case Z_NEED_DICT: |
483 | 0 | case Z_DATA_ERROR: |
484 | 0 | errno = EINVAL; |
485 | 0 | ret = -1; |
486 | 0 | goto out; |
487 | 0 | case Z_STREAM_END: |
488 | 0 | if (exact && sum_read != toread) |
489 | 0 | { |
490 | 0 | DEBUGP(("zlib stream ended unexpectedly after %"PRId64"/%"PRId64 |
491 | 0 | " bytes\n", sum_read, toread)); |
492 | 0 | } |
493 | 0 | } |
494 | | |
495 | 0 | towrite = gzbufsize - gzstream.avail_out; |
496 | 0 | write_res = write_data (out, NULL, gzbuf, towrite, &skip, |
497 | 0 | &sum_written); |
498 | 0 | if (write_res < 0) |
499 | 0 | { |
500 | 0 | ret = write_res; |
501 | 0 | goto out; |
502 | 0 | } |
503 | 0 | } |
504 | 0 | while (gzstream.avail_out == 0); |
505 | 0 | } |
506 | 0 | else |
507 | 0 | #endif |
508 | 0 | { |
509 | 0 | write_res = write_data (out, out2, dlbuf, ret, &skip, |
510 | 0 | &sum_written); |
511 | 0 | if (write_res < 0) |
512 | 0 | { |
513 | 0 | ret = write_res; |
514 | 0 | goto out; |
515 | 0 | } |
516 | 0 | } |
517 | | |
518 | 0 | if (chunked) |
519 | 0 | { |
520 | 0 | remaining_chunk_size -= ret; |
521 | 0 | if (remaining_chunk_size == 0) |
522 | 0 | { |
523 | 0 | char *line = fd_read_line (fd); |
524 | 0 | if (line == NULL) |
525 | 0 | { |
526 | 0 | ret = -1; |
527 | 0 | break; |
528 | 0 | } |
529 | 0 | else |
530 | 0 | { |
531 | 0 | if (out2 != NULL) |
532 | 0 | fwrite (line, 1, strlen (line), out2); |
533 | 0 | xfree (line); |
534 | 0 | } |
535 | 0 | } |
536 | 0 | } |
537 | 0 | } |
538 | | |
539 | 0 | if (opt.limit_rate) |
540 | 0 | limit_bandwidth (ret, timer); |
541 | |
|
542 | 0 | if (progress) |
543 | 0 | progress_update (progress, ret, ptimer_read (timer)); |
544 | | #ifdef WINDOWS |
545 | | if (toread > 0 && opt.show_progress) |
546 | | ws_percenttitle (100.0 * |
547 | | (startpos + sum_read) / (startpos + toread)); |
548 | | #endif |
549 | 0 | } |
550 | 0 | if (ret < -1) |
551 | 0 | ret = -1; |
552 | |
|
553 | 0 | out: |
554 | 0 | if (progress) |
555 | 0 | progress_finish (progress, ptimer_read (timer)); |
556 | |
|
557 | 0 | if (timer) |
558 | 0 | { |
559 | 0 | if (elapsed) |
560 | 0 | *elapsed = ptimer_read (timer); |
561 | 0 | ptimer_destroy (timer); |
562 | 0 | } |
563 | |
|
564 | 0 | #ifdef HAVE_LIBZ |
565 | 0 | if (gzbuf) |
566 | 0 | { |
567 | 0 | int err = inflateEnd (&gzstream); |
568 | 0 | if (ret >= 0) |
569 | 0 | { |
570 | | /* with compression enabled, ret must be 0 if successful */ |
571 | 0 | if (err == Z_OK) |
572 | 0 | ret = 0; |
573 | 0 | else |
574 | 0 | { |
575 | 0 | errno = EINVAL; |
576 | 0 | ret = -1; |
577 | 0 | } |
578 | 0 | } |
579 | 0 | xfree (gzbuf); |
580 | |
|
581 | 0 | if (gzstream.total_in != (uLong) sum_read) |
582 | 0 | { |
583 | 0 | DEBUGP(("zlib read size differs from raw read size (%lu/%"PRId64")\n", |
584 | 0 | gzstream.total_in, sum_read)); |
585 | 0 | } |
586 | 0 | } |
587 | 0 | #endif |
588 | |
|
589 | 0 | if (qtyread) |
590 | 0 | *qtyread += sum_read; |
591 | 0 | if (qtywritten) |
592 | 0 | *qtywritten += sum_written; |
593 | |
|
594 | 0 | xfree (dlbuf); |
595 | |
|
596 | 0 | return ret; |
597 | 0 | } |
598 | | |
599 | | /* Read a hunk of data from FD, up until a terminator. The hunk is |
600 | | limited by whatever the TERMINATOR callback chooses as its |
601 | | terminator. For example, if terminator stops at newline, the hunk |
602 | | will consist of a line of data; if terminator stops at two |
603 | | newlines, it can be used to read the head of an HTTP response. |
604 | | Upon determining the boundary, the function returns the data (up to |
605 | | the terminator) in malloc-allocated storage. |
606 | | |
607 | | In case of read error, NULL is returned. In case of EOF and no |
608 | | data read, NULL is returned and errno set to 0. In case of having |
609 | | read some data, but encountering EOF before seeing the terminator, |
610 | | the data that has been read is returned, but it will (obviously) |
611 | | not contain the terminator. |
612 | | |
613 | | The TERMINATOR function is called with three arguments: the |
614 | | beginning of the data read so far, the beginning of the current |
615 | | block of peeked-at data, and the length of the current block. |
616 | | Depending on its needs, the function is free to choose whether to |
617 | | analyze all data or just the newly arrived data. If TERMINATOR |
618 | | returns NULL, it means that the terminator has not been seen. |
619 | | Otherwise it should return a pointer to the charactre immediately |
620 | | following the terminator. |
621 | | |
622 | | The idea is to be able to read a line of input, or otherwise a hunk |
623 | | of text, such as the head of an HTTP request, without crossing the |
624 | | boundary, so that the next call to fd_read etc. reads the data |
625 | | after the hunk. To achieve that, this function does the following: |
626 | | |
627 | | 1. Peek at incoming data. |
628 | | |
629 | | 2. Determine whether the peeked data, along with the previously |
630 | | read data, includes the terminator. |
631 | | |
632 | | 2a. If yes, read the data until the end of the terminator, and |
633 | | exit. |
634 | | |
635 | | 2b. If no, read the peeked data and goto 1. |
636 | | |
637 | | The function is careful to assume as little as possible about the |
638 | | implementation of peeking. For example, every peek is followed by |
639 | | a read. If the read returns a different amount of data, the |
640 | | process is retried until all data arrives safely. |
641 | | |
642 | | SIZEHINT is the buffer size sufficient to hold all the data in the |
643 | | typical case (it is used as the initial buffer size). MAXSIZE is |
644 | | the maximum amount of memory this function is allowed to allocate, |
645 | | or 0 if no upper limit is to be enforced. |
646 | | |
647 | | This function should be used as a building block for other |
648 | | functions -- see fd_read_line as a simple example. */ |
649 | | |
650 | | char * |
651 | | fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize) |
652 | 1.05k | { |
653 | 1.05k | long bufsize = sizehint; |
654 | 1.05k | char *hunk = xmalloc (bufsize); |
655 | 1.05k | int tail = 0; /* tail position in HUNK */ |
656 | | |
657 | 1.05k | assert (!maxsize || maxsize >= bufsize); |
658 | | |
659 | 2.00k | while (1) |
660 | 2.00k | { |
661 | 2.00k | const char *end; |
662 | 2.00k | int pklen, rdlen, remain; |
663 | | |
664 | | /* First, peek at the available data. */ |
665 | | |
666 | 2.00k | pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1); |
667 | 2.00k | if (pklen < 0) |
668 | 0 | { |
669 | 0 | xfree (hunk); |
670 | 0 | return NULL; |
671 | 0 | } |
672 | 2.00k | end = terminator (hunk, hunk + tail, pklen); |
673 | 2.00k | if (end) |
674 | 575 | { |
675 | | /* The data contains the terminator: we'll drain the data up |
676 | | to the end of the terminator. */ |
677 | 575 | remain = end - (hunk + tail); |
678 | 575 | assert (remain >= 0); |
679 | 575 | if (remain == 0) |
680 | 407 | { |
681 | | /* No more data needs to be read. */ |
682 | 407 | hunk[tail] = '\0'; |
683 | 407 | return hunk; |
684 | 407 | } |
685 | 168 | if (bufsize - 1 < tail + remain) |
686 | 0 | { |
687 | 0 | bufsize = tail + remain + 1; |
688 | 0 | hunk = xrealloc (hunk, bufsize); |
689 | 0 | } |
690 | 168 | } |
691 | 1.42k | else |
692 | | /* No terminator: simply read the data we know is (or should |
693 | | be) available. */ |
694 | 1.42k | remain = pklen; |
695 | | |
696 | | /* Now, read the data. Note that we make no assumptions about |
697 | | how much data we'll get. (Some TCP stacks are notorious for |
698 | | read returning less data than the previous MSG_PEEK.) */ |
699 | | |
700 | 1.59k | rdlen = fd_read (fd, hunk + tail, remain, 0); |
701 | 1.59k | if (rdlen < 0) |
702 | 0 | { |
703 | 0 | xfree (hunk); |
704 | 0 | return NULL; |
705 | 0 | } |
706 | 1.59k | tail += rdlen; |
707 | 1.59k | hunk[tail] = '\0'; |
708 | | |
709 | 1.59k | if (rdlen == 0) |
710 | 476 | { |
711 | 476 | if (tail == 0) |
712 | 312 | { |
713 | | /* EOF without anything having been read */ |
714 | 312 | xfree (hunk); |
715 | 312 | errno = 0; |
716 | 312 | return NULL; |
717 | 312 | } |
718 | 164 | else |
719 | | /* EOF seen: return the data we've read. */ |
720 | 164 | return hunk; |
721 | 476 | } |
722 | 1.11k | if (end && rdlen == remain) |
723 | | /* The terminator was seen and the remaining data drained -- |
724 | | we got what we came for. */ |
725 | 168 | return hunk; |
726 | | |
727 | | /* Keep looping until all the data arrives. */ |
728 | | |
729 | 949 | if (tail == bufsize - 1) |
730 | 95 | { |
731 | | /* Double the buffer size, but refuse to allocate more than |
732 | | MAXSIZE bytes. */ |
733 | 95 | if (maxsize && bufsize >= maxsize) |
734 | 0 | { |
735 | 0 | xfree (hunk); |
736 | 0 | errno = ENOMEM; |
737 | 0 | return NULL; |
738 | 0 | } |
739 | 95 | bufsize <<= 1; |
740 | 95 | if (maxsize && bufsize > maxsize) |
741 | 0 | bufsize = maxsize; |
742 | 95 | hunk = xrealloc (hunk, bufsize); |
743 | 95 | } |
744 | 949 | } |
745 | 1.05k | } |
746 | | |
747 | | static const char * |
748 | | line_terminator (const char *start _GL_UNUSED, const char *peeked, int peeklen) |
749 | 0 | { |
750 | 0 | const char *p = memchr (peeked, '\n', peeklen); |
751 | 0 | if (p) |
752 | | /* p+1 because the line must include '\n' */ |
753 | 0 | return p + 1; |
754 | 0 | return NULL; |
755 | 0 | } |
756 | | |
757 | | /* The maximum size of the single line we agree to accept. This is |
758 | | not meant to impose an arbitrary limit, but to protect the user |
759 | | from Wget slurping up available memory upon encountering malicious |
760 | | or buggy server output. Define it to 0 to remove the limit. */ |
761 | 0 | #define FD_READ_LINE_MAX 4096 |
762 | | |
763 | | /* Read one line from FD and return it. The line is allocated using |
764 | | malloc, but is never larger than FD_READ_LINE_MAX. |
765 | | |
766 | | If an error occurs, or if no data can be read, NULL is returned. |
767 | | In the former case errno indicates the error condition, and in the |
768 | | latter case, errno is NULL. */ |
769 | | |
770 | | char * |
771 | | fd_read_line (int fd) |
772 | 0 | { |
773 | 0 | return fd_read_hunk (fd, line_terminator, 128, FD_READ_LINE_MAX); |
774 | 0 | } |
775 | | |
776 | | /* Return a printed representation of the download rate, along with |
777 | | the units appropriate for the download speed. */ |
778 | | |
779 | | const char * |
780 | | retr_rate (wgint bytes, double secs) |
781 | 0 | { |
782 | 0 | static char res[20]; |
783 | 0 | static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s", "TB/s" }; |
784 | 0 | static const char *rate_names_bits[] = {"b/s", "Kb/s", "Mb/s", "Gb/s", "Tb/s" }; |
785 | 0 | int units; |
786 | |
|
787 | 0 | double dlrate = calc_rate (bytes, secs, &units); |
788 | | /* Use more digits for smaller numbers (regardless of unit used), |
789 | | e.g. "1022", "247", "12.5", "2.38". */ |
790 | 0 | snprintf (res, sizeof(res), "%.*f %s", |
791 | 0 | dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2, |
792 | 0 | dlrate, !opt.report_bps ? rate_names[units]: rate_names_bits[units]); |
793 | |
|
794 | 0 | return res; |
795 | 0 | } |
796 | | |
797 | | /* Calculate the download rate and trim it as appropriate for the |
798 | | speed. Appropriate means that if rate is greater than 1K/s, |
799 | | kilobytes are used, and if rate is greater than 1MB/s, megabytes |
800 | | are used. |
801 | | |
802 | | UNITS is zero for B/s, one for KB/s, two for MB/s, and three for |
803 | | GB/s. */ |
804 | | |
805 | | double |
806 | | calc_rate (wgint bytes, double secs, int *units) |
807 | 5.98k | { |
808 | 5.98k | double dlrate; |
809 | 5.98k | double bibyte; |
810 | | |
811 | 5.98k | if (!opt.report_bps) |
812 | 5.98k | bibyte = 1024.0; |
813 | 0 | else |
814 | 0 | bibyte = 1000.0; |
815 | | |
816 | 5.98k | if (secs == 0) |
817 | | /* If elapsed time is exactly zero, it means we're under the |
818 | | resolution of the timer. This can easily happen on systems |
819 | | that use time() for the timer. Since the interval lies between |
820 | | 0 and the timer's resolution, assume half the resolution. */ |
821 | 3.44k | secs = ptimer_resolution () / 2.0; |
822 | | |
823 | 5.98k | dlrate = secs ? convert_to_bits (bytes) / secs : 0; |
824 | 5.98k | if (dlrate < bibyte) |
825 | 4.11k | *units = 0; |
826 | 1.86k | else if (dlrate < (bibyte * bibyte)) |
827 | 333 | *units = 1, dlrate /= bibyte; |
828 | 1.53k | else if (dlrate < (bibyte * bibyte * bibyte)) |
829 | 510 | *units = 2, dlrate /= (bibyte * bibyte); |
830 | 1.02k | else if (dlrate < (bibyte * bibyte * bibyte * bibyte)) |
831 | 632 | *units = 3, dlrate /= (bibyte * bibyte * bibyte); |
832 | 392 | else { |
833 | 392 | *units = 4, dlrate /= (bibyte * bibyte * bibyte * bibyte); |
834 | 392 | if (dlrate > 99.99) |
835 | 307 | dlrate = 99.99; // upper limit 99.99TB/s |
836 | 392 | } |
837 | | |
838 | 5.98k | return dlrate; |
839 | 5.98k | } |
840 | | |
841 | | |
842 | 0 | #define SUSPEND_METHOD do { \ |
843 | 0 | method_suspended = true; \ |
844 | 0 | saved_body_data = opt.body_data; \ |
845 | 0 | saved_body_file_name = opt.body_file; \ |
846 | 0 | saved_method = opt.method; \ |
847 | 0 | opt.body_data = NULL; \ |
848 | 0 | opt.body_file = NULL; \ |
849 | 0 | opt.method = NULL; \ |
850 | 0 | } while (0) |
851 | | |
852 | 0 | #define RESTORE_METHOD do { \ |
853 | 0 | if (method_suspended) \ |
854 | 0 | { \ |
855 | 0 | opt.body_data = saved_body_data; \ |
856 | 0 | opt.body_file = saved_body_file_name; \ |
857 | 0 | opt.method = saved_method; \ |
858 | 0 | method_suspended = false; \ |
859 | 0 | } \ |
860 | 0 | } while (0) |
861 | | |
862 | | static char *getproxy (struct url *); |
863 | | |
864 | | /* Retrieve the given URL. Decides which loop to call -- HTTP, FTP, |
865 | | FTP, proxy, etc. */ |
866 | | |
867 | | /* #### This function should be rewritten so it doesn't return from |
868 | | multiple points. */ |
869 | | |
870 | | uerr_t |
871 | | retrieve_url (struct url * orig_parsed, const char *origurl, char **file, |
872 | | char **newloc, const char *refurl, int *dt, bool recursive, |
873 | | struct iri *iri, bool register_status) |
874 | 0 | { |
875 | 0 | uerr_t result; |
876 | 0 | char *url; |
877 | 0 | bool location_changed; |
878 | 0 | bool iri_fallbacked = 0; |
879 | 0 | int dummy; |
880 | 0 | char *mynewloc, *proxy; |
881 | 0 | struct url *u = orig_parsed, *proxy_url; |
882 | 0 | int up_error_code; /* url parse error code */ |
883 | 0 | char *local_file = NULL; |
884 | 0 | int redirection_count = 0; |
885 | |
|
886 | 0 | bool method_suspended = false; |
887 | 0 | char *saved_body_data = NULL; |
888 | 0 | char *saved_method = NULL; |
889 | 0 | char *saved_body_file_name = NULL; |
890 | | |
891 | | /* If dt is NULL, use local storage. */ |
892 | 0 | if (!dt) |
893 | 0 | { |
894 | 0 | dt = &dummy; |
895 | 0 | dummy = 0; |
896 | 0 | } |
897 | 0 | url = xstrdup (origurl); |
898 | 0 | if (newloc) |
899 | 0 | *newloc = NULL; |
900 | 0 | if (file) |
901 | 0 | *file = NULL; |
902 | |
|
903 | 0 | if (!refurl) |
904 | 0 | refurl = opt.referer; |
905 | |
|
906 | 0 | redirected: |
907 | | /* (also for IRI fallbacking) */ |
908 | |
|
909 | 0 | result = NOCONERROR; |
910 | 0 | mynewloc = NULL; |
911 | 0 | xfree(local_file); |
912 | 0 | proxy_url = NULL; |
913 | |
|
914 | 0 | proxy = getproxy (u); |
915 | 0 | if (proxy) |
916 | 0 | { |
917 | 0 | struct iri *pi = iri_new (); |
918 | 0 | set_uri_encoding (pi, opt.locale, true); |
919 | 0 | pi->utf8_encode = false; |
920 | | |
921 | | /* Parse the proxy URL. */ |
922 | 0 | proxy_url = url_parse (proxy, &up_error_code, pi, true); |
923 | 0 | if (!proxy_url) |
924 | 0 | { |
925 | 0 | logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), |
926 | 0 | proxy, url_error (up_error_code)); |
927 | 0 | xfree (url); |
928 | 0 | xfree (proxy); |
929 | 0 | iri_free (pi); |
930 | 0 | RESTORE_METHOD; |
931 | 0 | result = PROXERR; |
932 | 0 | if (orig_parsed != u) |
933 | 0 | url_free (u); |
934 | 0 | goto bail; |
935 | 0 | } |
936 | 0 | if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme) |
937 | 0 | { |
938 | 0 | logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy); |
939 | 0 | url_free (proxy_url); |
940 | 0 | xfree (url); |
941 | 0 | xfree (proxy); |
942 | 0 | iri_free (pi); |
943 | 0 | RESTORE_METHOD; |
944 | 0 | result = PROXERR; |
945 | 0 | if (orig_parsed != u) |
946 | 0 | url_free (u); |
947 | 0 | goto bail; |
948 | 0 | } |
949 | 0 | iri_free(pi); |
950 | 0 | xfree (proxy); |
951 | 0 | } |
952 | | |
953 | 0 | if (u->scheme == SCHEME_HTTP |
954 | 0 | #ifdef HAVE_SSL |
955 | 0 | || u->scheme == SCHEME_HTTPS |
956 | 0 | #endif |
957 | 0 | || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) |
958 | 0 | { |
959 | 0 | #ifdef HAVE_HSTS |
960 | 0 | #ifdef TESTING |
961 | | /* we don't link against main.o when we're testing */ |
962 | 0 | hsts_store_t hsts_store = NULL; |
963 | | #else |
964 | | extern hsts_store_t hsts_store; |
965 | | #endif |
966 | |
|
967 | 0 | if (opt.hsts && hsts_store) |
968 | 0 | { |
969 | 0 | if (hsts_match (hsts_store, u)) |
970 | 0 | logprintf (LOG_VERBOSE, "URL transformed to HTTPS due to an HSTS policy\n"); |
971 | 0 | } |
972 | 0 | #endif |
973 | 0 | result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt, |
974 | 0 | proxy_url, iri); |
975 | 0 | } |
976 | 0 | else if (u->scheme == SCHEME_FTP |
977 | 0 | #ifdef HAVE_SSL |
978 | 0 | || u->scheme == SCHEME_FTPS |
979 | 0 | #endif |
980 | 0 | ) |
981 | 0 | { |
982 | | /* If this is a redirection, temporarily turn off opt.ftp_glob |
983 | | and opt.recursive, both being undesirable when following |
984 | | redirects. */ |
985 | 0 | bool oldrec = recursive, glob = opt.ftp_glob; |
986 | 0 | if (redirection_count) |
987 | 0 | oldrec = glob = false; |
988 | |
|
989 | 0 | result = ftp_loop (u, orig_parsed, &local_file, dt, proxy_url, |
990 | 0 | recursive, glob); |
991 | 0 | recursive = oldrec; |
992 | | |
993 | | /* There is a possibility of having HTTP being redirected to |
994 | | FTP. In these cases we must decide whether the text is HTML |
995 | | according to the suffix. The HTML suffixes are `.html', |
996 | | `.htm' and a few others, case-insensitive. */ |
997 | 0 | if (redirection_count && local_file && (u->scheme == SCHEME_FTP |
998 | 0 | #ifdef HAVE_SSL |
999 | 0 | || u->scheme == SCHEME_FTPS |
1000 | 0 | #endif |
1001 | 0 | )) |
1002 | 0 | { |
1003 | 0 | if (has_html_suffix_p (local_file)) |
1004 | 0 | *dt |= TEXTHTML; |
1005 | 0 | } |
1006 | 0 | } |
1007 | |
|
1008 | 0 | if (proxy_url) |
1009 | 0 | { |
1010 | 0 | url_free (proxy_url); |
1011 | 0 | proxy_url = NULL; |
1012 | 0 | } |
1013 | |
|
1014 | 0 | location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST); |
1015 | 0 | if (location_changed) |
1016 | 0 | { |
1017 | 0 | char *construced_newloc; |
1018 | 0 | struct url *newloc_parsed; |
1019 | |
|
1020 | 0 | assert (mynewloc != NULL); |
1021 | |
|
1022 | 0 | xfree (local_file); |
1023 | | |
1024 | | /* The HTTP specs only allow absolute URLs to appear in |
1025 | | redirects, but a ton of boneheaded webservers and CGIs out |
1026 | | there break the rules and use relative URLs, and popular |
1027 | | browsers are lenient about this, so wget should be too. */ |
1028 | 0 | construced_newloc = uri_merge (url, mynewloc ? mynewloc : ""); |
1029 | 0 | xfree (mynewloc); |
1030 | 0 | mynewloc = construced_newloc; |
1031 | |
|
1032 | 0 | #ifdef ENABLE_IRI |
1033 | | /* Reset UTF-8 encoding state, set the URI encoding and reset |
1034 | | the content encoding. */ |
1035 | 0 | iri->utf8_encode = opt.enable_iri; |
1036 | 0 | if (opt.encoding_remote) |
1037 | 0 | set_uri_encoding (iri, opt.encoding_remote, true); |
1038 | 0 | set_content_encoding (iri, NULL); |
1039 | 0 | xfree (iri->orig_url); |
1040 | 0 | #endif |
1041 | | |
1042 | | /* Now, see if this new location makes sense. */ |
1043 | 0 | newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true); |
1044 | 0 | if (!newloc_parsed) |
1045 | 0 | { |
1046 | 0 | logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), |
1047 | 0 | url_error (up_error_code)); |
1048 | 0 | if (orig_parsed != u) |
1049 | 0 | { |
1050 | 0 | url_free (u); |
1051 | 0 | } |
1052 | 0 | xfree (url); |
1053 | 0 | xfree (mynewloc); |
1054 | 0 | RESTORE_METHOD; |
1055 | 0 | goto bail; |
1056 | 0 | } |
1057 | | |
1058 | | /* Now mynewloc will become newloc_parsed->url, because if the |
1059 | | Location contained relative paths like .././something, we |
1060 | | don't want that propagating as url. */ |
1061 | 0 | xfree (mynewloc); |
1062 | 0 | mynewloc = xstrdup (newloc_parsed->url); |
1063 | | |
1064 | | /* Check for max. number of redirections. */ |
1065 | 0 | if (++redirection_count > opt.max_redirect) |
1066 | 0 | { |
1067 | 0 | logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"), |
1068 | 0 | opt.max_redirect); |
1069 | 0 | url_free (newloc_parsed); |
1070 | 0 | if (orig_parsed != u) |
1071 | 0 | { |
1072 | 0 | url_free (u); |
1073 | 0 | } |
1074 | 0 | xfree (url); |
1075 | 0 | xfree (mynewloc); |
1076 | 0 | RESTORE_METHOD; |
1077 | 0 | result = WRONGCODE; |
1078 | 0 | goto bail; |
1079 | 0 | } |
1080 | | |
1081 | 0 | xfree (url); |
1082 | 0 | url = mynewloc; |
1083 | 0 | if (orig_parsed != u) |
1084 | 0 | { |
1085 | 0 | url_free (u); |
1086 | 0 | } |
1087 | 0 | u = newloc_parsed; |
1088 | | |
1089 | | /* If we're being redirected from POST, and we received a |
1090 | | redirect code different than 307, we don't want to POST |
1091 | | again. Many requests answer POST with a redirection to an |
1092 | | index page; that redirection is clearly a GET. We "suspend" |
1093 | | POST data for the duration of the redirections, and restore |
1094 | | it when we're done. |
1095 | | |
1096 | | RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect |
1097 | | specifically to preserve the method of the request. |
1098 | | */ |
1099 | 0 | if (result != NEWLOCATION_KEEP_POST && !method_suspended) |
1100 | 0 | SUSPEND_METHOD; |
1101 | |
|
1102 | 0 | goto redirected; |
1103 | 0 | } |
1104 | 0 | else |
1105 | 0 | { |
1106 | 0 | xfree(mynewloc); |
1107 | 0 | } |
1108 | | |
1109 | | /* Try to not encode in UTF-8 if fetching failed */ |
1110 | 0 | if (!(*dt & RETROKF) && iri->utf8_encode) |
1111 | 0 | { |
1112 | 0 | iri->utf8_encode = false; |
1113 | 0 | if (orig_parsed != u) |
1114 | 0 | { |
1115 | 0 | url_free (u); |
1116 | 0 | } |
1117 | 0 | u = url_parse (origurl, NULL, iri, true); |
1118 | 0 | if (u) |
1119 | 0 | { |
1120 | 0 | if (strcmp(u->url, orig_parsed->url)) |
1121 | 0 | { |
1122 | 0 | DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url))); |
1123 | 0 | xfree (url); |
1124 | 0 | url = xstrdup (u->url); |
1125 | 0 | iri_fallbacked = 1; |
1126 | 0 | goto redirected; |
1127 | 0 | } |
1128 | 0 | else |
1129 | 0 | DEBUGP (("[Needn't fallback to non-utf8 for %s\n", quote (url))); |
1130 | 0 | } |
1131 | 0 | else |
1132 | 0 | DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url))); |
1133 | 0 | } |
1134 | | |
1135 | 0 | if (local_file && u && (*dt & RETROKF || opt.content_on_error)) |
1136 | 0 | { |
1137 | 0 | register_download (u->url, local_file); |
1138 | |
|
1139 | 0 | if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url)) |
1140 | 0 | register_redirection (origurl, u->url); |
1141 | |
|
1142 | 0 | if (*dt & TEXTHTML) |
1143 | 0 | register_html (local_file); |
1144 | |
|
1145 | 0 | if (*dt & TEXTCSS) |
1146 | 0 | register_css (local_file); |
1147 | 0 | } |
1148 | |
|
1149 | 0 | if (file) |
1150 | 0 | *file = local_file ? local_file : NULL; |
1151 | 0 | else |
1152 | 0 | xfree (local_file); |
1153 | |
|
1154 | 0 | if (orig_parsed != u) |
1155 | 0 | url_free (u); |
1156 | |
|
1157 | 0 | if (redirection_count || iri_fallbacked) |
1158 | 0 | { |
1159 | 0 | if (newloc) |
1160 | 0 | *newloc = url; |
1161 | 0 | else |
1162 | 0 | xfree (url); |
1163 | 0 | } |
1164 | 0 | else |
1165 | 0 | { |
1166 | 0 | if (newloc) |
1167 | 0 | *newloc = NULL; |
1168 | 0 | xfree (url); |
1169 | 0 | } |
1170 | |
|
1171 | 0 | RESTORE_METHOD; |
1172 | |
|
1173 | 0 | bail: |
1174 | 0 | if (register_status) |
1175 | 0 | inform_exit_status (result); |
1176 | |
|
1177 | 0 | return result; |
1178 | 0 | } |
1179 | | |
1180 | | static uerr_t retrieve_from_url_list(struct urlpos *url_list, int *count, struct iri *iri) |
1181 | 0 | { |
1182 | 0 | struct urlpos *cur_url; |
1183 | 0 | uerr_t status; |
1184 | |
|
1185 | 0 | for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) |
1186 | 0 | { |
1187 | 0 | char *filename = NULL, *new_file = NULL, *proxy; |
1188 | 0 | int dt = 0; |
1189 | 0 | struct iri *tmpiri; |
1190 | 0 | struct url *parsed_url; |
1191 | |
|
1192 | 0 | if (cur_url->ignore_when_downloading) |
1193 | 0 | continue; |
1194 | | |
1195 | 0 | if (opt.quota && total_downloaded_bytes > opt.quota) |
1196 | 0 | { |
1197 | 0 | status = QUOTEXC; |
1198 | 0 | break; |
1199 | 0 | } |
1200 | | |
1201 | 0 | tmpiri = iri_dup (iri); |
1202 | 0 | parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true); |
1203 | |
|
1204 | 0 | proxy = getproxy (cur_url->url); |
1205 | 0 | if ((opt.recursive || opt.page_requisites) |
1206 | 0 | && ((cur_url->url->scheme != SCHEME_FTP |
1207 | 0 | #ifdef HAVE_SSL |
1208 | 0 | && cur_url->url->scheme != SCHEME_FTPS |
1209 | 0 | #endif |
1210 | 0 | ) || proxy)) |
1211 | 0 | { |
1212 | 0 | int old_follow_ftp = opt.follow_ftp; |
1213 | | |
1214 | | /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ |
1215 | 0 | if (cur_url->url->scheme == SCHEME_FTP |
1216 | 0 | #ifdef HAVE_SSL |
1217 | 0 | || cur_url->url->scheme == SCHEME_FTPS |
1218 | 0 | #endif |
1219 | 0 | ) |
1220 | 0 | opt.follow_ftp = 1; |
1221 | |
|
1222 | 0 | status = retrieve_tree (parsed_url ? parsed_url : cur_url->url, |
1223 | 0 | tmpiri); |
1224 | |
|
1225 | 0 | opt.follow_ftp = old_follow_ftp; |
1226 | 0 | } |
1227 | 0 | else |
1228 | 0 | status = retrieve_url (parsed_url ? parsed_url : cur_url->url, |
1229 | 0 | cur_url->url->url, &filename, |
1230 | 0 | &new_file, NULL, &dt, opt.recursive, tmpiri, |
1231 | 0 | true); |
1232 | 0 | xfree (proxy); |
1233 | |
|
1234 | 0 | if (parsed_url) |
1235 | 0 | url_free (parsed_url); |
1236 | |
|
1237 | 0 | if (filename && opt.delete_after && file_exists_p (filename, NULL)) |
1238 | 0 | { |
1239 | 0 | DEBUGP (("\ |
1240 | 0 | Removing file due to --delete-after in retrieve_from_file():\n")); |
1241 | 0 | logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename); |
1242 | 0 | if (unlink (filename)) |
1243 | 0 | logprintf (LOG_NOTQUIET, "Failed to unlink %s: (%d) %s\n", filename, errno, strerror (errno)); |
1244 | 0 | dt &= ~RETROKF; |
1245 | 0 | } |
1246 | |
|
1247 | 0 | xfree (new_file); |
1248 | 0 | xfree (filename); |
1249 | 0 | iri_free (tmpiri); |
1250 | 0 | } |
1251 | 0 | return status; |
1252 | 0 | } |
1253 | | |
1254 | | /* Find the URLs in the file and call retrieve_url() for each of them. |
1255 | | If HTML is true, treat the file as HTML, and construct the URLs |
1256 | | accordingly. |
1257 | | |
1258 | | If opt.recursive is set, call retrieve_tree() for each file. */ |
1259 | | |
1260 | | uerr_t |
1261 | | retrieve_from_file (const char *file, bool html, int *count) |
1262 | 0 | { |
1263 | 0 | uerr_t status; |
1264 | 0 | struct urlpos *url_list, *cur_url; |
1265 | 0 | struct iri *iri = iri_new(); |
1266 | |
|
1267 | 0 | char *input_file, *url_file = NULL; |
1268 | 0 | const char *url = file; |
1269 | |
|
1270 | 0 | status = RETROK; /* Suppose everything is OK. */ |
1271 | 0 | *count = 0; /* Reset the URL count. */ |
1272 | | |
1273 | | /* sXXXav : Assume filename and links in the file are in the locale */ |
1274 | 0 | set_uri_encoding (iri, opt.locale, true); |
1275 | 0 | set_content_encoding (iri, opt.locale); |
1276 | |
|
1277 | 0 | if (url_valid_scheme (url)) |
1278 | 0 | { |
1279 | 0 | int dt,url_err; |
1280 | 0 | struct url *url_parsed = url_parse (url, &url_err, iri, true); |
1281 | 0 | if (!url_parsed) |
1282 | 0 | { |
1283 | 0 | logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (url_err)); |
1284 | 0 | iri_free (iri); |
1285 | 0 | return URLERROR; |
1286 | 0 | } |
1287 | | |
1288 | 0 | if (!opt.base_href) |
1289 | 0 | opt.base_href = xstrdup (url); |
1290 | |
|
1291 | 0 | status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt, |
1292 | 0 | false, iri, true); |
1293 | 0 | url_free (url_parsed); |
1294 | |
|
1295 | 0 | if (!url_file || (status != RETROK)) |
1296 | 0 | { |
1297 | 0 | iri_free (iri); |
1298 | 0 | return status; |
1299 | 0 | } |
1300 | | |
1301 | 0 | if (dt & TEXTHTML) |
1302 | 0 | html = true; |
1303 | |
|
1304 | 0 | #ifdef ENABLE_IRI |
1305 | | /* If we have a found a content encoding, use it. |
1306 | | * ( == is okay, because we're checking for identical object) */ |
1307 | 0 | if (iri->content_encoding != opt.locale) |
1308 | 0 | set_uri_encoding (iri, iri->content_encoding, false); |
1309 | 0 | #endif |
1310 | | |
1311 | | /* Reset UTF-8 encode status */ |
1312 | 0 | iri->utf8_encode = opt.enable_iri; |
1313 | 0 | xfree (iri->orig_url); |
1314 | |
|
1315 | 0 | input_file = url_file; |
1316 | 0 | } |
1317 | 0 | else |
1318 | 0 | input_file = (char *) file; |
1319 | | |
1320 | 0 | bool read_again = false; |
1321 | 0 | do { |
1322 | 0 | url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) |
1323 | 0 | : get_urls_file (input_file, &read_again)); |
1324 | |
|
1325 | 0 | status = retrieve_from_url_list(url_list, count, iri); |
1326 | 0 | } while (read_again); |
1327 | |
|
1328 | 0 | xfree (url_file); |
1329 | | |
1330 | | /* Free the linked list of URL-s. */ |
1331 | 0 | free_urlpos (url_list); |
1332 | |
|
1333 | 0 | iri_free (iri); |
1334 | |
|
1335 | 0 | return status; |
1336 | 0 | } |
1337 | | |
1338 | | /* Print `giving up', or `retrying', depending on the impending |
1339 | | action. N1 and N2 are the attempt number and the attempt limit. */ |
1340 | | void |
1341 | | printwhat (int n1, int n2) |
1342 | 0 | { |
1343 | 0 | logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n")); |
1344 | 0 | } |
1345 | | |
1346 | | /* If opt.wait or opt.waitretry are specified, and if certain |
1347 | | conditions are met, sleep the appropriate number of seconds. See |
1348 | | the documentation of --wait and --waitretry for more information. |
1349 | | |
1350 | | COUNT is the count of current retrieval, beginning with 1. */ |
1351 | | |
1352 | | void |
1353 | | sleep_between_retrievals (int count) |
1354 | 0 | { |
1355 | 0 | static bool first_retrieval = true; |
1356 | |
|
1357 | 0 | if (first_retrieval) |
1358 | 0 | { |
1359 | | /* Don't sleep before the very first retrieval. */ |
1360 | 0 | first_retrieval = false; |
1361 | 0 | return; |
1362 | 0 | } |
1363 | | |
1364 | 0 | if (opt.waitretry && count > 1) |
1365 | 0 | { |
1366 | | /* If opt.waitretry is specified and this is a retry, wait for |
1367 | | COUNT-1 number of seconds, or for opt.waitretry seconds. */ |
1368 | 0 | if (count <= opt.waitretry) |
1369 | 0 | xsleep (count - 1); |
1370 | 0 | else |
1371 | 0 | xsleep (opt.waitretry); |
1372 | 0 | } |
1373 | 0 | else if (opt.wait) |
1374 | 0 | { |
1375 | 0 | if (!opt.random_wait || count > 1) |
1376 | | /* If random-wait is not specified, or if we are sleeping |
1377 | | between retries of the same download, sleep the fixed |
1378 | | interval. */ |
1379 | 0 | xsleep (opt.wait); |
1380 | 0 | else |
1381 | 0 | { |
1382 | | /* Sleep a random amount of time averaging in opt.wait |
1383 | | seconds. The sleeping amount ranges from 0.5*opt.wait to |
1384 | | 1.5*opt.wait. */ |
1385 | 0 | double waitsecs = (0.5 + random_float ()) * opt.wait; |
1386 | 0 | DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n", |
1387 | 0 | opt.wait, waitsecs)); |
1388 | 0 | xsleep (waitsecs); |
1389 | 0 | } |
1390 | 0 | } |
1391 | 0 | } |
1392 | | |
1393 | | /* Free the linked list of urlpos. */ |
1394 | | void |
1395 | | free_urlpos (struct urlpos *l) |
1396 | 10.9k | { |
1397 | 37.8k | while (l) |
1398 | 26.9k | { |
1399 | 26.9k | struct urlpos *next = l->next; |
1400 | 26.9k | if (l->url) |
1401 | 26.9k | url_free (l->url); |
1402 | 26.9k | xfree (l->local_name); |
1403 | 26.9k | xfree (l); |
1404 | 26.9k | l = next; |
1405 | 26.9k | } |
1406 | 10.9k | } |
1407 | | |
1408 | | /* Rotate FNAME opt.backups times */ |
1409 | | void |
1410 | | rotate_backups(const char *fname) |
1411 | 0 | { |
1412 | | #ifdef __VMS |
1413 | | # define SEP "_" |
1414 | | # define AVS ";*" /* All-version suffix. */ |
1415 | | # define AVSL (sizeof (AVS) - 1) |
1416 | | #else |
1417 | 0 | # define SEP "." |
1418 | 0 | # define AVSL 0 |
1419 | 0 | #endif |
1420 | 0 | #define FILE_BUF_SIZE 1024 |
1421 | | |
1422 | | /* avoid alloca() here */ |
1423 | 0 | char from[FILE_BUF_SIZE], to[FILE_BUF_SIZE]; |
1424 | 0 | struct stat sb; |
1425 | 0 | bool overflow; |
1426 | 0 | int i; |
1427 | |
|
1428 | 0 | if (stat (fname, &sb) == 0) |
1429 | 0 | if (S_ISREG (sb.st_mode) == 0) |
1430 | 0 | return; |
1431 | | |
1432 | 0 | for (i = opt.backups; i > 1; i--) |
1433 | 0 | { |
1434 | | #ifdef VMS |
1435 | | /* Delete (all versions of) any existing max-suffix file, to avoid |
1436 | | * creating multiple versions of it. (On VMS, rename() will |
1437 | | * create a new version of an existing destination file, not |
1438 | | * destroy/overwrite it.) |
1439 | | */ |
1440 | | if (i == opt.backups) |
1441 | | { |
1442 | | if (((unsigned) snprintf (to, sizeof (to), "%s%s%d%s", fname, SEP, i, AVS)) >= sizeof (to)) |
1443 | | logprintf (LOG_NOTQUIET, "Failed to delete %s: File name truncation\n", to); |
1444 | | else |
1445 | | delete (to); |
1446 | | } |
1447 | | #endif |
1448 | 0 | overflow = (unsigned) snprintf (to, FILE_BUF_SIZE, "%s%s%d", fname, SEP, i) >= FILE_BUF_SIZE; |
1449 | 0 | overflow |= (unsigned) snprintf (from, FILE_BUF_SIZE, "%s%s%d", fname, SEP, i - 1) >= FILE_BUF_SIZE; |
1450 | |
|
1451 | 0 | if (overflow) |
1452 | 0 | errno = ENAMETOOLONG; |
1453 | 0 | if (overflow || rename (from, to)) |
1454 | 0 | { |
1455 | | // The original file may not exist. In which case rename() will |
1456 | | // return ENOENT. This is not a real error. We could make this better |
1457 | | // by calling stat() first and making sure that the file exists. |
1458 | 0 | if (errno != ENOENT) |
1459 | 0 | logprintf (LOG_NOTQUIET, "Failed to rename %s to %s: (%d) %s\n", |
1460 | 0 | from, to, errno, strerror (errno)); |
1461 | 0 | } |
1462 | 0 | } |
1463 | |
|
1464 | 0 | overflow = (unsigned) snprintf (to, FILE_BUF_SIZE, "%s%s%d", fname, SEP, 1) >= FILE_BUF_SIZE; |
1465 | 0 | if (overflow) |
1466 | 0 | errno = ENAMETOOLONG; |
1467 | 0 | if (overflow || rename(fname, to)) |
1468 | 0 | { |
1469 | 0 | if (errno != ENOENT) |
1470 | 0 | logprintf (LOG_NOTQUIET, "Failed to rename %s to %s: (%d) %s\n", |
1471 | 0 | from, to, errno, strerror (errno)); |
1472 | 0 | } |
1473 | |
|
1474 | 0 | #undef FILE_BUF_SIZE |
1475 | 0 | } |
1476 | | |
1477 | | static bool no_proxy_match (const char *, const char **); |
1478 | | |
1479 | | /* Return the URL of the proxy appropriate for url U. */ |
1480 | | |
1481 | | static char * |
1482 | | getproxy (struct url *u) |
1483 | 0 | { |
1484 | 0 | char *proxy = NULL; |
1485 | 0 | char *rewritten_url; |
1486 | |
|
1487 | 0 | if (!opt.use_proxy) |
1488 | 0 | return NULL; |
1489 | 0 | if (no_proxy_match (u->host, (const char **)opt.no_proxy)) |
1490 | 0 | return NULL; |
1491 | | |
1492 | 0 | switch (u->scheme) |
1493 | 0 | { |
1494 | 0 | case SCHEME_HTTP: |
1495 | 0 | proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy"); |
1496 | 0 | break; |
1497 | 0 | #ifdef HAVE_SSL |
1498 | 0 | case SCHEME_HTTPS: |
1499 | 0 | proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy"); |
1500 | 0 | break; |
1501 | 0 | case SCHEME_FTPS: |
1502 | 0 | proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftps_proxy"); |
1503 | 0 | break; |
1504 | 0 | #endif |
1505 | 0 | case SCHEME_FTP: |
1506 | 0 | proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy"); |
1507 | 0 | break; |
1508 | 0 | case SCHEME_INVALID: |
1509 | 0 | break; |
1510 | 0 | } |
1511 | 0 | if (!proxy || !*proxy) |
1512 | | #ifdef HAVE_LIBPROXY |
1513 | | { |
1514 | | pxProxyFactory *pf = px_proxy_factory_new (); |
1515 | | if (!pf) |
1516 | | { |
1517 | | DEBUGP (("Allocating memory for libproxy failed")); |
1518 | | return NULL; |
1519 | | } |
1520 | | |
1521 | | DEBUGP (("asking libproxy about url '%s'\n", u->url)); |
1522 | | char **proxies = px_proxy_factory_get_proxies (pf, u->url); |
1523 | | if (proxies) |
1524 | | { |
1525 | | if (proxies[0]) |
1526 | | { |
1527 | | DEBUGP (("libproxy suggest to use '%s'\n", proxies[0])); |
1528 | | if (strcmp (proxies[0], "direct://") != 0) |
1529 | | { |
1530 | | proxy = xstrdup (proxies[0]); |
1531 | | DEBUGP (("libproxy setting to use '%s'\n", proxy)); |
1532 | | } |
1533 | | } |
1534 | | |
1535 | | px_proxy_factory_free_proxies (proxies); |
1536 | | } |
1537 | | px_proxy_factory_free (pf); |
1538 | | |
1539 | | if (!proxy || !*proxy) |
1540 | | return NULL; |
1541 | | } |
1542 | | #else |
1543 | 0 | return NULL; |
1544 | 0 | #endif |
1545 | | |
1546 | | /* Handle shorthands. `rewritten_storage' is a kludge to allow |
1547 | | getproxy() to return static storage. */ |
1548 | 0 | rewritten_url = rewrite_shorthand_url (proxy); |
1549 | 0 | if (rewritten_url) |
1550 | 0 | return rewritten_url; |
1551 | | |
1552 | 0 | return strdup(proxy); |
1553 | 0 | } |
1554 | | |
1555 | | /* Returns true if URL would be downloaded through a proxy. */ |
1556 | | |
1557 | | bool |
1558 | | url_uses_proxy (struct url * u) |
1559 | 0 | { |
1560 | 0 | bool ret; |
1561 | 0 | char *proxy; |
1562 | |
|
1563 | 0 | if (!u) |
1564 | 0 | return false; |
1565 | 0 | proxy = getproxy (u); |
1566 | 0 | ret = proxy != NULL; |
1567 | 0 | xfree (proxy); |
1568 | 0 | return ret; |
1569 | 0 | } |
1570 | | |
1571 | | /* Should a host be accessed through proxy, concerning no_proxy? */ |
1572 | | static bool |
1573 | | no_proxy_match (const char *host, const char **no_proxy) |
1574 | 0 | { |
1575 | 0 | if (!no_proxy) |
1576 | 0 | return false; |
1577 | 0 | else |
1578 | 0 | return sufmatch (no_proxy, host); |
1579 | 0 | } |
1580 | | |
1581 | | /* Set the file parameter to point to the local file string. */ |
1582 | | void |
1583 | | set_local_file (const char **file, const char *default_file) |
1584 | 0 | { |
1585 | 0 | if (opt.output_document) |
1586 | 0 | { |
1587 | 0 | if (output_stream_regular) |
1588 | 0 | *file = opt.output_document; |
1589 | 0 | } |
1590 | 0 | else |
1591 | 0 | *file = default_file; |
1592 | 0 | } |
1593 | | |
1594 | | /* Return true for an input file's own URL, false otherwise. */ |
1595 | | bool |
1596 | | input_file_url (const char *input_file) |
1597 | 0 | { |
1598 | 0 | static bool first = true; |
1599 | |
|
1600 | 0 | if (input_file |
1601 | 0 | && url_has_scheme (input_file) |
1602 | 0 | && first) |
1603 | 0 | { |
1604 | 0 | first = false; |
1605 | 0 | return true; |
1606 | 0 | } |
1607 | 0 | else |
1608 | 0 | return false; |
1609 | 0 | } |
1610 | | |
1611 | | #ifdef TESTING |
1612 | | |
1613 | | #include <stdint.h> |
1614 | | #include "../tests/unit-tests.h" |
1615 | | |
1616 | | const char * |
1617 | | test_retr_rate(void) |
1618 | 0 | { |
1619 | 0 | static const struct test { |
1620 | 0 | wgint bytes; |
1621 | 0 | double secs; |
1622 | 0 | const char *expected; |
1623 | 0 | } tests[] = { |
1624 | 0 | { 0, 1, "0.00 B/s" }, |
1625 | 0 | { INT64_MAX, 1, "100 TB/s" }, |
1626 | 0 | }; |
1627 | |
|
1628 | 0 | for (struct test *t = tests; t < tests+countof(tests); t++) |
1629 | 0 | { |
1630 | 0 | const char *result = retr_rate (t->bytes, t->secs); |
1631 | |
|
1632 | 0 | if (strcmp(result,t->expected)) |
1633 | 0 | return aprintf("%s: Expected '%s', got '%s'", __func__, t->expected, result); |
1634 | 0 | } |
1635 | | |
1636 | 0 | return NULL; |
1637 | 0 | } |
1638 | | |
1639 | | #endif /* TESTING */ |