Coverage Report

Created: 2024-02-25 07:23

/src/wget/src/http.c
Line
Count
Source (jump to first uncovered line)
1
/* HTTP support.
2
   Copyright (C) 1996-2012, 2014-2015, 2018-2024 Free Software
3
   Foundation, Inc.
4
5
This file is part of GNU Wget.
6
7
GNU Wget is free software; you can redistribute it and/or modify
8
it under the terms of the GNU General Public License as published by
9
the Free Software Foundation; either version 3 of the License, or
10
 (at your option) any later version.
11
12
GNU Wget is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
GNU General Public License for more details.
16
17
You should have received a copy of the GNU General Public License
18
along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20
Additional permission under GNU GPL version 3 section 7
21
22
If you modify this program, or any covered work, by linking or
23
combining it with the OpenSSL project's OpenSSL library (or a
24
modified version of that library), containing parts covered by the
25
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26
grants you additional permission to convey the resulting work.
27
Corresponding Source for a non-source form of such a combination
28
shall include the source code for the parts of OpenSSL used as well
29
as that of the covered work.  */
30
31
#include "wget.h"
32
33
#include <stdio.h>
34
#include <stdlib.h>
35
#include <string.h>
36
#include <unistd.h>
37
#include <assert.h>
38
#include <errno.h>
39
#include <time.h>
40
#include <locale.h>
41
#include <fcntl.h>
42
43
#include "hash.h"
44
#include "http.h"
45
#include "hsts.h"
46
#include "utils.h"
47
#include "url.h"
48
#include "host.h"
49
#include "retr.h"
50
#include "connect.h"
51
#include "netrc.h"
52
#ifdef HAVE_SSL
53
# include "ssl.h"
54
#endif
55
#ifdef ENABLE_NTLM
56
# include "http-ntlm.h"
57
#endif
58
#include "cookies.h"
59
#include "md5.h"
60
#include "convert.h"
61
#include "spider.h"
62
#include "warc.h"
63
#include "c-strcase.h"
64
#include "version.h"
65
#include "xstrndup.h"
66
#ifdef HAVE_METALINK
67
# include "metalink.h"
68
#endif
69
#ifdef ENABLE_XATTR
70
#include "xattr.h"
71
#endif
72
73
#ifdef TESTING
74
#include "../tests/unit-tests.h"
75
#endif
76
77
#ifdef __VMS
78
# include "vms.h"
79
#endif /* def __VMS */
80
81
82
/* Forward decls. */
83
struct http_stat;
84
static char *create_authorization_line (const char *, const char *,
85
                                        const char *, const char *,
86
                                        const char *, bool *, uerr_t *);
87
static char *basic_authentication_encode (const char *, const char *);
88
static bool known_authentication_scheme_p (const char *, const char *);
89
static void ensure_extension (struct http_stat *, const char *, int *);
90
static void load_cookies (void);
91
92
static bool cookies_loaded_p;
93
static struct cookie_jar *wget_cookie_jar;
94
95
0
#define TEXTHTML_S "text/html"
96
0
#define TEXTXHTML_S "application/xhtml+xml"
97
0
#define TEXTCSS_S "text/css"
98
99
/* Some status code validation macros: */
100
0
#define H_10X(x)        (((x) >= 100) && ((x) < 200))
101
0
#define H_20X(x)        (((x) >= 200) && ((x) < 300))
102
0
#define H_PARTIAL(x)    ((x) == HTTP_STATUS_PARTIAL_CONTENTS)
103
0
#define H_REDIRECTED(x) ((x) == HTTP_STATUS_MOVED_PERMANENTLY          \
104
0
                         || (x) == HTTP_STATUS_MOVED_TEMPORARILY       \
105
0
                         || (x) == HTTP_STATUS_SEE_OTHER               \
106
0
                         || (x) == HTTP_STATUS_TEMPORARY_REDIRECT      \
107
0
                         || (x) == HTTP_STATUS_PERMANENT_REDIRECT)
108
109
/* HTTP/1.0 status codes from RFC1945, provided for reference.  */
110
/* Successful 2xx.  */
111
0
#define HTTP_STATUS_OK                    200
112
#define HTTP_STATUS_CREATED               201
113
#define HTTP_STATUS_ACCEPTED              202
114
0
#define HTTP_STATUS_NO_CONTENT            204
115
0
#define HTTP_STATUS_PARTIAL_CONTENTS      206
116
117
/* Redirection 3xx.  */
118
0
#define HTTP_STATUS_MULTIPLE_CHOICES      300
119
0
#define HTTP_STATUS_MOVED_PERMANENTLY     301
120
0
#define HTTP_STATUS_MOVED_TEMPORARILY     302
121
0
#define HTTP_STATUS_SEE_OTHER             303 /* from HTTP/1.1 */
122
0
#define HTTP_STATUS_NOT_MODIFIED          304
123
0
#define HTTP_STATUS_TEMPORARY_REDIRECT    307 /* from HTTP/1.1 */
124
0
#define HTTP_STATUS_PERMANENT_REDIRECT    308 /* from HTTP/1.1 */
125
126
/* Client error 4xx.  */
127
#define HTTP_STATUS_BAD_REQUEST           400
128
0
#define HTTP_STATUS_UNAUTHORIZED          401
129
#define HTTP_STATUS_FORBIDDEN             403
130
#define HTTP_STATUS_NOT_FOUND             404
131
0
#define HTTP_STATUS_RANGE_NOT_SATISFIABLE 416
132
133
/* Server errors 5xx.  */
134
#define HTTP_STATUS_INTERNAL              500
135
#define HTTP_STATUS_NOT_IMPLEMENTED       501
136
#define HTTP_STATUS_BAD_GATEWAY           502
137
#define HTTP_STATUS_UNAVAILABLE           503
138
0
#define HTTP_STATUS_GATEWAY_TIMEOUT       504
139
140
enum rp {
141
  rel_none, rel_name, rel_value, rel_both
142
};
143
144
struct request {
145
  const char *method;
146
  char *arg;
147
148
  struct request_header {
149
    char *name, *value;
150
    enum rp release_policy;
151
  } *headers;
152
  int hcount, hcapacity;
153
};
154
155
156
/* Create a new, empty request. Set the request's method and its
157
   arguments.  METHOD should be a literal string (or it should outlive
158
   the request) because it will not be freed.  ARG will be freed by
159
   request_free.  */
160
161
static struct request *
162
request_new (const char *method, char *arg)
163
0
{
164
0
  struct request *req = xnew0 (struct request);
165
0
  req->hcapacity = 8;
166
0
  req->headers = xnew_array (struct request_header, req->hcapacity);
167
0
  req->method = method;
168
0
  req->arg = arg;
169
0
  return req;
170
0
}
171
172
/* Return the method string passed with the last call to
173
   request_set_method.  */
174
175
static const char *
176
request_method (const struct request *req)
177
0
{
178
0
  return req->method;
179
0
}
180
181
/* Free one header according to the release policy specified with
182
   request_set_header.  */
183
184
static void
185
release_header (struct request_header *hdr)
186
0
{
187
0
  switch (hdr->release_policy)
188
0
    {
189
0
    case rel_none:
190
0
      break;
191
0
    case rel_name:
192
0
      xfree (hdr->name);
193
0
      break;
194
0
    case rel_value:
195
0
      xfree (hdr->value);
196
0
      break;
197
0
    case rel_both:
198
0
      xfree (hdr->name);
199
0
      xfree (hdr->value);
200
0
      break;
201
0
    }
202
0
}
203
204
/* Set the request named NAME to VALUE.  Specifically, this means that
205
   a "NAME: VALUE\r\n" header line will be used in the request.  If a
206
   header with the same name previously existed in the request, its
207
   value will be replaced by this one.  A NULL value means do nothing.
208
209
   RELEASE_POLICY determines whether NAME and VALUE should be released
210
   (freed) with request_free.  Allowed values are:
211
212
    - rel_none     - don't free NAME or VALUE
213
    - rel_name     - free NAME when done
214
    - rel_value    - free VALUE when done
215
    - rel_both     - free both NAME and VALUE when done
216
217
   Setting release policy is useful when arguments come from different
218
   sources.  For example:
219
220
     // Don't free literal strings!
221
     request_set_header (req, "Pragma", "no-cache", rel_none);
222
223
     // Don't free a global variable, we'll need it later.
224
     request_set_header (req, "Referer", opt.referer, rel_none);
225
226
     // Value freshly allocated, free it when done.
227
     request_set_header (req, "Range",
228
                         aprintf ("bytes=%s-", number_to_static_string (hs->restval)),
229
                         rel_value);
230
   */
231
232
static void
233
request_set_header (struct request *req, const char *name, const char *value,
234
                    enum rp release_policy)
235
0
{
236
0
  struct request_header *hdr;
237
0
  int i;
238
239
0
  if (!value)
240
0
    {
241
      /* A NULL value is a no-op; if freeing the name is requested,
242
         free it now to avoid leaks.  */
243
0
      if (release_policy == rel_name || release_policy == rel_both)
244
0
        xfree (name);
245
0
      return;
246
0
    }
247
248
0
  for (i = 0; i < req->hcount; i++)
249
0
    {
250
0
      hdr = &req->headers[i];
251
0
      if (0 == c_strcasecmp (name, hdr->name))
252
0
        {
253
          /* Replace existing header. */
254
0
          release_header (hdr);
255
0
          hdr->name = (void *)name;
256
0
          hdr->value = (void *)value;
257
0
          hdr->release_policy = release_policy;
258
0
          return;
259
0
        }
260
0
    }
261
262
  /* Install new header. */
263
264
0
  if (req->hcount >= req->hcapacity)
265
0
    {
266
0
      req->hcapacity <<= 1;
267
0
      req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr));
268
0
    }
269
0
  hdr = &req->headers[req->hcount++];
270
0
  hdr->name = (void *)name;
271
0
  hdr->value = (void *)value;
272
0
  hdr->release_policy = release_policy;
273
0
}
274
275
/* Like request_set_header, but sets the whole header line, as
276
   provided by the user using the `--header' option.  For example,
277
   request_set_user_header (req, "Foo: bar") works just like
278
   request_set_header (req, "Foo", "bar").  */
279
280
static void
281
request_set_user_header (struct request *req, const char *header)
282
0
{
283
0
  const char *name, *p;
284
285
0
  if (!(p = strchr (header, ':')))
286
0
    return;
287
288
0
  name = xstrndup(header, p - header);
289
290
0
  ++p;
291
0
  while (c_isspace (*p))
292
0
    ++p;
293
294
0
  request_set_header (req, name, p, rel_name);
295
0
}
296
297
/* Remove the header with specified name from REQ.  Returns true if
298
   the header was actually removed, false otherwise.  */
299
300
static bool
301
request_remove_header (struct request *req, const char *name)
302
0
{
303
0
  int i;
304
0
  for (i = 0; i < req->hcount; i++)
305
0
    {
306
0
      struct request_header *hdr = &req->headers[i];
307
0
      if (0 == c_strcasecmp (name, hdr->name))
308
0
        {
309
0
          release_header (hdr);
310
          /* Move the remaining headers by one. */
311
0
          if (i < req->hcount - 1)
312
0
            memmove (hdr, hdr + 1, (req->hcount - i - 1) * sizeof (*hdr));
313
0
          --req->hcount;
314
0
          return true;
315
0
        }
316
0
    }
317
0
  return false;
318
0
}
319
320
0
#define APPEND(p, str) do {                     \
321
0
  int A_len = strlen (str);                     \
322
0
  memcpy (p, str, A_len);                       \
323
0
  p += A_len;                                   \
324
0
} while (0)
325
326
/* Construct the request and write it to FD using fd_write.
327
   If warc_tmp is set to a file pointer, the request string will
328
   also be written to that file. */
329
330
static int
331
request_send (const struct request *req, int fd, FILE *warc_tmp)
332
0
{
333
0
  char *request_string, *p;
334
0
  int i, size, write_error;
335
336
  /* Count the request size. */
337
0
  size = 0;
338
339
  /* METHOD " " ARG " " "HTTP/1.0" "\r\n" */
340
0
  size += strlen (req->method) + 1 + strlen (req->arg) + 1 + 8 + 2;
341
342
0
  for (i = 0; i < req->hcount; i++)
343
0
    {
344
0
      struct request_header *hdr = &req->headers[i];
345
      /* NAME ": " VALUE "\r\n" */
346
0
      size += strlen (hdr->name) + 2 + strlen (hdr->value) + 2;
347
0
    }
348
349
  /* "\r\n\0" */
350
0
  size += 3;
351
352
0
  p = request_string = xmalloc (size);
353
354
  /* Generate the request. */
355
356
0
  APPEND (p, req->method); *p++ = ' ';
357
0
  APPEND (p, req->arg);    *p++ = ' ';
358
0
  memcpy (p, "HTTP/1.1\r\n", 10); p += 10;
359
360
0
  for (i = 0; i < req->hcount; i++)
361
0
    {
362
0
      struct request_header *hdr = &req->headers[i];
363
0
      APPEND (p, hdr->name);
364
0
      *p++ = ':', *p++ = ' ';
365
0
      APPEND (p, hdr->value);
366
0
      *p++ = '\r', *p++ = '\n';
367
0
    }
368
369
0
  *p++ = '\r', *p++ = '\n', *p++ = '\0';
370
0
  assert (p - request_string == size);
371
372
0
#undef APPEND
373
374
0
  DEBUGP (("\n---request begin---\n%s---request end---\n", request_string));
375
376
  /* Send the request to the server. */
377
378
0
  write_error = fd_write (fd, request_string, size - 1, -1);
379
0
  if (write_error < 0)
380
0
    logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"),
381
0
               fd_errstr (fd));
382
0
  else if (warc_tmp != NULL)
383
0
    {
384
      /* Write a copy of the data to the WARC record. */
385
0
      int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp);
386
0
      if (warc_tmp_written != size - 1)
387
0
        write_error = -2;
388
0
    }
389
0
  xfree (request_string);
390
0
  return write_error;
391
0
}
392
393
/* Release the resources used by REQ.
394
   It is safe to call it with a valid pointer to a NULL pointer.
395
   It is not safe to call it with an invalid or NULL pointer.  */
396
397
static void
398
request_free (struct request **req_ref)
399
0
{
400
0
  int i;
401
0
  struct request *req = *req_ref;
402
403
0
  if (!req)
404
0
    return;
405
406
0
  xfree (req->arg);
407
0
  for (i = 0; i < req->hcount; i++)
408
0
    release_header (&req->headers[i]);
409
0
  xfree (req->headers);
410
0
  xfree (req);
411
0
  *req_ref = NULL;
412
0
}
413
414
static struct hash_table *basic_authed_hosts;
415
416
/* Find out if this host has issued a Basic challenge yet; if so, give
417
 * it the username, password. A temporary measure until we can get
418
 * proper authentication in place. */
419
420
static bool
421
maybe_send_basic_creds (const char *hostname, const char *user,
422
                        const char *passwd, struct request *req)
423
0
{
424
0
  bool do_challenge = false;
425
426
0
  if (opt.auth_without_challenge)
427
0
    {
428
0
      DEBUGP (("Auth-without-challenge set, sending Basic credentials.\n"));
429
0
      do_challenge = true;
430
0
    }
431
0
  else if (basic_authed_hosts
432
0
      && hash_table_contains (basic_authed_hosts, hostname))
433
0
    {
434
0
      DEBUGP (("Found %s in basic_authed_hosts.\n", quote (hostname)));
435
0
      do_challenge = true;
436
0
    }
437
0
  else
438
0
    {
439
0
      DEBUGP (("Host %s has not issued a general basic challenge.\n",
440
0
              quote (hostname)));
441
0
    }
442
0
  if (do_challenge)
443
0
    {
444
0
      request_set_header (req, "Authorization",
445
0
                          basic_authentication_encode (user, passwd),
446
0
                          rel_value);
447
0
    }
448
0
  return do_challenge;
449
0
}
450
451
static void
452
register_basic_auth_host (const char *hostname)
453
0
{
454
0
  if (!basic_authed_hosts)
455
0
    {
456
0
      basic_authed_hosts = make_nocase_string_hash_table (1);
457
0
    }
458
0
  if (!hash_table_contains (basic_authed_hosts, hostname))
459
0
    {
460
0
      hash_table_put (basic_authed_hosts, xstrdup (hostname), NULL);
461
0
      DEBUGP (("Inserted %s into basic_authed_hosts\n", quote (hostname)));
462
0
    }
463
0
}
464
465
/* Send the contents of FILE_NAME to SOCK.  Make sure that exactly
466
   PROMISED_SIZE bytes are sent over the wire -- if the file is
467
   longer, read only that much; if the file is shorter, report an error.
468
   If warc_tmp is set to a file pointer, the post data will
469
   also be written to that file.  */
470
471
static int
472
body_file_send (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp)
473
0
{
474
0
  static char chunk[8192];
475
0
  wgint written = 0;
476
0
  int write_error;
477
0
  FILE *fp;
478
479
0
  DEBUGP (("[writing BODY file %s ... ", file_name));
480
481
0
  fp = fopen (file_name, "rb");
482
0
  if (!fp)
483
0
    return -1;
484
0
  while (!feof (fp) && written < promised_size)
485
0
    {
486
0
      int towrite;
487
0
      int length = fread (chunk, 1, sizeof (chunk), fp);
488
0
      if (length == 0)
489
0
        break;
490
0
      towrite = MIN (promised_size - written, length);
491
0
      write_error = fd_write (sock, chunk, towrite, -1);
492
0
      if (write_error < 0)
493
0
        {
494
0
          fclose (fp);
495
0
          return -1;
496
0
        }
497
0
      if (warc_tmp != NULL)
498
0
        {
499
          /* Write a copy of the data to the WARC record. */
500
0
          int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp);
501
0
          if (warc_tmp_written != towrite)
502
0
            {
503
0
              fclose (fp);
504
0
              return -2;
505
0
            }
506
0
        }
507
0
      written += towrite;
508
0
    }
509
0
  fclose (fp);
510
511
  /* If we've written less than was promised, report a (probably
512
     nonsensical) error rather than break the promise.  */
513
0
  if (written < promised_size)
514
0
    {
515
0
      errno = EINVAL;
516
0
      return -1;
517
0
    }
518
519
0
  assert (written == promised_size);
520
0
  DEBUGP (("done]\n"));
521
0
  return 0;
522
0
}
523
524
/* Determine whether [START, PEEKED + PEEKLEN) contains an empty line.
525
   If so, return the pointer to the position after the line, otherwise
526
   return NULL.  This is used as callback to fd_read_hunk.  The data
527
   between START and PEEKED has been read and cannot be "unread"; the
528
   data after PEEKED has only been peeked.  */
529
530
static const char *
531
response_head_terminator (const char *start, const char *peeked, int peeklen)
532
0
{
533
0
  const char *p, *end;
534
535
  /* If at first peek, verify whether HUNK starts with "HTTP".  If
536
     not, this is a HTTP/0.9 request and we must bail out without
537
     reading anything.  */
538
0
  if (start == peeked && 0 != memcmp (start, "HTTP", MIN (peeklen, 4)))
539
0
    return start;
540
541
  /* Look for "\n[\r]\n", and return the following position if found.
542
     Start two chars before the current to cover the possibility that
543
     part of the terminator (e.g. "\n\r") arrived in the previous
544
     batch.  */
545
0
  p = peeked - start < 2 ? start : peeked - 2;
546
0
  end = peeked + peeklen;
547
548
  /* Check for \n\r\n or \n\n anywhere in [p, end-2). */
549
0
  for (; p < end - 2; p++)
550
0
    if (*p == '\n')
551
0
      {
552
0
        if (p[1] == '\r' && p[2] == '\n')
553
0
          return p + 3;
554
0
        else if (p[1] == '\n')
555
0
          return p + 2;
556
0
      }
557
  /* p==end-2: check for \n\n directly preceding END. */
558
0
  if (peeklen >= 2 && p[0] == '\n' && p[1] == '\n')
559
0
    return p + 2;
560
561
0
  return NULL;
562
0
}
563
564
/* The maximum size of a single HTTP response we care to read.  Rather
565
   than being a limit of the reader implementation, this limit
566
   prevents Wget from slurping all available memory upon encountering
567
   malicious or buggy server output, thus protecting the user.  Define
568
   it to 0 to remove the limit.  */
569
570
0
#define HTTP_RESPONSE_MAX_SIZE 65536
571
572
/* Read the HTTP request head from FD and return it.  The error
573
   conditions are the same as with fd_read_hunk.
574
575
   To support HTTP/0.9 responses, this function tries to make sure
576
   that the data begins with "HTTP".  If this is not the case, no data
577
   is read and an empty request is returned, so that the remaining
578
   data can be treated as body.  */
579
580
static char *
581
read_http_response_head (int fd)
582
0
{
583
0
  return fd_read_hunk (fd, response_head_terminator, 512,
584
0
                       HTTP_RESPONSE_MAX_SIZE);
585
0
}
586
587
struct response {
588
  /* The response data. */
589
  const char *data;
590
591
  /* The array of pointers that indicate where each header starts.
592
     For example, given this HTTP response:
593
594
       HTTP/1.0 200 Ok
595
       Description: some
596
        text
597
       Etag: x
598
599
     The headers are located like this:
600
601
     "HTTP/1.0 200 Ok\r\nDescription: some\r\n text\r\nEtag: x\r\n\r\n"
602
     ^                   ^                             ^          ^
603
     headers[0]          headers[1]                    headers[2] headers[3]
604
605
     I.e. headers[0] points to the beginning of the request,
606
     headers[1] points to the end of the first header and the
607
     beginning of the second one, etc.  */
608
609
  const char **headers;
610
};
611
612
/* Create a new response object from the text of the HTTP response,
613
   available in HEAD.  That text is automatically split into
614
   constituent header lines for fast retrieval using
615
   resp_header_*.  */
616
617
static struct response *
618
resp_new (char *head)
619
0
{
620
0
  char *hdr;
621
0
  int count, size;
622
623
0
  struct response *resp = xnew0 (struct response);
624
0
  resp->data = head;
625
626
0
  if (*head == '\0')
627
0
    {
628
      /* Empty head means that we're dealing with a headerless
629
         (HTTP/0.9) response.  In that case, don't set HEADERS at
630
         all.  */
631
0
      return resp;
632
0
    }
633
634
  /* Split HEAD into header lines, so that resp_header_* functions
635
     don't need to do this over and over again.  */
636
637
0
  size = count = 0;
638
0
  hdr = head;
639
0
  while (1)
640
0
    {
641
0
      DO_REALLOC (resp->headers, size, count + 1, const char *);
642
0
      resp->headers[count++] = hdr;
643
644
      /* Break upon encountering an empty line. */
645
0
      if (!hdr[0] || (hdr[0] == '\r' && hdr[1] == '\n') || hdr[0] == '\n')
646
0
        break;
647
648
      /* Find the end of HDR, including continuations. */
649
0
      for (;;)
650
0
        {
651
0
          char *end = strchr (hdr, '\n');
652
653
0
          if (!end)
654
0
            {
655
0
              hdr += strlen (hdr);
656
0
              break;
657
0
            }
658
659
0
          hdr = end + 1;
660
661
0
          if (*hdr != ' ' && *hdr != '\t')
662
0
            break;
663
664
          // continuation, transform \r and \n into spaces
665
0
          *end = ' ';
666
0
          if (end > head && end[-1] == '\r')
667
0
            end[-1] = ' ';
668
0
        }
669
0
    }
670
0
  DO_REALLOC (resp->headers, size, count + 1, const char *);
671
0
  resp->headers[count] = NULL;
672
673
0
  return resp;
674
0
}
675
676
/* Locate the header named NAME in the request data, starting with
677
   position START.  This allows the code to loop through the request
678
   data, filtering for all requests of a given name.  Returns the
679
   found position, or -1 for failure.  The code that uses this
680
   function typically looks like this:
681
682
     for (pos = 0; (pos = resp_header_locate (...)) != -1; pos++)
683
       ... do something with header ...
684
685
   If you only care about one header, use resp_header_get instead of
686
   this function.  */
687
688
static int
689
resp_header_locate (const struct response *resp, const char *name, int start,
690
                    const char **begptr, const char **endptr)
691
0
{
692
0
  int i;
693
0
  const char **headers = resp->headers;
694
0
  int name_len;
695
696
0
  if (!headers || !headers[1])
697
0
    return -1;
698
699
0
  name_len = strlen (name);
700
0
  if (start > 0)
701
0
    i = start;
702
0
  else
703
0
    i = 1;
704
705
0
  for (; headers[i + 1]; i++)
706
0
    {
707
0
      const char *b = headers[i];
708
0
      const char *e = headers[i + 1];
709
0
      if (e - b > name_len
710
0
          && b[name_len] == ':'
711
0
          && 0 == c_strncasecmp (b, name, name_len))
712
0
        {
713
0
          b += name_len + 1;
714
0
          while (b < e && c_isspace (*b))
715
0
            ++b;
716
0
          while (b < e && c_isspace (e[-1]))
717
0
            --e;
718
0
          *begptr = b;
719
0
          *endptr = e;
720
0
          return i;
721
0
        }
722
0
    }
723
0
  return -1;
724
0
}
725
726
/* Find and retrieve the header named NAME in the request data.  If
727
   found, set *BEGPTR to its starting, and *ENDPTR to its ending
728
   position, and return true.  Otherwise return false.
729
730
   This function is used as a building block for resp_header_copy
731
   and resp_header_strdup.  */
732
733
static bool
734
resp_header_get (const struct response *resp, const char *name,
735
                 const char **begptr, const char **endptr)
736
0
{
737
0
  int pos = resp_header_locate (resp, name, 0, begptr, endptr);
738
0
  return pos != -1;
739
0
}
740
741
/* Copy the response header named NAME to buffer BUF, no longer than
742
   BUFSIZE (BUFSIZE includes the terminating 0).  If the header
743
   exists, true is returned, false otherwise.  If there should be no
744
   limit on the size of the header, use resp_header_strdup instead.
745
746
   If BUFSIZE is 0, no data is copied, but the boolean indication of
747
   whether the header is present is still returned.  */
748
749
static bool
750
resp_header_copy (const struct response *resp, const char *name,
751
                  char *buf, int bufsize)
752
0
{
753
0
  const char *b, *e;
754
0
  if (!resp_header_get (resp, name, &b, &e))
755
0
    return false;
756
0
  if (bufsize)
757
0
    {
758
0
      int len = MIN (e - b, bufsize - 1);
759
0
      memcpy (buf, b, len);
760
0
      buf[len] = '\0';
761
0
    }
762
0
  return true;
763
0
}
764
765
/* Return the value of header named NAME in RESP, allocated with
766
   malloc.  If such a header does not exist in RESP, return NULL.  */
767
768
static char *
769
resp_header_strdup (const struct response *resp, const char *name)
770
0
{
771
0
  const char *b, *e;
772
0
  if (!resp_header_get (resp, name, &b, &e))
773
0
    return NULL;
774
0
  return strdupdelim (b, e);
775
0
}
776
777
/* Parse the HTTP status line, which is of format:
778
779
   HTTP-Version SP Status-Code SP Reason-Phrase
780
781
   The function returns the status-code, or -1 if the status line
782
   appears malformed.  The pointer to "reason-phrase" message is
783
   returned in *MESSAGE.  */
784
785
static int
786
resp_status (const struct response *resp, char **message)
787
0
{
788
0
  int status;
789
0
  const char *p, *end;
790
791
0
  if (!resp->headers)
792
0
    {
793
      /* For a HTTP/0.9 response, assume status 200. */
794
0
      if (message)
795
0
        *message = xstrdup (_("No headers, assuming HTTP/0.9"));
796
0
      return 200;
797
0
    }
798
799
0
  p = resp->headers[0];
800
0
  end = resp->headers[1];
801
802
0
  if (!end)
803
0
    return -1;
804
805
  /* "HTTP" */
806
0
  if (end - p < 4 || 0 != strncmp (p, "HTTP", 4))
807
0
    return -1;
808
0
  p += 4;
809
810
  /* Match the HTTP version.  This is optional because Gnutella
811
     servers have been reported to not specify HTTP version.  */
812
0
  if (p < end && *p == '/')
813
0
    {
814
0
      ++p;
815
0
      while (p < end && c_isdigit (*p))
816
0
        ++p;
817
0
      if (p < end && *p == '.')
818
0
        ++p;
819
0
      while (p < end && c_isdigit (*p))
820
0
        ++p;
821
0
    }
822
823
0
  while (p < end && c_isspace (*p))
824
0
    ++p;
825
0
  if (end - p < 3 || !c_isdigit (p[0]) || !c_isdigit (p[1]) || !c_isdigit (p[2]))
826
0
    return -1;
827
828
0
  status = 100 * (p[0] - '0') + 10 * (p[1] - '0') + (p[2] - '0');
829
0
  p += 3;
830
831
0
  if (message)
832
0
    {
833
0
      while (p < end && c_isspace (*p))
834
0
        ++p;
835
0
      while (p < end && c_isspace (end[-1]))
836
0
        --end;
837
0
      *message = strdupdelim (p, end);
838
0
    }
839
840
0
  return status;
841
0
}
842
843
/* Release the resources used by RESP.
844
   It is safe to call it with a valid pointer to a NULL pointer.
845
   It is not safe to call it with a invalid or NULL pointer.  */
846
847
static void
848
resp_free (struct response **resp_ref)
849
0
{
850
0
  struct response *resp = *resp_ref;
851
852
0
  if (!resp)
853
0
    return;
854
855
0
  xfree (resp->headers);
856
0
  xfree (resp);
857
858
0
  *resp_ref = NULL;
859
0
}
860
861
/* Print a single line of response, the characters [b, e).  We tried
862
   getting away with
863
      logprintf (LOG_VERBOSE, "%s%.*s\n", prefix, (int) (e - b), b);
864
   but that failed to escape the non-printable characters and, in fact,
865
   caused crashes in UTF-8 locales.  */
866
867
static void
868
print_response_line (const char *prefix, const char *b, const char *e)
869
0
{
870
0
  char buf[1024], *copy;
871
0
  size_t len = e - b;
872
873
0
  if (len < sizeof (buf))
874
0
    copy = buf;
875
0
  else
876
0
    copy = xmalloc(len + 1);
877
878
0
  memcpy(copy, b, len);
879
0
  copy[len] = 0;
880
881
0
  logprintf (LOG_ALWAYS, "%s%s\n", prefix,
882
0
             quotearg_style (escape_quoting_style, copy));
883
884
0
  if (copy != buf)
885
0
    xfree (copy);
886
0
}
887
888
/* Print the server response, line by line, omitting the trailing CRLF
889
   from individual header lines, and prefixed with PREFIX.  */
890
891
static void
892
print_server_response (const struct response *resp, const char *prefix)
893
0
{
894
0
  int i;
895
0
  if (!resp->headers)
896
0
    return;
897
0
  for (i = 0; resp->headers[i + 1]; i++)
898
0
    {
899
0
      const char *b = resp->headers[i];
900
0
      const char *e = resp->headers[i + 1];
901
      /* Skip CRLF */
902
0
      if (b < e && e[-1] == '\n')
903
0
        --e;
904
0
      if (b < e && e[-1] == '\r')
905
0
        --e;
906
0
      print_response_line (prefix, b, e);
907
0
    }
908
0
}
909
910
/* Parse the `Content-Range' header and extract the information it
911
   contains.  Returns true if successful, false otherwise.  */
912
static bool
913
parse_content_range (const char *hdr, wgint *first_byte_ptr,
914
                     wgint *last_byte_ptr, wgint *entity_length_ptr)
915
0
{
916
0
  wgint num;
917
918
  /* Ancient versions of Netscape proxy server, presumably predating
919
     rfc2068, sent out `Content-Range' without the "bytes"
920
     specifier.  */
921
0
  if (0 == strncasecmp (hdr, "bytes", 5))
922
0
    {
923
0
      hdr += 5;
924
      /* "JavaWebServer/1.1.1" sends "bytes: x-y/z", contrary to the
925
         HTTP spec. */
926
0
      if (*hdr == ':')
927
0
        ++hdr;
928
0
      while (c_isspace (*hdr))
929
0
        ++hdr;
930
0
      if (!*hdr)
931
0
        return false;
932
0
    }
933
0
  if (!c_isdigit (*hdr))
934
0
    return false;
935
0
  for (num = 0; c_isdigit (*hdr); hdr++)
936
0
    num = 10 * num + (*hdr - '0');
937
0
  if (*hdr != '-' || !c_isdigit (*(hdr + 1)))
938
0
    return false;
939
0
  *first_byte_ptr = num;
940
0
  ++hdr;
941
0
  for (num = 0; c_isdigit (*hdr); hdr++)
942
0
    num = 10 * num + (*hdr - '0');
943
0
  if (*hdr != '/')
944
0
    return false;
945
0
  *last_byte_ptr = num;
946
0
  if (!(c_isdigit (*(hdr + 1)) || *(hdr + 1) == '*'))
947
0
    return false;
948
0
  if (*last_byte_ptr < *first_byte_ptr)
949
0
    return false;
950
0
  ++hdr;
951
0
  if (*hdr == '*')
952
0
    num = -1;
953
0
  else
954
0
    for (num = 0; c_isdigit (*hdr); hdr++)
955
0
      num = 10 * num + (*hdr - '0');
956
0
  *entity_length_ptr = num;
957
0
  if ((*entity_length_ptr <= *last_byte_ptr) && *entity_length_ptr != -1)
958
0
    return false;
959
0
  return true;
960
0
}
961
962
/* Read the body of the request, but don't store it anywhere and don't
963
   display a progress gauge.  This is useful for reading the bodies of
964
   administrative responses to which we will soon issue another
965
   request.  The response is not useful to the user, but reading it
966
   allows us to continue using the same connection to the server.
967
968
   If reading fails, false is returned, true otherwise.  In debug
969
   mode, the body is displayed for debugging purposes.  */
970
971
static bool
972
skip_short_body (int fd, wgint contlen, bool chunked)
973
0
{
974
0
  enum {
975
0
    SKIP_SIZE = 512,                /* size of the download buffer */
976
0
    SKIP_THRESHOLD = 4096        /* the largest size we read */
977
0
  };
978
0
  wgint remaining_chunk_size = 0;
979
0
  char dlbuf[SKIP_SIZE + 1];
980
0
  dlbuf[SKIP_SIZE] = '\0';        /* so DEBUGP can safely print it */
981
982
  /* If the body is too large, it makes more sense to simply close the
983
     connection than to try to read the body.  */
984
0
  if (contlen > SKIP_THRESHOLD)
985
0
    return false;
986
987
0
  while (contlen > 0 || chunked)
988
0
    {
989
0
      int ret;
990
0
      if (chunked)
991
0
        {
992
0
          if (remaining_chunk_size == 0)
993
0
            {
994
0
              char *line = fd_read_line (fd);
995
0
              char *endl;
996
0
              if (line == NULL)
997
0
                break;
998
999
0
              remaining_chunk_size = strtol (line, &endl, 16);
1000
0
              xfree (line);
1001
1002
0
              if (remaining_chunk_size < 0)
1003
0
                return false;
1004
1005
0
              if (remaining_chunk_size == 0)
1006
0
                {
1007
0
                  line = fd_read_line (fd);
1008
0
                  xfree (line);
1009
0
                  break;
1010
0
                }
1011
0
            }
1012
1013
0
          contlen = MIN (remaining_chunk_size, SKIP_SIZE);
1014
0
        }
1015
1016
0
      DEBUGP (("Skipping %s bytes of body: [", number_to_static_string (contlen)));
1017
1018
0
      ret = fd_read (fd, dlbuf, MIN (contlen, SKIP_SIZE), -1);
1019
0
      if (ret <= 0)
1020
0
        {
1021
          /* Don't normally report the error since this is an
1022
             optimization that should be invisible to the user.  */
1023
0
          DEBUGP (("] aborting (%s).\n",
1024
0
                   ret < 0 ? fd_errstr (fd) : "EOF received"));
1025
0
          return false;
1026
0
        }
1027
0
      contlen -= ret;
1028
1029
0
      if (chunked)
1030
0
        {
1031
0
          remaining_chunk_size -= ret;
1032
0
          if (remaining_chunk_size == 0)
1033
0
            {
1034
0
              char *line = fd_read_line (fd);
1035
0
              if (line == NULL)
1036
0
                return false;
1037
0
              else
1038
0
                xfree (line);
1039
0
            }
1040
0
        }
1041
1042
      /* Safe even if %.*s bogusly expects terminating \0 because
1043
         we've zero-terminated dlbuf above.  */
1044
0
      DEBUGP (("%.*s", ret, dlbuf));
1045
0
    }
1046
1047
0
  DEBUGP (("] done.\n"));
1048
0
  return true;
1049
0
}
1050
1051
31.9k
#define NOT_RFC2231 0
1052
969
#define RFC2231_NOENCODING 1
1053
21.0k
#define RFC2231_ENCODING 2
1054
1055
/* extract_param extracts the parameter name into NAME.
1056
   However, if the parameter name is in RFC2231 format then
1057
   this function adjusts NAME by stripping of the trailing
1058
   characters that are not part of the name but are present to
1059
   indicate the presence of encoding information in the value
1060
   or a fragment of a long parameter value
1061
*/
1062
static int
1063
modify_param_name (param_token *name)
1064
18.7k
{
1065
18.7k
  const char *delim1 = memchr (name->b, '*', name->e - name->b);
1066
18.7k
  const char *delim2 = memrchr (name->b, '*', name->e - name->b);
1067
1068
18.7k
  int result;
1069
1070
18.7k
  if (delim1 == NULL)
1071
13.2k
    {
1072
13.2k
      result = NOT_RFC2231;
1073
13.2k
    }
1074
5.51k
  else if (delim1 == delim2)
1075
4.54k
    {
1076
4.54k
      if ((name->e - 1) == delim1)
1077
3.57k
        {
1078
3.57k
          result = RFC2231_ENCODING;
1079
3.57k
        }
1080
969
      else
1081
969
        {
1082
969
          result = RFC2231_NOENCODING;
1083
969
        }
1084
4.54k
      name->e = delim1;
1085
4.54k
    }
1086
969
  else
1087
969
    {
1088
969
      name->e = delim1;
1089
969
      result = RFC2231_ENCODING;
1090
969
    }
1091
18.7k
  return result;
1092
18.7k
}
1093
1094
/* extract_param extract the parameter value into VALUE.
1095
   Like modify_param_name this function modifies VALUE by
1096
   stripping off the encoding information from the actual value
1097
*/
1098
static void
1099
modify_param_value (param_token *value, int encoding_type )
1100
5.51k
{
1101
5.51k
  if (encoding_type == RFC2231_ENCODING)
1102
4.54k
    {
1103
4.54k
      const char *delim = memrchr (value->b, '\'', value->e - value->b);
1104
4.54k
      if (delim != NULL)
1105
969
        {
1106
969
          value->b = (delim+1);
1107
969
        }
1108
4.54k
    }
1109
5.51k
}
1110
1111
/* Extract a parameter from the string (typically an HTTP header) at
1112
   **SOURCE and advance SOURCE to the next parameter.  Return false
1113
   when there are no more parameters to extract.  The name of the
1114
   parameter is returned in NAME, and the value in VALUE.  If the
1115
   parameter has no value, the token's value is zeroed out.
1116
1117
   For example, if *SOURCE points to the string "attachment;
1118
   filename=\"foo bar\"", the first call to this function will return
1119
   the token named "attachment" and no value, and the second call will
1120
   return the token named "filename" and value "foo bar".  The third
1121
   call will return false, indicating no more valid tokens.
1122
1123
   is_url_encoded is an out parameter. If not NULL, a boolean value will be
1124
   stored into it, letting the caller know whether or not the extracted value is
1125
   URL-encoded. The caller can then decode it with url_unescape(), which however
1126
   performs decoding in-place. URL-encoding is used by RFC 2231 to support
1127
   non-US-ASCII characters in HTTP header values.  */
1128
1129
bool
1130
extract_param (const char **source, param_token *name, param_token *value,
1131
               char separator, bool *is_url_encoded)
1132
33.9k
{
1133
33.9k
  const char *p = *source;
1134
33.9k
  int param_type;
1135
33.9k
  if (is_url_encoded)
1136
0
    *is_url_encoded = false;   /* initializing the out parameter */
1137
1138
39.7k
  while (c_isspace (*p)) ++p;
1139
33.9k
  if (!*p)
1140
2.44k
    {
1141
2.44k
      *source = p;
1142
2.44k
      return false;             /* no error; nothing more to extract */
1143
2.44k
    }
1144
1145
  /* Extract name. */
1146
31.4k
  name->b = p;
1147
277k
  while (*p && !c_isspace (*p) && *p != '=' && *p != separator) ++p;
1148
31.4k
  name->e = p;
1149
31.4k
  if (name->b == name->e)
1150
84
    return false;               /* empty name: error */
1151
33.8k
  while (c_isspace (*p)) ++p;
1152
31.3k
  if (*p == separator || !*p)           /* no value */
1153
12.3k
    {
1154
12.3k
      xzero (*value);
1155
12.3k
      if (*p == separator) ++p;
1156
12.3k
      *source = p;
1157
12.3k
      return true;
1158
12.3k
    }
1159
18.9k
  if (*p != '=')
1160
186
    return false;               /* error */
1161
1162
  /* *p is '=', extract value */
1163
18.7k
  ++p;
1164
19.7k
  while (c_isspace (*p)) ++p;
1165
18.7k
  if (*p == '"')                /* quoted */
1166
1.65k
    {
1167
1.65k
      value->b = ++p;
1168
4.43k
      while (*p && *p != '"') ++p;
1169
1.65k
      if (!*p)
1170
66
        return false;
1171
1.59k
      value->e = p++;
1172
      /* Currently at closing quote; find the end of param. */
1173
2.56k
      while (c_isspace (*p)) ++p;
1174
2.63k
      while (*p && *p != separator) ++p;
1175
1.59k
      if (*p == separator)
1176
1.45k
        ++p;
1177
138
      else if (*p)
1178
        /* garbage after closed quote, e.g. foo="bar"baz */
1179
0
        return false;
1180
1.59k
    }
1181
17.1k
  else                          /* unquoted */
1182
17.1k
    {
1183
17.1k
      value->b = p;
1184
84.5k
      while (*p && *p != separator) ++p;
1185
17.1k
      value->e = p;
1186
19.8k
      while (value->e != value->b && c_isspace (value->e[-1]))
1187
2.72k
        --value->e;
1188
17.1k
      if (*p == separator) ++p;
1189
17.1k
    }
1190
18.7k
  *source = p;
1191
1192
18.7k
  param_type = modify_param_name (name);
1193
18.7k
  if (param_type != NOT_RFC2231)
1194
5.51k
    {
1195
5.51k
      if (param_type == RFC2231_ENCODING && is_url_encoded)
1196
0
        *is_url_encoded = true;
1197
5.51k
      modify_param_value (value, param_type);
1198
5.51k
    }
1199
18.7k
  return true;
1200
18.7k
}
1201
1202
#undef NOT_RFC2231
1203
#undef RFC2231_NOENCODING
1204
#undef RFC2231_ENCODING
1205
1206
/* Appends the string represented by VALUE to FILENAME */
1207
1208
static void
1209
append_value_to_filename (char **filename, param_token const * const value,
1210
                          bool is_url_encoded)
1211
0
{
1212
0
  int original_length = strlen (*filename);
1213
0
  int new_length = strlen (*filename) + (value->e - value->b);
1214
0
  *filename = xrealloc (*filename, new_length+1);
1215
0
  memcpy (*filename + original_length, value->b, (value->e - value->b));
1216
0
  (*filename)[new_length] = '\0';
1217
0
  if (is_url_encoded)
1218
0
    url_unescape (*filename + original_length);
1219
0
}
1220
1221
/* Parse the contents of the `Content-Disposition' header, extracting
1222
   the information useful to Wget.  Content-Disposition is a header
1223
   borrowed from MIME; when used in HTTP, it typically serves for
1224
   specifying the desired file name of the resource.  For example:
1225
1226
       Content-Disposition: attachment; filename="flora.jpg"
1227
1228
   Wget will skip the tokens it doesn't care about, such as
1229
   "attachment" in the previous example; it will also skip other
1230
   unrecognized params.  If the header is syntactically correct and
1231
   contains a file name, a copy of the file name is stored in
1232
   *filename and true is returned.  Otherwise, the function returns
1233
   false.
1234
1235
   The file name is stripped of directory components and must not be
1236
   empty.
1237
1238
   Historically, this function returned filename prefixed with opt.dir_prefix,
1239
   now that logic is handled by the caller, new code should pay attention,
1240
   changed by crq, Sep 2010.
1241
1242
*/
1243
static bool
1244
parse_content_disposition (const char *hdr, char **filename)
1245
0
{
1246
0
  param_token name, value;
1247
0
  bool is_url_encoded = false;
1248
1249
0
  char *encodedFilename = NULL;
1250
0
  char *unencodedFilename = NULL;
1251
0
  for ( ; extract_param (&hdr, &name, &value, ';', &is_url_encoded);
1252
0
        is_url_encoded = false)
1253
0
    {
1254
0
      int isFilename = BOUNDED_EQUAL_NO_CASE (name.b, name.e, "filename");
1255
0
      if ( isFilename && value.b != NULL)
1256
0
        {
1257
          /* Make the file name begin at the last slash or backslash. */
1258
0
          bool isEncodedFilename;
1259
0
          char **outFilename;
1260
0
          const char *last_slash = memrchr (value.b, '/', value.e - value.b);
1261
0
          const char *last_bs = memrchr (value.b, '\\', value.e - value.b);
1262
0
          if (last_slash && last_bs)
1263
0
            value.b = 1 + MAX (last_slash, last_bs);
1264
0
          else if (last_slash || last_bs)
1265
0
            value.b = 1 + (last_slash ? last_slash : last_bs);
1266
0
          if (value.b == value.e)
1267
0
            continue;
1268
1269
          /* Check if the name is "filename*" as specified in RFC 6266.
1270
           * Since "filename" could be broken up as "filename*N" (RFC 2231),
1271
           * a check is needed to make sure this is not the case */
1272
0
          isEncodedFilename = *name.e == '*' && !c_isdigit (*(name.e + 1));
1273
0
          outFilename = isEncodedFilename ? &encodedFilename
1274
0
            : &unencodedFilename;
1275
0
          if (*outFilename)
1276
0
            append_value_to_filename (outFilename, &value, is_url_encoded);
1277
0
          else
1278
0
            {
1279
0
              *outFilename = strdupdelim (value.b, value.e);
1280
0
              if (is_url_encoded)
1281
0
                url_unescape (*outFilename);
1282
0
            }
1283
0
        }
1284
0
    }
1285
0
  if (encodedFilename)
1286
0
    {
1287
0
      xfree (unencodedFilename);
1288
0
      *filename = encodedFilename;
1289
0
    }
1290
0
  else
1291
0
    {
1292
0
      xfree (encodedFilename);
1293
0
      *filename = unencodedFilename;
1294
0
    }
1295
0
  if (*filename)
1296
0
    return true;
1297
0
  else
1298
0
    return false;
1299
0
}
1300
1301
#ifdef HAVE_HSTS
1302
static bool
1303
parse_strict_transport_security (const char *header, int64_t *max_age, bool *include_subdomains)
1304
0
{
1305
0
  param_token name, value;
1306
0
  const char *c_max_age = NULL;
1307
0
  bool is = false; /* includeSubDomains */
1308
0
  bool is_url_encoded = false;
1309
0
  bool success = false;
1310
1311
0
  if (header)
1312
0
    {
1313
      /* Process the STS header. Keys should be matched case-insensitively. */
1314
0
      for (; extract_param (&header, &name, &value, ';', &is_url_encoded); is_url_encoded = false)
1315
0
        {
1316
0
          if (BOUNDED_EQUAL_NO_CASE (name.b, name.e, "max-age"))
1317
0
            {
1318
0
              xfree (c_max_age);
1319
0
              c_max_age = strdupdelim (value.b, value.e);
1320
0
            }
1321
0
          else if (BOUNDED_EQUAL_NO_CASE (name.b, name.e, "includeSubDomains"))
1322
0
            is = true;
1323
0
        }
1324
1325
      /* pass the parsed values over */
1326
0
      if (c_max_age)
1327
0
        {
1328
          /* If the string value goes out of a long's bounds, strtol() will return LONG_MIN or LONG_MAX.
1329
           * In theory, the HSTS engine should be able to handle it.
1330
           * Also, time_t is normally defined as a long, so this should not break.
1331
           */
1332
0
          if (max_age)
1333
0
            *max_age = (int64_t) strtoll (c_max_age, NULL, 10);
1334
0
          if (include_subdomains)
1335
0
            *include_subdomains = is;
1336
1337
0
          DEBUGP (("Parsed Strict-Transport-Security max-age = %s, includeSubDomains = %s\n",
1338
0
                 c_max_age, (is ? "true" : "false")));
1339
1340
0
          xfree (c_max_age);
1341
0
          success = true;
1342
0
        }
1343
0
      else
1344
0
        {
1345
          /* something weird happened */
1346
0
          logprintf (LOG_VERBOSE, "Could not parse Strict-Transport-Security header\n");
1347
0
          success = false;
1348
0
        }
1349
0
    }
1350
1351
0
  return success;
1352
0
}
1353
#endif
1354
1355
/* Persistent connections.  Currently, we cache the most recently used
1356
   connection as persistent, provided that the HTTP server agrees to
1357
   make it such.  The persistence data is stored in the variables
1358
   below.  Ideally, it should be possible to cache an arbitrary fixed
1359
   number of these connections.  */
1360
1361
/* Whether a persistent connection is active. */
1362
static bool pconn_active;
1363
1364
static struct {
1365
  /* The socket of the connection.  */
1366
  int socket;
1367
1368
  /* Host and port of the currently active persistent connection. */
1369
  char *host;
1370
  int port;
1371
1372
  /* Whether a ssl handshake has occurred on this connection.  */
1373
  bool ssl;
1374
1375
  /* Whether the connection was authorized.  This is only done by
1376
     NTLM, which authorizes *connections* rather than individual
1377
     requests.  (That practice is peculiar for HTTP, but it is a
1378
     useful optimization.)  */
1379
  bool authorized;
1380
1381
#ifdef ENABLE_NTLM
1382
  /* NTLM data of the current connection.  */
1383
  struct ntlmdata ntlm;
1384
#endif
1385
} pconn;
1386
1387
/* Mark the persistent connection as invalid and free the resources it
1388
   uses.  This is used by the CLOSE_* macros after they forcefully
1389
   close a registered persistent connection.  */
1390
1391
static void
1392
invalidate_persistent (void)
1393
0
{
1394
0
  DEBUGP (("Disabling further reuse of socket %d.\n", pconn.socket));
1395
0
  pconn_active = false;
1396
0
  fd_close (pconn.socket);
1397
0
  xfree (pconn.host);
1398
0
  xzero (pconn);
1399
0
}
1400
1401
/* Register FD, which should be a TCP/IP connection to HOST:PORT, as
1402
   persistent.  This will enable someone to use the same connection
1403
   later.  In the context of HTTP, this must be called only AFTER the
1404
   response has been received and the server has promised that the
1405
   connection will remain alive.
1406
1407
   If a previous connection was persistent, it is closed. */
1408
1409
static void
1410
register_persistent (const char *host, int port, int fd, bool ssl)
1411
0
{
1412
0
  if (pconn_active)
1413
0
    {
1414
0
      if (pconn.socket == fd)
1415
0
        {
1416
          /* The connection FD is already registered. */
1417
0
          return;
1418
0
        }
1419
0
      else
1420
0
        {
1421
          /* The old persistent connection is still active; close it
1422
             first.  This situation arises whenever a persistent
1423
             connection exists, but we then connect to a different
1424
             host, and try to register a persistent connection to that
1425
             one.  */
1426
0
          invalidate_persistent ();
1427
0
        }
1428
0
    }
1429
1430
0
  pconn_active = true;
1431
0
  pconn.socket = fd;
1432
0
  pconn.host = xstrdup (host);
1433
0
  pconn.port = port;
1434
0
  pconn.ssl = ssl;
1435
0
  pconn.authorized = false;
1436
1437
0
  DEBUGP (("Registered socket %d for persistent reuse.\n", fd));
1438
0
}
1439
1440
/* Return true if a persistent connection is available for connecting
1441
   to HOST:PORT.  */
1442
1443
static bool
1444
persistent_available_p (const char *host, int port, bool ssl,
1445
                        bool *host_lookup_failed)
1446
0
{
1447
  /* First, check whether a persistent connection is active at all.  */
1448
0
  if (!pconn_active)
1449
0
    return false;
1450
1451
  /* If we want SSL and the last connection wasn't or vice versa,
1452
     don't use it.  Checking for host and port is not enough because
1453
     HTTP and HTTPS can apparently coexist on the same port.  */
1454
0
  if (ssl != pconn.ssl)
1455
0
    return false;
1456
1457
  /* If we're not connecting to the same port, we're not interested. */
1458
0
  if (port != pconn.port)
1459
0
    return false;
1460
1461
  /* If the host is the same, we're in business.  If not, there is
1462
     still hope -- read below.  */
1463
0
  if (0 != strcasecmp (host, pconn.host))
1464
0
    {
1465
      /* Check if pconn.socket is talking to HOST under another name.
1466
         This happens often when both sites are virtual hosts
1467
         distinguished only by name and served by the same network
1468
         interface, and hence the same web server (possibly set up by
1469
         the ISP and serving many different web sites).  This
1470
         admittedly unconventional optimization does not contradict
1471
         HTTP and works well with popular server software.  */
1472
1473
0
      bool found;
1474
0
      ip_address ip;
1475
0
      struct address_list *al;
1476
1477
0
      if (ssl)
1478
        /* Don't try to talk to two different SSL sites over the same
1479
           secure connection!  (Besides, it's not clear that
1480
           name-based virtual hosting is even possible with SSL.)  */
1481
0
        return false;
1482
1483
      /* If pconn.socket's peer is one of the IP addresses HOST
1484
         resolves to, pconn.socket is for all intents and purposes
1485
         already talking to HOST.  */
1486
1487
0
      if (!socket_ip_address (pconn.socket, &ip, ENDPOINT_PEER))
1488
0
        {
1489
          /* Can't get the peer's address -- something must be very
1490
             wrong with the connection.  */
1491
0
          invalidate_persistent ();
1492
0
          return false;
1493
0
        }
1494
0
      al = lookup_host (host, 0);
1495
0
      if (!al)
1496
0
        {
1497
0
          *host_lookup_failed = true;
1498
0
          return false;
1499
0
        }
1500
1501
0
      found = address_list_contains (al, &ip);
1502
0
      address_list_release (al);
1503
1504
0
      if (!found)
1505
0
        return false;
1506
1507
      /* The persistent connection's peer address was found among the
1508
         addresses HOST resolved to; therefore, pconn.sock is in fact
1509
         already talking to HOST -- no need to reconnect.  */
1510
0
    }
1511
1512
  /* Finally, check whether the connection is still open.  This is
1513
     important because most servers implement liberal (short) timeout
1514
     on persistent connections.  Wget can of course always reconnect
1515
     if the connection doesn't work out, but it's nicer to know in
1516
     advance.  This test is a logical followup of the first test, but
1517
     is "expensive" and therefore placed at the end of the list.
1518
1519
     (Current implementation of test_socket_open has a nice side
1520
     effect that it treats sockets with pending data as "closed".
1521
     This is exactly what we want: if a broken server sends message
1522
     body in response to HEAD, or if it sends more than conent-length
1523
     data, we won't reuse the corrupted connection.)  */
1524
1525
0
  if (!test_socket_open (pconn.socket))
1526
0
    {
1527
      /* Oops, the socket is no longer open.  Now that we know that,
1528
         let's invalidate the persistent connection before returning
1529
         0.  */
1530
0
      invalidate_persistent ();
1531
0
      return false;
1532
0
    }
1533
1534
0
  return true;
1535
0
}
1536
1537
/* The idea behind these two CLOSE macros is to distinguish between
1538
   two cases: one when the job we've been doing is finished, and we
1539
   want to close the connection and leave, and two when something is
1540
   seriously wrong and we're closing the connection as part of
1541
   cleanup.
1542
1543
   In case of keep_alive, CLOSE_FINISH should leave the connection
1544
   open, while CLOSE_INVALIDATE should still close it.
1545
1546
   Note that the semantics of the flag `keep_alive' is "this
1547
   connection *will* be reused (the server has promised not to close
1548
   the connection once we're done)", while the semantics of
1549
   `pc_active_p && (fd) == pc_last_fd' is "we're *now* using an
1550
   active, registered connection".  */
1551
1552
0
#define CLOSE_FINISH(fd) do {                   \
1553
0
  if (!keep_alive)                              \
1554
0
    {                                           \
1555
0
      if (pconn_active && (fd) == pconn.socket) \
1556
0
        invalidate_persistent ();               \
1557
0
      else                                      \
1558
0
          fd_close (fd);                        \
1559
0
      fd = -1;                                  \
1560
0
    }                                           \
1561
0
} while (0)
1562
1563
0
#define CLOSE_INVALIDATE(fd) do {               \
1564
0
  if (pconn_active && (fd) == pconn.socket)     \
1565
0
    invalidate_persistent ();                   \
1566
0
  else                                          \
1567
0
    fd_close (fd);                              \
1568
0
  fd = -1;                                      \
1569
0
} while (0)
1570
1571
typedef enum
1572
{
1573
  ENC_INVALID = -1,             /* invalid encoding */
1574
  ENC_NONE = 0,                 /* no special encoding */
1575
  ENC_GZIP,                     /* gzip compression */
1576
  ENC_DEFLATE,                  /* deflate compression */
1577
  ENC_COMPRESS,                 /* compress compression */
1578
  ENC_BROTLI                    /* brotli compression */
1579
} encoding_t;
1580
1581
struct http_stat
1582
{
1583
  wgint len;                    /* received length */
1584
  wgint contlen;                /* expected length */
1585
  wgint restval;                /* the restart value */
1586
  int res;                      /* the result of last read */
1587
  char *rderrmsg;               /* error message from read error */
1588
  char *newloc;                 /* new location (redirection) */
1589
  char *remote_time;            /* remote time-stamp string */
1590
  char *error;                  /* textual HTTP error */
1591
  int statcode;                 /* status code */
1592
  char *message;                /* status message */
1593
  wgint rd_size;                /* amount of data read from socket */
1594
  double dltime;                /* time it took to download the data */
1595
  const char *referer;          /* value of the referer header. */
1596
  char *local_file;             /* local file name. */
1597
  bool existence_checked;       /* true if we already checked for a file's
1598
                                   existence after having begun to download
1599
                                   (needed in gethttp for when connection is
1600
                                   interrupted/restarted. */
1601
  bool timestamp_checked;       /* true if pre-download time-stamping checks
1602
                                 * have already been performed */
1603
  char *orig_file_name;         /* name of file to compare for time-stamping
1604
                                 * (might be != local_file if -K is set) */
1605
  wgint orig_file_size;         /* size of file to compare for time-stamping */
1606
  time_t orig_file_tstamp;      /* time-stamp of file to compare for
1607
                                 * time-stamping */
1608
#ifdef HAVE_METALINK
1609
  metalink_t *metalink;
1610
#endif
1611
1612
  encoding_t local_encoding;    /* the encoding of the local file */
1613
  encoding_t remote_encoding;   /* the encoding of the remote file */
1614
1615
  bool temporary;               /* downloading a temporary file */
1616
};
1617
1618
static void
1619
free_hstat (struct http_stat *hs)
1620
0
{
1621
0
  xfree (hs->newloc);
1622
0
  xfree (hs->remote_time);
1623
0
  xfree (hs->error);
1624
0
  xfree (hs->rderrmsg);
1625
0
  xfree (hs->local_file);
1626
0
  xfree (hs->orig_file_name);
1627
0
  xfree (hs->message);
1628
#ifdef HAVE_METALINK
1629
  metalink_delete (hs->metalink);
1630
  hs->metalink = NULL;
1631
#endif
1632
0
}
1633
1634
static void
1635
get_file_flags (const char *filename, int *dt)
1636
0
{
1637
0
  logprintf (LOG_VERBOSE, _("\
1638
0
File %s already there; not retrieving.\n\n"), quote (filename));
1639
  /* If the file is there, we suppose it's retrieved OK.  */
1640
0
  *dt |= RETROKF;
1641
1642
  /* #### Bogusness alert.  */
1643
  /* If its suffix is "html" or "htm" or similar, assume text/html.  */
1644
0
  if (has_html_suffix_p (filename))
1645
0
    *dt |= TEXTHTML;
1646
0
}
1647
1648
/* Download the response body from the socket and writes it to
1649
   an output file.  The headers have already been read from the
1650
   socket.  If WARC is enabled, the response body will also be
1651
   written to a WARC response record.
1652
1653
   hs, contlen, contrange, chunked_transfer_encoding and url are
1654
   parameters from the gethttp method.  fp is a pointer to the
1655
   output file.
1656
1657
   url, warc_timestamp_str, warc_request_uuid, warc_ip, type
1658
   and statcode will be saved in the headers of the WARC record.
1659
   The head parameter contains the HTTP headers of the response.
1660
1661
   If fp is NULL and WARC is enabled, the response body will be
1662
   written only to the WARC file.  If WARC is disabled and fp
1663
   is a file pointer, the data will be written to the file.
1664
   If fp is a file pointer and WARC is enabled, the body will
1665
   be written to both destinations.
1666
1667
   Returns the error code.   */
1668
static int
1669
read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen,
1670
                    wgint contrange, bool chunked_transfer_encoding,
1671
                    char *url, char *warc_timestamp_str, char *warc_request_uuid,
1672
                    ip_address *warc_ip, char *type, int statcode, char *head)
1673
0
{
1674
0
  int warc_payload_offset = 0;
1675
0
  FILE *warc_tmp = NULL;
1676
0
  int warcerr = 0;
1677
0
  int flags = 0;
1678
1679
0
  if (opt.warc_filename != NULL)
1680
0
    {
1681
      /* Open a temporary file where we can write the response before we
1682
         add it to the WARC record.  */
1683
0
      warc_tmp = warc_tempfile ();
1684
0
      if (warc_tmp == NULL)
1685
0
        warcerr = WARC_TMP_FOPENERR;
1686
1687
0
      if (warcerr == 0)
1688
0
        {
1689
          /* We should keep the response headers for the WARC record.  */
1690
0
          int head_len = strlen (head);
1691
0
          int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp);
1692
0
          if (warc_tmp_written != head_len)
1693
0
            warcerr = WARC_TMP_FWRITEERR;
1694
0
          warc_payload_offset = head_len;
1695
0
        }
1696
1697
0
      if (warcerr != 0)
1698
0
        {
1699
0
          if (warc_tmp != NULL)
1700
0
            fclose (warc_tmp);
1701
0
          return warcerr;
1702
0
        }
1703
0
    }
1704
1705
0
  if (fp != NULL)
1706
0
    {
1707
      /* This confuses the timestamping code that checks for file size.
1708
         #### The timestamping code should be smarter about file size.  */
1709
0
      if (opt.save_headers && hs->restval == 0)
1710
0
        fwrite (head, 1, strlen (head), fp);
1711
0
    }
1712
1713
  /* Read the response body.  */
1714
0
  if (contlen != -1)
1715
    /* If content-length is present, read that much; otherwise, read
1716
       until EOF.  The HTTP spec doesn't require the server to
1717
       actually close the connection when it's done sending data. */
1718
0
    flags |= rb_read_exactly;
1719
0
  if (fp != NULL && hs->restval > 0 && contrange == 0)
1720
    /* If the server ignored our range request, instruct fd_read_body
1721
       to skip the first RESTVAL bytes of body.  */
1722
0
    flags |= rb_skip_startpos;
1723
0
  if (chunked_transfer_encoding)
1724
0
    flags |= rb_chunked_transfer_encoding;
1725
1726
0
  if (hs->remote_encoding == ENC_GZIP)
1727
0
    flags |= rb_compressed_gzip;
1728
1729
0
  hs->len = hs->restval;
1730
0
  hs->rd_size = 0;
1731
  /* Download the response body and write it to fp.
1732
     If we are working on a WARC file, we simultaneously write the
1733
     response body to warc_tmp.  */
1734
0
  hs->res = fd_read_body (hs->local_file, sock, fp, contlen != -1 ? contlen : 0,
1735
0
                          hs->restval, &hs->rd_size, &hs->len, &hs->dltime,
1736
0
                          flags, warc_tmp);
1737
0
  if (hs->res >= 0)
1738
0
    {
1739
0
      if (warc_tmp != NULL)
1740
0
        {
1741
          /* Create a response record and write it to the WARC file.
1742
             Note: per the WARC standard, the request and response should share
1743
             the same date header.  We re-use the timestamp of the request.
1744
             The response record should also refer to the uuid of the request.  */
1745
0
          bool r = warc_write_response_record (url, warc_timestamp_str,
1746
0
                                               warc_request_uuid, warc_ip,
1747
0
                                               warc_tmp, warc_payload_offset,
1748
0
                                               type, statcode, hs->newloc);
1749
1750
          /* warc_write_response_record has closed warc_tmp. */
1751
1752
0
          if (! r)
1753
0
            return WARC_ERR;
1754
0
        }
1755
1756
0
      return RETRFINISHED;
1757
0
    }
1758
1759
0
  if (warc_tmp != NULL)
1760
0
    fclose (warc_tmp);
1761
1762
0
  if (hs->res == -2)
1763
0
    {
1764
      /* Error while writing to fd. */
1765
0
      return FWRITEERR;
1766
0
    }
1767
0
  else if (hs->res == -3)
1768
0
    {
1769
      /* Error while writing to warc_tmp. */
1770
0
      return WARC_TMP_FWRITEERR;
1771
0
    }
1772
0
  else
1773
0
    {
1774
      /* A read error! */
1775
0
      xfree (hs->rderrmsg);
1776
0
      hs->rderrmsg = xstrdup (fd_errstr (sock));
1777
0
      return RETRFINISHED;
1778
0
    }
1779
0
}
1780
1781
#define BEGINS_WITH(line, string_constant)                               \
1782
0
  (!c_strncasecmp (line, string_constant, sizeof (string_constant) - 1)    \
1783
0
   && (c_isspace (line[sizeof (string_constant) - 1])                      \
1784
0
       || !line[sizeof (string_constant) - 1]))
1785
1786
0
#define SET_USER_AGENT(req) do {                                         \
1787
0
  if (!opt.useragent)                                                    \
1788
0
    request_set_header (req, "User-Agent",                               \
1789
0
                        aprintf ("Wget/%s",                              \
1790
0
                        version_string),                                 \
1791
0
                        rel_value);                                      \
1792
0
  else if (*opt.useragent)                                               \
1793
0
    request_set_header (req, "User-Agent", opt.useragent, rel_none);     \
1794
0
} while (0)
1795
1796
/*
1797
   Convert time_t to one of valid HTTP date formats
1798
   ie. rfc1123-date.
1799
1800
   HTTP-date    = rfc1123-date | rfc850-date | asctime-date
1801
   rfc1123-date = wkday "," SP date1 SP time SP "GMT"
1802
   rfc850-date  = weekday "," SP date2 SP time SP "GMT"
1803
   asctime-date = wkday SP date3 SP time SP 4DIGIT
1804
   date1        = 2DIGIT SP month SP 4DIGIT
1805
                  ; day month year (e.g., 02 Jun 1982)
1806
   date2        = 2DIGIT "-" month "-" 2DIGIT
1807
                  ; day-month-year (e.g., 02-Jun-82)
1808
   date3        = month SP ( 2DIGIT | ( SP 1DIGIT ))
1809
                  ; month day (e.g., Jun  2)
1810
   time         = 2DIGIT ":" 2DIGIT ":" 2DIGIT
1811
                  ; 00:00:00 - 23:59:59
1812
   wkday        = "Mon" | "Tue" | "Wed"
1813
                | "Thu" | "Fri" | "Sat" | "Sun"
1814
   weekday      = "Monday" | "Tuesday" | "Wednesday"
1815
                | "Thursday" | "Friday" | "Saturday" | "Sunday"
1816
   month        = "Jan" | "Feb" | "Mar" | "Apr"
1817
                | "May" | "Jun" | "Jul" | "Aug"
1818
                | "Sep" | "Oct" | "Nov" | "Dec"
1819
1820
   source: RFC2616  */
1821
static uerr_t
1822
time_to_rfc1123 (time_t time, char *buf, size_t bufsize)
1823
0
{
1824
0
  static const char *wkday[] = { "Sun", "Mon", "Tue", "Wed",
1825
0
                                 "Thu", "Fri", "Sat" };
1826
0
  static const char *month[] = { "Jan", "Feb", "Mar", "Apr",
1827
0
                                 "May", "Jun", "Jul", "Aug",
1828
0
                                 "Sep", "Oct", "Nov", "Dec" };
1829
1830
0
  struct tm *gtm = gmtime (&time);
1831
0
  if (!gtm)
1832
0
    {
1833
0
      logprintf (LOG_NOTQUIET,
1834
0
                 _("gmtime failed. This is probably a bug.\n"));
1835
0
      return TIMECONV_ERR;
1836
0
    }
1837
1838
  /* rfc1123 example: Thu, 01 Jan 1998 22:12:57 GMT  */
1839
0
  snprintf (buf, bufsize, "%s, %02d %s %04d %02d:%02d:%02d GMT",
1840
0
            wkday[gtm->tm_wday],
1841
0
            gtm->tm_mday, month[gtm->tm_mon],
1842
0
            gtm->tm_year + 1900, gtm->tm_hour,
1843
0
            gtm->tm_min, gtm->tm_sec);
1844
1845
0
  return RETROK;
1846
0
}
1847
1848
static struct request *
1849
initialize_request (const struct url *u, struct http_stat *hs, int *dt, struct url *proxy,
1850
                    bool inhibit_keep_alive, bool *basic_auth_finished,
1851
                    wgint *body_data_size, char **user, char **passwd, uerr_t *ret)
1852
0
{
1853
0
  bool head_only = !!(*dt & HEAD_ONLY);
1854
0
  struct request *req;
1855
1856
  /* Prepare the request to send. */
1857
0
  {
1858
0
    char *meth_arg;
1859
0
    const char *meth = "GET";
1860
0
    if (head_only)
1861
0
      meth = "HEAD";
1862
0
    else if (opt.method)
1863
0
      meth = opt.method;
1864
    /* Use the full path, i.e. one that includes the leading slash and
1865
       the query string.  E.g. if u->path is "foo/bar" and u->query is
1866
       "param=value", full_path will be "/foo/bar?param=value".  */
1867
0
    if (proxy
1868
0
#ifdef HAVE_SSL
1869
        /* When using SSL over proxy, CONNECT establishes a direct
1870
           connection to the HTTPS server.  Therefore use the same
1871
           argument as when talking to the server directly. */
1872
0
        && u->scheme != SCHEME_HTTPS
1873
0
#endif
1874
0
        )
1875
0
      meth_arg = xstrdup (u->url);
1876
0
    else
1877
0
      meth_arg = url_full_path (u);
1878
0
    req = request_new (meth, meth_arg);
1879
0
  }
1880
1881
  /* Generate the Host header, HOST:PORT.  Take into account that:
1882
1883
     - Broken server-side software often doesn't recognize the PORT
1884
       argument, so we must generate "Host: www.server.com" instead of
1885
       "Host: www.server.com:80" (and likewise for https port).
1886
1887
     - IPv6 addresses contain ":", so "Host: 3ffe:8100:200:2::2:1234"
1888
       becomes ambiguous and needs to be rewritten as "Host:
1889
       [3ffe:8100:200:2::2]:1234".  */
1890
0
  {
1891
    /* Formats arranged for hfmt[add_port][add_squares].  */
1892
0
    static const char *hfmt[][2] = {
1893
0
      { "%s", "[%s]" }, { "%s:%d", "[%s]:%d" }
1894
0
    };
1895
0
    int add_port = u->port != scheme_default_port (u->scheme);
1896
0
    int add_squares = strchr (u->host, ':') != NULL;
1897
0
    request_set_header (req, "Host",
1898
0
                        aprintf (hfmt[add_port][add_squares], u->host, u->port),
1899
0
                        rel_value);
1900
0
  }
1901
1902
0
  request_set_header (req, "Referer", hs->referer, rel_none);
1903
0
  if (*dt & SEND_NOCACHE)
1904
0
    {
1905
      /* Cache-Control MUST be obeyed by all HTTP/1.1 caching mechanisms...  */
1906
0
      request_set_header (req, "Cache-Control", "no-cache", rel_none);
1907
1908
      /* ... but some HTTP/1.0 caches doesn't implement Cache-Control.  */
1909
0
      request_set_header (req, "Pragma", "no-cache", rel_none);
1910
0
    }
1911
0
  if (*dt & IF_MODIFIED_SINCE)
1912
0
    {
1913
0
      char strtime[32];
1914
0
      uerr_t err = time_to_rfc1123 (hs->orig_file_tstamp, strtime, countof (strtime));
1915
1916
0
      if (err != RETROK)
1917
0
        {
1918
0
          logputs (LOG_VERBOSE, _("Cannot convert timestamp to http format. "
1919
0
                                  "Falling back to time 0 as last modification "
1920
0
                                  "time.\n"));
1921
0
          strcpy (strtime, "Thu, 01 Jan 1970 00:00:00 GMT");
1922
0
        }
1923
0
      request_set_header (req, "If-Modified-Since", xstrdup (strtime), rel_value);
1924
0
    }
1925
0
  if (hs->restval)
1926
0
    request_set_header (req, "Range",
1927
0
                        aprintf ("bytes=%s-",
1928
0
                                 number_to_static_string (hs->restval)),
1929
0
                        rel_value);
1930
0
  SET_USER_AGENT (req);
1931
0
  request_set_header (req, "Accept", "*/*", rel_none);
1932
0
#ifdef HAVE_LIBZ
1933
0
  if (opt.compression != compression_none)
1934
0
    request_set_header (req, "Accept-Encoding", "gzip", rel_none);
1935
0
  else
1936
0
#endif
1937
0
    request_set_header (req, "Accept-Encoding", "identity", rel_none);
1938
1939
  /* Find the username with priority */
1940
0
  if (u->user)
1941
0
    *user = u->user;
1942
0
  else if (opt.user && (opt.use_askpass || opt.ask_passwd))
1943
0
    *user = opt.user;
1944
0
  else if (opt.http_user)
1945
0
    *user = opt.http_user;
1946
0
  else if (opt.user)
1947
0
    *user = opt.user;
1948
0
  else
1949
0
    *user = NULL;
1950
1951
  /* Find the password with priority */
1952
0
  if (u->passwd)
1953
0
    *passwd = u->passwd;
1954
0
  else if (opt.passwd && (opt.use_askpass || opt.ask_passwd))
1955
0
    *passwd = opt.passwd;
1956
0
  else if (opt.http_passwd)
1957
0
    *passwd = opt.http_passwd;
1958
0
  else if (opt.passwd)
1959
0
    *passwd = opt.passwd;
1960
0
  else
1961
0
    *passwd = NULL;
1962
1963
  /* Check for ~/.netrc if none of the above match */
1964
0
  if (opt.netrc && (!*user || !*passwd))
1965
0
    search_netrc (u->host, (const char **) user, (const char **) passwd, 0, NULL);
1966
1967
  /* We only do "site-wide" authentication with "global" user/password
1968
   * values unless --auth-no-challenge has been requested; URL user/password
1969
   * info overrides. */
1970
0
  if (*user && *passwd && (!u->user || opt.auth_without_challenge))
1971
0
    {
1972
      /* If this is a host for which we've already received a Basic
1973
       * challenge, we'll go ahead and send Basic authentication creds. */
1974
0
      *basic_auth_finished = maybe_send_basic_creds (u->host, *user, *passwd, req);
1975
0
    }
1976
1977
0
  if (inhibit_keep_alive)
1978
0
    request_set_header (req, "Connection", "Close", rel_none);
1979
0
  else
1980
0
    {
1981
0
      request_set_header (req, "Connection", "Keep-Alive", rel_none);
1982
0
      if (proxy)
1983
0
        request_set_header (req, "Proxy-Connection", "Keep-Alive", rel_none);
1984
0
    }
1985
1986
0
  if (opt.method)
1987
0
    {
1988
1989
0
      if (opt.body_data || opt.body_file)
1990
0
        {
1991
0
          request_set_header (req, "Content-Type",
1992
0
                              "application/x-www-form-urlencoded", rel_none);
1993
1994
0
          if (opt.body_data)
1995
0
            *body_data_size = strlen (opt.body_data);
1996
0
          else
1997
0
            {
1998
0
              *body_data_size = file_size (opt.body_file);
1999
0
              if (*body_data_size == -1)
2000
0
                {
2001
0
                  logprintf (LOG_NOTQUIET, _("BODY data file %s missing: %s\n"),
2002
0
                             quote (opt.body_file), strerror (errno));
2003
0
                  request_free (&req);
2004
0
                  *ret = FILEBADFILE;
2005
0
                  return NULL;
2006
0
                }
2007
0
            }
2008
0
          request_set_header (req, "Content-Length",
2009
0
                              xstrdup (number_to_static_string (*body_data_size)),
2010
0
                              rel_value);
2011
0
        }
2012
0
      else if (c_strcasecmp (opt.method, "post") == 0
2013
0
               || c_strcasecmp (opt.method, "put") == 0
2014
0
               || c_strcasecmp (opt.method, "patch") == 0)
2015
0
        request_set_header (req, "Content-Length", "0", rel_none);
2016
0
    }
2017
0
  return req;
2018
0
}
2019
2020
static void
2021
initialize_proxy_configuration (const struct url *u, struct request *req,
2022
                                struct url *proxy, char **proxyauth)
2023
0
{
2024
0
  char *proxy_user, *proxy_passwd;
2025
  /* For normal username and password, URL components override
2026
     command-line/wgetrc parameters.  With proxy
2027
     authentication, it's the reverse, because proxy URLs are
2028
     normally the "permanent" ones, so command-line args
2029
     should take precedence.  */
2030
0
  if (opt.proxy_user && opt.proxy_passwd)
2031
0
    {
2032
0
      proxy_user = opt.proxy_user;
2033
0
      proxy_passwd = opt.proxy_passwd;
2034
0
    }
2035
0
  else
2036
0
    {
2037
0
      proxy_user = proxy->user;
2038
0
      proxy_passwd = proxy->passwd;
2039
0
    }
2040
  /* #### This does not appear right.  Can't the proxy request,
2041
     say, `Digest' authentication?  */
2042
0
  if (proxy_user && proxy_passwd)
2043
0
    *proxyauth = basic_authentication_encode (proxy_user, proxy_passwd);
2044
2045
  /* Proxy authorization over SSL is handled below. */
2046
0
#ifdef HAVE_SSL
2047
0
  if (u->scheme != SCHEME_HTTPS)
2048
0
#endif
2049
0
    request_set_header (req, "Proxy-Authorization", *proxyauth, rel_value);
2050
0
}
2051
2052
static uerr_t
2053
establish_connection (const struct url *u, const struct url **conn_ref,
2054
                      struct http_stat *hs, struct url *proxy,
2055
                      char **proxyauth,
2056
                      struct request **req_ref, bool *using_ssl,
2057
                      bool inhibit_keep_alive,
2058
                      int *sock_ref)
2059
0
{
2060
0
  bool host_lookup_failed = false;
2061
0
  int sock = *sock_ref;
2062
0
  struct request *req = *req_ref;
2063
0
  const struct url *conn = *conn_ref;
2064
0
  struct response *resp;
2065
0
  int write_error;
2066
0
  int statcode;
2067
2068
0
  if (! inhibit_keep_alive)
2069
0
    {
2070
      /* Look for a persistent connection to target host, unless a
2071
         proxy is used.  The exception is when SSL is in use, in which
2072
         case the proxy is nothing but a passthrough to the target
2073
         host, registered as a connection to the latter.  */
2074
0
      const struct url *relevant = conn;
2075
0
#ifdef HAVE_SSL
2076
0
      if (u->scheme == SCHEME_HTTPS)
2077
0
        relevant = u;
2078
0
#endif
2079
2080
0
      if (persistent_available_p (relevant->host, relevant->port,
2081
0
#ifdef HAVE_SSL
2082
0
                                  relevant->scheme == SCHEME_HTTPS,
2083
#else
2084
                                  0,
2085
#endif
2086
0
                                  &host_lookup_failed))
2087
0
        {
2088
0
          int family = socket_family (pconn.socket, ENDPOINT_PEER);
2089
0
          sock = pconn.socket;
2090
0
          *using_ssl = pconn.ssl;
2091
0
#if ENABLE_IPV6
2092
0
          if (family == AF_INET6)
2093
0
             logprintf (LOG_VERBOSE, _("Reusing existing connection to [%s]:%d.\n"),
2094
0
                        quotearg_style (escape_quoting_style, pconn.host),
2095
0
                         pconn.port);
2096
0
          else
2097
0
#endif
2098
0
             logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"),
2099
0
                        quotearg_style (escape_quoting_style, pconn.host),
2100
0
                        pconn.port);
2101
0
          DEBUGP (("Reusing fd %d.\n", sock));
2102
0
          if (pconn.authorized)
2103
            /* If the connection is already authorized, the "Basic"
2104
               authorization added by code above is unnecessary and
2105
               only hurts us.  */
2106
0
            request_remove_header (req, "Authorization");
2107
0
        }
2108
0
      else if (host_lookup_failed)
2109
0
        {
2110
0
          logprintf(LOG_NOTQUIET,
2111
0
                    _("%s: unable to resolve host address %s\n"),
2112
0
                    exec_name, quote (relevant->host));
2113
0
          return HOSTERR;
2114
0
        }
2115
0
      else if (sock != -1)
2116
0
        {
2117
0
          sock = -1;
2118
0
        }
2119
0
    }
2120
2121
0
  if (sock < 0)
2122
0
    {
2123
0
      sock = connect_to_host (conn->host, conn->port);
2124
0
      if (sock == E_HOST)
2125
0
        return HOSTERR;
2126
0
      else if (sock < 0)
2127
0
        return (retryable_socket_connect_error (errno)
2128
0
                ? CONERROR : CONIMPOSSIBLE);
2129
2130
0
#ifdef HAVE_SSL
2131
0
      if (proxy && u->scheme == SCHEME_HTTPS)
2132
0
        {
2133
0
          char *head;
2134
0
          char *message;
2135
          /* When requesting SSL URLs through proxies, use the
2136
             CONNECT method to request passthrough.  */
2137
0
          struct request *connreq = request_new ("CONNECT",
2138
0
                              aprintf ("%s:%d", u->host, u->port));
2139
0
          SET_USER_AGENT (connreq);
2140
0
          if (proxyauth)
2141
0
            {
2142
0
              request_set_header (connreq, "Proxy-Authorization",
2143
0
                                  *proxyauth, rel_value);
2144
              /* Now that PROXYAUTH is part of the CONNECT request,
2145
                 zero it out so we don't send proxy authorization with
2146
                 the regular request below.  */
2147
0
              *proxyauth = NULL;
2148
0
            }
2149
0
          request_set_header (connreq, "Host",
2150
0
                              aprintf ("%s:%d", u->host, u->port),
2151
0
                              rel_value);
2152
2153
0
          write_error = request_send (connreq, sock, 0);
2154
0
          request_free (&connreq);
2155
0
          if (write_error < 0)
2156
0
            {
2157
0
              CLOSE_INVALIDATE (sock);
2158
0
              return WRITEFAILED;
2159
0
            }
2160
2161
0
          head = read_http_response_head (sock);
2162
0
          if (!head)
2163
0
            {
2164
0
              logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"),
2165
0
                         fd_errstr (sock));
2166
0
              CLOSE_INVALIDATE (sock);
2167
0
              return HERR;
2168
0
            }
2169
0
          message = NULL;
2170
0
          if (!*head)
2171
0
            {
2172
0
              xfree (head);
2173
0
              goto failed_tunnel;
2174
0
            }
2175
0
          DEBUGP (("proxy responded with: [%s]\n", head));
2176
2177
0
          resp = resp_new (head);
2178
0
          statcode = resp_status (resp, &message);
2179
0
          if (statcode < 0)
2180
0
            {
2181
0
              char *tms = datetime_str (time (NULL));
2182
0
              logprintf (LOG_VERBOSE, "%d\n", statcode);
2183
0
              logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), tms, statcode,
2184
0
                         quotearg_style (escape_quoting_style,
2185
0
                                         _("Malformed status line")));
2186
0
              xfree (head);
2187
0
              return HERR;
2188
0
            }
2189
0
          xfree (hs->message);
2190
0
          hs->message = xstrdup (message);
2191
0
          resp_free (&resp);
2192
0
          xfree (head);
2193
0
          if (statcode != 200)
2194
0
            {
2195
0
            failed_tunnel:
2196
0
              logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"),
2197
0
                         message ? quotearg_style (escape_quoting_style, message) : "?");
2198
0
              xfree (message);
2199
0
              return CONSSLERR;
2200
0
            }
2201
0
          xfree (message);
2202
2203
          /* SOCK is now *really* connected to u->host, so update CONN
2204
             to reflect this.  That way register_persistent will
2205
             register SOCK as being connected to u->host:u->port.  */
2206
0
          conn = u;
2207
0
        }
2208
2209
0
      if (conn->scheme == SCHEME_HTTPS)
2210
0
        {
2211
0
          if (!ssl_connect_wget (sock, u->host, NULL))
2212
0
            {
2213
0
              CLOSE_INVALIDATE (sock);
2214
0
              return CONSSLERR;
2215
0
            }
2216
0
          else if (!ssl_check_certificate (sock, u->host))
2217
0
            {
2218
0
              CLOSE_INVALIDATE (sock);
2219
0
              return VERIFCERTERR;
2220
0
            }
2221
0
          *using_ssl = true;
2222
0
        }
2223
0
#endif /* HAVE_SSL */
2224
0
    }
2225
0
  *conn_ref = conn;
2226
0
  *req_ref = req;
2227
0
  *sock_ref = sock;
2228
0
  return RETROK;
2229
0
}
2230
2231
static uerr_t
2232
set_file_timestamp (struct http_stat *hs)
2233
0
{
2234
0
  bool local_dot_orig_file_exists = false;
2235
0
  char *local_filename = NULL;
2236
0
  struct stat st;
2237
0
  char buf[1024];
2238
2239
0
  if (opt.backup_converted)
2240
    /* If -K is specified, we'll act on the assumption that it was specified
2241
        last time these files were downloaded as well, and instead of just
2242
        comparing local file X against server file X, we'll compare local
2243
        file X.orig (if extant, else X) against server file X.  If -K
2244
        _wasn't_ specified last time, or the server contains files called
2245
        *.orig, -N will be back to not operating correctly with -k. */
2246
0
    {
2247
0
      size_t filename_len = strlen (hs->local_file);
2248
0
      char *filename_plus_orig_suffix;
2249
2250
0
      if (filename_len + sizeof (ORIG_SFX) > sizeof (buf))
2251
0
        filename_plus_orig_suffix = xmalloc (filename_len + sizeof (ORIG_SFX));
2252
0
      else
2253
0
        filename_plus_orig_suffix = buf;
2254
2255
      /* Would a single s[n]printf() call be faster?  --dan
2256
2257
          Definitely not.  sprintf() is horribly slow.  It's a
2258
          different question whether the difference between the two
2259
          affects a program.  Usually I'd say "no", but at one
2260
          point I profiled Wget, and found that a measurable and
2261
          non-negligible amount of time was lost calling sprintf()
2262
          in url.c.  Replacing sprintf with inline calls to
2263
          strcpy() and number_to_string() made a difference.
2264
          --hniksic */
2265
0
      memcpy (filename_plus_orig_suffix, hs->local_file, filename_len);
2266
0
      memcpy (filename_plus_orig_suffix + filename_len,
2267
0
              ORIG_SFX, sizeof (ORIG_SFX));
2268
2269
      /* Try to stat() the .orig file. */
2270
0
      if (stat (filename_plus_orig_suffix, &st) == 0)
2271
0
        {
2272
0
          local_dot_orig_file_exists = true;
2273
0
          local_filename = filename_plus_orig_suffix;
2274
0
        }
2275
0
    }
2276
2277
0
  if (!local_dot_orig_file_exists)
2278
    /* Couldn't stat() <file>.orig, so try to stat() <file>. */
2279
0
    if (stat (hs->local_file, &st) == 0)
2280
0
      {
2281
0
        if (local_filename != buf)
2282
0
          xfree (local_filename);
2283
0
        local_filename = hs->local_file;
2284
0
      }
2285
2286
0
  if (local_filename != NULL)
2287
    /* There was a local file, so we'll check later to see if the version
2288
        the server has is the same version we already have, allowing us to
2289
        skip a download. */
2290
0
    {
2291
0
      if (local_filename == buf || local_filename == hs->local_file)
2292
0
        hs->orig_file_name = xstrdup (local_filename); // on stack or a copy, make a heap copy
2293
0
      else
2294
0
        hs->orig_file_name = local_filename; // was previously malloc'ed
2295
0
      hs->orig_file_size = st.st_size;
2296
0
      hs->orig_file_tstamp = st.st_mtime;
2297
#ifdef WINDOWS
2298
      /* Modification time granularity is 2 seconds for Windows, so
2299
          increase local time by 1 second for later comparison. */
2300
      ++hs->orig_file_tstamp;
2301
#endif
2302
0
      hs->timestamp_checked = true;
2303
0
    }
2304
2305
0
  return RETROK;
2306
0
}
2307
2308
static uerr_t
2309
check_file_output (const struct url *u, struct http_stat *hs,
2310
                   struct response *resp, char *hdrval, size_t hdrsize)
2311
0
{
2312
  /* Determine the local filename if needed. Notice that if -O is used
2313
   * hstat.local_file is set by http_loop to the argument of -O. */
2314
0
  if (!hs->local_file)
2315
0
    {
2316
0
      char *local_file = NULL;
2317
2318
      /* Honor Content-Disposition whether possible. */
2319
0
      if (!opt.content_disposition
2320
0
          || !resp_header_copy (resp, "Content-Disposition",
2321
0
                                hdrval, hdrsize)
2322
0
          || !parse_content_disposition (hdrval, &local_file))
2323
0
        {
2324
          /* The Content-Disposition header is missing or broken.
2325
           * Choose unique file name according to given URL. */
2326
0
          hs->local_file = url_file_name (u, NULL);
2327
0
        }
2328
0
      else
2329
0
        {
2330
0
          DEBUGP (("Parsed filename from Content-Disposition: %s\n",
2331
0
                  local_file));
2332
0
          hs->local_file = url_file_name (u, local_file);
2333
0
        }
2334
2335
0
      xfree (local_file);
2336
0
    }
2337
2338
0
  hs->temporary = opt.delete_after || opt.spider || !acceptable (hs->local_file);
2339
0
  if (hs->temporary)
2340
0
    {
2341
0
      char *tmp = aprintf ("%s.tmp", hs->local_file);
2342
0
      xfree (hs->local_file);
2343
0
      hs->local_file = tmp;
2344
0
    }
2345
2346
  /* TODO: perform this check only once. */
2347
0
  if (!hs->existence_checked && file_exists_p (hs->local_file, NULL))
2348
0
    {
2349
0
      if (opt.noclobber && !opt.output_document)
2350
0
        {
2351
          /* If opt.noclobber is turned on and file already exists, do not
2352
             retrieve the file. But if the output_document was given, then this
2353
             test was already done and the file didn't exist. Hence the !opt.output_document */
2354
0
          return RETRUNNEEDED;
2355
0
        }
2356
0
      else if (!ALLOW_CLOBBER)
2357
0
        {
2358
0
          char *unique = unique_name_passthrough (hs->local_file);
2359
0
          if (unique != hs->local_file)
2360
0
            xfree (hs->local_file);
2361
0
          hs->local_file = unique;
2362
0
        }
2363
0
    }
2364
0
  hs->existence_checked = true;
2365
2366
  /* Support timestamping */
2367
0
  if (opt.timestamping && !hs->timestamp_checked)
2368
0
    {
2369
0
      uerr_t timestamp_err = set_file_timestamp (hs);
2370
0
      if (timestamp_err != RETROK)
2371
0
        return timestamp_err;
2372
0
    }
2373
0
  return RETROK;
2374
0
}
2375
2376
static uerr_t
2377
check_auth (const struct url *u, char *user, char *passwd, struct response *resp,
2378
            struct request *req, bool *ntlm_seen_ref, bool *retry,
2379
            bool *basic_auth_finished_ref, bool *auth_finished_ref)
2380
0
{
2381
0
  uerr_t auth_err = RETROK;
2382
0
  bool basic_auth_finished = *basic_auth_finished_ref;
2383
0
  bool auth_finished = *auth_finished_ref;
2384
0
  bool ntlm_seen = *ntlm_seen_ref;
2385
0
  char buf[256], *tmp = NULL;
2386
2387
0
  *retry = false;
2388
2389
0
  if (!auth_finished && (user && passwd))
2390
0
    {
2391
      /* IIS sends multiple copies of WWW-Authenticate, one with
2392
         the value "negotiate", and other(s) with data.  Loop over
2393
         all the occurrences and pick the one we recognize.  */
2394
0
      int wapos;
2395
0
      const char *www_authenticate = NULL;
2396
0
      const char *wabeg, *waend;
2397
0
      const char *digest = NULL, *basic = NULL, *ntlm = NULL;
2398
2399
0
      for (wapos = 0; !ntlm
2400
0
             && (wapos = resp_header_locate (resp, "WWW-Authenticate", wapos,
2401
0
                                             &wabeg, &waend)) != -1;
2402
0
           ++wapos)
2403
0
        {
2404
0
          param_token name, value;
2405
0
          size_t len = waend - wabeg;
2406
2407
0
          if (tmp != buf)
2408
0
            xfree (tmp);
2409
2410
0
          if (len < sizeof (buf))
2411
0
            tmp = buf;
2412
0
          else
2413
0
            tmp = xmalloc (len + 1);
2414
2415
0
          memcpy (tmp, wabeg, len);
2416
0
          tmp[len] = 0;
2417
2418
0
          www_authenticate = tmp;
2419
2420
0
          for (;!ntlm;)
2421
0
            {
2422
              /* extract the auth-scheme */
2423
0
              while (c_isspace (*www_authenticate)) www_authenticate++;
2424
0
              name.e = name.b = www_authenticate;
2425
0
              while (*name.e && !c_isspace (*name.e)) name.e++;
2426
2427
0
              if (name.b == name.e)
2428
0
                break;
2429
2430
0
              DEBUGP (("Auth scheme found '%.*s'\n", (int) (name.e - name.b), name.b));
2431
2432
0
              if (known_authentication_scheme_p (name.b, name.e))
2433
0
                {
2434
0
                  if (BEGINS_WITH (name.b, "NTLM"))
2435
0
                    {
2436
0
                      ntlm = name.b;
2437
0
                      break; /* this is the most secure challenge, stop here */
2438
0
                    }
2439
0
                  else if (!digest && BEGINS_WITH (name.b, "Digest"))
2440
0
                    digest = name.b;
2441
0
                  else if (!basic && BEGINS_WITH (name.b, "Basic"))
2442
0
                    basic = name.b;
2443
0
                }
2444
2445
              /* now advance over the auth-params */
2446
0
              www_authenticate = name.e;
2447
0
              DEBUGP (("Auth param list '%s'\n", www_authenticate));
2448
0
              while (extract_param (&www_authenticate, &name, &value, ',', NULL) && name.b && value.b)
2449
0
                {
2450
0
                  DEBUGP (("Auth param %.*s=%.*s\n",
2451
0
                           (int) (name.e - name.b), name.b, (int) (value.e - value.b), value.b));
2452
0
                }
2453
0
            }
2454
0
        }
2455
2456
0
      if (!basic && !digest && !ntlm)
2457
0
        {
2458
          /* If the authentication header is missing or
2459
             unrecognized, there's no sense in retrying.  */
2460
0
          logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n"));
2461
0
        }
2462
0
      else if (!basic_auth_finished
2463
0
               || !basic)
2464
0
        {
2465
0
          char *pth = url_full_path (u);
2466
0
          const char *value;
2467
0
          uerr_t *auth_stat;
2468
0
          auth_stat = xmalloc (sizeof (uerr_t));
2469
0
          *auth_stat = RETROK;
2470
2471
0
          if (ntlm)
2472
0
            www_authenticate = ntlm;
2473
0
          else if (digest)
2474
0
            www_authenticate = digest;
2475
0
          else
2476
0
            www_authenticate = basic;
2477
2478
0
          logprintf (LOG_NOTQUIET, _("Authentication selected: %s\n"), www_authenticate);
2479
2480
0
          value =  create_authorization_line (www_authenticate,
2481
0
                                              user, passwd,
2482
0
                                              request_method (req),
2483
0
                                              pth,
2484
0
                                              &auth_finished,
2485
0
                                              auth_stat);
2486
2487
0
          auth_err = *auth_stat;
2488
0
          xfree (auth_stat);
2489
0
          xfree (pth);
2490
0
          if (auth_err == RETROK)
2491
0
            {
2492
0
              request_set_header (req, "Authorization", value, rel_value);
2493
2494
0
              if (BEGINS_WITH (www_authenticate, "NTLM"))
2495
0
                ntlm_seen = true;
2496
0
              else if (!u->user && BEGINS_WITH (www_authenticate, "Basic"))
2497
0
                {
2498
                  /* Need to register this host as using basic auth,
2499
                   * so we automatically send creds next time. */
2500
0
                  register_basic_auth_host (u->host);
2501
0
                }
2502
2503
0
              *retry = true;
2504
0
              goto cleanup;
2505
0
            }
2506
0
          else
2507
0
            {
2508
              /* Creating the Authorization header went wrong */
2509
0
              xfree (value);
2510
0
            }
2511
0
        }
2512
0
      else
2513
0
        {
2514
          /* We already did Basic auth, and it failed. Gotta
2515
           * give up. */
2516
0
        }
2517
0
    }
2518
2519
0
 cleanup:
2520
0
   if (tmp != buf)
2521
0
     xfree (tmp);
2522
0
  *ntlm_seen_ref = ntlm_seen;
2523
0
  *basic_auth_finished_ref = basic_auth_finished;
2524
0
  *auth_finished_ref = auth_finished;
2525
0
  return auth_err;
2526
0
}
2527
2528
static uerr_t
2529
open_output_stream (struct http_stat *hs, int count, FILE **fp)
2530
0
{
2531
/* 2005-06-17 SMS.
2532
   For VMS, define common fopen() optional arguments.
2533
*/
2534
#ifdef __VMS
2535
# define FOPEN_OPT_ARGS "fop=sqo", "acc", acc_cb, &open_id
2536
# define FOPEN_BIN_FLAG 3
2537
#else /* def __VMS */
2538
0
# define FOPEN_BIN_FLAG true
2539
0
#endif /* def __VMS [else] */
2540
2541
  /* Open the local file.  */
2542
0
  if (!output_stream)
2543
0
    {
2544
0
      mkalldirs (hs->local_file);
2545
0
      if (opt.backups)
2546
0
        rotate_backups (hs->local_file);
2547
0
      if (hs->restval)
2548
0
        {
2549
#ifdef __VMS
2550
          int open_id;
2551
2552
          open_id = 21;
2553
          *fp = fopen (hs->local_file, "ab", FOPEN_OPT_ARGS);
2554
#else /* def __VMS */
2555
0
          *fp = fopen (hs->local_file, "ab");
2556
0
#endif /* def __VMS [else] */
2557
0
        }
2558
0
      else if (ALLOW_CLOBBER || count > 0)
2559
0
        {
2560
0
          if (opt.unlink_requested && file_exists_p (hs->local_file, NULL))
2561
0
            {
2562
0
              if (unlink (hs->local_file) < 0)
2563
0
                {
2564
0
                  logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file,
2565
0
                             strerror (errno));
2566
0
                  return UNLINKERR;
2567
0
                }
2568
0
            }
2569
2570
#ifdef __VMS
2571
          int open_id;
2572
2573
          open_id = 22;
2574
          *fp = fopen (hs->local_file, "wb", FOPEN_OPT_ARGS);
2575
#else /* def __VMS */
2576
0
          if (hs->temporary)
2577
0
            {
2578
0
              *fp = fdopen (open (hs->local_file, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY, S_IRUSR | S_IWUSR), "wb");
2579
0
            }
2580
0
          else
2581
0
            {
2582
0
              *fp = fopen (hs->local_file, "wb");
2583
0
            }
2584
2585
0
#endif /* def __VMS [else] */
2586
0
        }
2587
0
      else
2588
0
        {
2589
0
          *fp = fopen_excl (hs->local_file, FOPEN_BIN_FLAG);
2590
0
          if (!*fp && errno == EEXIST)
2591
0
            {
2592
              /* We cannot just invent a new name and use it (which is
2593
                 what functions like unique_create typically do)
2594
                 because we told the user we'd use this name.
2595
                 Instead, return and retry the download.  */
2596
0
              logprintf (LOG_NOTQUIET,
2597
0
                         _("%s has sprung into existence.\n"),
2598
0
                         hs->local_file);
2599
0
              return FOPEN_EXCL_ERR;
2600
0
            }
2601
0
        }
2602
0
      if (!*fp)
2603
0
        {
2604
0
          logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno));
2605
0
          return FOPENERR;
2606
0
        }
2607
0
    }
2608
0
  else
2609
0
    *fp = output_stream;
2610
2611
  /* Print fetch message, if opt.verbose.  */
2612
0
  logprintf (LOG_VERBOSE, _("Saving to: %s\n"),
2613
0
             HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file));
2614
2615
0
  return RETROK;
2616
0
}
2617
2618
/* Set proper type flags based on type string.  */
2619
static void
2620
set_content_type (int *dt, const char *type)
2621
0
{
2622
  /* If content-type is not given, assume text/html.  This is because
2623
     of the multitude of broken CGI's that "forget" to generate the
2624
     content-type.  */
2625
0
  if (!type ||
2626
0
      0 == c_strcasecmp (type, TEXTHTML_S) ||
2627
0
      0 == c_strcasecmp (type, TEXTXHTML_S))
2628
0
    *dt |= TEXTHTML;
2629
0
  else
2630
0
    *dt &= ~TEXTHTML;
2631
2632
0
  if (type &&
2633
0
      0 == c_strcasecmp (type, TEXTCSS_S))
2634
0
    *dt |= TEXTCSS;
2635
0
  else
2636
0
    *dt &= ~TEXTCSS;
2637
0
}
2638
2639
#ifdef HAVE_METALINK
2640
/* Will return proper metalink_t structure if enough data was found in
2641
   http response resp. Otherwise returns NULL.
2642
   Two exit points: one for success and one for failure.  */
2643
static metalink_t *
2644
metalink_from_http (const struct response *resp, const struct http_stat *hs,
2645
                    const struct url *u)
2646
{
2647
  metalink_t *metalink = NULL;
2648
  metalink_file_t *mfile = xnew0 (metalink_file_t);
2649
  const char *val_beg, *val_end;
2650
  int res_count = 0, meta_count = 0, hash_count = 0, sig_count = 0, i;
2651
2652
  DEBUGP (("Checking for Metalink in HTTP response\n"));
2653
2654
  /* Initialize metalink file for our simple use case.  */
2655
  if (hs->local_file)
2656
    mfile->name = xstrdup (hs->local_file);
2657
  else
2658
    mfile->name = url_file_name (u, NULL);
2659
2660
  /* Begin with 1-element array (for 0-termination). */
2661
  mfile->checksums = xnew0 (metalink_checksum_t *);
2662
  mfile->resources = xnew0 (metalink_resource_t *);
2663
  mfile->metaurls = xnew0 (metalink_metaurl_t *);
2664
2665
  /* Process the Content-Type header.  */
2666
  if (resp_header_locate (resp, "Content-Type", 0, &val_beg, &val_end) != -1)
2667
    {
2668
      metalink_metaurl_t murl = {0};
2669
2670
      const char *type_beg, *type_end;
2671
      char *typestr = NULL;
2672
      char *namestr = NULL;
2673
      size_t type_len;
2674
2675
      DEBUGP (("Processing Content-Type header...\n"));
2676
2677
      /* Find beginning of type.  */
2678
      type_beg = val_beg;
2679
      while (type_beg < val_end && c_isspace (*type_beg))
2680
        type_beg++;
2681
2682
      /* Find end of type.  */
2683
      type_end = type_beg + 1;
2684
      while (type_end < val_end &&
2685
             *type_end != ';' &&
2686
             *type_end != ' ' &&
2687
             *type_end != '\r' &&
2688
             *type_end != '\n')
2689
        type_end++;
2690
2691
      if (type_beg >= val_end || type_end > val_end)
2692
        {
2693
          DEBUGP (("Invalid Content-Type header. Ignoring.\n"));
2694
          goto skip_content_type;
2695
        }
2696
2697
      type_len = type_end - type_beg;
2698
      typestr = xstrndup (type_beg, type_len);
2699
2700
      DEBUGP (("Content-Type: %s\n", typestr));
2701
2702
      if (strcmp (typestr, "application/metalink4+xml"))
2703
        {
2704
          xfree (typestr);
2705
          goto skip_content_type;
2706
        }
2707
2708
      /*
2709
        Valid ranges for the "pri" attribute are from
2710
        1 to 999999.  Mirror servers with a lower value of the "pri"
2711
        attribute have a higher priority, while mirrors with an undefined
2712
        "pri" attribute are considered to have a value of 999999, which is
2713
        the lowest priority.
2714
2715
        rfc6249 section 3.1
2716
      */
2717
      murl.priority = DEFAULT_PRI;
2718
2719
      murl.mediatype = typestr;
2720
      typestr = NULL;
2721
2722
      if (opt.content_disposition
2723
          && resp_header_locate (resp, "Content-Disposition", 0, &val_beg, &val_end) != -1)
2724
        {
2725
          find_key_value (val_beg, val_end, "filename", &namestr);
2726
          murl.name = namestr;
2727
          namestr = NULL;
2728
        }
2729
2730
      murl.url = xstrdup (u->url);
2731
2732
      DEBUGP (("URL=%s\n", murl.url));
2733
      DEBUGP (("MEDIATYPE=%s\n", murl.mediatype));
2734
      DEBUGP (("NAME=%s\n", murl.name ? murl.name : ""));
2735
      DEBUGP (("PRIORITY=%d\n", murl.priority));
2736
2737
      /* 1 slot from new resource, 1 slot for null-termination.  */
2738
      mfile->metaurls = xrealloc (mfile->metaurls,
2739
                                  sizeof (metalink_metaurl_t *) * (meta_count + 2));
2740
      mfile->metaurls[meta_count] = xnew0 (metalink_metaurl_t);
2741
      *mfile->metaurls[meta_count] = murl;
2742
      meta_count++;
2743
    }
2744
skip_content_type:
2745
2746
  /* Find all Link headers.  */
2747
  for (i = 0;
2748
       (i = resp_header_locate (resp, "Link", i, &val_beg, &val_end)) != -1;
2749
       i++)
2750
    {
2751
      char *rel = NULL, *reltype = NULL;
2752
      char *urlstr = NULL;
2753
      const char *url_beg, *url_end, *attrs_beg;
2754
      size_t url_len;
2755
2756
      /* Sample Metalink Link headers:
2757
2758
           Link: <http://www2.example.com/dir1/dir2/dir3/dir4/dir5/example.ext>;
2759
           rel=duplicate; pri=1; pref; geo=gb; depth=4
2760
2761
           Link: <http://example.com/example.ext.asc>; rel=describedby;
2762
           type="application/pgp-signature"
2763
       */
2764
2765
      /* Find beginning of URL.  */
2766
      url_beg = val_beg;
2767
      while (url_beg < val_end - 1 && c_isspace (*url_beg))
2768
        url_beg++;
2769
2770
      /* Find end of URL.  */
2771
      /* The convention here is that end ptr points to one element after
2772
         end of string. In this case, it should be pointing to the '>', which
2773
         is one element after end of actual URL. Therefore, it should never point
2774
         to val_end, which is one element after entire header value string.  */
2775
      url_end = url_beg + 1;
2776
      while (url_end < val_end - 1 && *url_end != '>')
2777
        url_end++;
2778
2779
      if (url_beg >= val_end || url_end >= val_end ||
2780
          *url_beg != '<' || *url_end != '>')
2781
        {
2782
          DEBUGP (("This is not a valid Link header. Ignoring.\n"));
2783
          continue;
2784
        }
2785
2786
      /* Skip <.  */
2787
      url_beg++;
2788
      url_len = url_end - url_beg;
2789
2790
      /* URL found. Now handle the attributes.  */
2791
      attrs_beg = url_end + 1;
2792
2793
      /* First we need to find out what type of link it is. Currently, we
2794
         support rel=duplicate and rel=describedby.  */
2795
      if (!find_key_value (attrs_beg, val_end, "rel", &rel))
2796
        {
2797
          DEBUGP (("No rel value in Link header, skipping.\n"));
2798
          continue;
2799
        }
2800
2801
      urlstr = xstrndup (url_beg, url_len);
2802
      DEBUGP (("URL=%s\n", urlstr));
2803
      DEBUGP (("rel=%s\n", rel));
2804
2805
      if (!strcmp (rel, "describedby"))
2806
        find_key_value (attrs_beg, val_end, "type", &reltype);
2807
2808
      /* Handle signatures.
2809
         Libmetalink only supports one signature per file. Therefore we stop
2810
         as soon as we successfully get first supported signature.  */
2811
      if (sig_count == 0 &&
2812
          reltype && !strcmp (reltype, "application/pgp-signature"))
2813
        {
2814
          /* Download the signature to a temporary file.  */
2815
          FILE *_output_stream = output_stream;
2816
          bool _output_stream_regular = output_stream_regular;
2817
2818
          output_stream = tmpfile ();
2819
          if (output_stream)
2820
            {
2821
              struct iri *iri = iri_new ();
2822
              struct url *url;
2823
              int url_err;
2824
2825
              set_uri_encoding (iri, opt.locale, true);
2826
              url = url_parse (urlstr, &url_err, iri, false);
2827
2828
              if (!url)
2829
                {
2830
                  logprintf (LOG_NOTQUIET, _("When downloading signature:\n"
2831
                                             "%s: %s.\n"), urlstr, url_error (url_err));
2832
                  iri_free (iri);
2833
                }
2834
              else
2835
                {
2836
                  /* Avoid recursive Metalink from HTTP headers.  */
2837
                  bool _metalink_http = opt.metalink_over_http;
2838
                  uerr_t retr_err;
2839
2840
                  opt.metalink_over_http = false;
2841
                  retr_err = retrieve_url (url, urlstr, NULL, NULL,
2842
                                           NULL, NULL, false, iri, false);
2843
                  opt.metalink_over_http = _metalink_http;
2844
2845
                  url_free (url);
2846
                  iri_free (iri);
2847
2848
                  if (retr_err == RETROK)
2849
                    {
2850
                      /* Signature is in the temporary file. Read it into
2851
                         metalink resource structure.  */
2852
                      metalink_signature_t msig;
2853
                      size_t siglen;
2854
2855
                      fseek (output_stream, 0, SEEK_END);
2856
                      siglen = ftell (output_stream);
2857
                      fseek (output_stream, 0, SEEK_SET);
2858
2859
                      DEBUGP (("siglen=%lu\n", siglen));
2860
2861
                      msig.signature = xmalloc (siglen + 1);
2862
                      if (fread (msig.signature, siglen, 1, output_stream) != 1)
2863
                        {
2864
                          logputs (LOG_NOTQUIET,
2865
                                   _("Unable to read signature content from "
2866
                                     "temporary file. Skipping.\n"));
2867
                          xfree (msig.signature);
2868
                        }
2869
                      else
2870
                        {
2871
                          msig.signature[siglen] = '\0'; /* Just in case.  */
2872
                          msig.mediatype = xstrdup ("application/pgp-signature");
2873
2874
                          DEBUGP (("Signature (%s):\n%s\n",
2875
                                   msig.mediatype, msig.signature));
2876
2877
                          mfile->signature = xnew (metalink_signature_t);
2878
                          *mfile->signature = msig;
2879
2880
                          sig_count++;
2881
                        }
2882
                    }
2883
                }
2884
              fclose (output_stream);
2885
            }
2886
          else
2887
            {
2888
              logputs (LOG_NOTQUIET, _("Could not create temporary file. "
2889
                                       "Skipping signature download.\n"));
2890
            }
2891
          output_stream_regular = _output_stream_regular;
2892
          output_stream = _output_stream;
2893
        } /* Iterate over signatures.  */
2894
2895
        /* Handle Metalink resources.  */
2896
      else if (!strcmp (rel, "duplicate"))
2897
        {
2898
          metalink_resource_t mres = {0};
2899
          char *pristr;
2900
2901
          /*
2902
             Valid ranges for the "pri" attribute are from
2903
             1 to 999999.  Mirror servers with a lower value of the "pri"
2904
             attribute have a higher priority, while mirrors with an undefined
2905
             "pri" attribute are considered to have a value of 999999, which is
2906
             the lowest priority.
2907
2908
             rfc6249 section 3.1
2909
           */
2910
          mres.priority = DEFAULT_PRI;
2911
          if (find_key_value (url_end, val_end, "pri", &pristr))
2912
            {
2913
              long pri;
2914
              char *end_pristr;
2915
              /* Do not care for errno since 0 is error in this case.  */
2916
              pri = strtol (pristr, &end_pristr, 10);
2917
              if (end_pristr != pristr + strlen (pristr) ||
2918
                  !VALID_PRI_RANGE (pri))
2919
                {
2920
                  /* This is against the specification, so let's inform the user.  */
2921
                  logprintf (LOG_NOTQUIET,
2922
                             _("Invalid pri value. Assuming %d.\n"),
2923
                             DEFAULT_PRI);
2924
                }
2925
              else
2926
                mres.priority = pri;
2927
              xfree (pristr);
2928
            }
2929
2930
          switch (url_scheme (urlstr))
2931
            {
2932
            case SCHEME_HTTP:
2933
              mres.type = xstrdup ("http");
2934
              break;
2935
#ifdef HAVE_SSL
2936
            case SCHEME_HTTPS:
2937
              mres.type = xstrdup ("https");
2938
              break;
2939
            case SCHEME_FTPS:
2940
              mres.type = xstrdup ("ftps");
2941
              break;
2942
#endif
2943
            case SCHEME_FTP:
2944
              mres.type = xstrdup ("ftp");
2945
              break;
2946
            default:
2947
              DEBUGP (("Unsupported url scheme in %s. Skipping resource.\n", urlstr));
2948
            }
2949
2950
          if (mres.type)
2951
            {
2952
              DEBUGP (("TYPE=%s\n", mres.type));
2953
2954
              /* At this point we have validated the new resource.  */
2955
2956
              find_key_value (url_end, val_end, "geo", &mres.location);
2957
2958
              mres.url = urlstr;
2959
              urlstr = NULL;
2960
2961
              mres.preference = 0;
2962
              if (has_key (url_end, val_end, "pref"))
2963
                {
2964
                  DEBUGP (("This resource has preference\n"));
2965
                  mres.preference = 1;
2966
                }
2967
2968
              /* 1 slot from new resource, 1 slot for null-termination.  */
2969
              mfile->resources = xrealloc (mfile->resources,
2970
                                           sizeof (metalink_resource_t *) * (res_count + 2));
2971
              mfile->resources[res_count] = xnew0 (metalink_resource_t);
2972
              *mfile->resources[res_count] = mres;
2973
              res_count++;
2974
            }
2975
        } /* Handle resource link (rel=duplicate).  */
2976
2977
      /* Handle Metalink/XML resources.  */
2978
      else if (reltype && !strcmp (reltype, "application/metalink4+xml"))
2979
        {
2980
          metalink_metaurl_t murl = {0};
2981
          char *pristr;
2982
2983
          /*
2984
             Valid ranges for the "pri" attribute are from
2985
             1 to 999999.  Mirror servers with a lower value of the "pri"
2986
             attribute have a higher priority, while mirrors with an undefined
2987
             "pri" attribute are considered to have a value of 999999, which is
2988
             the lowest priority.
2989
2990
             rfc6249 section 3.1
2991
           */
2992
          murl.priority = DEFAULT_PRI;
2993
          if (find_key_value (url_end, val_end, "pri", &pristr))
2994
            {
2995
              long pri;
2996
              char *end_pristr;
2997
              /* Do not care for errno since 0 is error in this case.  */
2998
              pri = strtol (pristr, &end_pristr, 10);
2999
              if (end_pristr != pristr + strlen (pristr) ||
3000
                  !VALID_PRI_RANGE (pri))
3001
                {
3002
                  /* This is against the specification, so let's inform the user.  */
3003
                  logprintf (LOG_NOTQUIET,
3004
                             _("Invalid pri value. Assuming %d.\n"),
3005
                             DEFAULT_PRI);
3006
                }
3007
              else
3008
                murl.priority = pri;
3009
              xfree (pristr);
3010
            }
3011
3012
          murl.mediatype = xstrdup (reltype);
3013
3014
          DEBUGP (("MEDIATYPE=%s\n", murl.mediatype));
3015
3016
          /* At this point we have validated the new resource.  */
3017
3018
          find_key_value (url_end, val_end, "name", &murl.name);
3019
3020
          murl.url = urlstr;
3021
          urlstr = NULL;
3022
3023
          /* 1 slot from new resource, 1 slot for null-termination.  */
3024
          mfile->metaurls = xrealloc (mfile->metaurls,
3025
                                       sizeof (metalink_metaurl_t *) * (meta_count + 2));
3026
          mfile->metaurls[meta_count] = xnew0 (metalink_metaurl_t);
3027
          *mfile->metaurls[meta_count] = murl;
3028
          meta_count++;
3029
        } /* Handle resource link (rel=describedby).  */
3030
      else
3031
        DEBUGP (("This link header was not used for Metalink\n"));
3032
3033
      xfree (urlstr);
3034
      xfree (reltype);
3035
      xfree (rel);
3036
    } /* Iterate over link headers.  */
3037
3038
  /* Null-terminate resources array.  */
3039
  mfile->resources[res_count] = 0;
3040
  mfile->metaurls[meta_count] = 0;
3041
3042
  if (res_count == 0 && meta_count == 0)
3043
    {
3044
      DEBUGP (("No valid metalink references found.\n"));
3045
      goto fail;
3046
    }
3047
3048
  /* Find all Digest headers.  */
3049
  for (i = 0;
3050
       (i = resp_header_locate (resp, "Digest", i, &val_beg, &val_end)) != -1;
3051
       i++)
3052
    {
3053
      const char *dig_pos;
3054
      char *dig_type, *dig_hash;
3055
3056
      /* Each Digest header can include multiple hashes. Example:
3057
           Digest: SHA=thvDyvhfIqlvFe+A9MYgxAfm1q5=,unixsum=30637
3058
           Digest: md5=HUXZLQLMuI/KZ5KDcJPcOA==
3059
       */
3060
      for (dig_pos = val_beg;
3061
           (dig_pos = find_key_values (dig_pos, val_end, &dig_type, &dig_hash));
3062
           dig_pos++)
3063
        {
3064
          /* The hash here is assumed to be base64. We need the hash in hex.
3065
             Therefore we convert: base64 -> binary -> hex.  */
3066
          const size_t dig_hash_str_len = strlen (dig_hash);
3067
          char bin_hash[256];
3068
          ssize_t hash_bin_len;
3069
3070
          // there is no hash with that size
3071
          if (dig_hash_str_len >= sizeof (bin_hash))
3072
            {
3073
              DEBUGP (("Hash too long, ignored.\n"));
3074
              xfree (dig_type);
3075
              xfree (dig_hash);
3076
              continue;
3077
            }
3078
3079
          hash_bin_len = wget_base64_decode (dig_hash, bin_hash, dig_hash_str_len * 3 / 4 + 1);
3080
3081
          /* Detect malformed base64 input.  */
3082
          if (hash_bin_len < 0)
3083
            {
3084
              DEBUGP (("Malformed base64 input, ignored.\n"));
3085
              xfree (dig_type);
3086
              xfree (dig_hash);
3087
              continue;
3088
            }
3089
3090
          /* One slot for me, one for zero-termination.  */
3091
          mfile->checksums =
3092
                  xrealloc (mfile->checksums,
3093
                            sizeof (metalink_checksum_t *) * (hash_count + 2));
3094
          mfile->checksums[hash_count] = xnew (metalink_checksum_t);
3095
          mfile->checksums[hash_count]->type = dig_type;
3096
3097
          mfile->checksums[hash_count]->hash = xmalloc ((size_t)hash_bin_len * 2 + 1);
3098
          wg_hex_to_string (mfile->checksums[hash_count]->hash, bin_hash, (size_t)hash_bin_len);
3099
3100
          xfree (dig_hash);
3101
3102
          hash_count++;
3103
        }
3104
    }
3105
3106
  /* Zero-terminate checksums array.  */
3107
  mfile->checksums[hash_count] = 0;
3108
3109
  /*
3110
    If Instance Digests are not provided by the Metalink servers, the
3111
    Link header fields pertaining to this specification MUST be ignored.
3112
3113
    rfc6249 section 6
3114
   */
3115
  if (res_count && hash_count == 0)
3116
    {
3117
      logputs (LOG_VERBOSE,
3118
               _("Could not find acceptable digest for Metalink resources.\n"
3119
                 "Ignoring them.\n"));
3120
      goto fail;
3121
    }
3122
3123
  /* Metalink data is OK. Now we just need to sort the resources based
3124
     on their priorities, preference, and perhaps location.  */
3125
  stable_sort (mfile->resources, res_count, sizeof (metalink_resource_t *), metalink_res_cmp);
3126
  stable_sort (mfile->metaurls, meta_count, sizeof (metalink_metaurl_t *), metalink_meta_cmp);
3127
3128
  /* Restore sensible preference values (in case someone cares to look).  */
3129
  for (i = 0; i < res_count; ++i)
3130
    mfile->resources[i]->preference = 1000000 - mfile->resources[i]->priority;
3131
3132
  metalink = xnew0 (metalink_t);
3133
  metalink->files = xmalloc (sizeof (metalink_file_t *) * 2);
3134
  metalink->files[0] = mfile;
3135
  metalink->files[1] = 0;
3136
  metalink->origin = xstrdup (u->url);
3137
  metalink->version = METALINK_VERSION_4;
3138
  /* Leave other fields set to 0.  */
3139
3140
  return metalink;
3141
3142
fail:
3143
  /* Free all allocated memory.  */
3144
  if (metalink)
3145
    metalink_delete (metalink);
3146
  else
3147
    metalink_file_delete (mfile);
3148
  return NULL;
3149
}
3150
#endif /* HAVE_METALINK */
3151
3152
/* Retrieve a document through HTTP protocol.  It recognizes status
3153
   code, and correctly handles redirections.  It closes the network
3154
   socket.  If it receives an error from the functions below it, it
3155
   will print it if there is enough information to do so (almost
3156
   always), returning the error to the caller (i.e. http_loop).
3157
3158
   Various HTTP parameters are stored to hs.
3159
3160
   If PROXY is non-NULL, the connection will be made to the proxy
3161
   server, and u->url will be requested.  */
3162
static uerr_t
3163
gethttp (const struct url *u, struct url *original_url, struct http_stat *hs,
3164
         int *dt, struct url *proxy, struct iri *iri, int count)
3165
0
{
3166
0
  struct request *req = NULL;
3167
3168
0
  char *type = NULL;
3169
0
  char *user, *passwd;
3170
0
  char *proxyauth;
3171
0
  int statcode;
3172
0
  int write_error;
3173
0
  wgint contlen, contrange;
3174
0
  const struct url *conn;
3175
0
  FILE *fp;
3176
0
  int err;
3177
0
  uerr_t retval;
3178
0
#ifdef HAVE_HSTS
3179
0
#ifdef TESTING
3180
  /* we don't link against main.o when we're testing */
3181
0
  hsts_store_t hsts_store = NULL;
3182
#else
3183
  extern hsts_store_t hsts_store;
3184
#endif
3185
0
#endif
3186
3187
0
  int sock = -1;
3188
3189
  /* Set to 1 when the authorization has already been sent and should
3190
     not be tried again. */
3191
0
  bool auth_finished = false;
3192
3193
  /* Set to 1 when just globally-set Basic authorization has been sent;
3194
   * should prevent further Basic negotiations, but not other
3195
   * mechanisms. */
3196
0
  bool basic_auth_finished = false;
3197
3198
  /* Whether NTLM authentication is used for this request. */
3199
0
  bool ntlm_seen = false;
3200
3201
  /* Whether our connection to the remote host is through SSL.  */
3202
0
  bool using_ssl = false;
3203
3204
  /* Whether a HEAD request will be issued (as opposed to GET or
3205
     POST). */
3206
0
  bool head_only = !!(*dt & HEAD_ONLY);
3207
3208
  /* Whether conditional get request will be issued.  */
3209
0
  bool cond_get = !!(*dt & IF_MODIFIED_SINCE);
3210
3211
#ifdef HAVE_METALINK
3212
  /* Are we looking for metalink info in HTTP headers?  */
3213
  bool metalink = !!(*dt & METALINK_METADATA);
3214
#endif
3215
3216
0
  char *head = NULL;
3217
0
  struct response *resp = NULL;
3218
0
  char hdrval[512];
3219
0
  char *message = NULL;
3220
3221
  /* Declare WARC variables. */
3222
0
  bool warc_enabled = (opt.warc_filename != NULL);
3223
0
  FILE *warc_tmp = NULL;
3224
0
  char warc_timestamp_str [21];
3225
0
  char warc_request_uuid [48];
3226
0
  ip_address warc_ip_buf, *warc_ip = NULL;
3227
0
  off_t warc_payload_offset = -1;
3228
3229
  /* Whether this connection will be kept alive after the HTTP request
3230
     is done. */
3231
0
  bool keep_alive;
3232
3233
  /* Is the server using the chunked transfer encoding?  */
3234
0
  bool chunked_transfer_encoding = false;
3235
3236
  /* Whether keep-alive should be inhibited.  */
3237
0
  bool inhibit_keep_alive =
3238
0
    !opt.http_keep_alive || opt.ignore_length;
3239
3240
  /* Headers sent when using POST. */
3241
0
  wgint body_data_size = 0;
3242
3243
0
#ifdef HAVE_SSL
3244
0
  if (u->scheme == SCHEME_HTTPS)
3245
0
    {
3246
      /* Initialize the SSL context.  After this has once been done,
3247
         it becomes a no-op.  */
3248
0
      if (!ssl_init ())
3249
0
        {
3250
0
          scheme_disable (SCHEME_HTTPS);
3251
0
          logprintf (LOG_NOTQUIET,
3252
0
                     _("Disabling SSL due to encountered errors.\n"));
3253
0
          retval = SSLINITFAILED;
3254
0
          goto cleanup;
3255
0
        }
3256
0
    }
3257
0
#endif /* HAVE_SSL */
3258
3259
  /* Initialize certain elements of struct http_stat.
3260
   * Since this function is called in a loop, we have to xfree certain
3261
   * members. */
3262
0
  hs->len = 0;
3263
0
  hs->contlen = -1;
3264
0
  hs->res = -1;
3265
0
  xfree (hs->rderrmsg);
3266
0
  xfree (hs->newloc);
3267
0
  xfree (hs->remote_time);
3268
0
  xfree (hs->error);
3269
0
  xfree (hs->message);
3270
0
  hs->local_encoding = ENC_NONE;
3271
0
  hs->remote_encoding = ENC_NONE;
3272
3273
0
  conn = u;
3274
3275
0
  {
3276
0
    uerr_t ret;
3277
0
    req = initialize_request (u, hs, dt, proxy, inhibit_keep_alive,
3278
0
                              &basic_auth_finished, &body_data_size,
3279
0
                              &user, &passwd, &ret);
3280
0
    if (req == NULL)
3281
0
      {
3282
0
        retval = ret;
3283
0
        goto cleanup;
3284
0
      }
3285
0
  }
3286
0
 retry_with_auth:
3287
  /* We need to come back here when the initial attempt to retrieve
3288
     without authorization header fails.  (Expected to happen at least
3289
     for the Digest authorization scheme.)  */
3290
3291
0
  if (opt.cookies)
3292
0
    request_set_header (req, "Cookie",
3293
0
                        cookie_header (wget_cookie_jar,
3294
0
                                       u->host, u->port, u->path,
3295
0
#ifdef HAVE_SSL
3296
0
                                       u->scheme == SCHEME_HTTPS
3297
#else
3298
                                       0
3299
#endif
3300
0
                                       ),
3301
0
                        rel_value);
3302
3303
  /* Add the user headers. */
3304
0
  if (opt.user_headers)
3305
0
    {
3306
0
      int i;
3307
0
      for (i = 0; opt.user_headers[i]; i++)
3308
0
        request_set_user_header (req, opt.user_headers[i]);
3309
0
    }
3310
3311
0
  proxyauth = NULL;
3312
0
  if (proxy)
3313
0
    {
3314
0
      conn = proxy;
3315
0
      initialize_proxy_configuration (u, req, proxy, &proxyauth);
3316
0
    }
3317
0
  keep_alive = true;
3318
3319
  /* Establish the connection.  */
3320
0
  if (inhibit_keep_alive)
3321
0
    keep_alive = false;
3322
3323
0
  {
3324
0
    uerr_t conn_err = establish_connection (u, &conn, hs, proxy, &proxyauth, &req,
3325
0
                                            &using_ssl, inhibit_keep_alive, &sock);
3326
0
    if (conn_err != RETROK)
3327
0
      {
3328
0
        retval = conn_err;
3329
0
        goto cleanup;
3330
0
      }
3331
0
  }
3332
3333
  /* Open the temporary file where we will write the request. */
3334
0
  if (warc_enabled)
3335
0
    {
3336
0
      warc_tmp = warc_tempfile ();
3337
0
      if (warc_tmp == NULL)
3338
0
        {
3339
0
          CLOSE_INVALIDATE (sock);
3340
0
          retval = WARC_TMP_FOPENERR;
3341
0
          goto cleanup;
3342
0
        }
3343
3344
0
      if (! proxy)
3345
0
        {
3346
0
          warc_ip = &warc_ip_buf;
3347
0
          socket_ip_address (sock, warc_ip, ENDPOINT_PEER);
3348
0
        }
3349
0
    }
3350
3351
  /* Send the request to server.  */
3352
0
  write_error = request_send (req, sock, warc_tmp);
3353
3354
0
  if (write_error >= 0)
3355
0
    {
3356
0
      if (opt.body_data)
3357
0
        {
3358
0
          DEBUGP (("[BODY data: %s]\n", opt.body_data));
3359
0
          write_error = fd_write (sock, opt.body_data, body_data_size, -1);
3360
0
          if (write_error >= 0 && warc_tmp != NULL)
3361
0
            {
3362
0
              int warc_tmp_written;
3363
3364
              /* Remember end of headers / start of payload. */
3365
0
              warc_payload_offset = ftello (warc_tmp);
3366
3367
              /* Write a copy of the data to the WARC record. */
3368
0
              warc_tmp_written = fwrite (opt.body_data, 1, body_data_size, warc_tmp);
3369
0
              if (warc_tmp_written != body_data_size)
3370
0
                write_error = -2;
3371
0
            }
3372
0
         }
3373
0
      else if (opt.body_file && body_data_size != 0)
3374
0
        {
3375
0
          if (warc_tmp != NULL)
3376
            /* Remember end of headers / start of payload */
3377
0
            warc_payload_offset = ftello (warc_tmp);
3378
3379
0
          write_error = body_file_send (sock, opt.body_file, body_data_size, warc_tmp);
3380
0
        }
3381
0
    }
3382
3383
0
  if (write_error < 0)
3384
0
    {
3385
0
      CLOSE_INVALIDATE (sock);
3386
3387
0
      if (warc_tmp != NULL)
3388
0
        fclose (warc_tmp);
3389
3390
0
      if (write_error == -2)
3391
0
        retval = WARC_TMP_FWRITEERR;
3392
0
      else
3393
0
        retval = WRITEFAILED;
3394
0
      goto cleanup;
3395
0
    }
3396
0
  logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "),
3397
0
             proxy ? "Proxy" : "HTTP");
3398
0
  contlen = -1;
3399
0
  contrange = 0;
3400
0
  *dt &= ~RETROKF;
3401
3402
3403
0
  if (warc_enabled)
3404
0
    {
3405
0
      bool warc_result;
3406
3407
      /* Generate a timestamp and uuid for this request. */
3408
0
      warc_timestamp (warc_timestamp_str, sizeof (warc_timestamp_str));
3409
0
      warc_uuid_str (warc_request_uuid, sizeof (warc_request_uuid));
3410
3411
      /* Create a request record and store it in the WARC file. */
3412
0
      warc_result = warc_write_request_record (u->url, warc_timestamp_str,
3413
0
                                               warc_request_uuid, warc_ip,
3414
0
                                               warc_tmp, warc_payload_offset);
3415
0
      if (! warc_result)
3416
0
        {
3417
0
          CLOSE_INVALIDATE (sock);
3418
0
          retval = WARC_ERR;
3419
0
          goto cleanup;
3420
0
        }
3421
3422
      /* warc_write_request_record has also closed warc_tmp. */
3423
0
    }
3424
3425
  /* Repeat while we receive a 10x response code.  */
3426
0
  {
3427
0
    bool _repeat;
3428
3429
0
    do
3430
0
      {
3431
0
        head = read_http_response_head (sock);
3432
0
        if (!head)
3433
0
          {
3434
0
            if (errno == 0)
3435
0
              {
3436
0
                logputs (LOG_NOTQUIET, _("No data received.\n"));
3437
0
                CLOSE_INVALIDATE (sock);
3438
0
                retval = HEOF;
3439
0
              }
3440
0
            else
3441
0
              {
3442
0
                logprintf (LOG_NOTQUIET, _("Read error (%s) in headers.\n"),
3443
0
                           fd_errstr (sock));
3444
0
                CLOSE_INVALIDATE (sock);
3445
0
                retval = HERR;
3446
0
              }
3447
0
            goto cleanup;
3448
0
          }
3449
0
        DEBUGP (("\n---response begin---\n%s---response end---\n", head));
3450
3451
0
        resp = resp_new (head);
3452
3453
        /* Check for status line.  */
3454
0
        xfree (message);
3455
0
        statcode = resp_status (resp, &message);
3456
0
        if (statcode < 0)
3457
0
          {
3458
0
            char *tms = datetime_str (time (NULL));
3459
0
            logprintf (LOG_VERBOSE, "%d\n", statcode);
3460
0
            logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), tms, statcode,
3461
0
                       quotearg_style (escape_quoting_style,
3462
0
                                       _("Malformed status line")));
3463
0
            CLOSE_INVALIDATE (sock);
3464
0
            retval = HERR;
3465
0
            goto cleanup;
3466
0
          }
3467
3468
0
        if (H_10X (statcode))
3469
0
          {
3470
0
            xfree (head);
3471
0
            resp_free (&resp);
3472
0
            _repeat = true;
3473
0
            DEBUGP (("Ignoring response\n"));
3474
0
          }
3475
0
        else
3476
0
          {
3477
0
            _repeat = false;
3478
0
          }
3479
0
      }
3480
0
    while (_repeat);
3481
0
  }
3482
3483
0
  xfree (hs->message);
3484
0
  hs->message = xstrdup (message);
3485
0
  if (!opt.server_response)
3486
0
    logprintf (LOG_VERBOSE, "%2d %s\n", statcode,
3487
0
               message ? quotearg_style (escape_quoting_style, message) : "");
3488
0
  else
3489
0
    {
3490
0
      logprintf (LOG_VERBOSE, "\n");
3491
0
      print_server_response (resp, "  ");
3492
0
    }
3493
3494
0
  if (!opt.ignore_length
3495
0
      && resp_header_copy (resp, "Content-Length", hdrval, sizeof (hdrval)))
3496
0
    {
3497
0
      wgint parsed;
3498
0
      errno = 0;
3499
0
      parsed = str_to_wgint (hdrval, NULL, 10);
3500
0
      if (parsed == WGINT_MAX && errno == ERANGE)
3501
0
        {
3502
          /* Out of range.
3503
             #### If Content-Length is out of range, it most likely
3504
             means that the file is larger than 2G and that we're
3505
             compiled without LFS.  In that case we should probably
3506
             refuse to even attempt to download the file.  */
3507
0
          contlen = -1;
3508
0
        }
3509
0
      else if (parsed < 0)
3510
0
        {
3511
          /* Negative Content-Length; nonsensical, so we can't
3512
             assume any information about the content to receive. */
3513
0
          contlen = -1;
3514
0
        }
3515
0
      else
3516
0
        contlen = parsed;
3517
0
    }
3518
3519
  /* Check for keep-alive related responses. */
3520
0
  if (!inhibit_keep_alive)
3521
0
    {
3522
0
      if (resp_header_copy (resp, "Connection", hdrval, sizeof (hdrval)))
3523
0
        {
3524
0
          if (0 == c_strcasecmp (hdrval, "Close"))
3525
0
            keep_alive = false;
3526
0
        }
3527
0
    }
3528
3529
0
  chunked_transfer_encoding = false;
3530
0
  if (resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval))
3531
0
      && 0 == c_strcasecmp (hdrval, "chunked"))
3532
0
    chunked_transfer_encoding = true;
3533
3534
  /* Handle (possibly multiple instances of) the Set-Cookie header. */
3535
0
  if (opt.cookies)
3536
0
    {
3537
0
      int scpos;
3538
0
      const char *scbeg, *scend;
3539
      /* The jar should have been created by now. */
3540
0
      assert (wget_cookie_jar != NULL);
3541
0
      for (scpos = 0;
3542
0
           (scpos = resp_header_locate (resp, "Set-Cookie", scpos,
3543
0
                                        &scbeg, &scend)) != -1;
3544
0
           ++scpos)
3545
0
        {
3546
0
          char buf[1024], *set_cookie;
3547
0
          size_t len = scend - scbeg;
3548
3549
0
          if (len < sizeof (buf))
3550
0
            set_cookie = buf;
3551
0
          else
3552
0
            set_cookie = xmalloc (len + 1);
3553
3554
0
          memcpy (set_cookie, scbeg, len);
3555
0
          set_cookie[len] = 0;
3556
3557
0
          cookie_handle_set_cookie (wget_cookie_jar, u->host, u->port,
3558
0
                                    u->path, set_cookie);
3559
3560
0
          if (set_cookie != buf)
3561
0
            xfree (set_cookie);
3562
0
        }
3563
0
    }
3564
3565
0
  if (keep_alive)
3566
    /* The server has promised that it will not close the connection
3567
       when we're done.  This means that we can register it.  */
3568
0
    register_persistent (conn->host, conn->port, sock, using_ssl);
3569
3570
#ifdef HAVE_METALINK
3571
  /* We need to check for the Metalink data in the very first response
3572
     we get from the server (before redirections, authorization, etc.).  */
3573
  if (metalink)
3574
    {
3575
      hs->metalink = metalink_from_http (resp, hs, u);
3576
      /* Bugfix: hs->local_file is NULL (opt.content_disposition).  */
3577
      if (!hs->local_file && hs->metalink && hs->metalink->origin)
3578
        hs->local_file = xstrdup (hs->metalink->origin);
3579
      xfree (hs->message);
3580
      retval = RETR_WITH_METALINK;
3581
      CLOSE_FINISH (sock);
3582
      goto cleanup;
3583
    }
3584
#endif
3585
3586
0
  if (statcode == HTTP_STATUS_UNAUTHORIZED)
3587
0
    {
3588
      /* Authorization is required.  */
3589
0
      uerr_t auth_err = RETROK;
3590
0
      bool retry;
3591
      /* Normally we are not interested in the response body.
3592
         But if we are writing a WARC file we are: we like to keep everything.  */
3593
0
      if (warc_enabled)
3594
0
        {
3595
0
          int _err;
3596
0
          type = resp_header_strdup (resp, "Content-Type");
3597
0
          _err = read_response_body (hs, sock, NULL, contlen, 0,
3598
0
                                    chunked_transfer_encoding,
3599
0
                                    u->url, warc_timestamp_str,
3600
0
                                    warc_request_uuid, warc_ip, type,
3601
0
                                    statcode, head);
3602
0
          xfree (type);
3603
3604
0
          if (_err != RETRFINISHED || hs->res < 0)
3605
0
            {
3606
0
              CLOSE_INVALIDATE (sock);
3607
0
              retval = _err;
3608
0
              goto cleanup;
3609
0
            }
3610
0
          else
3611
0
            CLOSE_FINISH (sock);
3612
0
        }
3613
0
      else
3614
0
        {
3615
          /* Since WARC is disabled, we are not interested in the response body.  */
3616
0
          if (keep_alive && !head_only
3617
0
              && skip_short_body (sock, contlen, chunked_transfer_encoding))
3618
0
            CLOSE_FINISH (sock);
3619
0
          else
3620
0
            CLOSE_INVALIDATE (sock);
3621
0
        }
3622
3623
0
      pconn.authorized = false;
3624
3625
0
      {
3626
0
        auth_err = check_auth (u, user, passwd, resp, req,
3627
0
                               &ntlm_seen, &retry,
3628
0
                               &basic_auth_finished,
3629
0
                               &auth_finished);
3630
0
        if (auth_err == RETROK && retry)
3631
0
          {
3632
0
            resp_free (&resp);
3633
0
            xfree (message);
3634
0
            xfree (head);
3635
0
            goto retry_with_auth;
3636
0
          }
3637
0
      }
3638
0
      if (auth_err == RETROK)
3639
0
        retval = AUTHFAILED;
3640
0
      else
3641
0
        retval = auth_err;
3642
0
      goto cleanup;
3643
0
    }
3644
0
  else /* statcode != HTTP_STATUS_UNAUTHORIZED */
3645
0
    {
3646
      /* Kludge: if NTLM is used, mark the TCP connection as authorized. */
3647
0
      if (ntlm_seen)
3648
0
        pconn.authorized = true;
3649
0
    }
3650
3651
0
  {
3652
0
    uerr_t ret = check_file_output (u, hs, resp, hdrval, sizeof hdrval);
3653
0
    if (ret != RETROK)
3654
0
      {
3655
0
        retval = ret;
3656
0
        goto cleanup;
3657
0
      }
3658
0
  }
3659
3660
0
  hs->statcode = statcode;
3661
0
  xfree (hs->error);
3662
0
  if (statcode == -1)
3663
0
    hs->error = xstrdup (_("Malformed status line"));
3664
0
  else if (!message || !*message)
3665
0
    hs->error = xstrdup (_("(no description)"));
3666
0
  else
3667
0
    hs->error = xstrdup (message);
3668
3669
0
#ifdef HAVE_HSTS
3670
0
  if (opt.hsts && hsts_store)
3671
0
    {
3672
0
      int64_t max_age;
3673
0
      const char *hsts_params = resp_header_strdup (resp, "Strict-Transport-Security");
3674
0
      bool include_subdomains;
3675
3676
0
      if (parse_strict_transport_security (hsts_params, &max_age, &include_subdomains))
3677
0
        {
3678
          /* process strict transport security */
3679
0
          if (hsts_store_entry (hsts_store, u->scheme, u->host, u->port, max_age, include_subdomains))
3680
0
            DEBUGP(("Added new HSTS host: %s:%" PRIu32 " (max-age: %" PRId64 ", includeSubdomains: %s)\n",
3681
0
                   u->host,
3682
0
                   (uint32_t) u->port,
3683
0
                   max_age,
3684
0
                   (include_subdomains ? "true" : "false")));
3685
0
          else
3686
0
            DEBUGP(("Updated HSTS host: %s:%" PRIu32 " (max-age: %" PRId64 ", includeSubdomains: %s)\n",
3687
0
                   u->host,
3688
0
                   (uint32_t) u->port,
3689
0
                   max_age,
3690
0
                   (include_subdomains ? "true" : "false")));
3691
0
        }
3692
0
      xfree (hsts_params);
3693
0
    }
3694
0
#endif
3695
3696
0
  type = resp_header_strdup (resp, "Content-Type");
3697
0
  if (type)
3698
0
    {
3699
0
      char *tmp = strchr (type, ';');
3700
0
      if (tmp)
3701
0
        {
3702
0
#ifdef ENABLE_IRI
3703
          /* sXXXav: only needed if IRI support is enabled */
3704
0
          char *tmp2 = tmp + 1;
3705
0
#endif
3706
3707
0
          while (tmp > type && c_isspace (tmp[-1]))
3708
0
            --tmp;
3709
0
          *tmp = '\0';
3710
3711
0
#ifdef ENABLE_IRI
3712
          /* Try to get remote encoding if needed */
3713
0
          if (opt.enable_iri && !opt.encoding_remote)
3714
0
            {
3715
0
              tmp = parse_charset (tmp2);
3716
0
              if (tmp)
3717
0
                set_content_encoding (iri, tmp);
3718
0
              xfree (tmp);
3719
0
            }
3720
0
#endif
3721
0
        }
3722
0
    }
3723
0
  xfree (hs->newloc);
3724
0
  hs->newloc = resp_header_strdup (resp, "Location");
3725
0
  xfree (hs->remote_time);
3726
0
  hs->remote_time = resp_header_strdup (resp, "Last-Modified");
3727
0
  if (!hs->remote_time) // now look for the Wayback Machine's timestamp
3728
0
    hs->remote_time = resp_header_strdup (resp, "X-Archive-Orig-last-modified");
3729
3730
0
  if (resp_header_copy (resp, "Content-Range", hdrval, sizeof (hdrval)))
3731
0
    {
3732
0
      wgint first_byte_pos, last_byte_pos, entity_length;
3733
0
      if (parse_content_range (hdrval, &first_byte_pos, &last_byte_pos,
3734
0
                               &entity_length))
3735
0
        {
3736
0
          contrange = first_byte_pos;
3737
0
          contlen = last_byte_pos - first_byte_pos + 1;
3738
0
        }
3739
0
    }
3740
3741
0
  if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof (hdrval)))
3742
0
    {
3743
0
      hs->local_encoding = ENC_INVALID;
3744
3745
0
      switch (hdrval[0])
3746
0
        {
3747
0
        case 'b': case 'B':
3748
0
          if (0 == c_strcasecmp(hdrval, "br"))
3749
0
            hs->local_encoding = ENC_BROTLI;
3750
0
          break;
3751
0
        case 'c': case 'C':
3752
0
          if (0 == c_strcasecmp(hdrval, "compress"))
3753
0
            hs->local_encoding = ENC_COMPRESS;
3754
0
          break;
3755
0
        case 'd': case 'D':
3756
0
          if (0 == c_strcasecmp(hdrval, "deflate"))
3757
0
            hs->local_encoding = ENC_DEFLATE;
3758
0
          break;
3759
0
        case 'g': case 'G':
3760
0
          if (0 == c_strcasecmp(hdrval, "gzip"))
3761
0
            hs->local_encoding = ENC_GZIP;
3762
0
          break;
3763
0
        case 'i': case 'I':
3764
0
          if (0 == c_strcasecmp(hdrval, "identity"))
3765
0
            hs->local_encoding = ENC_NONE;
3766
0
          break;
3767
0
        case 'x': case 'X':
3768
0
          if (0 == c_strcasecmp(hdrval, "x-compress"))
3769
0
            hs->local_encoding = ENC_COMPRESS;
3770
0
          else if (0 == c_strcasecmp(hdrval, "x-gzip"))
3771
0
            hs->local_encoding = ENC_GZIP;
3772
0
          break;
3773
0
        case '\0':
3774
0
          hs->local_encoding = ENC_NONE;
3775
0
        }
3776
3777
0
      if (hs->local_encoding == ENC_INVALID)
3778
0
        {
3779
0
          DEBUGP (("Unrecognized Content-Encoding: %s\n", hdrval));
3780
0
          hs->local_encoding = ENC_NONE;
3781
0
        }
3782
0
#ifdef HAVE_LIBZ
3783
0
      else if (hs->local_encoding == ENC_GZIP
3784
0
               && opt.compression != compression_none)
3785
0
        {
3786
0
          const char *p;
3787
3788
          /* Make sure the Content-Type is not gzip before decompressing */
3789
0
          if (type)
3790
0
            {
3791
0
              p = strchr (type, '/');
3792
0
              if (p == NULL)
3793
0
                {
3794
0
                  hs->remote_encoding = ENC_GZIP;
3795
0
                  hs->local_encoding = ENC_NONE;
3796
0
                }
3797
0
              else
3798
0
                {
3799
0
                  p++;
3800
0
                  if (c_tolower(p[0]) == 'x' && p[1] == '-')
3801
0
                    p += 2;
3802
0
                  if (0 != c_strcasecmp (p, "gzip"))
3803
0
                    {
3804
0
                      hs->remote_encoding = ENC_GZIP;
3805
0
                      hs->local_encoding = ENC_NONE;
3806
0
                    }
3807
0
                }
3808
0
            }
3809
0
          else
3810
0
            {
3811
0
               hs->remote_encoding = ENC_GZIP;
3812
0
               hs->local_encoding = ENC_NONE;
3813
0
            }
3814
3815
          /* don't uncompress if a file ends with '.gz' or '.tgz' */
3816
0
          if (hs->remote_encoding == ENC_GZIP
3817
0
              && (p = strrchr(u->file, '.'))
3818
0
              && (c_strcasecmp(p, ".gz") == 0 || c_strcasecmp(p, ".tgz") == 0))
3819
0
            {
3820
0
               DEBUGP (("Enabling broken server workaround. Will not decompress this GZip file.\n"));
3821
0
               hs->remote_encoding = ENC_NONE;
3822
0
            }
3823
0
        }
3824
0
#endif
3825
0
    }
3826
3827
  /* 20x responses are counted among successful by default.  */
3828
0
  if (H_20X (statcode))
3829
0
    *dt |= RETROKF;
3830
3831
0
  if (statcode == HTTP_STATUS_NO_CONTENT)
3832
0
    {
3833
      /* 204 response has no body (RFC 2616, 4.3) */
3834
3835
      /* In case the caller cares to look...  */
3836
0
      hs->len = 0;
3837
0
      hs->res = 0;
3838
0
      hs->restval = 0;
3839
3840
0
      CLOSE_FINISH (sock);
3841
3842
0
      retval = RETRFINISHED;
3843
0
      goto cleanup;
3844
0
    }
3845
3846
  /* Return if redirected.  */
3847
0
  if (H_REDIRECTED (statcode) || statcode == HTTP_STATUS_MULTIPLE_CHOICES)
3848
0
    {
3849
      /* RFC2068 says that in case of the 300 (multiple choices)
3850
         response, the server can output a preferred URL through
3851
         `Location' header; otherwise, the request should be treated
3852
         like GET.  So, if the location is set, it will be a
3853
         redirection; otherwise, just proceed normally.  */
3854
0
      if (statcode == HTTP_STATUS_MULTIPLE_CHOICES && !hs->newloc)
3855
0
        *dt |= RETROKF;
3856
0
      else
3857
0
        {
3858
0
          logprintf (LOG_VERBOSE,
3859
0
                     _("Location: %s%s\n"),
3860
0
                     hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"),
3861
0
                     hs->newloc ? _(" [following]") : "");
3862
3863
          /* In case the caller cares to look...  */
3864
0
          hs->len = 0;
3865
0
          hs->res = 0;
3866
0
          hs->restval = 0;
3867
3868
          /* Normally we are not interested in the response body of a redirect.
3869
             But if we are writing a WARC file we are: we like to keep everything.  */
3870
0
          if (warc_enabled)
3871
0
            {
3872
0
              int _err = read_response_body (hs, sock, NULL, contlen, 0,
3873
0
                                            chunked_transfer_encoding,
3874
0
                                            u->url, warc_timestamp_str,
3875
0
                                            warc_request_uuid, warc_ip, type,
3876
0
                                            statcode, head);
3877
3878
0
              if (_err != RETRFINISHED || hs->res < 0)
3879
0
                {
3880
0
                  CLOSE_INVALIDATE (sock);
3881
0
                  retval = _err;
3882
0
                  goto cleanup;
3883
0
                }
3884
0
              else
3885
0
                CLOSE_FINISH (sock);
3886
0
            }
3887
0
          else
3888
0
            {
3889
              /* Since WARC is disabled, we are not interested in the response body.  */
3890
0
              if (keep_alive && !head_only
3891
0
                  && skip_short_body (sock, contlen, chunked_transfer_encoding))
3892
0
                CLOSE_FINISH (sock);
3893
0
              else
3894
0
                CLOSE_INVALIDATE (sock);
3895
0
            }
3896
3897
          /* From RFC2616: The status codes 303 and 307 have
3898
             been added for servers that wish to make unambiguously
3899
             clear which kind of reaction is expected of the client.
3900
3901
             A 307 should be redirected using the same method,
3902
             in other words, a POST should be preserved and not
3903
             converted to a GET in that case.
3904
3905
             With strict adherence to RFC2616, POST requests are not
3906
             converted to a GET request on 301 Permanent Redirect
3907
             or 302 Temporary Redirect.
3908
3909
             A switch may be provided later based on the HTTPbis draft
3910
             that allows clients to convert POST requests to GET
3911
             requests on 301 and 302 response codes. */
3912
0
          switch (statcode)
3913
0
            {
3914
0
            case HTTP_STATUS_TEMPORARY_REDIRECT:
3915
0
            case HTTP_STATUS_PERMANENT_REDIRECT:
3916
0
              retval = NEWLOCATION_KEEP_POST;
3917
0
              goto cleanup;
3918
0
            case HTTP_STATUS_MOVED_PERMANENTLY:
3919
0
              if (opt.method && c_strcasecmp (opt.method, "post") != 0)
3920
0
                {
3921
0
                  retval = NEWLOCATION_KEEP_POST;
3922
0
                  goto cleanup;
3923
0
                }
3924
0
              break;
3925
0
            case HTTP_STATUS_MOVED_TEMPORARILY:
3926
0
              if (opt.method && c_strcasecmp (opt.method, "post") != 0)
3927
0
                {
3928
0
                  retval = NEWLOCATION_KEEP_POST;
3929
0
                  goto cleanup;
3930
0
                }
3931
0
              break;
3932
0
            }
3933
0
          retval = NEWLOCATION;
3934
0
          goto cleanup;
3935
0
        }
3936
0
    }
3937
3938
0
  if (cond_get)
3939
0
    {
3940
0
      if (statcode == HTTP_STATUS_NOT_MODIFIED)
3941
0
        {
3942
0
          logprintf (LOG_VERBOSE,
3943
0
                     _ ("File %s not modified on server. Omitting download.\n\n"),
3944
0
                     quote (hs->local_file));
3945
0
          *dt |= RETROKF;
3946
0
          CLOSE_FINISH (sock);
3947
0
          retval = RETRUNNEEDED;
3948
0
          goto cleanup;
3949
0
        }
3950
0
    }
3951
3952
0
  set_content_type (dt, type);
3953
3954
0
  if (opt.adjust_extension)
3955
0
    {
3956
0
      const char *encoding_ext = NULL;
3957
0
      switch (hs->local_encoding)
3958
0
        {
3959
0
        case ENC_INVALID:
3960
0
        case ENC_NONE:
3961
0
          break;
3962
0
        case ENC_BROTLI:
3963
0
          encoding_ext = ".br";
3964
0
          break;
3965
0
        case ENC_COMPRESS:
3966
0
          encoding_ext = ".Z";
3967
0
          break;
3968
0
        case ENC_DEFLATE:
3969
0
          encoding_ext = ".zlib";
3970
0
          break;
3971
0
        case ENC_GZIP:
3972
0
          encoding_ext = ".gz";
3973
0
          break;
3974
0
        default:
3975
0
          DEBUGP (("No extension found for encoding %d\n",
3976
0
                   hs->local_encoding));
3977
0
      }
3978
0
      if (encoding_ext != NULL)
3979
0
        {
3980
0
          char *file_ext = strrchr (hs->local_file, '.');
3981
          /* strip Content-Encoding extension (it will be re-added later) */
3982
0
          if (file_ext != NULL && 0 == strcasecmp (file_ext, encoding_ext))
3983
0
            *file_ext = '\0';
3984
0
        }
3985
0
      if (*dt & TEXTHTML)
3986
        /* -E / --adjust-extension / adjust_extension = on was specified,
3987
           and this is a text/html file.  If some case-insensitive
3988
           variation on ".htm[l]" isn't already the file's suffix,
3989
           tack on ".html". */
3990
0
        {
3991
0
          ensure_extension (hs, ".html", dt);
3992
0
        }
3993
0
      else if (*dt & TEXTCSS)
3994
0
        {
3995
0
          ensure_extension (hs, ".css", dt);
3996
0
        }
3997
0
      if (encoding_ext != NULL)
3998
0
        {
3999
0
          ensure_extension (hs, encoding_ext, dt);
4000
0
        }
4001
0
    }
4002
4003
0
  if (cond_get)
4004
0
    {
4005
      /* Handle the case when server ignores If-Modified-Since header.  */
4006
0
      if (statcode == HTTP_STATUS_OK && hs->remote_time)
4007
0
        {
4008
0
          time_t tmr = http_atotm (hs->remote_time);
4009
4010
          /* Check if the local file is up-to-date based on Last-Modified header
4011
             and content length.  */
4012
0
          if (tmr != (time_t) - 1 && tmr <= hs->orig_file_tstamp
4013
0
              && (contlen == -1 || contlen == hs->orig_file_size))
4014
0
            {
4015
0
              logprintf (LOG_VERBOSE,
4016
0
                         _("Server ignored If-Modified-Since header for file %s.\n"
4017
0
                           "You might want to add --no-if-modified-since option."
4018
0
                           "\n\n"),
4019
0
                         quote (hs->local_file));
4020
0
              *dt |= RETROKF;
4021
0
              CLOSE_INVALIDATE (sock);
4022
0
              retval = RETRUNNEEDED;
4023
0
              goto cleanup;
4024
0
            }
4025
0
        }
4026
0
    }
4027
4028
0
  if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE
4029
0
      || (!opt.timestamping && hs->restval > 0 && statcode == HTTP_STATUS_OK
4030
0
          && contrange == 0 && contlen >= 0 && hs->restval >= contlen))
4031
0
    {
4032
      /* If `-c' is in use and the file has been fully downloaded (or
4033
         the remote file has shrunk), Wget effectively requests bytes
4034
         after the end of file and the server response with 416
4035
         (or 200 with a <= Content-Length.  */
4036
0
      logputs (LOG_VERBOSE, _("\
4037
0
\n    The file is already fully retrieved; nothing to do.\n\n"));
4038
      /* In case the caller inspects. */
4039
0
      hs->len = contlen;
4040
0
      hs->res = 0;
4041
      /* Mark as successfully retrieved. */
4042
0
      *dt |= RETROKF;
4043
4044
      /* Try to maintain the keep-alive connection. It is often cheaper to
4045
       * consume some bytes which have already been sent than to negotiate
4046
       * a new connection. However, if the body is too large, or we don't
4047
       * care about keep-alive, then simply terminate the connection */
4048
0
      if (keep_alive &&
4049
0
          skip_short_body (sock, contlen, chunked_transfer_encoding))
4050
0
        CLOSE_FINISH (sock);
4051
0
      else
4052
0
        CLOSE_INVALIDATE (sock);
4053
0
      retval = RETRUNNEEDED;
4054
0
      goto cleanup;
4055
0
    }
4056
0
  if ((contrange != 0 && contrange != hs->restval)
4057
0
      || (H_PARTIAL (statcode) && !contrange && hs->restval))
4058
0
    {
4059
      /* The Range request was somehow misunderstood by the server.
4060
         Bail out.  */
4061
0
      CLOSE_INVALIDATE (sock);
4062
0
      retval = RANGEERR;
4063
0
      goto cleanup;
4064
0
    }
4065
0
  if (contlen == -1)
4066
0
    hs->contlen = -1;
4067
  /* If the response is gzipped, the uncompressed size is unknown. */
4068
0
  else if (hs->remote_encoding == ENC_GZIP)
4069
0
    hs->contlen = -1;
4070
0
  else
4071
0
    hs->contlen = contlen + contrange;
4072
4073
0
  if (opt.verbose)
4074
0
    {
4075
0
      if (*dt & RETROKF)
4076
0
        {
4077
          /* No need to print this output if the body won't be
4078
             downloaded at all, or if the original server response is
4079
             printed.  */
4080
0
          logputs (LOG_VERBOSE, _("Length: "));
4081
0
          if (contlen != -1)
4082
0
            {
4083
0
              logputs (LOG_VERBOSE, number_to_static_string (contlen + contrange));
4084
0
              if (contlen + contrange >= 1024)
4085
0
                logprintf (LOG_VERBOSE, " (%s)",
4086
0
                           human_readable (contlen + contrange, 10, 1));
4087
0
              if (contrange)
4088
0
                {
4089
0
                  if (contlen >= 1024)
4090
0
                    logprintf (LOG_VERBOSE, _(", %s (%s) remaining"),
4091
0
                               number_to_static_string (contlen),
4092
0
                               human_readable (contlen, 10, 1));
4093
0
                  else
4094
0
                    logprintf (LOG_VERBOSE, _(", %s remaining"),
4095
0
                               number_to_static_string (contlen));
4096
0
                }
4097
0
            }
4098
0
          else
4099
0
            logputs (LOG_VERBOSE,
4100
0
                     opt.ignore_length ? _("ignored") : _("unspecified"));
4101
0
          if (type)
4102
0
            logprintf (LOG_VERBOSE, " [%s]\n", quotearg_style (escape_quoting_style, type));
4103
0
          else
4104
0
            logputs (LOG_VERBOSE, "\n");
4105
0
        }
4106
0
    }
4107
4108
  /* Return if we have no intention of further downloading.  */
4109
0
  if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only || (opt.spider && !opt.recursive))
4110
0
    {
4111
      /* In case the caller cares to look...  */
4112
0
      hs->len = 0;
4113
0
      hs->res = 0;
4114
0
      hs->restval = 0;
4115
4116
      /* Normally we are not interested in the response body of a error responses.
4117
         But if we are writing a WARC file we are: we like to keep everything.  */
4118
0
      if (warc_enabled)
4119
0
        {
4120
0
          int _err = read_response_body (hs, sock, NULL, contlen, 0,
4121
0
                                        chunked_transfer_encoding,
4122
0
                                        u->url, warc_timestamp_str,
4123
0
                                        warc_request_uuid, warc_ip, type,
4124
0
                                        statcode, head);
4125
4126
0
          if (_err != RETRFINISHED || hs->res < 0)
4127
0
            {
4128
0
              CLOSE_INVALIDATE (sock);
4129
0
              retval = _err;
4130
0
              goto cleanup;
4131
0
            }
4132
4133
0
          CLOSE_FINISH (sock);
4134
0
        }
4135
0
      else
4136
0
        {
4137
          /* Since WARC is disabled, we are not interested in the response body.  */
4138
0
          if (head_only)
4139
            /* Pre-1.10 Wget used CLOSE_INVALIDATE here.  Now we trust the
4140
               servers not to send body in response to a HEAD request, and
4141
               those that do will likely be caught by test_socket_open.
4142
               If not, they can be worked around using
4143
               `--no-http-keep-alive'.  */
4144
0
            CLOSE_FINISH (sock);
4145
0
          else if (opt.spider && !opt.recursive)
4146
            /* we just want to see if the page exists - no downloading required */
4147
0
            CLOSE_INVALIDATE (sock);
4148
0
          else if (keep_alive
4149
0
                   && skip_short_body (sock, contlen, chunked_transfer_encoding))
4150
            /* Successfully skipped the body; also keep using the socket. */
4151
0
            CLOSE_FINISH (sock);
4152
0
          else
4153
0
            CLOSE_INVALIDATE (sock);
4154
0
        }
4155
4156
0
      if (statcode == HTTP_STATUS_GATEWAY_TIMEOUT)
4157
0
        retval = GATEWAYTIMEOUT;
4158
0
      else
4159
0
        retval = RETRFINISHED;
4160
4161
0
      goto cleanup;
4162
0
    }
4163
4164
0
  err = open_output_stream (hs, count, &fp);
4165
0
  if (err != RETROK)
4166
0
    {
4167
      /* Make sure that errno doesn't get clobbered.
4168
       * This is the case for OpenSSL's SSL_shutdown(). */
4169
0
      int tmp_errno = errno;
4170
0
      CLOSE_INVALIDATE (sock);
4171
0
      errno = tmp_errno;
4172
0
      retval = err;
4173
0
      goto cleanup;
4174
0
    }
4175
4176
0
#ifdef ENABLE_XATTR
4177
0
  if (opt.enable_xattr)
4178
0
    {
4179
0
      if (original_url != u)
4180
0
        set_file_metadata (u, original_url, fp);
4181
0
      else
4182
0
        set_file_metadata (u, NULL, fp);
4183
0
    }
4184
0
#endif
4185
4186
0
  err = read_response_body (hs, sock, fp, contlen, contrange,
4187
0
                            chunked_transfer_encoding,
4188
0
                            u->url, warc_timestamp_str,
4189
0
                            warc_request_uuid, warc_ip, type,
4190
0
                            statcode, head);
4191
4192
0
  if (hs->res >= 0)
4193
0
    CLOSE_FINISH (sock);
4194
0
  else
4195
0
    CLOSE_INVALIDATE (sock);
4196
4197
0
  if (!output_stream)
4198
0
    fclose (fp);
4199
4200
0
  retval = err;
4201
4202
0
  cleanup:
4203
0
  xfree (head);
4204
0
  xfree (type);
4205
0
  xfree (message);
4206
0
  resp_free (&resp);
4207
0
  request_free (&req);
4208
4209
0
  return retval;
4210
0
}
4211
4212
/* Check whether the supplied HTTP status code is among those
4213
   listed for the --retry-on-http-error option. */
4214
static bool
4215
check_retry_on_http_error (const int statcode)
4216
0
{
4217
0
  const char *tok = opt.retry_on_http_error;
4218
0
  while (tok && *tok)
4219
0
    {
4220
0
      if (atoi (tok) == statcode)
4221
0
        return true;
4222
0
      if ((tok = strchr (tok, ',')))
4223
0
        ++tok;
4224
0
    }
4225
0
  return false;
4226
0
}
4227
4228
/* The genuine HTTP loop!  This is the part where the retrieval is
4229
   retried, and retried, and retried, and...  */
4230
uerr_t
4231
http_loop (const struct url *u, struct url *original_url, char **newloc,
4232
           char **local_file, const char *referer, int *dt, struct url *proxy,
4233
           struct iri *iri)
4234
0
{
4235
0
  int count;
4236
0
  bool got_head = false;         /* used for time-stamping and filename detection */
4237
0
  bool time_came_from_head = false;
4238
0
  bool got_name = false;
4239
0
  char *tms;
4240
0
  const char *tmrate;
4241
0
  uerr_t err, ret = TRYLIMEXC;
4242
0
  time_t tmr = -1;               /* remote time-stamp */
4243
0
  struct http_stat hstat;        /* HTTP status */
4244
0
  struct stat st;
4245
0
  bool send_head_first = true;
4246
0
  bool force_full_retrieve = false;
4247
4248
4249
  /* If we are writing to a WARC file: always retrieve the whole file. */
4250
0
  if (opt.warc_filename != NULL)
4251
0
    force_full_retrieve = true;
4252
4253
4254
  /* Assert that no value for *LOCAL_FILE was passed. */
4255
0
  assert (local_file == NULL || *local_file == NULL);
4256
4257
  /* Set LOCAL_FILE parameter. */
4258
0
  if (local_file && opt.output_document)
4259
0
    *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document);
4260
4261
  /* Reset NEWLOC parameter. */
4262
0
  *newloc = NULL;
4263
4264
  /* This used to be done in main, but it's a better idea to do it
4265
     here so that we don't go through the hoops if we're just using
4266
     FTP or whatever. */
4267
0
  if (opt.cookies)
4268
0
    load_cookies ();
4269
4270
  /* Warn on (likely bogus) wildcard usage in HTTP. */
4271
0
  if (opt.ftp_glob && has_wildcards_p (u->path))
4272
0
    logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n"));
4273
4274
  /* Setup hstat struct. */
4275
0
  xzero (hstat);
4276
0
  hstat.referer = referer;
4277
4278
0
  if (opt.output_document)
4279
0
    {
4280
0
      hstat.local_file = xstrdup (opt.output_document);
4281
0
      got_name = true;
4282
0
    }
4283
0
  else if (!opt.content_disposition)
4284
0
    {
4285
0
      hstat.local_file =
4286
0
        url_file_name (opt.trustservernames ? u : original_url, NULL);
4287
0
      got_name = true;
4288
0
    }
4289
4290
0
  if (got_name && file_exists_p (hstat.local_file, NULL) && opt.noclobber && !opt.output_document)
4291
0
    {
4292
      /* If opt.noclobber is turned on and file already exists, do not
4293
         retrieve the file. But if the output_document was given, then this
4294
         test was already done and the file didn't exist. Hence the !opt.output_document */
4295
0
      get_file_flags (hstat.local_file, dt);
4296
0
      ret = RETROK;
4297
0
      goto exit;
4298
0
    }
4299
4300
  /* Reset the counter. */
4301
0
  count = 0;
4302
4303
  /* Reset the document type. */
4304
0
  *dt = 0;
4305
4306
  /* Skip preliminary HEAD request if we're not in spider mode.  */
4307
0
  if (!opt.spider)
4308
0
    send_head_first = false;
4309
4310
  /* Send preliminary HEAD request if --content-disposition and -c are used
4311
     together.  */
4312
0
  if (opt.content_disposition && opt.always_rest)
4313
0
    send_head_first = true;
4314
4315
#ifdef HAVE_METALINK
4316
  if (opt.metalink_over_http)
4317
    {
4318
      *dt |= METALINK_METADATA;
4319
      send_head_first = true;
4320
    }
4321
#endif
4322
4323
0
  if (opt.timestamping)
4324
0
    {
4325
      /* Use conditional get request if requested
4326
       * and if timestamp is known at this moment.  */
4327
0
      if (opt.if_modified_since && !send_head_first && got_name && file_exists_p (hstat.local_file, NULL))
4328
0
        {
4329
0
          *dt |= IF_MODIFIED_SINCE;
4330
0
          {
4331
0
            uerr_t timestamp_err = set_file_timestamp (&hstat);
4332
0
            if (timestamp_err != RETROK)
4333
0
              return timestamp_err;
4334
0
          }
4335
0
        }
4336
        /* Send preliminary HEAD request if -N is given and we have existing
4337
         * destination file or content disposition is enabled.  */
4338
0
      else if (opt.content_disposition || file_exists_p (hstat.local_file, NULL))
4339
0
        send_head_first = true;
4340
0
    }
4341
4342
  /* THE loop */
4343
0
  do
4344
0
    {
4345
      /* Increment the pass counter.  */
4346
0
      ++count;
4347
0
      sleep_between_retrievals (count);
4348
4349
      /* Get the current time string.  */
4350
0
      tms = datetime_str (time (NULL));
4351
4352
0
      if (opt.spider && !got_head)
4353
0
        logprintf (LOG_VERBOSE,
4354
0
        _("Spider mode enabled. Check if remote file exists.\n"));
4355
4356
      /* Print fetch message, if opt.verbose.  */
4357
0
      if (opt.verbose)
4358
0
        {
4359
0
          char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
4360
4361
0
          if (count > 1)
4362
0
            {
4363
0
              char tmp[256];
4364
0
              sprintf (tmp, _("(try:%2d)"), count);
4365
0
              logprintf (LOG_NOTQUIET, "--%s--  %s  %s\n",
4366
0
                         tms, tmp, hurl);
4367
0
            }
4368
0
          else
4369
0
            {
4370
0
              logprintf (LOG_NOTQUIET, "--%s--  %s\n",
4371
0
                         tms, hurl);
4372
0
            }
4373
4374
#ifdef WINDOWS
4375
          ws_changetitle (hurl);
4376
#endif
4377
0
          xfree (hurl);
4378
0
        }
4379
4380
      /* Default document type is empty.  However, if spider mode is
4381
         on or time-stamping is employed, HEAD_ONLY commands is
4382
         encoded within *dt.  */
4383
0
      if (send_head_first && !got_head)
4384
0
        *dt |= HEAD_ONLY;
4385
0
      else
4386
0
        *dt &= ~HEAD_ONLY;
4387
4388
      /* Decide whether or not to restart.  */
4389
0
      if (force_full_retrieve)
4390
0
        hstat.restval = hstat.len;
4391
0
      else if (opt.start_pos >= 0)
4392
0
        hstat.restval = opt.start_pos;
4393
0
      else if (opt.always_rest
4394
0
          && got_name
4395
0
          && stat (hstat.local_file, &st) == 0
4396
0
          && S_ISREG (st.st_mode))
4397
        /* When -c is used, continue from on-disk size.  (Can't use
4398
           hstat.len even if count>1 because we don't want a failed
4399
           first attempt to clobber existing data.)  */
4400
0
        hstat.restval = st.st_size;
4401
0
      else if (count > 1)
4402
0
        {
4403
          /* otherwise, continue where the previous try left off */
4404
0
          if (hstat.len < hstat.restval)
4405
0
            hstat.restval -= hstat.len;
4406
0
          else
4407
0
            hstat.restval = hstat.len;
4408
0
        }
4409
0
      else
4410
0
        hstat.restval = 0;
4411
4412
      /* Decide whether to send the no-cache directive.  We send it in
4413
         two cases:
4414
           a) we're using a proxy, and we're past our first retrieval.
4415
              Some proxies are notorious for caching incomplete data, so
4416
              we require a fresh get.
4417
           b) caching is explicitly inhibited. */
4418
0
      if ((proxy && count > 1)        /* a */
4419
0
          || !opt.allow_cache)        /* b */
4420
0
        *dt |= SEND_NOCACHE;
4421
0
      else
4422
0
        *dt &= ~SEND_NOCACHE;
4423
4424
      /* Try fetching the document, or at least its head.  */
4425
0
      err = gethttp (u, original_url, &hstat, dt, proxy, iri, count);
4426
4427
      /* Time?  */
4428
0
      tms = datetime_str (time (NULL));
4429
4430
      /* Get the new location (with or without the redirection).  */
4431
0
      if (hstat.newloc)
4432
0
        *newloc = xstrdup (hstat.newloc);
4433
4434
0
      switch (err)
4435
0
        {
4436
0
        case HERR: case HEOF: case CONSOCKERR:
4437
0
        case CONERROR: case READERR: case WRITEFAILED:
4438
0
        case RANGEERR: case FOPEN_EXCL_ERR: case GATEWAYTIMEOUT:
4439
          /* Non-fatal errors continue executing the loop, which will
4440
             bring them to "while" statement at the end, to judge
4441
             whether the number of tries was exceeded.  */
4442
0
          printwhat (count, opt.ntry);
4443
0
          continue;
4444
0
        case FWRITEERR: case FOPENERR:
4445
          /* Another fatal error.  */
4446
0
          logputs (LOG_VERBOSE, "\n");
4447
0
          logprintf (LOG_NOTQUIET, _("Cannot write to %s (%s).\n"),
4448
0
                     quote (hstat.local_file), strerror (errno));
4449
0
          ret = err;
4450
0
          goto exit;
4451
0
        case HOSTERR:
4452
          /* Fatal unless option set otherwise. */
4453
0
          if ( opt.retry_on_host_error )
4454
0
            {
4455
0
              printwhat (count, opt.ntry);
4456
0
              continue;
4457
0
            }
4458
0
          ret = err;
4459
0
          goto exit;
4460
0
        case CONIMPOSSIBLE: case PROXERR: case SSLINITFAILED:
4461
0
        case CONTNOTSUPPORTED: case VERIFCERTERR: case FILEBADFILE:
4462
0
        case UNKNOWNATTR:
4463
          /* Fatal errors just return from the function.  */
4464
0
          ret = err;
4465
0
          goto exit;
4466
0
        case ATTRMISSING:
4467
          /* A missing attribute in a Header is a fatal Protocol error. */
4468
0
          logputs (LOG_VERBOSE, "\n");
4469
0
          logprintf (LOG_NOTQUIET, _("Required attribute missing from Header received.\n"));
4470
0
          ret = err;
4471
0
          goto exit;
4472
0
        case AUTHFAILED:
4473
0
          logputs (LOG_VERBOSE, "\n");
4474
0
          logprintf (LOG_NOTQUIET, _("Username/Password Authentication Failed.\n"));
4475
0
          ret = err;
4476
0
          goto exit;
4477
0
        case WARC_ERR:
4478
          /* A fatal WARC error. */
4479
0
          logputs (LOG_VERBOSE, "\n");
4480
0
          logprintf (LOG_NOTQUIET, _("Cannot write to WARC file.\n"));
4481
0
          ret = err;
4482
0
          goto exit;
4483
0
        case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR:
4484
          /* A fatal WARC error. */
4485
0
          logputs (LOG_VERBOSE, "\n");
4486
0
          logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n"));
4487
0
          ret = err;
4488
0
          goto exit;
4489
0
        case CONSSLERR:
4490
          /* Another fatal error.  */
4491
0
          logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n"));
4492
0
          ret = err;
4493
0
          goto exit;
4494
0
        case UNLINKERR:
4495
          /* Another fatal error.  */
4496
0
          logputs (LOG_VERBOSE, "\n");
4497
0
          logprintf (LOG_NOTQUIET, _("Cannot unlink %s (%s).\n"),
4498
0
                     quote (hstat.local_file), strerror (errno));
4499
0
          ret = err;
4500
0
          goto exit;
4501
0
        case NEWLOCATION:
4502
0
        case NEWLOCATION_KEEP_POST:
4503
          /* Return the new location to the caller.  */
4504
0
          if (!*newloc)
4505
0
            {
4506
0
              logprintf (LOG_NOTQUIET,
4507
0
                         _("ERROR: Redirection (%d) without location.\n"),
4508
0
                         hstat.statcode);
4509
0
              ret = WRONGCODE;
4510
0
            }
4511
0
          else
4512
0
            {
4513
0
              ret = err;
4514
0
            }
4515
0
          goto exit;
4516
0
        case RETRUNNEEDED:
4517
          /* The file was already fully retrieved. */
4518
0
          ret = RETROK;
4519
0
          goto exit;
4520
0
        case RETRFINISHED:
4521
          /* Deal with you later.  */
4522
0
          break;
4523
#ifdef HAVE_METALINK
4524
        case RETR_WITH_METALINK:
4525
          {
4526
            if (hstat.metalink == NULL)
4527
              {
4528
                logputs (LOG_NOTQUIET,
4529
                         _("Could not find Metalink data in HTTP response. "
4530
                           "Downloading file using HTTP GET.\n"));
4531
                *dt &= ~METALINK_METADATA;
4532
                *dt &= ~HEAD_ONLY;
4533
                got_head = true;
4534
                continue;
4535
              }
4536
4537
            logputs (LOG_VERBOSE,
4538
                     _("Metalink headers found. "
4539
                       "Switching to Metalink mode.\n"));
4540
4541
            ret = retrieve_from_metalink (hstat.metalink);
4542
            goto exit;
4543
          }
4544
          break;
4545
#endif
4546
0
        default:
4547
          /* All possibilities should have been exhausted.  */
4548
0
          abort ();
4549
0
        }
4550
4551
0
      if (!(*dt & RETROKF))
4552
0
        {
4553
0
          char *hurl = NULL;
4554
0
          if (!opt.verbose)
4555
0
            {
4556
              /* #### Ugly ugly ugly! */
4557
0
              hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
4558
0
              logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
4559
0
            }
4560
4561
          /* Fall back to GET if HEAD fails with a 500 or 501 error code. */
4562
0
          if (*dt & HEAD_ONLY
4563
0
              && (hstat.statcode == 500 || hstat.statcode == 501))
4564
0
            {
4565
0
              got_head = true;
4566
0
              xfree (hurl);
4567
0
              continue;
4568
0
            }
4569
          /* Maybe we should always keep track of broken links, not just in
4570
           * spider mode.
4571
           * Don't log error if it was UTF-8 encoded because we will try
4572
           * once unencoded. */
4573
0
          else if (opt.spider && !iri->utf8_encode)
4574
0
            {
4575
              /* #### Again: ugly ugly ugly! */
4576
0
              if (!hurl)
4577
0
                hurl = url_string (u, URL_AUTH_HIDE_PASSWD);
4578
0
              nonexisting_url (hurl);
4579
0
              logprintf (LOG_NOTQUIET, _("\
4580
0
Remote file does not exist -- broken link!!!\n"));
4581
0
            }
4582
0
          else if (check_retry_on_http_error (hstat.statcode))
4583
0
            {
4584
0
              printwhat (count, opt.ntry);
4585
0
              xfree (hurl);
4586
0
              continue;
4587
0
            }
4588
0
          else
4589
0
            {
4590
0
              logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
4591
0
                         tms, hstat.statcode,
4592
0
                         quotearg_style (escape_quoting_style, hstat.error));
4593
0
            }
4594
0
          logputs (LOG_VERBOSE, "\n");
4595
0
          ret = WRONGCODE;
4596
0
          xfree (hurl);
4597
0
          goto exit;
4598
0
        }
4599
4600
      /* Did we get the time-stamp? */
4601
0
      if (!got_head || (opt.spider && !opt.recursive))
4602
0
        {
4603
0
          got_head = true;    /* no more time-stamping */
4604
4605
0
          if (opt.timestamping && !hstat.remote_time)
4606
0
            {
4607
0
              logputs (LOG_NOTQUIET, _("\
4608
0
Last-modified header missing -- time-stamps turned off.\n"));
4609
0
            }
4610
0
          else if (hstat.remote_time)
4611
0
            {
4612
              /* Convert the date-string into struct tm.  */
4613
0
              tmr = http_atotm (hstat.remote_time);
4614
0
              if (tmr == (time_t) (-1))
4615
0
                logputs (LOG_VERBOSE, _("\
4616
0
Last-modified header invalid -- time-stamp ignored.\n"));
4617
0
              if (*dt & HEAD_ONLY)
4618
0
                time_came_from_head = true;
4619
0
            }
4620
4621
0
          if (send_head_first)
4622
0
            {
4623
              /* The time-stamping section.  */
4624
0
              if (opt.timestamping)
4625
0
                {
4626
0
                  if (hstat.orig_file_name) /* Perform the following
4627
                                               checks only if the file
4628
                                               we're supposed to
4629
                                               download already exists.  */
4630
0
                    {
4631
0
                      if (hstat.remote_time &&
4632
0
                          tmr != (time_t) (-1))
4633
0
                        {
4634
                          /* Now time-stamping can be used validly.
4635
                             Time-stamping means that if the sizes of
4636
                             the local and remote file match, and local
4637
                             file is newer than the remote file, it will
4638
                             not be retrieved.  Otherwise, the normal
4639
                             download procedure is resumed.  */
4640
0
                          if (hstat.orig_file_tstamp >= tmr)
4641
0
                            {
4642
0
                              if (hstat.contlen == -1
4643
0
                                  || hstat.orig_file_size == hstat.contlen)
4644
0
                                {
4645
0
                                  logprintf (LOG_VERBOSE, _("\
4646
0
Server file no newer than local file %s -- not retrieving.\n\n"),
4647
0
                                             quote (hstat.orig_file_name));
4648
0
                                  ret = RETROK;
4649
0
                                  goto exit;
4650
0
                                }
4651
0
                              else
4652
0
                                {
4653
0
                                  logprintf (LOG_VERBOSE, _("\
4654
0
The sizes do not match (local %s) -- retrieving.\n"),
4655
0
                                             number_to_static_string (hstat.orig_file_size));
4656
0
                                }
4657
0
                            }
4658
0
                          else
4659
0
                            {
4660
0
                              force_full_retrieve = true;
4661
0
                              logputs (LOG_VERBOSE,
4662
0
                                       _("Remote file is newer, retrieving.\n"));
4663
0
                            }
4664
4665
0
                          logputs (LOG_VERBOSE, "\n");
4666
0
                        }
4667
0
                    }
4668
4669
                  /* free_hstat (&hstat); */
4670
0
                  hstat.timestamp_checked = true;
4671
0
                }
4672
4673
0
              if (opt.spider)
4674
0
                {
4675
0
                  bool finished = true;
4676
0
                  if (opt.recursive)
4677
0
                    {
4678
0
                      if ((*dt & TEXTHTML) || (*dt & TEXTCSS))
4679
0
                        {
4680
0
                          logputs (LOG_VERBOSE, _("\
4681
0
Remote file exists and could contain links to other resources -- retrieving.\n\n"));
4682
0
                          finished = false;
4683
0
                        }
4684
0
                      else
4685
0
                        {
4686
0
                          logprintf (LOG_VERBOSE, _("\
4687
0
Remote file exists but does not contain any link -- not retrieving.\n\n"));
4688
0
                          ret = RETROK; /* RETRUNNEEDED is not for caller. */
4689
0
                        }
4690
0
                    }
4691
0
                  else
4692
0
                    {
4693
0
                      if ((*dt & TEXTHTML) || (*dt & TEXTCSS))
4694
0
                        {
4695
0
                          logprintf (LOG_VERBOSE, _("\
4696
0
Remote file exists and could contain further links,\n\
4697
0
but recursion is disabled -- not retrieving.\n\n"));
4698
0
                        }
4699
0
                      else
4700
0
                        {
4701
0
                          logprintf (LOG_VERBOSE, _("\
4702
0
Remote file exists.\n\n"));
4703
0
                        }
4704
0
                      ret = RETROK; /* RETRUNNEEDED is not for caller. */
4705
0
                    }
4706
4707
0
                  if (finished)
4708
0
                    {
4709
0
                      logprintf (LOG_NONVERBOSE,
4710
0
                                 _("%s URL: %s %2d %s\n"),
4711
0
                                 tms, u->url, hstat.statcode,
4712
0
                                 hstat.message ? quotearg_style (escape_quoting_style, hstat.message) : "");
4713
0
                      goto exit;
4714
0
                    }
4715
0
                }
4716
4717
0
              got_name = true;
4718
0
              *dt &= ~HEAD_ONLY;
4719
0
              count = 0;          /* the retrieve count for HEAD is reset */
4720
0
              continue;
4721
0
            } /* send_head_first */
4722
0
        } /* !got_head */
4723
4724
0
      if (opt.useservertimestamps
4725
0
          && (tmr != (time_t) (-1))
4726
0
          && ((hstat.len == hstat.contlen) ||
4727
0
              ((hstat.res == 0) && (hstat.contlen == -1))))
4728
0
        {
4729
0
          const char *fl = NULL;
4730
0
          set_local_file (&fl, hstat.local_file);
4731
0
          if (fl)
4732
0
            {
4733
0
              time_t newtmr = -1;
4734
              /* Reparse time header, in case it's changed. */
4735
0
              if (time_came_from_head
4736
0
                  && hstat.remote_time && hstat.remote_time[0])
4737
0
                {
4738
0
                  newtmr = http_atotm (hstat.remote_time);
4739
0
                  if (newtmr != (time_t)-1)
4740
0
                    tmr = newtmr;
4741
0
                }
4742
0
              touch (fl, tmr);
4743
0
            }
4744
0
        }
4745
      /* End of time-stamping section. */
4746
4747
0
      tmrate = retr_rate (hstat.rd_size, hstat.dltime);
4748
0
      total_download_time += hstat.dltime;
4749
4750
0
      if (hstat.len == hstat.contlen)
4751
0
        {