Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pip/_vendor/cachecontrol/controller.py: 12%

206 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:48 +0000

1# SPDX-FileCopyrightText: 2015 Eric Larson 

2# 

3# SPDX-License-Identifier: Apache-2.0 

4 

5""" 

6The httplib2 algorithms ported for use with requests. 

7""" 

8import logging 

9import re 

10import calendar 

11import time 

12from email.utils import parsedate_tz 

13 

14from pip._vendor.requests.structures import CaseInsensitiveDict 

15 

16from .cache import DictCache, SeparateBodyBaseCache 

17from .serialize import Serializer 

18 

19 

20logger = logging.getLogger(__name__) 

21 

22URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") 

23 

24PERMANENT_REDIRECT_STATUSES = (301, 308) 

25 

26 

27def parse_uri(uri): 

28 """Parses a URI using the regex given in Appendix B of RFC 3986. 

29 

30 (scheme, authority, path, query, fragment) = parse_uri(uri) 

31 """ 

32 groups = URI.match(uri).groups() 

33 return (groups[1], groups[3], groups[4], groups[6], groups[8]) 

34 

35 

36class CacheController(object): 

37 """An interface to see if request should cached or not.""" 

38 

39 def __init__( 

40 self, cache=None, cache_etags=True, serializer=None, status_codes=None 

41 ): 

42 self.cache = DictCache() if cache is None else cache 

43 self.cache_etags = cache_etags 

44 self.serializer = serializer or Serializer() 

45 self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308) 

46 

47 @classmethod 

48 def _urlnorm(cls, uri): 

49 """Normalize the URL to create a safe key for the cache""" 

50 (scheme, authority, path, query, fragment) = parse_uri(uri) 

51 if not scheme or not authority: 

52 raise Exception("Only absolute URIs are allowed. uri = %s" % uri) 

53 

54 scheme = scheme.lower() 

55 authority = authority.lower() 

56 

57 if not path: 

58 path = "/" 

59 

60 # Could do syntax based normalization of the URI before 

61 # computing the digest. See Section 6.2.2 of Std 66. 

62 request_uri = query and "?".join([path, query]) or path 

63 defrag_uri = scheme + "://" + authority + request_uri 

64 

65 return defrag_uri 

66 

67 @classmethod 

68 def cache_url(cls, uri): 

69 return cls._urlnorm(uri) 

70 

71 def parse_cache_control(self, headers): 

72 known_directives = { 

73 # https://tools.ietf.org/html/rfc7234#section-5.2 

74 "max-age": (int, True), 

75 "max-stale": (int, False), 

76 "min-fresh": (int, True), 

77 "no-cache": (None, False), 

78 "no-store": (None, False), 

79 "no-transform": (None, False), 

80 "only-if-cached": (None, False), 

81 "must-revalidate": (None, False), 

82 "public": (None, False), 

83 "private": (None, False), 

84 "proxy-revalidate": (None, False), 

85 "s-maxage": (int, True), 

86 } 

87 

88 cc_headers = headers.get("cache-control", headers.get("Cache-Control", "")) 

89 

90 retval = {} 

91 

92 for cc_directive in cc_headers.split(","): 

93 if not cc_directive.strip(): 

94 continue 

95 

96 parts = cc_directive.split("=", 1) 

97 directive = parts[0].strip() 

98 

99 try: 

100 typ, required = known_directives[directive] 

101 except KeyError: 

102 logger.debug("Ignoring unknown cache-control directive: %s", directive) 

103 continue 

104 

105 if not typ or not required: 

106 retval[directive] = None 

107 if typ: 

108 try: 

109 retval[directive] = typ(parts[1].strip()) 

110 except IndexError: 

111 if required: 

112 logger.debug( 

113 "Missing value for cache-control " "directive: %s", 

114 directive, 

115 ) 

116 except ValueError: 

117 logger.debug( 

118 "Invalid value for cache-control directive " "%s, must be %s", 

119 directive, 

120 typ.__name__, 

121 ) 

122 

123 return retval 

124 

125 def cached_request(self, request): 

126 """ 

127 Return a cached response if it exists in the cache, otherwise 

128 return False. 

129 """ 

130 cache_url = self.cache_url(request.url) 

131 logger.debug('Looking up "%s" in the cache', cache_url) 

132 cc = self.parse_cache_control(request.headers) 

133 

134 # Bail out if the request insists on fresh data 

135 if "no-cache" in cc: 

136 logger.debug('Request header has "no-cache", cache bypassed') 

137 return False 

138 

139 if "max-age" in cc and cc["max-age"] == 0: 

140 logger.debug('Request header has "max_age" as 0, cache bypassed') 

141 return False 

142 

143 # Request allows serving from the cache, let's see if we find something 

144 cache_data = self.cache.get(cache_url) 

145 if cache_data is None: 

146 logger.debug("No cache entry available") 

147 return False 

148 

149 if isinstance(self.cache, SeparateBodyBaseCache): 

150 body_file = self.cache.get_body(cache_url) 

151 else: 

152 body_file = None 

153 

154 # Check whether it can be deserialized 

155 resp = self.serializer.loads(request, cache_data, body_file) 

156 if not resp: 

157 logger.warning("Cache entry deserialization failed, entry ignored") 

158 return False 

159 

160 # If we have a cached permanent redirect, return it immediately. We 

161 # don't need to test our response for other headers b/c it is 

162 # intrinsically "cacheable" as it is Permanent. 

163 # 

164 # See: 

165 # https://tools.ietf.org/html/rfc7231#section-6.4.2 

166 # 

167 # Client can try to refresh the value by repeating the request 

168 # with cache busting headers as usual (ie no-cache). 

169 if int(resp.status) in PERMANENT_REDIRECT_STATUSES: 

170 msg = ( 

171 "Returning cached permanent redirect response " 

172 "(ignoring date and etag information)" 

173 ) 

174 logger.debug(msg) 

175 return resp 

176 

177 headers = CaseInsensitiveDict(resp.headers) 

178 if not headers or "date" not in headers: 

179 if "etag" not in headers: 

180 # Without date or etag, the cached response can never be used 

181 # and should be deleted. 

182 logger.debug("Purging cached response: no date or etag") 

183 self.cache.delete(cache_url) 

184 logger.debug("Ignoring cached response: no date") 

185 return False 

186 

187 now = time.time() 

188 date = calendar.timegm(parsedate_tz(headers["date"])) 

189 current_age = max(0, now - date) 

190 logger.debug("Current age based on date: %i", current_age) 

191 

192 # TODO: There is an assumption that the result will be a 

193 # urllib3 response object. This may not be best since we 

194 # could probably avoid instantiating or constructing the 

195 # response until we know we need it. 

196 resp_cc = self.parse_cache_control(headers) 

197 

198 # determine freshness 

199 freshness_lifetime = 0 

200 

201 # Check the max-age pragma in the cache control header 

202 if "max-age" in resp_cc: 

203 freshness_lifetime = resp_cc["max-age"] 

204 logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime) 

205 

206 # If there isn't a max-age, check for an expires header 

207 elif "expires" in headers: 

208 expires = parsedate_tz(headers["expires"]) 

209 if expires is not None: 

210 expire_time = calendar.timegm(expires) - date 

211 freshness_lifetime = max(0, expire_time) 

212 logger.debug("Freshness lifetime from expires: %i", freshness_lifetime) 

213 

214 # Determine if we are setting freshness limit in the 

215 # request. Note, this overrides what was in the response. 

216 if "max-age" in cc: 

217 freshness_lifetime = cc["max-age"] 

218 logger.debug( 

219 "Freshness lifetime from request max-age: %i", freshness_lifetime 

220 ) 

221 

222 if "min-fresh" in cc: 

223 min_fresh = cc["min-fresh"] 

224 # adjust our current age by our min fresh 

225 current_age += min_fresh 

226 logger.debug("Adjusted current age from min-fresh: %i", current_age) 

227 

228 # Return entry if it is fresh enough 

229 if freshness_lifetime > current_age: 

230 logger.debug('The response is "fresh", returning cached response') 

231 logger.debug("%i > %i", freshness_lifetime, current_age) 

232 return resp 

233 

234 # we're not fresh. If we don't have an Etag, clear it out 

235 if "etag" not in headers: 

236 logger.debug('The cached response is "stale" with no etag, purging') 

237 self.cache.delete(cache_url) 

238 

239 # return the original handler 

240 return False 

241 

242 def conditional_headers(self, request): 

243 cache_url = self.cache_url(request.url) 

244 resp = self.serializer.loads(request, self.cache.get(cache_url)) 

245 new_headers = {} 

246 

247 if resp: 

248 headers = CaseInsensitiveDict(resp.headers) 

249 

250 if "etag" in headers: 

251 new_headers["If-None-Match"] = headers["ETag"] 

252 

253 if "last-modified" in headers: 

254 new_headers["If-Modified-Since"] = headers["Last-Modified"] 

255 

256 return new_headers 

257 

258 def _cache_set(self, cache_url, request, response, body=None, expires_time=None): 

259 """ 

260 Store the data in the cache. 

261 """ 

262 if isinstance(self.cache, SeparateBodyBaseCache): 

263 # We pass in the body separately; just put a placeholder empty 

264 # string in the metadata. 

265 self.cache.set( 

266 cache_url, 

267 self.serializer.dumps(request, response, b""), 

268 expires=expires_time, 

269 ) 

270 self.cache.set_body(cache_url, body) 

271 else: 

272 self.cache.set( 

273 cache_url, 

274 self.serializer.dumps(request, response, body), 

275 expires=expires_time, 

276 ) 

277 

278 def cache_response(self, request, response, body=None, status_codes=None): 

279 """ 

280 Algorithm for caching requests. 

281 

282 This assumes a requests Response object. 

283 """ 

284 # From httplib2: Don't cache 206's since we aren't going to 

285 # handle byte range requests 

286 cacheable_status_codes = status_codes or self.cacheable_status_codes 

287 if response.status not in cacheable_status_codes: 

288 logger.debug( 

289 "Status code %s not in %s", response.status, cacheable_status_codes 

290 ) 

291 return 

292 

293 response_headers = CaseInsensitiveDict(response.headers) 

294 

295 if "date" in response_headers: 

296 date = calendar.timegm(parsedate_tz(response_headers["date"])) 

297 else: 

298 date = 0 

299 

300 # If we've been given a body, our response has a Content-Length, that 

301 # Content-Length is valid then we can check to see if the body we've 

302 # been given matches the expected size, and if it doesn't we'll just 

303 # skip trying to cache it. 

304 if ( 

305 body is not None 

306 and "content-length" in response_headers 

307 and response_headers["content-length"].isdigit() 

308 and int(response_headers["content-length"]) != len(body) 

309 ): 

310 return 

311 

312 cc_req = self.parse_cache_control(request.headers) 

313 cc = self.parse_cache_control(response_headers) 

314 

315 cache_url = self.cache_url(request.url) 

316 logger.debug('Updating cache with response from "%s"', cache_url) 

317 

318 # Delete it from the cache if we happen to have it stored there 

319 no_store = False 

320 if "no-store" in cc: 

321 no_store = True 

322 logger.debug('Response header has "no-store"') 

323 if "no-store" in cc_req: 

324 no_store = True 

325 logger.debug('Request header has "no-store"') 

326 if no_store and self.cache.get(cache_url): 

327 logger.debug('Purging existing cache entry to honor "no-store"') 

328 self.cache.delete(cache_url) 

329 if no_store: 

330 return 

331 

332 # https://tools.ietf.org/html/rfc7234#section-4.1: 

333 # A Vary header field-value of "*" always fails to match. 

334 # Storing such a response leads to a deserialization warning 

335 # during cache lookup and is not allowed to ever be served, 

336 # so storing it can be avoided. 

337 if "*" in response_headers.get("vary", ""): 

338 logger.debug('Response header has "Vary: *"') 

339 return 

340 

341 # If we've been given an etag, then keep the response 

342 if self.cache_etags and "etag" in response_headers: 

343 expires_time = 0 

344 if response_headers.get("expires"): 

345 expires = parsedate_tz(response_headers["expires"]) 

346 if expires is not None: 

347 expires_time = calendar.timegm(expires) - date 

348 

349 expires_time = max(expires_time, 14 * 86400) 

350 

351 logger.debug("etag object cached for {0} seconds".format(expires_time)) 

352 logger.debug("Caching due to etag") 

353 self._cache_set(cache_url, request, response, body, expires_time) 

354 

355 # Add to the cache any permanent redirects. We do this before looking 

356 # that the Date headers. 

357 elif int(response.status) in PERMANENT_REDIRECT_STATUSES: 

358 logger.debug("Caching permanent redirect") 

359 self._cache_set(cache_url, request, response, b"") 

360 

361 # Add to the cache if the response headers demand it. If there 

362 # is no date header then we can't do anything about expiring 

363 # the cache. 

364 elif "date" in response_headers: 

365 date = calendar.timegm(parsedate_tz(response_headers["date"])) 

366 # cache when there is a max-age > 0 

367 if "max-age" in cc and cc["max-age"] > 0: 

368 logger.debug("Caching b/c date exists and max-age > 0") 

369 expires_time = cc["max-age"] 

370 self._cache_set( 

371 cache_url, 

372 request, 

373 response, 

374 body, 

375 expires_time, 

376 ) 

377 

378 # If the request can expire, it means we should cache it 

379 # in the meantime. 

380 elif "expires" in response_headers: 

381 if response_headers["expires"]: 

382 expires = parsedate_tz(response_headers["expires"]) 

383 if expires is not None: 

384 expires_time = calendar.timegm(expires) - date 

385 else: 

386 expires_time = None 

387 

388 logger.debug( 

389 "Caching b/c of expires header. expires in {0} seconds".format( 

390 expires_time 

391 ) 

392 ) 

393 self._cache_set( 

394 cache_url, 

395 request, 

396 response, 

397 body, 

398 expires_time, 

399 ) 

400 

401 def update_cached_response(self, request, response): 

402 """On a 304 we will get a new set of headers that we want to 

403 update our cached value with, assuming we have one. 

404 

405 This should only ever be called when we've sent an ETag and 

406 gotten a 304 as the response. 

407 """ 

408 cache_url = self.cache_url(request.url) 

409 

410 cached_response = self.serializer.loads(request, self.cache.get(cache_url)) 

411 

412 if not cached_response: 

413 # we didn't have a cached response 

414 return response 

415 

416 # Lets update our headers with the headers from the new request: 

417 # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1 

418 # 

419 # The server isn't supposed to send headers that would make 

420 # the cached body invalid. But... just in case, we'll be sure 

421 # to strip out ones we know that might be problmatic due to 

422 # typical assumptions. 

423 excluded_headers = ["content-length"] 

424 

425 cached_response.headers.update( 

426 dict( 

427 (k, v) 

428 for k, v in response.headers.items() 

429 if k.lower() not in excluded_headers 

430 ) 

431 ) 

432 

433 # we want a 200 b/c we have content via the cache 

434 cached_response.status = 200 

435 

436 # update our cache 

437 self._cache_set(cache_url, request, cached_response) 

438 

439 return cached_response