Source code for gws.tools.net

import cgi
import hashlib
import os
import pickle
import re
import time
import urllib.parse

import requests
import requests.structures

import gws

# https://urllib3.readthedocs.org/en/latest/security.html#using-your-system-s-root-certificates
CA_CERTS_PATH = '/etc/ssl/certs/ca-certificates.crt'


[docs]class Error(gws.Error): pass
[docs]class HTTPError(Error): pass
[docs]class Timeout(Error): pass
_parse_url_keys = ( 'dir', 'ext', 'filename', 'fnbody', 'fragment', 'hostname', 'netloc', 'params', 'password', 'path', 'port', 'qs', 'query', 'scheme', 'username', )
[docs]def quote(s, safe='/'): return urllib.parse.quote(s, safe)
[docs]def unquote(s): return urllib.parse.unquote(s)
[docs]def is_abs_url(url): return re.match(r'^([a-z]+:|)//', url)
[docs]def parse_url(url): p = {k: '' for k in _parse_url_keys} # NB force an absolute url if not is_abs_url(url): url = '//' + url res = urllib.parse.urlsplit(url) for k in _parse_url_keys: p[k] = getattr(res, k, '') or '' if p['path']: p['dir'], p['filename'] = os.path.split(p['path']) if p['filename'].startswith('.'): p['fnbody'], p['ext'] = p['filename'], '' else: p['fnbody'], _, p['ext'] = p['filename'].partition('.') if p['query']: p['qs'] = urllib.parse.parse_qs(p['query']) r = {k: v[0] for k, v in p['qs'].items()} else: r = {} p['params'] = requests.structures.CaseInsensitiveDict(r) if p['username']: p['username'] = unquote(p['username']) p['password'] = unquote(p.get('password', '')) return p
[docs]def make_url(p): s = '' if p.get('scheme'): s += p['scheme'] s += '://' else: s += '//' if p.get('username'): s += quote(p.get('username')) s += ':' s += quote(p.get('password', '')) s += '@' s += p['hostname'] if p.get('port'): s += ':' s += str(p['port']) if p.get('path'): s += '/' s += p['path'].lstrip('/') if p.get('params'): s += '?' s += gws.as_query_string(dict(p['params'])) if p.get('fragment'): s += '#' s += p['fragment'].lstrip('#') return s
[docs]def add_params(url, params): p = parse_url(url) p['params'].update(params) return make_url(p)
# @TODO locking for caches
[docs]class Response: def __init__(self, resp: requests.Response): self.status_code = resp.status_code self.content = resp.content self.content_type, self.content_type_encoding = self._parse_content_type(resp.headers) self._text = None @property def text(self): if self._text is None: self._text = self._get_text() return self._text def _get_text(self): if self.content_type_encoding: try: return str(self.content, encoding=self.content_type_encoding, errors='strict') except UnicodeDecodeError: pass # some guys serve utf8 content without a header, in which case requests thinks it's ISO-8859-1 # (see http://docs.python-requests.org/en/master/user/advanced/#encodings) # # 'apparent_encoding' is not always reliable # # therefore when there's no header, we try utf8 first, and then ISO-8859-1 try: return str(self.content, encoding='utf8', errors='strict') except UnicodeDecodeError: pass try: return str(self.content, encoding='ISO-8859-1', errors='strict') except UnicodeDecodeError: pass # both failed, do utf8 with replace gws.log.warn(f'decode failed') return str(self.content, encoding='utf8', errors='replace') def _parse_content_type(self, headers): # copied from requests.utils.get_encoding_from_headers, but with no ISO-8859-1 default content_type = headers.get('content-type') if not content_type: # https://www.w3.org/Protocols/rfc2616/rfc2616-sec7.html#sec7.2.1 return 'application/octet-stream', None ctype, params = cgi.parse_header(content_type) if 'charset' not in params: return ctype, None enc = params['charset'].strip("'\"") # make sure this is a valid python encoding try: str(b'.', encoding=enc, errors='strict') except LookupError: gws.log.warn(f'invalid content-type encoding {enc!r}') return ctype, None return ctype, enc
[docs]class FailedResponse: def __init__(self, err): self.status_code = 500 self.content = repr(err).encode('utf8') self.content_type = 'text/plain' self.content_type_encoding = 'utf8' self.text = repr(err)
[docs]def http_request(url, **kwargs) -> Response: if 'params' in kwargs: url = add_params(url, kwargs.pop('params')) cache_path = None max_age = kwargs.pop('max_age', 0) gws.log.debug(f'REQUEST_BEGIN: url={url!r} max_age={max_age}') if max_age: cache_path = _cache_path(url) ag = _file_age(cache_path) if ag < max_age: gws.log.debug(f'REQUEST_CACHED: path={cache_path!r} age={ag}') return _read_cache(cache_path) gws.log.debug('not_cached', cache_path, ag, max_age) kwargs = dict(kwargs or {}) kwargs['stream'] = False method = kwargs.pop('method', 'GET').upper() if url.startswith('https') and 'verify' not in kwargs: kwargs['verify'] = CA_CERTS_PATH timeout = kwargs.pop('timeout', (60, 120)) # (connect, read) if isinstance(timeout, (int, float)): timeout = int(timeout), int(timeout) kwargs['timeout'] = timeout lax = kwargs.pop('lax', False) ts = time.time() err = None resp = None try: resp = requests.request(method, url, **kwargs) except requests.Timeout as e: gws.log.debug(f'REQUEST_FAILED: timeout url={url!r}') if cache_path: err = e else: raise Timeout() from e except requests.RequestException as e: gws.log.debug(f'REQUEST_FAILED: generic url={url!r}') if cache_path: err = e else: raise HTTPError(500, str(e)) from e if resp and not lax: try: resp.raise_for_status() except requests.RequestException as e: gws.log.debug(f'REQUEST_FAILED: http url={url!r}') raise HTTPError(resp.status_code, resp.text) ts = time.time() - ts if resp and not err: gws.log.debug(f'REQUEST_DONE: code={resp.status_code} len={len(resp.content)} time={ts:.3f}') r = Response(resp) else: gws.log.debug(f'REQUEST_DONE: resp=FAILED time={ts:.3f}') r = FailedResponse(err) if cache_path: _store_cache(r, cache_path) return r
def _cache_path(url): return gws.NET_CACHE_DIR + '/' + _cache_key(url) def _cache_key(url): m = re.search(r'^(https?://)(.+?)(\?.+)?$', url) if not m: return _hash(url) return gws.as_uid(m.group(2)) + '_' + _hash(m.group(3)) def _hash(s): return hashlib.md5(gws.as_bytes(s)).hexdigest() def _file_age(path): try: st = os.stat(path) except: return 1e20 return int(time.time() - st.st_mtime) def _store_cache(resp, path): with open(path, 'wb') as fp: pickle.dump(resp, fp) def _read_cache(path): with open(path, 'rb') as fp: return pickle.load(fp)