#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import time import urllib.request import urllib.error cookie = dict() def delCookie(cookiekey): cookiekey = str(cookiekey) del cookie[cookiekey] def setCookie(cookiekey, cookieval): cookieval = str(cookieval) cookiekey = str(cookiekey) if not cookiekey: return if not cookieval: delCookie(cookiekey) cookie[cookiekey] = cookieval def getCookies(): return dict(cookie.items()) def patchCookies(newCookies): for nk, nv in newCookies.items(): setCookie(nk,nv) def cleanCookies(): global cookie cookie = dict() def setCookies(newCookies): cleanCookies() patchCookies(newCookies) def getUrlBytes(url): global cookie request = urllib.request.Request(url) request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) '+ 'AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu '+ 'Chromium/63.0.3239.84 Chrome/63.0.3239.84 '+ 'Safari/537.36' ) if len(cookie): request.add_header("Cookie", '; '.join(map(lambda a: '='.join(a), cookie.items()))) response = None try: response = urllib.request.urlopen(request, timeout=15) except urllib.error.HTTPError as e: if e.code == 429: print('[URL] Got 429 (Too Many Requests): sleeping for 5 seconds') print(' @ %s'%url) time.sleep(5) return getUrlBytes(url) if e.code == 503: print('[URL] Got 503 (Service Temporarily Unavailable): retrying after 5 seconds') print(' @ %s'%url) time.sleep(5) return getUrlBytes(url) elif e.code == 404: return None elif e.code == 400: return None raise e except urllib.error.URLError as e: if str(e.reason).startswith('[Errno -2]'): return None if str(e.reason).startswith('[Errno -3]'): print('Check your internet connection. It seems gone.') if str(e.reason).startswith('[Errno 110]') or str(e.reason)=='timed out': print('Connection request has timed out - assuming "Not Found"') return None if str(e.reason).startswith('[Errno 111]') or str(e.reason)=='timed out': print('Connection refused - assuming "Not Found"') return None raise e rcode = response.getcode() rinfo = response.info() headers = dict() headers_l = list(map(lambda a: list(map(str.strip, a.split(':',1))), str(rinfo).strip().splitlines())) for header in headers_l: k = header[0].lower() v = header[1] if k not in headers: headers[k]=list() headers[k].append(v) del k del v del header del headers_l if 'set-cookie' in headers: for cke in headers['set-cookie']: ckek = cke.split('=',1)[0].strip() ckev = cke.split('=',1)[1].split(';',1)[0].strip() setCookie(ckek,ckev) del ckek del ckev del cke if rcode == 429: tosleep = 5 try: tosleep = int(headers['retry-after'][0]) except: pass if tosleep < 1: tosleep = 1 print('[URL] Got 429 (Too Many Requests): sleeping for %d seconds'%tosleep) print(' @ %s'%url) time.sleep(tosleep) return getUrlBytes(url) data = None if rcode == 200: data = response.read() response.close() return data def getUrl(url): return getUrlBytes(url).decode('utf-8')