116 lines
3.5 KiB
Python
116 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
import time
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
cookie = dict()
|
|
|
|
def delCookie(cookiekey):
|
|
cookiekey = str(cookiekey)
|
|
del cookie[cookiekey]
|
|
|
|
def setCookie(cookiekey, cookieval):
|
|
cookieval = str(cookieval)
|
|
cookiekey = str(cookiekey)
|
|
if not cookiekey: return
|
|
if not cookieval: delCookie(cookiekey)
|
|
cookie[cookiekey] = cookieval
|
|
|
|
def getCookies():
|
|
return dict(cookie.items())
|
|
|
|
def patchCookies(newCookies):
|
|
for nk, nv in newCookies.items():
|
|
setCookie(nk,nv)
|
|
|
|
def cleanCookies():
|
|
global cookie
|
|
cookie = dict()
|
|
|
|
def setCookies(newCookies):
|
|
cleanCookies()
|
|
patchCookies(newCookies)
|
|
|
|
def getUrlBytes(url):
|
|
global cookie
|
|
request = urllib.request.Request(url)
|
|
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) '+
|
|
'AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu '+
|
|
'Chromium/63.0.3239.84 Chrome/63.0.3239.84 '+
|
|
'Safari/537.36'
|
|
)
|
|
if len(cookie):
|
|
request.add_header("Cookie", '; '.join(map(lambda a: '='.join(a), cookie.items())))
|
|
response = None
|
|
try:
|
|
response = urllib.request.urlopen(request, timeout=15)
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 429:
|
|
print('[URL] Got 429 (Too Many Requests): sleeping for 5 seconds')
|
|
print(' @ %s'%url)
|
|
time.sleep(5)
|
|
return getUrlBytes(url)
|
|
if e.code == 503:
|
|
print('[URL] Got 503 (Service Temporarily Unavailable): retrying after 5 seconds')
|
|
print(' @ %s'%url)
|
|
time.sleep(5)
|
|
return getUrlBytes(url)
|
|
elif e.code == 404:
|
|
return None
|
|
elif e.code == 400:
|
|
return None
|
|
raise e
|
|
except urllib.error.URLError as e:
|
|
if str(e.reason).startswith('[Errno -2]'):
|
|
return None
|
|
if str(e.reason).startswith('[Errno -3]'):
|
|
print('Check your internet connection. It seems gone.')
|
|
if str(e.reason).startswith('[Errno 110]') or str(e.reason)=='timed out':
|
|
print('Connection request has timed out - assuming "Not Found"')
|
|
return None
|
|
if str(e.reason).startswith('[Errno 111]') or str(e.reason)=='timed out':
|
|
print('Connection refused - assuming "Not Found"')
|
|
return None
|
|
raise e
|
|
rcode = response.getcode()
|
|
rinfo = response.info()
|
|
headers = dict()
|
|
headers_l = list(map(lambda a: list(map(str.strip, a.split(':',1))), str(rinfo).strip().splitlines()))
|
|
for header in headers_l:
|
|
k = header[0].lower()
|
|
v = header[1]
|
|
if k not in headers:
|
|
headers[k]=list()
|
|
headers[k].append(v)
|
|
del k
|
|
del v
|
|
del header
|
|
del headers_l
|
|
if 'set-cookie' in headers:
|
|
for cke in headers['set-cookie']:
|
|
ckek = cke.split('=',1)[0].strip()
|
|
ckev = cke.split('=',1)[1].split(';',1)[0].strip()
|
|
setCookie(ckek,ckev)
|
|
del ckek
|
|
del ckev
|
|
del cke
|
|
if rcode == 429:
|
|
tosleep = 5
|
|
try: tosleep = int(headers['retry-after'][0])
|
|
except: pass
|
|
if tosleep < 1: tosleep = 1
|
|
print('[URL] Got 429 (Too Many Requests): sleeping for %d seconds'%tosleep)
|
|
print(' @ %s'%url)
|
|
time.sleep(tosleep)
|
|
return getUrlBytes(url)
|
|
data = None
|
|
if rcode == 200:
|
|
data = response.read()
|
|
response.close()
|
|
return data
|
|
|
|
def getUrl(url):
|
|
return getUrlBytes(url).decode('utf-8')
|