2017-12-29 22:54:22 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
|
2021-01-23 06:15:31 +00:00
|
|
|
import re
|
2017-12-29 22:54:22 +00:00
|
|
|
import time
|
2021-01-23 06:15:31 +00:00
|
|
|
from typing import Dict
|
2018-01-12 03:41:18 +00:00
|
|
|
import urllib.parse
|
2017-12-29 22:54:22 +00:00
|
|
|
import urllib.request
|
|
|
|
import urllib.error
|
2021-01-23 06:15:31 +00:00
|
|
|
import subprocess
|
|
|
|
import platform
|
2017-12-29 22:54:22 +00:00
|
|
|
|
2021-01-23 06:15:31 +00:00
|
|
|
RGX_MAJOR_MINOR_VER = re.compile(r'(\d+\.\d+)')
|
|
|
|
|
|
|
|
cookie: Dict[str, str] = dict()
|
|
|
|
|
|
|
|
|
|
|
|
def getFirefoxUserAgent():
|
|
|
|
r = subprocess.run(['firefox', '--version'], stdout=subprocess.PIPE, text=True, check=True)
|
|
|
|
ver = RGX_MAJOR_MINOR_VER.findall(r.stdout.strip())[0]
|
|
|
|
os_string = ''
|
|
|
|
if platform.system() == 'Linux':
|
|
|
|
os_string = 'X11; Linux x86_64; '
|
|
|
|
elif platform.system() == 'Darwin':
|
|
|
|
os_string = 'Macintosh; Intel Mac OS X 11.0; '
|
|
|
|
elif platform.system() == 'Windows':
|
|
|
|
os_string = 'Windows NT 10.0; Win64; x64; '
|
|
|
|
return f'Mozilla/5.0 ({os_string}rv:{ver}) Gecko/20100101 Firefox/{ver}'
|
2017-12-29 22:54:22 +00:00
|
|
|
|
2020-11-06 00:08:05 +00:00
|
|
|
|
2017-12-29 22:54:22 +00:00
|
|
|
def delCookie(cookiekey):
|
|
|
|
cookiekey = str(cookiekey)
|
2018-03-27 01:57:33 +00:00
|
|
|
if cookiekey in cookie:
|
|
|
|
del cookie[cookiekey]
|
2017-12-29 22:54:22 +00:00
|
|
|
|
2020-11-06 00:08:05 +00:00
|
|
|
|
2017-12-29 22:54:22 +00:00
|
|
|
def setCookie(cookiekey, cookieval):
|
|
|
|
cookieval = str(cookieval)
|
|
|
|
cookiekey = str(cookiekey)
|
2020-11-06 00:08:05 +00:00
|
|
|
if not cookiekey:
|
|
|
|
return
|
|
|
|
if not cookieval:
|
|
|
|
delCookie(cookiekey)
|
2017-12-29 22:54:22 +00:00
|
|
|
cookie[cookiekey] = cookieval
|
|
|
|
|
2020-11-06 00:08:05 +00:00
|
|
|
|
2017-12-29 22:54:22 +00:00
|
|
|
def getCookies():
|
|
|
|
return dict(cookie.items())
|
|
|
|
|
2020-11-06 00:08:05 +00:00
|
|
|
|
2017-12-29 22:54:22 +00:00
|
|
|
def patchCookies(newCookies):
|
|
|
|
for nk, nv in newCookies.items():
|
2020-11-06 00:08:05 +00:00
|
|
|
setCookie(nk, nv)
|
|
|
|
|
2017-12-29 22:54:22 +00:00
|
|
|
|
|
|
|
def cleanCookies():
|
|
|
|
global cookie
|
|
|
|
cookie = dict()
|
|
|
|
|
2020-11-06 00:08:05 +00:00
|
|
|
|
2017-12-29 22:54:22 +00:00
|
|
|
def setCookies(newCookies):
|
|
|
|
cleanCookies()
|
|
|
|
patchCookies(newCookies)
|
|
|
|
|
2020-11-06 00:08:05 +00:00
|
|
|
|
2018-01-07 03:57:39 +00:00
|
|
|
def getUrlBytes(url, giveUpOn403=False):
|
2017-12-29 22:54:22 +00:00
|
|
|
global cookie
|
2020-05-13 21:07:05 +00:00
|
|
|
request = urllib.request.Request(url.replace(' ', '%20'))
|
2018-01-12 03:41:18 +00:00
|
|
|
try:
|
|
|
|
url.encode('ascii')
|
|
|
|
except:
|
2020-11-06 00:08:05 +00:00
|
|
|
request = urllib.request.Request(urllib.parse.quote(url, safe='/%?#:'))
|
2021-01-23 06:15:31 +00:00
|
|
|
request.add_header('User-Agent', getFirefoxUserAgent())
|
2017-12-29 22:54:22 +00:00
|
|
|
if len(cookie):
|
2021-01-23 06:15:31 +00:00
|
|
|
request.add_header("Cookie", '; '.join(map('='.join, cookie.items())))
|
2017-12-29 22:54:22 +00:00
|
|
|
response = None
|
|
|
|
try:
|
2018-01-07 03:57:39 +00:00
|
|
|
response = urllib.request.urlopen(request, timeout=30)
|
2017-12-29 22:54:22 +00:00
|
|
|
except urllib.error.HTTPError as e:
|
|
|
|
if e.code == 429:
|
|
|
|
print('[URL] Got 429 (Too Many Requests): sleeping for 5 seconds')
|
2020-11-06 00:08:05 +00:00
|
|
|
print(' @ %s' % url)
|
2017-12-29 22:54:22 +00:00
|
|
|
time.sleep(5)
|
|
|
|
return getUrlBytes(url)
|
|
|
|
if e.code == 503:
|
|
|
|
print('[URL] Got 503 (Service Temporarily Unavailable): retrying after 5 seconds')
|
2020-11-06 00:08:05 +00:00
|
|
|
print(' @ %s' % url)
|
2017-12-29 22:54:22 +00:00
|
|
|
time.sleep(5)
|
|
|
|
return getUrlBytes(url)
|
2018-01-07 03:57:39 +00:00
|
|
|
if e.code == 403 and giveUpOn403:
|
|
|
|
print('[URL] Got 403 (Forbidden): assuming "Not Found"')
|
2020-11-06 00:08:05 +00:00
|
|
|
print(' @ %s' % url)
|
2018-01-07 03:57:39 +00:00
|
|
|
return None
|
|
|
|
elif e.code == 500:
|
|
|
|
print('[URL] Got 500 (Server Error): assuming "Not Found"')
|
|
|
|
return None
|
2020-05-13 21:07:05 +00:00
|
|
|
elif e.code == 410:
|
|
|
|
print('[URL] Got 410 (Gone): assuming "Not Found"')
|
|
|
|
return None
|
2017-12-29 22:54:22 +00:00
|
|
|
elif e.code == 404:
|
|
|
|
return None
|
|
|
|
elif e.code == 400:
|
|
|
|
return None
|
|
|
|
raise e
|
|
|
|
except urllib.error.URLError as e:
|
2018-01-07 03:57:39 +00:00
|
|
|
if str(e.reason).startswith('EOF occurred in violation of protocol ('):
|
|
|
|
print('Server doesn\'t know how to use HTTP properly - assuming "Not Found"')
|
|
|
|
return None
|
|
|
|
if str(e.reason).startswith('[SSL: CERTIFICATE'):
|
|
|
|
print('Their SSL certificate is screwed up - assuming "Not Found"')
|
|
|
|
return None
|
|
|
|
if str(e.reason).startswith('[Errno -5]'):
|
|
|
|
print('Their DNS server is screwed up - assuming "Not Found"')
|
|
|
|
return None
|
2017-12-29 22:54:22 +00:00
|
|
|
if str(e.reason).startswith('[Errno -2]'):
|
|
|
|
return None
|
|
|
|
if str(e.reason).startswith('[Errno -3]'):
|
|
|
|
print('Check your internet connection. It seems gone.')
|
2020-11-06 00:08:05 +00:00
|
|
|
if str(e.reason).startswith('[Errno 110]') or str(e.reason) == 'timed out':
|
2017-12-29 22:54:22 +00:00
|
|
|
print('Connection request has timed out - assuming "Not Found"')
|
|
|
|
return None
|
2020-11-06 00:08:05 +00:00
|
|
|
if str(e.reason).startswith('[Errno 111]') or str(e.reason) == 'timed out':
|
2017-12-29 22:54:22 +00:00
|
|
|
print('Connection refused - assuming "Not Found"')
|
|
|
|
return None
|
|
|
|
raise e
|
|
|
|
rcode = response.getcode()
|
|
|
|
rinfo = response.info()
|
|
|
|
headers = dict()
|
2020-11-06 00:08:05 +00:00
|
|
|
headers_l = list(map(lambda a: list(map(str.strip, a.split(':', 1))), str(rinfo).strip().splitlines()))
|
2017-12-29 22:54:22 +00:00
|
|
|
for header in headers_l:
|
|
|
|
k = header[0].lower()
|
|
|
|
v = header[1]
|
|
|
|
if k not in headers:
|
2020-11-06 00:08:05 +00:00
|
|
|
headers[k] = list()
|
2017-12-29 22:54:22 +00:00
|
|
|
headers[k].append(v)
|
|
|
|
del k
|
|
|
|
del v
|
|
|
|
del header
|
|
|
|
del headers_l
|
|
|
|
if 'set-cookie' in headers:
|
|
|
|
for cke in headers['set-cookie']:
|
2020-11-06 00:08:05 +00:00
|
|
|
ckek = cke.split('=', 1)[0].strip()
|
|
|
|
ckev = cke.split('=', 1)[1].split(';', 1)[0].strip()
|
|
|
|
setCookie(ckek, ckev)
|
2017-12-29 22:54:22 +00:00
|
|
|
del ckek
|
|
|
|
del ckev
|
|
|
|
del cke
|
|
|
|
if rcode == 429:
|
|
|
|
tosleep = 5
|
2020-11-06 00:08:05 +00:00
|
|
|
try:
|
|
|
|
tosleep = int(headers['retry-after'][0])
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
if tosleep < 1:
|
|
|
|
tosleep = 1
|
|
|
|
print('[URL] Got 429 (Too Many Requests): sleeping for %d seconds' % tosleep)
|
|
|
|
print(' @ %s' % url)
|
2017-12-29 22:54:22 +00:00
|
|
|
time.sleep(tosleep)
|
|
|
|
return getUrlBytes(url)
|
|
|
|
data = None
|
|
|
|
if rcode == 200:
|
|
|
|
data = response.read()
|
|
|
|
response.close()
|
|
|
|
return data
|
|
|
|
|
2020-11-06 00:08:05 +00:00
|
|
|
|
2017-12-29 22:54:22 +00:00
|
|
|
def getUrl(url):
|
|
|
|
return getUrlBytes(url).decode('utf-8')
|