reddit-image-wall-getter/reddit_imgs/system/simpleDownloader.py

169 lines
5.2 KiB
Python
Raw Permalink Normal View History

2017-12-29 22:54:22 +00:00
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
2021-01-23 06:15:31 +00:00
import re
2017-12-29 22:54:22 +00:00
import time
2021-01-23 06:15:31 +00:00
from typing import Dict
2018-01-12 03:41:18 +00:00
import urllib.parse
2017-12-29 22:54:22 +00:00
import urllib.request
import urllib.error
2021-01-23 06:15:31 +00:00
import subprocess
import platform
2017-12-29 22:54:22 +00:00
2021-01-23 06:15:31 +00:00
RGX_MAJOR_MINOR_VER = re.compile(r'(\d+\.\d+)')
cookie: Dict[str, str] = dict()
def getFirefoxUserAgent():
r = subprocess.run(['firefox', '--version'], stdout=subprocess.PIPE, text=True, check=True)
ver = RGX_MAJOR_MINOR_VER.findall(r.stdout.strip())[0]
os_string = ''
if platform.system() == 'Linux':
os_string = 'X11; Linux x86_64; '
elif platform.system() == 'Darwin':
os_string = 'Macintosh; Intel Mac OS X 11.0; '
elif platform.system() == 'Windows':
os_string = 'Windows NT 10.0; Win64; x64; '
return f'Mozilla/5.0 ({os_string}rv:{ver}) Gecko/20100101 Firefox/{ver}'
2017-12-29 22:54:22 +00:00
2020-11-06 00:08:05 +00:00
2017-12-29 22:54:22 +00:00
def delCookie(cookiekey):
cookiekey = str(cookiekey)
2018-03-27 01:57:33 +00:00
if cookiekey in cookie:
del cookie[cookiekey]
2017-12-29 22:54:22 +00:00
2020-11-06 00:08:05 +00:00
2017-12-29 22:54:22 +00:00
def setCookie(cookiekey, cookieval):
cookieval = str(cookieval)
cookiekey = str(cookiekey)
2020-11-06 00:08:05 +00:00
if not cookiekey:
return
if not cookieval:
delCookie(cookiekey)
2017-12-29 22:54:22 +00:00
cookie[cookiekey] = cookieval
2020-11-06 00:08:05 +00:00
2017-12-29 22:54:22 +00:00
def getCookies():
return dict(cookie.items())
2020-11-06 00:08:05 +00:00
2017-12-29 22:54:22 +00:00
def patchCookies(newCookies):
for nk, nv in newCookies.items():
2020-11-06 00:08:05 +00:00
setCookie(nk, nv)
2017-12-29 22:54:22 +00:00
def cleanCookies():
global cookie
cookie = dict()
2020-11-06 00:08:05 +00:00
2017-12-29 22:54:22 +00:00
def setCookies(newCookies):
cleanCookies()
patchCookies(newCookies)
2020-11-06 00:08:05 +00:00
2018-01-07 03:57:39 +00:00
def getUrlBytes(url, giveUpOn403=False):
2017-12-29 22:54:22 +00:00
global cookie
2020-05-13 21:07:05 +00:00
request = urllib.request.Request(url.replace(' ', '%20'))
2018-01-12 03:41:18 +00:00
try:
url.encode('ascii')
except:
2020-11-06 00:08:05 +00:00
request = urllib.request.Request(urllib.parse.quote(url, safe='/%?#:'))
2021-01-23 06:15:31 +00:00
request.add_header('User-Agent', getFirefoxUserAgent())
2017-12-29 22:54:22 +00:00
if len(cookie):
2021-01-23 06:15:31 +00:00
request.add_header("Cookie", '; '.join(map('='.join, cookie.items())))
2017-12-29 22:54:22 +00:00
response = None
try:
2018-01-07 03:57:39 +00:00
response = urllib.request.urlopen(request, timeout=30)
2017-12-29 22:54:22 +00:00
except urllib.error.HTTPError as e:
if e.code == 429:
print('[URL] Got 429 (Too Many Requests): sleeping for 5 seconds')
2020-11-06 00:08:05 +00:00
print(' @ %s' % url)
2017-12-29 22:54:22 +00:00
time.sleep(5)
return getUrlBytes(url)
if e.code == 503:
print('[URL] Got 503 (Service Temporarily Unavailable): retrying after 5 seconds')
2020-11-06 00:08:05 +00:00
print(' @ %s' % url)
2017-12-29 22:54:22 +00:00
time.sleep(5)
return getUrlBytes(url)
2018-01-07 03:57:39 +00:00
if e.code == 403 and giveUpOn403:
print('[URL] Got 403 (Forbidden): assuming "Not Found"')
2020-11-06 00:08:05 +00:00
print(' @ %s' % url)
2018-01-07 03:57:39 +00:00
return None
elif e.code == 500:
print('[URL] Got 500 (Server Error): assuming "Not Found"')
return None
2020-05-13 21:07:05 +00:00
elif e.code == 410:
print('[URL] Got 410 (Gone): assuming "Not Found"')
return None
2017-12-29 22:54:22 +00:00
elif e.code == 404:
return None
elif e.code == 400:
return None
raise e
except urllib.error.URLError as e:
2018-01-07 03:57:39 +00:00
if str(e.reason).startswith('EOF occurred in violation of protocol ('):
print('Server doesn\'t know how to use HTTP properly - assuming "Not Found"')
return None
if str(e.reason).startswith('[SSL: CERTIFICATE'):
print('Their SSL certificate is screwed up - assuming "Not Found"')
return None
if str(e.reason).startswith('[Errno -5]'):
print('Their DNS server is screwed up - assuming "Not Found"')
return None
2017-12-29 22:54:22 +00:00
if str(e.reason).startswith('[Errno -2]'):
return None
if str(e.reason).startswith('[Errno -3]'):
print('Check your internet connection. It seems gone.')
2020-11-06 00:08:05 +00:00
if str(e.reason).startswith('[Errno 110]') or str(e.reason) == 'timed out':
2017-12-29 22:54:22 +00:00
print('Connection request has timed out - assuming "Not Found"')
return None
2020-11-06 00:08:05 +00:00
if str(e.reason).startswith('[Errno 111]') or str(e.reason) == 'timed out':
2017-12-29 22:54:22 +00:00
print('Connection refused - assuming "Not Found"')
return None
raise e
rcode = response.getcode()
rinfo = response.info()
headers = dict()
2020-11-06 00:08:05 +00:00
headers_l = list(map(lambda a: list(map(str.strip, a.split(':', 1))), str(rinfo).strip().splitlines()))
2017-12-29 22:54:22 +00:00
for header in headers_l:
k = header[0].lower()
v = header[1]
if k not in headers:
2020-11-06 00:08:05 +00:00
headers[k] = list()
2017-12-29 22:54:22 +00:00
headers[k].append(v)
del k
del v
del header
del headers_l
if 'set-cookie' in headers:
for cke in headers['set-cookie']:
2020-11-06 00:08:05 +00:00
ckek = cke.split('=', 1)[0].strip()
ckev = cke.split('=', 1)[1].split(';', 1)[0].strip()
setCookie(ckek, ckev)
2017-12-29 22:54:22 +00:00
del ckek
del ckev
del cke
if rcode == 429:
tosleep = 5
2020-11-06 00:08:05 +00:00
try:
tosleep = int(headers['retry-after'][0])
except:
pass
if tosleep < 1:
tosleep = 1
print('[URL] Got 429 (Too Many Requests): sleeping for %d seconds' % tosleep)
print(' @ %s' % url)
2017-12-29 22:54:22 +00:00
time.sleep(tosleep)
return getUrlBytes(url)
data = None
if rcode == 200:
data = response.read()
response.close()
return data
2020-11-06 00:08:05 +00:00
2017-12-29 22:54:22 +00:00
def getUrl(url):
return getUrlBytes(url).decode('utf-8')