@ -0,0 +1,3 @@ | |||
**/*.pyc | |||
**/__pycache__ | |||
**/__pycache__/** |
@ -0,0 +1,3 @@ | |||
** | |||
!About.md | |||
!.gitignore |
@ -0,0 +1 @@ | |||
This folder contains all images grouped by discussions. |
@ -0,0 +1,3 @@ | |||
** | |||
!About.md | |||
!.gitignore |
@ -0,0 +1 @@ | |||
This folder holds the downloaded images grouped by its discussion id from reddit. Not user friendly. |
@ -0,0 +1,3 @@ | |||
** | |||
!About.md | |||
!.gitignore |
@ -0,0 +1,3 @@ | |||
This folder contains the folders named as the subreddit name you want downloaded. | |||
As example, for downloading "/r/photoshopbattles/" you create a folder named "photoshopbattles". |
@ -0,0 +1,87 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
from .system import downloader as downloaderModule | |||
import json | |||
import os | |||
downloaderGetter = downloaderModule.getDownloader | |||
wdir = os.path.abspath('.') | |||
isImageDirectLink = lambda s: s.endswith('.jpg') or s.endswith('.png') or s.endswith('.gif') or s.endswith('.webp') | |||
def main(): | |||
links = list() | |||
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) | |||
for subreddit in subreddits: | |||
srf = os.path.abspath(os.path.join(wdir,'r',subreddit,'subreddit.json')) | |||
links2 = list() | |||
try: | |||
with open(srf) as f: | |||
links2 = json.loads(f.read())['links'] | |||
except: pass | |||
links+=links2 | |||
del links2 | |||
del srf | |||
del subreddit | |||
del subreddits | |||
links.sort(key=lambda link: link['timestamp']) | |||
medias = dict((('direct_link',list()),)) | |||
for link in links: | |||
if isImageDirectLink(link['link']): | |||
medias['direct_link'].append(link) | |||
continue | |||
if link['domain'] not in medias: | |||
medias[link['domain']] = list() | |||
medias[link['domain']].append(link) | |||
del link | |||
del links | |||
priorities = list() | |||
for source, links in sorted(medias.items()): | |||
downloaderClass = downloaderGetter(source) | |||
if downloaderClass is None: | |||
print('No downloader for: {0:<35} | {1:>5} links dropped'.format(source,len(links))) | |||
priorities.append((len(links),source)) | |||
del medias[source] | |||
continue | |||
top_priorities = list(reversed(sorted(priorities)))[:10] | |||
prioremain = sum(map(lambda a: a[0], list(reversed(sorted(priorities)))[10:])) | |||
priolen = len(priorities) | |||
del priorities | |||
for source, links in sorted(medias.items()): | |||
print('Changing downloader for next %d links on %s'%(len(links),source)) | |||
#if source!='imgur.com': continue | |||
downloaderClass = downloaderGetter(source) | |||
downloader = downloaderClass() | |||
for seq, link in enumerate(links): | |||
print('Downloading link #%05d of %05d: %s << %s'%(seq+1, len(links), link['link'], link['datakey'])) | |||
if not downloader.recognizes(link['link']): | |||
continue | |||
target = os.path.join(wdir,'i',link['datakey']) | |||
if not os.path.exists(target): | |||
downloader.download(link['link']).into(target) | |||
print() | |||
print('='*47) | |||
print('| {0:^43} |'.format('Missing downloaders')) | |||
print('='*47) | |||
print('| {0:^30} | {1:^10} |'.format('Domain','Hits')) | |||
print('-'*47) | |||
for priority in top_priorities: | |||
print('| {0:^30} | {1:^10} |'.format(*list(reversed(priority)))) | |||
del priority | |||
del top_priorities | |||
print('|'+'.'*32+'|'+'.'*12+'|') | |||
print('| {0:^30} | {1:^10} |'.format('...and more %d domains'%(priolen-10), prioremain)) | |||
del priolen | |||
print('='*47) | |||
print() | |||
if __name__ == '__main__': | |||
main() |
@ -0,0 +1,97 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
import os | |||
import re | |||
import json | |||
import shutil | |||
import datetime | |||
def readAllFile(s): | |||
with open(s) as f: | |||
return f.read() | |||
def slugify(dat): | |||
return re.sub(r'[^\w\s\.\-\(\)\[\]]', '-', dat) | |||
wdir = os.path.abspath('.') | |||
def main(): | |||
idir = os.path.join(wdir, 'i') | |||
ddir = os.path.join(wdir, 'd') | |||
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) | |||
subreddits = list(zip( | |||
subreddits, | |||
map( | |||
lambda a: a['links'], | |||
map( | |||
json.loads, | |||
map( | |||
readAllFile, | |||
map( | |||
lambda sr: os.path.join(wdir,'r',sr,'subreddit.json'), | |||
subreddits | |||
)))), | |||
map( | |||
lambda sr: os.path.join(wdir,'d',sr), | |||
subreddits | |||
) | |||
)) | |||
copyfiles = list() | |||
print('\r'+' '*79+'\r'+'Calculating changes...',end='') | |||
for subreddit, links, target in subreddits: | |||
sdir = os.path.join(ddir,subreddit) | |||
for link in links: | |||
imgd = os.path.join(idir, link['datakey']) | |||
meta = os.path.join(imgd, 'meta.json') | |||
if os.path.exists(meta): | |||
files = json.loads(readAllFile(meta)) | |||
for seq, file in enumerate(files): | |||
imgfrom = os.path.join(imgd, file['dname']) | |||
imgfn = '' | |||
imgfn+= subreddit | |||
imgfn+= '__' | |||
imgfn+= datetime.datetime.fromtimestamp(int(link['timestamp'])).isoformat().replace('T','_').replace(':','-') | |||
imgfn+= '_' | |||
imgfn+= 'nsfw' if link['nsfw'] else 'safe' | |||
imgfn+= '___' | |||
imgfn+= '-' if link['flair'] is None else slugify(link['flair']) | |||
imgfn+= '___' | |||
imgfn+= '-' if link['sharer'] is None else slugify(link['sharer']) | |||
imgfn+= '___' | |||
imgfn+= slugify(link['title'][:50]) | |||
imgfn+= '___' | |||
imgfn+= slugify(link['datakey']) | |||
imgfn+= '___' | |||
imgfn+= str('%04d'%seq) | |||
imgfn+= '.'+file['ext'] | |||
imgto = os.path.join(sdir,imgfn) | |||
copyfiles.append((imgfrom,imgto)) | |||
del links | |||
lcf = len(copyfiles) | |||
for (cnt, (src, dst)) in enumerate(copyfiles): | |||
if os.path.exists(dst): continue | |||
container = os.path.dirname(os.path.abspath(dst)) | |||
if not os.path.exists(container): | |||
os.makedirs(container) | |||
print('\r'+' '*79+'\r'+'%03d%% copied: %05d of %05d'%((((cnt+1)/lcf)*100)//1, cnt+1, lcf),end='') | |||
try: | |||
shutil.copyfile(src, dst) | |||
except KeyboardInterrupt as e: | |||
print() | |||
print('\r'+' '*79+'\r'+'Deleting interrupted file...',end='') | |||
os.remove(dst) | |||
print('\r'+' '*79+'\r'+'Aborted safely',end='') | |||
print() | |||
raise e | |||
print('\r'+' '*79+'\r'+'Done.',end='') | |||
print() | |||
if __name__ == '__main__': | |||
main() |
@ -0,0 +1,104 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
import reddit_imgs.sync | |||
import reddit_imgs.fetch | |||
import reddit_imgs.reorganize | |||
import os | |||
import shutil | |||
wdir = os.path.abspath('.') | |||
def ensureFolderAvailability(): | |||
if not os.path.exists(os.path.join(wdir,'a')): | |||
os.makedirs(os.path.join(wdir,'a')) | |||
if not os.path.exists(os.path.join(wdir,'d')): | |||
os.makedirs(os.path.join(wdir,'d')) | |||
if not os.path.exists(os.path.join(wdir,'i')): | |||
os.makedirs(os.path.join(wdir,'i')) | |||
if not os.path.exists(os.path.join(wdir,'r')): | |||
os.makedirs(os.path.join(wdir,'r')) | |||
def managesubreddits(): | |||
i = '' | |||
while i!='0': | |||
print('\n'*100) | |||
print('----------------------------------------------') | |||
print(' Subreddit Manager ') | |||
print('----------------------------------------------') | |||
print('1) List monitored subreddits') | |||
print('2) Add monitored subreddit') | |||
print('3) Remove monitored subreddit') | |||
print() | |||
print('0) Back') | |||
print('----------------------------------------------') | |||
print() | |||
print('Enter your choice:') | |||
i = input() | |||
i = i.strip() | |||
print() | |||
print() | |||
subreddits_dir = os.path.join(wdir,'r') | |||
subreddits_isfolder = lambda sr: os.path.isdir(os.path.join(subreddits_dir,sr)) | |||
subreddits = sorted(filter(subreddits_isfolder, os.listdir(subreddits_dir))) | |||
if i=='1' or i=='3': | |||
print('Subreddits monitored:') | |||
for sr in subreddits: | |||
print('/r/%s'%sr) | |||
print() | |||
if i=='1': | |||
print('Press enter to continue') | |||
input() | |||
if i=='3': | |||
print('Enter the subreddit you want to get rid of:') | |||
rem = input('/r/') | |||
try: shutil.rmtree(os.path.join(subreddits_dir,rem)) | |||
except: pass | |||
print() | |||
print('Done.') | |||
print('Press enter to continue') | |||
input() | |||
elif i=='2': | |||
print('Enter the subreddit you want to add:') | |||
add = input('/r/') | |||
try: os.makedirs(os.path.join(subreddits_dir,add)) | |||
except: pass | |||
print() | |||
print('Done.') | |||
print('Press enter to continue') | |||
input() | |||
def mainmenu(): | |||
i = '' | |||
while i!='0': | |||
print('\n'*100) | |||
print('----------------------------------------------') | |||
print(' Reddit Image Downloader ') | |||
print('----------------------------------------------') | |||
print('1) Manage subreddits') | |||
print('2) Get link list to be downloaded from reddit') | |||
print('3) Download grabbed links') | |||
print('4) Group and put nice names on downloaded data') | |||
print() | |||
print('0) Quit') | |||
print('----------------------------------------------') | |||
print() | |||
print('Enter your choice:') | |||
i = input() | |||
i = i.strip() | |||
if i=='1': | |||
managesubreddits() | |||
elif i=='2': | |||
reddit_imgs.sync.main() | |||
elif i=='3': | |||
reddit_imgs.fetch.main() | |||
elif i=='4': | |||
reddit_imgs.reorganize.main() | |||
def main(): | |||
ensureFolderAvailability() | |||
mainmenu() | |||
if __name__ == '__main__': | |||
main() | |||
@ -0,0 +1,47 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
import os | |||
from bs4 import BeautifulSoup as _BS | |||
from .system import simpleDownloader | |||
from .system.subredditTools import getEmptySubredditData, getSubredditPageInfo | |||
import json | |||
def BeautifulSoup(data): return _BS(data, 'html5lib') | |||
simpleDownloader.setCookies({'over18':1}) | |||
wdir = os.path.abspath('.') | |||
def main(): | |||
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) | |||
for subreddit in subreddits: | |||
srp = os.path.abspath(os.path.join(wdir,'r',subreddit)) | |||
#if subreddit!='yiff': continue | |||
nextpage = 'https://www.reddit.com/r/'+subreddit+'/new/?count=0' | |||
srdt = getEmptySubredditData(subreddit) | |||
try: | |||
with open(os.path.join(srp,'subreddit.json')) as f: | |||
srdt = json.loads(f.read()) | |||
except: pass | |||
#srdt = getEmptySubredditData(subreddit) | |||
pageno = 0 | |||
while nextpage: | |||
pageno+=1 | |||
print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit)) | |||
print(' >> %s'%nextpage) | |||
redditBytes = simpleDownloader.getUrlBytes(nextpage) | |||
bs = BeautifulSoup(redditBytes) | |||
first, last, nextpage, links = getSubredditPageInfo(bs) | |||
if srdt['date_last'] <= last: | |||
nextpage = None | |||
srdt['date_first'] = max(first, srdt['date_first']) | |||
srdt['date_last'] = min(last, srdt['date_last']) | |||
for link in links[::-1]: | |||
if link not in srdt['links']: | |||
srdt['links'].append(link) | |||
with open(os.path.join(srp,'subreddit.json'),'w') as f: | |||
f.write(json.dumps(srdt ,sort_keys=True, indent=2)) | |||
if __name__ == '__main__': | |||
main() |
@ -0,0 +1,21 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
import os | |||
modules_map = dict() | |||
moduleNames = os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'modules')) | |||
moduleNames = list(map(lambda a: a[:-3], filter(lambda a: a.endswith('.py'), moduleNames))) | |||
for moduleName in moduleNames: | |||
exec('from .modules import {0} as {0}; modules_map["{0}"] = {0}'.format(moduleName)) | |||
def getDownloader(domain): | |||
for module in modules_map.values(): | |||
try: | |||
if module.works_on(domain): | |||
return module.get_class() | |||
except: pass | |||
return None | |||
@ -0,0 +1,48 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
import filetype | |||
import shutil | |||
import json | |||
import os | |||
class DownloadedData(object): | |||
def __init__(self): | |||
self.control = list() | |||
self.fb = dict() | |||
def put(self, link, downloaded, ext=None): | |||
if ext is None: | |||
try: | |||
ext = link.rsplit('/',1)[-1].rsplit('.',1)[-1] | |||
if ext not in ['jpg','png','gif','webp']: | |||
raise Exception | |||
except: | |||
ext = filetype.guess_extension(downloaded) | |||
if ext is None: | |||
ext = 'unk' | |||
fnm = '%04d.%s'%(len(self.control),ext) | |||
self.control.append({ | |||
'dname': fnm, | |||
'link':link, | |||
'ext':ext, | |||
}) | |||
self.fb[fnm] = downloaded | |||
def into(self, directory): | |||
directory = os.path.abspath(directory) | |||
if not os.path.exists(directory): | |||
os.makedirs(directory) | |||
try: | |||
with open(os.path.join(directory,'meta.json'),'w') as f: | |||
f.write(json.dumps(self.control, sort_keys=True, indent=2)) | |||
for fnm, dtb in self.fb.items(): | |||
with open(os.path.join(directory,fnm),'wb') as f: | |||
f.write(dtb) | |||
except KeyboardInterrupt as e: | |||
shutil.rmtree(directory) | |||
raise e | |||
def merge(self, other): | |||
for oitem in other.control: | |||
self.put(oitem['link'], other.fb[oitem['dname']], oitem['ext']) | |||
def bulk_merge(self, others): | |||
for other in others: | |||
self.merge(other) |
@ -0,0 +1,34 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
from ..downloadedData import DownloadedData | |||
from ... import simpleDownloader | |||
def works_on(domain): | |||
return domain=='direct_link' | |||
class DirectLink(object): | |||
def recognizes(self, link): | |||
if ( | |||
link.startswith('http://u18chan.com/') | |||
or | |||
link.startswith('https://u18chan.com/') | |||
or | |||
link.startswith('http://dl.dropboxusercontent.com') | |||
or | |||
link.startswith('https://dl.dropboxusercontent.com') | |||
): | |||
return False | |||
return True | |||
def download(self, link): | |||
dd = DownloadedData() | |||
simpleDownloader.cleanCookies() | |||
bts = simpleDownloader.getUrlBytes(link) | |||
simpleDownloader.cleanCookies() | |||
if bts is not None: | |||
dd.put(link,bts) | |||
return dd | |||
def get_class(): | |||
return DirectLink |
@ -0,0 +1,88 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
import re | |||
import json | |||
import filetype | |||
from ..downloadedData import DownloadedData | |||
from ... import simpleDownloader | |||
def works_on(domain): | |||
return domain in ['i.imgur.com', 'imgur.com', 'm.imgur.com', 'www.imgur.com'] | |||
class ImgurCom(object): | |||
def recognizes(self, link): | |||
return True | |||
def download(self, link): | |||
dd = DownloadedData() | |||
simpleDownloader.cleanCookies() | |||
bts = b'' | |||
if '/a/' not in link and '.gifv' not in link and '.webm' not in link: | |||
bts = simpleDownloader.getUrlBytes(link) | |||
if bts is not None: | |||
ext = filetype.guess_extension(bts) | |||
if ext is None: | |||
if '.gifv' in link or '.webm' in link: | |||
bts=None | |||
print(' '*50,end='') | |||
print('\r',end='') | |||
print(' `--> It wasn\'t a single image...',end='') | |||
print('\r',end='') | |||
match = re.match( | |||
"(https?)://(www\.)?(i\.|m\.|www\.)?imgur\.com/(?:(a|gallery|r)/)?(\w*)/?(\w*)(#[0-9]+)?(\.\w*)?", | |||
link | |||
) | |||
tp = match.group(4) | |||
ky = None | |||
if tp != 'r': | |||
ky = match.group(5) | |||
else: | |||
ky = match.group(6) | |||
if not ky: | |||
ky = match.group(5) | |||
link2 = 'https://imgur.com/a/'+str(ky)+'/all' | |||
if tp is None or tp=='' or tp=='r': | |||
link2=link2.replace('/a/','/')[:-4] | |||
print(' '*50,end='') | |||
print('\r',end='') | |||
if link2.endswith('/all') or bts is None: | |||
print(' `--> Fetching album image list...',end='') | |||
bts = simpleDownloader.getUrlBytes(link2) | |||
else: | |||
print(' `--> Album image list already fetched...',end='') | |||
print('\r',end='') | |||
if bts is None: | |||
print(' '*50,end='') | |||
print('\r',end='') | |||
print(' `--> Gallery not found') | |||
return DownloadedData() | |||
html = bts.decode('utf-8') | |||
albnfo = json.loads(list(filter(lambda f: f.startswith('image'), map(str.strip, list(filter(lambda f: f.startswith("('gallery', {"), html.split('widgetFactory.mergeConfig')))[0].strip().splitlines())))[0][6:-1].strip()[1:].strip()) | |||
imgs = [albnfo] | |||
if 'album_images' in albnfo: | |||
imgs = albnfo['album_images']['images'] | |||
for seq, img in enumerate(imgs): | |||
print(' '*50,end='') | |||
print('\r',end='') | |||
print(' `--> Album image #%03d of %03d'%(seq+1,len(imgs)),end='') | |||
print('\r',end='') | |||
if img['ext'] == '.gifv': | |||
img['ext'] = '.mp4' | |||
durl = 'http://i.imgur.com/'+img['hash']+img['ext'] | |||
imb = simpleDownloader.getUrlBytes(durl) | |||
if imb is None: | |||
print() | |||
print('Album part failed') | |||
print() | |||
simpleDownloader.cleanCookies() | |||
return None | |||
dd.put(durl, imb, img['ext'][1:]) | |||
print('\r',end='') | |||
else: | |||
dd.put(link, bts, ext) | |||
simpleDownloader.cleanCookies() | |||
return dd | |||
def get_class(): | |||
return ImgurCom |
@ -0,0 +1,5 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
minint = -2**31 | |||
maxint = 2**31 - 1 |
@ -0,0 +1,7 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
class objectify(object): | |||
@property | |||
def json(self): return self.__dict__ | |||
def __init__(self, data): self.__dict__ = data |
@ -0,0 +1,115 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
import time | |||
import urllib.request | |||
import urllib.error | |||
cookie = dict() | |||
def delCookie(cookiekey): | |||
cookiekey = str(cookiekey) | |||
del cookie[cookiekey] | |||
def setCookie(cookiekey, cookieval): | |||
cookieval = str(cookieval) | |||
cookiekey = str(cookiekey) | |||
if not cookiekey: return | |||
if not cookieval: delCookie(cookiekey) | |||
cookie[cookiekey] = cookieval | |||
def getCookies(): | |||
return dict(cookie.items()) | |||
def patchCookies(newCookies): | |||
for nk, nv in newCookies.items(): | |||
setCookie(nk,nv) | |||
def cleanCookies(): | |||
global cookie | |||
cookie = dict() | |||
def setCookies(newCookies): | |||
cleanCookies() | |||
patchCookies(newCookies) | |||
def getUrlBytes(url): | |||
global cookie | |||
request = urllib.request.Request(url) | |||
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) '+ | |||
'AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu '+ | |||
'Chromium/63.0.3239.84 Chrome/63.0.3239.84 '+ | |||
'Safari/537.36' | |||
) | |||
if len(cookie): | |||
request.add_header("Cookie", '; '.join(map(lambda a: '='.join(a), cookie.items()))) | |||
response = None | |||
try: | |||
response = urllib.request.urlopen(request, timeout=15) | |||
except urllib.error.HTTPError as e: | |||
if e.code == 429: | |||
print('[URL] Got 429 (Too Many Requests): sleeping for 5 seconds') | |||
print(' @ %s'%url) | |||
time.sleep(5) | |||
return getUrlBytes(url) | |||
if e.code == 503: | |||
print('[URL] Got 503 (Service Temporarily Unavailable): retrying after 5 seconds') | |||
print(' @ %s'%url) | |||
time.sleep(5) | |||
return getUrlBytes(url) | |||
elif e.code == 404: | |||
return None | |||
elif e.code == 400: | |||
return None | |||
raise e | |||
except urllib.error.URLError as e: | |||
if str(e.reason).startswith('[Errno -2]'): | |||
return None | |||
if str(e.reason).startswith('[Errno -3]'): | |||
print('Check your internet connection. It seems gone.') | |||
if str(e.reason).startswith('[Errno 110]') or str(e.reason)=='timed out': | |||
print('Connection request has timed out - assuming "Not Found"') | |||
return None | |||
if str(e.reason).startswith('[Errno 111]') or str(e.reason)=='timed out': | |||
print('Connection refused - assuming "Not Found"') | |||
return None | |||
raise e | |||
rcode = response.getcode() | |||
rinfo = response.info() | |||
headers = dict() | |||
headers_l = list(map(lambda a: list(map(str.strip, a.split(':',1))), str(rinfo).strip().splitlines())) | |||
for header in headers_l: | |||
k = header[0].lower() | |||
v = header[1] | |||
if k not in headers: | |||
headers[k]=list() | |||
headers[k].append(v) | |||
del k | |||
del v | |||
del header | |||
del headers_l | |||
if 'set-cookie' in headers: | |||
for cke in headers['set-cookie']: | |||
ckek = cke.split('=',1)[0].strip() | |||
ckev = cke.split('=',1)[1].split(';',1)[0].strip() | |||
setCookie(ckek,ckev) | |||
del ckek | |||
del ckev | |||
del cke | |||
if rcode == 429: | |||
tosleep = 5 | |||
try: tosleep = int(headers['retry-after'][0]) | |||
except: pass | |||
if tosleep < 1: tosleep = 1 | |||
print('[URL] Got 429 (Too Many Requests): sleeping for %d seconds'%tosleep) | |||
print(' @ %s'%url) | |||
time.sleep(tosleep) | |||
return getUrlBytes(url) | |||
data = None | |||
if rcode == 200: | |||
data = response.read() | |||
response.close() | |||
return data | |||
def getUrl(url): | |||
return getUrlBytes(url).decode('utf-8') |
@ -0,0 +1,54 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
import dateutil.parser | |||
from .limits import minint, maxint | |||
def getInfoFromRedditItem(bs): | |||
nsfw = 'over18' in bs['class'] | |||
sharer = bs.find(class_='author').text.strip() | |||
title = bs.find('a',class_='title').text.strip() | |||
link = str(bs.find('a',class_='title')['href']) | |||
domain = bs.find('span',class_='domain').find('a').text.strip() | |||
datakey = bs['data-fullname'] | |||
timestamp = int(dateutil.parser.parse(bs.find('time')['datetime']).strftime('%s')) | |||
flair = None | |||
try: flair = bs.find('span',class_='linkflairlabel').text.strip() | |||
except: pass | |||
return { | |||
'nsfw': nsfw, | |||
'link': link, | |||
'title': title, | |||
'flair': flair, | |||
'sharer': sharer, | |||
'domain': domain, | |||
'datakey': datakey, | |||
'timestamp': timestamp, | |||
} | |||
def getEmptySubredditData(srname): | |||
return { | |||
'subreddit': srname, | |||
'date_first': minint, | |||
'date_last': maxint, | |||
'links': list() | |||
} | |||
def getSubredditPageInfo(bs): | |||
pagetable = bs.find(id='siteTable') | |||
discussions = pagetable.find_all( | |||
lambda a: a.has_attr('class') and | |||
'thing' in a['class'] | |||
) | |||
links = list(filter(lambda a: 'self' not in a['class'],discussions)) | |||
first = minint | |||
last = maxint | |||
try: first = int(dateutil.parser.parse(discussions[0].find('time')['datetime']).strftime('%s')) | |||
except: pass | |||
try: last = int(dateutil.parser.parse(discussions[-1].find('time')['datetime']).strftime('%s')) | |||
except: pass | |||
nextpage = None | |||
try: nextpage = bs.find('div', class_='nav-buttons').find(class_='nextprev').find(class_='next-button').find('a')['href'] | |||
except: pass | |||
structured_links = list(map(getInfoFromRedditItem, links)) | |||
return first, last, nextpage, structured_links |
@ -0,0 +1,7 @@ | |||
#!/usr/bin/env python3 | |||
# -*- encoding: utf-8 -*- | |||
import reddit_imgs.runner | |||
if __name__ == '__main__': | |||
reddit_imgs.runner.main() |
@ -0,0 +1 @@ | |||
filetype==1.0.0 |