commit b8629c749021202ade6e7b3d21d22ca8b699a7ab Author: Ádler Neves Date: Fri Dec 29 20:54:22 2017 -0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1615e04 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +**/*.pyc +**/__pycache__ +**/__pycache__/** diff --git a/d/.gitignore b/d/.gitignore new file mode 100644 index 0000000..b66ba8a --- /dev/null +++ b/d/.gitignore @@ -0,0 +1,3 @@ +** +!About.md +!.gitignore diff --git a/d/About.md b/d/About.md new file mode 100644 index 0000000..05844bd --- /dev/null +++ b/d/About.md @@ -0,0 +1 @@ +This folder contains all images grouped by discussions. diff --git a/i/.gitignore b/i/.gitignore new file mode 100644 index 0000000..b66ba8a --- /dev/null +++ b/i/.gitignore @@ -0,0 +1,3 @@ +** +!About.md +!.gitignore diff --git a/i/About.md b/i/About.md new file mode 100644 index 0000000..b8f68f2 --- /dev/null +++ b/i/About.md @@ -0,0 +1 @@ +This folder holds the downloaded images grouped by its discussion id from reddit. Not user friendly. diff --git a/r/.gitignore b/r/.gitignore new file mode 100644 index 0000000..b66ba8a --- /dev/null +++ b/r/.gitignore @@ -0,0 +1,3 @@ +** +!About.md +!.gitignore diff --git a/r/About.md b/r/About.md new file mode 100644 index 0000000..6d9fdab --- /dev/null +++ b/r/About.md @@ -0,0 +1,3 @@ +This folder contains the folders named as the subreddit name you want downloaded. + +As example, for downloading "/r/photoshopbattles/" you create a folder named "photoshopbattles". diff --git a/reddit_imgs/fetch.py b/reddit_imgs/fetch.py new file mode 100755 index 0000000..1fa795c --- /dev/null +++ b/reddit_imgs/fetch.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +from .system import downloader as downloaderModule +import json +import os + +downloaderGetter = downloaderModule.getDownloader + +wdir = os.path.abspath('.') + +isImageDirectLink = lambda s: s.endswith('.jpg') or s.endswith('.png') or s.endswith('.gif') or s.endswith('.webp') + +def main(): + links = list() + subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) + for subreddit in subreddits: + srf = os.path.abspath(os.path.join(wdir,'r',subreddit,'subreddit.json')) + links2 = list() + try: + with open(srf) as f: + links2 = json.loads(f.read())['links'] + except: pass + links+=links2 + del links2 + del srf + del subreddit + del subreddits + + links.sort(key=lambda link: link['timestamp']) + + medias = dict((('direct_link',list()),)) + for link in links: + if isImageDirectLink(link['link']): + medias['direct_link'].append(link) + continue + if link['domain'] not in medias: + medias[link['domain']] = list() + medias[link['domain']].append(link) + del link + del links + + priorities = list() + for source, links in sorted(medias.items()): + downloaderClass = downloaderGetter(source) + if downloaderClass is None: + print('No downloader for: {0:<35} | {1:>5} links dropped'.format(source,len(links))) + priorities.append((len(links),source)) + del medias[source] + continue + + top_priorities = list(reversed(sorted(priorities)))[:10] + prioremain = sum(map(lambda a: a[0], list(reversed(sorted(priorities)))[10:])) + priolen = len(priorities) + del priorities + + for source, links in sorted(medias.items()): + print('Changing downloader for next %d links on %s'%(len(links),source)) + #if source!='imgur.com': continue + downloaderClass = downloaderGetter(source) + downloader = downloaderClass() + for seq, link in enumerate(links): + print('Downloading link #%05d of %05d: %s << %s'%(seq+1, len(links), link['link'], link['datakey'])) + if not downloader.recognizes(link['link']): + continue + target = os.path.join(wdir,'i',link['datakey']) + if not os.path.exists(target): + downloader.download(link['link']).into(target) + + print() + print('='*47) + print('| {0:^43} |'.format('Missing downloaders')) + print('='*47) + print('| {0:^30} | {1:^10} |'.format('Domain','Hits')) + print('-'*47) + for priority in top_priorities: + print('| {0:^30} | {1:^10} |'.format(*list(reversed(priority)))) + del priority + del top_priorities + print('|'+'.'*32+'|'+'.'*12+'|') + print('| {0:^30} | {1:^10} |'.format('...and more %d domains'%(priolen-10), prioremain)) + del priolen + print('='*47) + print() + +if __name__ == '__main__': + main() diff --git a/reddit_imgs/reorganize.py b/reddit_imgs/reorganize.py new file mode 100755 index 0000000..2deb27d --- /dev/null +++ b/reddit_imgs/reorganize.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import os +import re +import json +import shutil +import datetime + +def readAllFile(s): + with open(s) as f: + return f.read() +def slugify(dat): + return re.sub(r'[^\w\s\.\-\(\)\[\]]', '-', dat) + +wdir = os.path.abspath('.') + +def main(): + idir = os.path.join(wdir, 'i') + ddir = os.path.join(wdir, 'd') + + subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) + + subreddits = list(zip( + subreddits, + map( + lambda a: a['links'], + map( + json.loads, + map( + readAllFile, + map( + lambda sr: os.path.join(wdir,'r',sr,'subreddit.json'), + subreddits + )))), + map( + lambda sr: os.path.join(wdir,'d',sr), + subreddits + ) + )) + + copyfiles = list() + + print('\r'+' '*79+'\r'+'Calculating changes...',end='') + + for subreddit, links, target in subreddits: + sdir = os.path.join(ddir,subreddit) + for link in links: + imgd = os.path.join(idir, link['datakey']) + meta = os.path.join(imgd, 'meta.json') + if os.path.exists(meta): + files = json.loads(readAllFile(meta)) + for seq, file in enumerate(files): + imgfrom = os.path.join(imgd, file['dname']) + imgfn = '' + imgfn+= subreddit + imgfn+= '__' + imgfn+= datetime.datetime.fromtimestamp(int(link['timestamp'])).isoformat().replace('T','_').replace(':','-') + imgfn+= '_' + imgfn+= 'nsfw' if link['nsfw'] else 'safe' + imgfn+= '___' + imgfn+= '-' if link['flair'] is None else slugify(link['flair']) + imgfn+= '___' + imgfn+= '-' if link['sharer'] is None else slugify(link['sharer']) + imgfn+= '___' + imgfn+= slugify(link['title'][:50]) + imgfn+= '___' + imgfn+= slugify(link['datakey']) + imgfn+= '___' + imgfn+= str('%04d'%seq) + imgfn+= '.'+file['ext'] + imgto = os.path.join(sdir,imgfn) + copyfiles.append((imgfrom,imgto)) + del links + + lcf = len(copyfiles) + for (cnt, (src, dst)) in enumerate(copyfiles): + if os.path.exists(dst): continue + container = os.path.dirname(os.path.abspath(dst)) + if not os.path.exists(container): + os.makedirs(container) + print('\r'+' '*79+'\r'+'%03d%% copied: %05d of %05d'%((((cnt+1)/lcf)*100)//1, cnt+1, lcf),end='') + try: + shutil.copyfile(src, dst) + except KeyboardInterrupt as e: + print() + print('\r'+' '*79+'\r'+'Deleting interrupted file...',end='') + os.remove(dst) + print('\r'+' '*79+'\r'+'Aborted safely',end='') + print() + raise e + + print('\r'+' '*79+'\r'+'Done.',end='') + print() + +if __name__ == '__main__': + main() diff --git a/reddit_imgs/runner.py b/reddit_imgs/runner.py new file mode 100755 index 0000000..ad52f3c --- /dev/null +++ b/reddit_imgs/runner.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import reddit_imgs.sync +import reddit_imgs.fetch +import reddit_imgs.reorganize + +import os +import shutil +wdir = os.path.abspath('.') + +def ensureFolderAvailability(): + if not os.path.exists(os.path.join(wdir,'a')): + os.makedirs(os.path.join(wdir,'a')) + if not os.path.exists(os.path.join(wdir,'d')): + os.makedirs(os.path.join(wdir,'d')) + if not os.path.exists(os.path.join(wdir,'i')): + os.makedirs(os.path.join(wdir,'i')) + if not os.path.exists(os.path.join(wdir,'r')): + os.makedirs(os.path.join(wdir,'r')) + +def managesubreddits(): + i = '' + while i!='0': + print('\n'*100) + print('----------------------------------------------') + print(' Subreddit Manager ') + print('----------------------------------------------') + print('1) List monitored subreddits') + print('2) Add monitored subreddit') + print('3) Remove monitored subreddit') + print() + print('0) Back') + print('----------------------------------------------') + print() + print('Enter your choice:') + i = input() + i = i.strip() + print() + print() + subreddits_dir = os.path.join(wdir,'r') + subreddits_isfolder = lambda sr: os.path.isdir(os.path.join(subreddits_dir,sr)) + subreddits = sorted(filter(subreddits_isfolder, os.listdir(subreddits_dir))) + if i=='1' or i=='3': + print('Subreddits monitored:') + for sr in subreddits: + print('/r/%s'%sr) + print() + if i=='1': + print('Press enter to continue') + input() + if i=='3': + print('Enter the subreddit you want to get rid of:') + rem = input('/r/') + try: shutil.rmtree(os.path.join(subreddits_dir,rem)) + except: pass + print() + print('Done.') + print('Press enter to continue') + input() + elif i=='2': + print('Enter the subreddit you want to add:') + add = input('/r/') + try: os.makedirs(os.path.join(subreddits_dir,add)) + except: pass + print() + print('Done.') + print('Press enter to continue') + input() + +def mainmenu(): + i = '' + while i!='0': + print('\n'*100) + print('----------------------------------------------') + print(' Reddit Image Downloader ') + print('----------------------------------------------') + print('1) Manage subreddits') + print('2) Get link list to be downloaded from reddit') + print('3) Download grabbed links') + print('4) Group and put nice names on downloaded data') + print() + print('0) Quit') + print('----------------------------------------------') + print() + print('Enter your choice:') + i = input() + i = i.strip() + if i=='1': + managesubreddits() + elif i=='2': + reddit_imgs.sync.main() + elif i=='3': + reddit_imgs.fetch.main() + elif i=='4': + reddit_imgs.reorganize.main() + +def main(): + ensureFolderAvailability() + mainmenu() + +if __name__ == '__main__': + main() + diff --git a/reddit_imgs/sync.py b/reddit_imgs/sync.py new file mode 100755 index 0000000..e66d065 --- /dev/null +++ b/reddit_imgs/sync.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import os +from bs4 import BeautifulSoup as _BS +from .system import simpleDownloader +from .system.subredditTools import getEmptySubredditData, getSubredditPageInfo +import json + +def BeautifulSoup(data): return _BS(data, 'html5lib') + +simpleDownloader.setCookies({'over18':1}) + +wdir = os.path.abspath('.') + +def main(): + subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) + for subreddit in subreddits: + srp = os.path.abspath(os.path.join(wdir,'r',subreddit)) + #if subreddit!='yiff': continue + nextpage = 'https://www.reddit.com/r/'+subreddit+'/new/?count=0' + srdt = getEmptySubredditData(subreddit) + try: + with open(os.path.join(srp,'subreddit.json')) as f: + srdt = json.loads(f.read()) + except: pass + #srdt = getEmptySubredditData(subreddit) + pageno = 0 + while nextpage: + pageno+=1 + print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit)) + print(' >> %s'%nextpage) + redditBytes = simpleDownloader.getUrlBytes(nextpage) + bs = BeautifulSoup(redditBytes) + first, last, nextpage, links = getSubredditPageInfo(bs) + if srdt['date_last'] <= last: + nextpage = None + srdt['date_first'] = max(first, srdt['date_first']) + srdt['date_last'] = min(last, srdt['date_last']) + for link in links[::-1]: + if link not in srdt['links']: + srdt['links'].append(link) + with open(os.path.join(srp,'subreddit.json'),'w') as f: + f.write(json.dumps(srdt ,sort_keys=True, indent=2)) + +if __name__ == '__main__': + main() diff --git a/reddit_imgs/system/downloader/__init__.py b/reddit_imgs/system/downloader/__init__.py new file mode 100644 index 0000000..7e34022 --- /dev/null +++ b/reddit_imgs/system/downloader/__init__.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import os + +modules_map = dict() + +moduleNames = os.listdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'modules')) +moduleNames = list(map(lambda a: a[:-3], filter(lambda a: a.endswith('.py'), moduleNames))) + +for moduleName in moduleNames: + exec('from .modules import {0} as {0}; modules_map["{0}"] = {0}'.format(moduleName)) + +def getDownloader(domain): + for module in modules_map.values(): + try: + if module.works_on(domain): + return module.get_class() + except: pass + return None + diff --git a/reddit_imgs/system/downloader/downloadedData.py b/reddit_imgs/system/downloader/downloadedData.py new file mode 100644 index 0000000..5dc687c --- /dev/null +++ b/reddit_imgs/system/downloader/downloadedData.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import filetype +import shutil +import json +import os + +class DownloadedData(object): + def __init__(self): + self.control = list() + self.fb = dict() + def put(self, link, downloaded, ext=None): + if ext is None: + try: + ext = link.rsplit('/',1)[-1].rsplit('.',1)[-1] + if ext not in ['jpg','png','gif','webp']: + raise Exception + except: + ext = filetype.guess_extension(downloaded) + if ext is None: + ext = 'unk' + fnm = '%04d.%s'%(len(self.control),ext) + self.control.append({ + 'dname': fnm, + 'link':link, + 'ext':ext, + }) + self.fb[fnm] = downloaded + def into(self, directory): + directory = os.path.abspath(directory) + if not os.path.exists(directory): + os.makedirs(directory) + try: + with open(os.path.join(directory,'meta.json'),'w') as f: + f.write(json.dumps(self.control, sort_keys=True, indent=2)) + for fnm, dtb in self.fb.items(): + with open(os.path.join(directory,fnm),'wb') as f: + f.write(dtb) + except KeyboardInterrupt as e: + shutil.rmtree(directory) + raise e + def merge(self, other): + for oitem in other.control: + self.put(oitem['link'], other.fb[oitem['dname']], oitem['ext']) + def bulk_merge(self, others): + for other in others: + self.merge(other) diff --git a/reddit_imgs/system/downloader/modules/direct_link.py b/reddit_imgs/system/downloader/modules/direct_link.py new file mode 100644 index 0000000..dc62d5f --- /dev/null +++ b/reddit_imgs/system/downloader/modules/direct_link.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +from ..downloadedData import DownloadedData +from ... import simpleDownloader + +def works_on(domain): + return domain=='direct_link' + +class DirectLink(object): + def recognizes(self, link): + if ( + link.startswith('http://u18chan.com/') + or + link.startswith('https://u18chan.com/') + or + link.startswith('http://dl.dropboxusercontent.com') + or + link.startswith('https://dl.dropboxusercontent.com') + ): + return False + return True + + def download(self, link): + dd = DownloadedData() + simpleDownloader.cleanCookies() + bts = simpleDownloader.getUrlBytes(link) + simpleDownloader.cleanCookies() + if bts is not None: + dd.put(link,bts) + return dd + +def get_class(): + return DirectLink diff --git a/reddit_imgs/system/downloader/modules/imgur_com.py b/reddit_imgs/system/downloader/modules/imgur_com.py new file mode 100644 index 0000000..3a299ab --- /dev/null +++ b/reddit_imgs/system/downloader/modules/imgur_com.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import re +import json +import filetype +from ..downloadedData import DownloadedData +from ... import simpleDownloader + +def works_on(domain): + return domain in ['i.imgur.com', 'imgur.com', 'm.imgur.com', 'www.imgur.com'] + +class ImgurCom(object): + def recognizes(self, link): + return True + + def download(self, link): + dd = DownloadedData() + simpleDownloader.cleanCookies() + bts = b'' + if '/a/' not in link and '.gifv' not in link and '.webm' not in link: + bts = simpleDownloader.getUrlBytes(link) + if bts is not None: + ext = filetype.guess_extension(bts) + if ext is None: + if '.gifv' in link or '.webm' in link: + bts=None + print(' '*50,end='') + print('\r',end='') + print(' `--> It wasn\'t a single image...',end='') + print('\r',end='') + match = re.match( + "(https?)://(www\.)?(i\.|m\.|www\.)?imgur\.com/(?:(a|gallery|r)/)?(\w*)/?(\w*)(#[0-9]+)?(\.\w*)?", + link + ) + tp = match.group(4) + ky = None + if tp != 'r': + ky = match.group(5) + else: + ky = match.group(6) + if not ky: + ky = match.group(5) + link2 = 'https://imgur.com/a/'+str(ky)+'/all' + if tp is None or tp=='' or tp=='r': + link2=link2.replace('/a/','/')[:-4] + print(' '*50,end='') + print('\r',end='') + if link2.endswith('/all') or bts is None: + print(' `--> Fetching album image list...',end='') + bts = simpleDownloader.getUrlBytes(link2) + else: + print(' `--> Album image list already fetched...',end='') + print('\r',end='') + if bts is None: + print(' '*50,end='') + print('\r',end='') + print(' `--> Gallery not found') + return DownloadedData() + html = bts.decode('utf-8') + albnfo = json.loads(list(filter(lambda f: f.startswith('image'), map(str.strip, list(filter(lambda f: f.startswith("('gallery', {"), html.split('widgetFactory.mergeConfig')))[0].strip().splitlines())))[0][6:-1].strip()[1:].strip()) + imgs = [albnfo] + if 'album_images' in albnfo: + imgs = albnfo['album_images']['images'] + for seq, img in enumerate(imgs): + print(' '*50,end='') + print('\r',end='') + print(' `--> Album image #%03d of %03d'%(seq+1,len(imgs)),end='') + print('\r',end='') + if img['ext'] == '.gifv': + img['ext'] = '.mp4' + durl = 'http://i.imgur.com/'+img['hash']+img['ext'] + imb = simpleDownloader.getUrlBytes(durl) + if imb is None: + print() + print('Album part failed') + print() + simpleDownloader.cleanCookies() + return None + dd.put(durl, imb, img['ext'][1:]) + print('\r',end='') + else: + dd.put(link, bts, ext) + simpleDownloader.cleanCookies() + return dd + +def get_class(): + return ImgurCom diff --git a/reddit_imgs/system/limits.py b/reddit_imgs/system/limits.py new file mode 100644 index 0000000..b49207e --- /dev/null +++ b/reddit_imgs/system/limits.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +minint = -2**31 +maxint = 2**31 - 1 \ No newline at end of file diff --git a/reddit_imgs/system/objectify.py b/reddit_imgs/system/objectify.py new file mode 100644 index 0000000..c607df7 --- /dev/null +++ b/reddit_imgs/system/objectify.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +class objectify(object): + @property + def json(self): return self.__dict__ + def __init__(self, data): self.__dict__ = data \ No newline at end of file diff --git a/reddit_imgs/system/simpleDownloader.py b/reddit_imgs/system/simpleDownloader.py new file mode 100644 index 0000000..bca81b2 --- /dev/null +++ b/reddit_imgs/system/simpleDownloader.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import time +import urllib.request +import urllib.error + +cookie = dict() + +def delCookie(cookiekey): + cookiekey = str(cookiekey) + del cookie[cookiekey] + +def setCookie(cookiekey, cookieval): + cookieval = str(cookieval) + cookiekey = str(cookiekey) + if not cookiekey: return + if not cookieval: delCookie(cookiekey) + cookie[cookiekey] = cookieval + +def getCookies(): + return dict(cookie.items()) + +def patchCookies(newCookies): + for nk, nv in newCookies.items(): + setCookie(nk,nv) + +def cleanCookies(): + global cookie + cookie = dict() + +def setCookies(newCookies): + cleanCookies() + patchCookies(newCookies) + +def getUrlBytes(url): + global cookie + request = urllib.request.Request(url) + request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) '+ + 'AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu '+ + 'Chromium/63.0.3239.84 Chrome/63.0.3239.84 '+ + 'Safari/537.36' + ) + if len(cookie): + request.add_header("Cookie", '; '.join(map(lambda a: '='.join(a), cookie.items()))) + response = None + try: + response = urllib.request.urlopen(request, timeout=15) + except urllib.error.HTTPError as e: + if e.code == 429: + print('[URL] Got 429 (Too Many Requests): sleeping for 5 seconds') + print(' @ %s'%url) + time.sleep(5) + return getUrlBytes(url) + if e.code == 503: + print('[URL] Got 503 (Service Temporarily Unavailable): retrying after 5 seconds') + print(' @ %s'%url) + time.sleep(5) + return getUrlBytes(url) + elif e.code == 404: + return None + elif e.code == 400: + return None + raise e + except urllib.error.URLError as e: + if str(e.reason).startswith('[Errno -2]'): + return None + if str(e.reason).startswith('[Errno -3]'): + print('Check your internet connection. It seems gone.') + if str(e.reason).startswith('[Errno 110]') or str(e.reason)=='timed out': + print('Connection request has timed out - assuming "Not Found"') + return None + if str(e.reason).startswith('[Errno 111]') or str(e.reason)=='timed out': + print('Connection refused - assuming "Not Found"') + return None + raise e + rcode = response.getcode() + rinfo = response.info() + headers = dict() + headers_l = list(map(lambda a: list(map(str.strip, a.split(':',1))), str(rinfo).strip().splitlines())) + for header in headers_l: + k = header[0].lower() + v = header[1] + if k not in headers: + headers[k]=list() + headers[k].append(v) + del k + del v + del header + del headers_l + if 'set-cookie' in headers: + for cke in headers['set-cookie']: + ckek = cke.split('=',1)[0].strip() + ckev = cke.split('=',1)[1].split(';',1)[0].strip() + setCookie(ckek,ckev) + del ckek + del ckev + del cke + if rcode == 429: + tosleep = 5 + try: tosleep = int(headers['retry-after'][0]) + except: pass + if tosleep < 1: tosleep = 1 + print('[URL] Got 429 (Too Many Requests): sleeping for %d seconds'%tosleep) + print(' @ %s'%url) + time.sleep(tosleep) + return getUrlBytes(url) + data = None + if rcode == 200: + data = response.read() + response.close() + return data + +def getUrl(url): + return getUrlBytes(url).decode('utf-8') diff --git a/reddit_imgs/system/subredditTools.py b/reddit_imgs/system/subredditTools.py new file mode 100644 index 0000000..96de9b2 --- /dev/null +++ b/reddit_imgs/system/subredditTools.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import dateutil.parser +from .limits import minint, maxint + +def getInfoFromRedditItem(bs): + nsfw = 'over18' in bs['class'] + sharer = bs.find(class_='author').text.strip() + title = bs.find('a',class_='title').text.strip() + link = str(bs.find('a',class_='title')['href']) + domain = bs.find('span',class_='domain').find('a').text.strip() + datakey = bs['data-fullname'] + timestamp = int(dateutil.parser.parse(bs.find('time')['datetime']).strftime('%s')) + flair = None + try: flair = bs.find('span',class_='linkflairlabel').text.strip() + except: pass + return { + 'nsfw': nsfw, + 'link': link, + 'title': title, + 'flair': flair, + 'sharer': sharer, + 'domain': domain, + 'datakey': datakey, + 'timestamp': timestamp, + } + +def getEmptySubredditData(srname): + return { + 'subreddit': srname, + 'date_first': minint, + 'date_last': maxint, + 'links': list() + } + +def getSubredditPageInfo(bs): + pagetable = bs.find(id='siteTable') + discussions = pagetable.find_all( + lambda a: a.has_attr('class') and + 'thing' in a['class'] + ) + links = list(filter(lambda a: 'self' not in a['class'],discussions)) + first = minint + last = maxint + try: first = int(dateutil.parser.parse(discussions[0].find('time')['datetime']).strftime('%s')) + except: pass + try: last = int(dateutil.parser.parse(discussions[-1].find('time')['datetime']).strftime('%s')) + except: pass + nextpage = None + try: nextpage = bs.find('div', class_='nav-buttons').find(class_='nextprev').find(class_='next-button').find('a')['href'] + except: pass + structured_links = list(map(getInfoFromRedditItem, links)) + return first, last, nextpage, structured_links diff --git a/redditgetter.py b/redditgetter.py new file mode 100755 index 0000000..c7c4914 --- /dev/null +++ b/redditgetter.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import reddit_imgs.runner + +if __name__ == '__main__': + reddit_imgs.runner.main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8ae6ccb --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +filetype==1.0.0