#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import json import os from concurrent.futures import ProcessPoolExecutor as PoolExecutor from pathlib import Path from urllib.error import ContentTooShortError, HTTPError, URLError from bs4 import BeautifulSoup as _BS from .system import simpleDownloader from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link, getEmptySubredditData, getSubredditPageJsonInfo) def BeautifulSoup(data): return _BS(data, 'html5lib') simpleDownloader.setCookies({'over18':1}) wdir = os.path.abspath('.') def process_subreddit(subreddit): simpleDownloader.setCookies({'over18':1}) srp = os.path.abspath(os.path.join(wdir, 'r', subreddit)) #if subreddit!='yiff': continue nextpage = build_gateway_link(subreddit) srdt = getEmptySubredditData(subreddit) try: with open(os.path.join(srp, 'subreddit.json')) as f: srdt = json.loads(f.read()) except: pass #srdt = getEmptySubredditData(subreddit) pageno = 0 ygst = srdt['date_first'] jsonPageSr = None while nextpage: pageno+=1 print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit)) print(' >> %s'%(nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),)) redditBytes = None try: redditBytes = simpleDownloader.getUrlBytes(nextpage) except (HTTPError, URLError, ContentTooShortError) as e: print(" >> HTTP Error with code: Skipping...") break if redditBytes is None: print(" >> HTTP Error: Skipping...") break # bs = BeautifulSoup(redditBytes) jsonPage = json.loads(redditBytes) first, last, nextpage, links = getSubredditPageJsonInfo(jsonPage, subreddit, pageno) if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date nextpage = None srdt['date_first'] = max(first, srdt['date_first']) srdt['date_last'] = min(last, srdt['date_last']) for link in links[::-1]: if link not in srdt['links']: srdt['links'].append(link) srid = next(iter(set.intersection( set(jsonPage['subreddits'].keys()), set(jsonPage['postFlair'].keys()), set(jsonPage['subredditAboutInfo'].keys()) ))) jsonPageSr = dict( id=srid, name=subreddit, definition=jsonPage['subreddits'][srid], about=jsonPage['subredditAboutInfo'][srid], flair=jsonPage['postFlair'][srid], ) with open(os.path.join(srp,'subreddit.json'),'w') as f: f.write(json.dumps(srdt, sort_keys=True, indent=2)) if jsonPageSr is not None: with open(os.path.join(srp,'meta.json'),'w') as f: f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2)) def main(): build_summary() subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) with PoolExecutor(16) as pe: q = list() for subreddit in subreddits: job = pe.submit(process_subreddit, subreddit) q.append(job) for job in q: job.result() build_summary() def build_summary(): rjpath = Path(wdir, 'r.json') oldsrs = dict() if rjpath.exists(): oldsrs = json.loads(rjpath.read_text()) srs = dict() for srp in Path(wdir, 'r').glob('*/subreddit.json'): sr = srp.parent.name.lower() try: srs[sr] = json.loads(srp.read_text()) except json.decoder.JSONDecodeError: if sr not in oldsrs: raise else: print('Restoring old data for corrupted subrredit %r' % sr) srs[sr] = oldsrs[sr] srp.write_text(json.dumps(oldsrs[sr], indent=1)) rjpath.write_text(json.dumps(srs, indent=1)) if __name__ == '__main__': main()