#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import json import os from concurrent.futures import ProcessPoolExecutor as PoolExecutor from pathlib import Path from urllib.error import ContentTooShortError, HTTPError, URLError import colored as clrlib from .system import simpleDownloader from .system.cmdline_parser import parse_cmdline from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link, getEmptySubredditData, getSubredditPageJsonInfo) MAX_WORKERS = 16 def cmdline(encoded_args: str = None): if encoded_args is None: return run_with_config() else: return parse_cmdline(run_with_config, encoded_args) def run_with_config(max_workers: int = None, ): global MAX_WORKERS if max_workers is not None: MAX_WORKERS = max_workers return main() simpleDownloader.setCookies({'over18': 1}) wdir = os.path.abspath('.') def process_subreddit(subreddit, srdt, jsonPageSr): simpleDownloader.setCookies({'over18': 1}) srp = os.path.abspath(os.path.join(wdir, 'r', subreddit)) nextpage = build_gateway_link(subreddit) pageno = 0 ygst = srdt['date_first'] while nextpage: pageno += 1 print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [ clrlib.fg('light_yellow'), ])) print(clrlib.stylize(' >> %s' % (nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),), [ clrlib.fg('light_yellow'), clrlib.attr('dim'), ])) redditBytes = None try: redditBytes = simpleDownloader.getUrlBytes(nextpage) except (HTTPError, URLError, ContentTooShortError): print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [ clrlib.fg('light_red'), clrlib.attr('bold'), ])) print(clrlib.stylize(" >> HTTP Error with code: Skipping...", [ clrlib.fg('light_red'), clrlib.attr('bold'), clrlib.attr('dim'), ])) break if redditBytes is None: print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [ clrlib.fg('light_red'), clrlib.attr('bold'), ])) print(clrlib.stylize(" >> HTTP Error: Skipping...", [ clrlib.fg('light_red'), clrlib.attr('bold'), clrlib.attr('dim'), ])) break jsonPage = json.loads(redditBytes) getSubredditPageJsonInfoResult = None try: getSubredditPageJsonInfoResult = ( getSubredditPageJsonInfo(jsonPage, subreddit, pageno)) except IndexError: print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [ clrlib.fg('light_gray'), clrlib.attr('dim'), ])) print(clrlib.stylize(" >> Empty subreddit: Skipping...", [ clrlib.fg('light_gray'), clrlib.attr('dim'), ])) break first, last, nextpage, links = getSubredditPageJsonInfoResult if ygst >= first: # if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date nextpage = None srdt['date_first'] = max(first, srdt['date_first']) srdt['date_last'] = min(last, srdt['date_last']) for link in links[::-1]: if link not in srdt['links']: srdt['links'].append(link) srid = next(iter(set.intersection( set(jsonPage['subreddits'].keys()), set(jsonPage['postFlair'].keys()), set(jsonPage['subredditAboutInfo'].keys()) ))) jsonPageSr = dict( id=srid, name=subreddit, definition=jsonPage['subreddits'][srid], about=jsonPage['subredditAboutInfo'][srid], flair=jsonPage['postFlair'][srid], ) srdt['links'] = list(filter(lambda a: len(a['datakey']) < 20, srdt['links'])) srdt['links'] = sorted(srdt['links'], key=lambda a: -a['timestamp']) return (subreddit, srp, srdt, jsonPageSr) def main(): print('Building summary...') srs, srsi, srf = build_summary() print('Download...') subreddits = sorted(filter(lambda sr: os.path.isdir( os.path.join(wdir, 'r', sr)), os.listdir(os.path.join(wdir, 'r')))) print('Opening process pool...') with PoolExecutor(MAX_WORKERS) as pe2: def process_subreddit_done_callback_inner(job): (subreddit, srp, srdt, jsonPageSr) = job.result() del job process_subreddit_done_callback(subreddit, srp, srdt, jsonPageSr, pe2, srs, srsi) return with PoolExecutor(MAX_WORKERS) as pe: print('Opened process pool') for subreddit in subreddits: if subreddit not in srs: srs[subreddit] = getEmptySubredditData(subreddit) if subreddit not in srsi: srsi[subreddit] = None job = pe.submit( process_subreddit, subreddit, srs[subreddit], srsi[subreddit], ) job.add_done_callback(process_subreddit_done_callback_inner) print('Closing process pool...') print('Closed process pool') print('Writing summary...') write_summary(srs, srsi, srf) print('Done') def process_subreddit_done_callback(subreddit, srp, srdt, jsonPageSr, pe, srs, srsi): srs[subreddit] = srdt srsi[subreddit] = jsonPageSr print(clrlib.stylize(f' @> Writing /r/{subreddit}', [ clrlib.fg('light_cyan'), ])) job = pe.submit( post_processing_saver, subreddit, srp, srdt, jsonPageSr ) def post_processing_saver(subreddit, srp, srdt, jsonPageSr): write_json(Path(os.path.join(srp, 'subreddit.json')), srdt, sort_keys=True) if jsonPageSr is not None: write_json(Path(os.path.join(srp, 'meta.json')), jsonPageSr, sort_keys=True) print(clrlib.stylize(f' @> Written /r/{subreddit}', [ clrlib.fg('light_green'), ])) def build_summary(): rjpath = Path(wdir, 'r.json') rijpath = Path(wdir, 'ri.json') rfpath = Path(wdir, 'rf.json') oldsrs = dict() oldsrsi = dict() if rjpath.exists(): oldsrs = json.loads(rjpath.read_bytes()) if rijpath.exists(): oldsrsi = json.loads(rijpath.read_bytes()) srs = dict() srsi = dict() nodownloadfilter = dict() nosfwfilter = dict() nonsfwfilter = dict() wallpaperfilter = dict() with PoolExecutor(MAX_WORKERS) as pe: def on_data_read(job): (sr, srp, srip, srd, srid, sripe) = job.result() if srd is not None: srs[sr] = srd else: if sr not in oldsrs: srp.unlink() else: print('Restoring old data for corrupted subrredit %r' % sr) srs[sr] = oldsrs[sr] srp.write_text(json.dumps(oldsrs[sr], indent=1)) if sripe: if srid is not None: srsi[sr] = srid else: if sr not in oldsrsi: srip.unlink() else: print('Restoring old data for corrupted subrredit %r' % sr) srsi[sr] = oldsrsi[sr] srip.write_text(json.dumps(oldsrsi[sr], indent=1)) for srp in Path(wdir, 'r').glob('*/subreddit.json'): sr = srp.parent.name.lower() nodownloadfilter[sr] = srp.parent.joinpath('nodownload.flag').exists() nosfwfilter[sr] = srp.parent.joinpath('nosfw.flag').exists() nonsfwfilter[sr] = srp.parent.joinpath('nonsfw.flag').exists() wallpaperfilter[sr] = srp.parent.joinpath('wallpaper.flag').exists() srip = srp.parent.joinpath('meta.json') job = pe.submit(read_disk_summary, sr, srp, srip) job.add_done_callback(on_data_read) srf = dict( no_download=nodownloadfilter, no_sfw=nosfwfilter, no_nsfw=nonsfwfilter, wallpaper=wallpaperfilter, ) return srs, srsi, srf def read_disk_summary(sr, srp, srip): srd = None srid = None sripe = srip.exists() try: srd = json.loads(srp.read_bytes()) except json.decoder.JSONDecodeError: pass if sripe: try: srid = json.loads(srip.read_bytes()) except json.decoder.JSONDecodeError: pass return (sr, srp, srip, srd, srid, sripe) def write_summary(srs, srsi, srf): rjpath = Path(wdir, 'r.json') rijpath = Path(wdir, 'ri.json') rfpath = Path(wdir, 'rf.json') with PoolExecutor(MAX_WORKERS) as pe: pe.submit(write_json, rjpath, srs) pe.submit(write_json, rijpath, srsi) pe.submit(write_json, rfpath, srf) def write_json(path, data, **kwargs): path.write_text(json.dumps(data, indent=1, **kwargs)) if __name__ == '__main__': main()