reddit-image-wall-getter/reddit_imgs/sync.py

256 lines
9.0 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import os
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from urllib.error import ContentTooShortError, HTTPError, URLError
import colored as clrlib
from .system import simpleDownloader
from .system.cmdline_parser import parse_cmdline
from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link,
getEmptySubredditData,
getSubredditPageJsonInfo)
MAX_WORKERS = 16
def cmdline(encoded_args: str = None):
if encoded_args is None:
return run_with_config()
else:
return parse_cmdline(run_with_config, encoded_args)
def run_with_config(max_workers: int = None,
):
global MAX_WORKERS
if max_workers is not None:
MAX_WORKERS = max_workers
return main()
simpleDownloader.setCookies({'over18': 1})
wdir = os.path.abspath('.')
def process_subreddit(subreddit, srdt, jsonPageSr):
simpleDownloader.setCookies({'over18': 1})
srp = os.path.abspath(os.path.join(wdir, 'r', subreddit))
nextpage = build_gateway_link(subreddit)
pageno = 0
ygst = srdt['date_first']
while nextpage:
pageno += 1
print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [
clrlib.fg('light_yellow'),
]))
print(clrlib.stylize(' >> %s' % (nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),), [
clrlib.fg('light_yellow'), clrlib.attr('dim'),
]))
redditBytes = None
try:
redditBytes = simpleDownloader.getUrlBytes(nextpage)
except (HTTPError, URLError, ContentTooShortError):
print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [
clrlib.fg('light_red'), clrlib.attr('bold'),
]))
print(clrlib.stylize(" >> HTTP Error with code: Skipping...", [
clrlib.fg('light_red'), clrlib.attr('bold'), clrlib.attr('dim'),
]))
break
if redditBytes is None:
print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [
clrlib.fg('light_red'), clrlib.attr('bold'),
]))
print(clrlib.stylize(" >> HTTP Error: Skipping...", [
clrlib.fg('light_red'), clrlib.attr('bold'), clrlib.attr('dim'),
]))
break
jsonPage = json.loads(redditBytes)
getSubredditPageJsonInfoResult = None
try:
getSubredditPageJsonInfoResult = (
getSubredditPageJsonInfo(jsonPage, subreddit, pageno))
except IndexError:
print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [
clrlib.fg('light_gray'), clrlib.attr('dim'),
]))
print(clrlib.stylize(" >> Empty subreddit: Skipping...", [
clrlib.fg('light_gray'), clrlib.attr('dim'),
]))
break
first, last, nextpage, links = getSubredditPageJsonInfoResult
if ygst >= first: # if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
nextpage = None
srdt['date_first'] = max(first, srdt['date_first'])
srdt['date_last'] = min(last, srdt['date_last'])
for link in links[::-1]:
if link not in srdt['links']:
srdt['links'].append(link)
srid = next(iter(set.intersection(
set(jsonPage['subreddits'].keys()),
set(jsonPage['postFlair'].keys()),
set(jsonPage['subredditAboutInfo'].keys())
)))
jsonPageSr = dict(
id=srid,
name=subreddit,
definition=jsonPage['subreddits'][srid],
about=jsonPage['subredditAboutInfo'][srid],
flair=jsonPage['postFlair'][srid],
)
srdt['links'] = list(filter(lambda a: len(a['datakey']) < 20, srdt['links']))
srdt['links'] = sorted(srdt['links'], key=lambda a: -a['timestamp'])
return (subreddit, srp, srdt, jsonPageSr)
def main():
print('Building summary...')
srs, srsi, srf = build_summary()
print('Download...')
subreddits = sorted(filter(lambda sr: os.path.isdir(
os.path.join(wdir, 'r', sr)), os.listdir(os.path.join(wdir, 'r'))))
print('Opening process pool...')
with PoolExecutor(MAX_WORKERS) as pe2:
def process_subreddit_done_callback_inner(job):
(subreddit, srp, srdt, jsonPageSr) = job.result()
del job
process_subreddit_done_callback(subreddit, srp, srdt, jsonPageSr, pe2, srs, srsi)
return
with PoolExecutor(MAX_WORKERS) as pe:
print('Opened process pool')
for subreddit in subreddits:
if subreddit not in srs:
srs[subreddit] = getEmptySubredditData(subreddit)
if subreddit not in srsi:
srsi[subreddit] = None
job = pe.submit(
process_subreddit,
subreddit,
srs[subreddit],
srsi[subreddit],
)
job.add_done_callback(process_subreddit_done_callback_inner)
print('Closing process pool...')
print('Closed process pool')
print('Writing summary...')
write_summary(srs, srsi, srf)
print('Done')
def process_subreddit_done_callback(subreddit, srp, srdt, jsonPageSr, pe, srs, srsi):
srs[subreddit] = srdt
srsi[subreddit] = jsonPageSr
print(clrlib.stylize(f' @> Writing /r/{subreddit}', [
clrlib.fg('light_cyan'),
]))
job = pe.submit(
post_processing_saver,
subreddit, srp, srdt, jsonPageSr
)
def post_processing_saver(subreddit, srp, srdt, jsonPageSr):
write_json(Path(os.path.join(srp, 'subreddit.json')), srdt, sort_keys=True)
if jsonPageSr is not None:
write_json(Path(os.path.join(srp, 'meta.json')), jsonPageSr, sort_keys=True)
print(clrlib.stylize(f' @> Written /r/{subreddit}', [
clrlib.fg('light_green'),
]))
def build_summary():
rjpath = Path(wdir, 'r.json')
rijpath = Path(wdir, 'ri.json')
rfpath = Path(wdir, 'rf.json')
oldsrs = dict()
oldsrsi = dict()
if rjpath.exists():
oldsrs = json.loads(rjpath.read_bytes())
if rijpath.exists():
oldsrsi = json.loads(rijpath.read_bytes())
srs = dict()
srsi = dict()
nodownloadfilter = dict()
nosfwfilter = dict()
nonsfwfilter = dict()
wallpaperfilter = dict()
with PoolExecutor(MAX_WORKERS) as pe:
def on_data_read(job):
(sr, srp, srip, srd, srid, sripe) = job.result()
if srd is not None:
srs[sr] = srd
else:
if sr not in oldsrs:
srp.unlink()
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srs[sr] = oldsrs[sr]
srp.write_text(json.dumps(oldsrs[sr], indent=1))
if sripe:
if srid is not None:
srsi[sr] = srid
else:
if sr not in oldsrsi:
srip.unlink()
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srsi[sr] = oldsrsi[sr]
srip.write_text(json.dumps(oldsrsi[sr], indent=1))
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
sr = srp.parent.name.lower()
nodownloadfilter[sr] = srp.parent.joinpath('nodownload.flag').exists()
nosfwfilter[sr] = srp.parent.joinpath('nosfw.flag').exists()
nonsfwfilter[sr] = srp.parent.joinpath('nonsfw.flag').exists()
wallpaperfilter[sr] = srp.parent.joinpath('wallpaper.flag').exists()
srip = srp.parent.joinpath('meta.json')
job = pe.submit(read_disk_summary, sr, srp, srip)
job.add_done_callback(on_data_read)
srf = dict(
no_download=nodownloadfilter,
no_sfw=nosfwfilter,
no_nsfw=nonsfwfilter,
wallpaper=wallpaperfilter,
)
return srs, srsi, srf
def read_disk_summary(sr, srp, srip):
srd = None
srid = None
sripe = srip.exists()
try:
srd = json.loads(srp.read_bytes())
except json.decoder.JSONDecodeError:
pass
if sripe:
try:
srid = json.loads(srip.read_bytes())
except json.decoder.JSONDecodeError:
pass
return (sr, srp, srip, srd, srid, sripe)
def write_summary(srs, srsi, srf):
rjpath = Path(wdir, 'r.json')
rijpath = Path(wdir, 'ri.json')
rfpath = Path(wdir, 'rf.json')
with PoolExecutor(MAX_WORKERS) as pe:
pe.submit(write_json, rjpath, srs)
pe.submit(write_json, rijpath, srsi)
pe.submit(write_json, rfpath, srf)
def write_json(path, data, **kwargs):
path.write_text(json.dumps(data, indent=1, **kwargs))
if __name__ == '__main__':
main()