reddit-image-wall-getter/reddit_imgs/sync.py

164 lines
5.6 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import os
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from urllib.error import ContentTooShortError, HTTPError, URLError
from bs4 import BeautifulSoup as _BS
from .system import simpleDownloader
from .system.cmdline_parser import parse_cmdline
from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link,
getEmptySubredditData,
getSubredditPageJsonInfo)
MAX_WORKERS = 16
def BeautifulSoup(data): return _BS(data, 'html5lib')
def cmdline(encoded_args: str = None):
if encoded_args is None:
return run_with_config()
else:
return parse_cmdline(run_with_config, encoded_args)
def run_with_config(max_workers: int = None,
):
global MAX_WORKERS
if max_workers is not None:
MAX_WORKERS = max_workers
return main()
simpleDownloader.setCookies({'over18': 1})
wdir = os.path.abspath('.')
def process_subreddit(subreddit):
simpleDownloader.setCookies({'over18': 1})
srp = os.path.abspath(os.path.join(wdir, 'r', subreddit))
# if subreddit!='yiff': continue
nextpage = build_gateway_link(subreddit)
srdt = getEmptySubredditData(subreddit)
try:
with open(os.path.join(srp, 'subreddit.json')) as f:
srdt = json.loads(f.read())
except BaseException:
pass
#srdt = getEmptySubredditData(subreddit)
pageno = 0
ygst = srdt['date_first']
jsonPageSr = None
while nextpage:
pageno += 1
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(' >> %s' % (nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),))
redditBytes = None
try:
redditBytes = simpleDownloader.getUrlBytes(nextpage)
except (HTTPError, URLError, ContentTooShortError):
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(" >> HTTP Error with code: Skipping...")
break
if redditBytes is None:
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(" >> HTTP Error: Skipping...")
break
# bs = BeautifulSoup(redditBytes)
jsonPage = json.loads(redditBytes)
getSubredditPageJsonInfoResult = None
try:
getSubredditPageJsonInfoResult = (
getSubredditPageJsonInfo(jsonPage, subreddit, pageno))
except IndexError:
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(" >> Empty subreddit: Skipping...")
break
first, last, nextpage, links = getSubredditPageJsonInfoResult
if ygst >= first: # if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
nextpage = None
srdt['date_first'] = max(first, srdt['date_first'])
srdt['date_last'] = min(last, srdt['date_last'])
for link in links[::-1]:
if link not in srdt['links']:
srdt['links'].append(link)
srid = next(iter(set.intersection(
set(jsonPage['subreddits'].keys()),
set(jsonPage['postFlair'].keys()),
set(jsonPage['subredditAboutInfo'].keys())
)))
jsonPageSr = dict(
id=srid,
name=subreddit,
definition=jsonPage['subreddits'][srid],
about=jsonPage['subredditAboutInfo'][srid],
flair=jsonPage['postFlair'][srid],
)
with open(os.path.join(srp, 'subreddit.json'), 'w') as f:
f.write(json.dumps(srdt, sort_keys=True, indent=2))
if jsonPageSr is not None:
with open(os.path.join(srp, 'meta.json'), 'w') as f:
f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2))
def main():
build_summary()
subreddits = sorted(filter(lambda sr: os.path.isdir(
os.path.join(wdir, 'r', sr)), os.listdir(os.path.join(wdir, 'r'))))
with PoolExecutor(MAX_WORKERS) as pe:
q = list()
for subreddit in subreddits:
job = pe.submit(process_subreddit, subreddit)
q.append(job)
for job in q:
job.result()
build_summary()
def build_summary():
rjpath = Path(wdir, 'r.json')
rijpath = Path(wdir, 'ri.json')
oldsrs = dict()
oldsrsi = dict()
if rjpath.exists():
oldsrs = json.loads(rjpath.read_text())
if rijpath.exists():
oldsrsi = json.loads(rijpath.read_text())
srs = dict()
srsi = dict()
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
sr = srp.parent.name.lower()
srip = srp.parent.joinpath('meta.json')
try:
srs[sr] = json.loads(srp.read_text())
except json.decoder.JSONDecodeError:
if sr not in oldsrs:
raise
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srs[sr] = oldsrs[sr]
srp.write_text(json.dumps(oldsrs[sr], indent=1))
if srip.exists():
try:
srsi[sr] = json.loads(srip.read_text())
except json.decoder.JSONDecodeError:
if sr not in oldsrsi:
raise
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srsi[sr] = oldsrsi[sr]
srip.write_text(json.dumps(oldsrsi[sr], indent=1))
rjpath.write_text(json.dumps(srs, indent=1))
rijpath.write_text(json.dumps(srsi, indent=1))
if __name__ == '__main__':
main()