reddit-image-wall-getter/reddit_imgs/sync.py

115 lines
4.0 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import os
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from urllib.error import ContentTooShortError, HTTPError, URLError
from bs4 import BeautifulSoup as _BS
from .system import simpleDownloader
from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link,
getEmptySubredditData,
getSubredditPageJsonInfo)
def BeautifulSoup(data): return _BS(data, 'html5lib')
simpleDownloader.setCookies({'over18':1})
wdir = os.path.abspath('.')
def process_subreddit(subreddit):
simpleDownloader.setCookies({'over18':1})
srp = os.path.abspath(os.path.join(wdir, 'r', subreddit))
#if subreddit!='yiff': continue
nextpage = build_gateway_link(subreddit)
srdt = getEmptySubredditData(subreddit)
try:
with open(os.path.join(srp, 'subreddit.json')) as f:
srdt = json.loads(f.read())
except: pass
#srdt = getEmptySubredditData(subreddit)
pageno = 0
ygst = srdt['date_first']
jsonPageSr = None
while nextpage:
pageno+=1
print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit))
print(' >> %s'%(nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),))
redditBytes = None
try:
redditBytes = simpleDownloader.getUrlBytes(nextpage)
except (HTTPError, URLError, ContentTooShortError) as e:
print(" >> HTTP Error with code: Skipping...")
break
if redditBytes is None:
print(" >> HTTP Error: Skipping...")
break
# bs = BeautifulSoup(redditBytes)
jsonPage = json.loads(redditBytes)
first, last, nextpage, links = getSubredditPageJsonInfo(jsonPage, subreddit, pageno)
if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
nextpage = None
srdt['date_first'] = max(first, srdt['date_first'])
srdt['date_last'] = min(last, srdt['date_last'])
for link in links[::-1]:
if link not in srdt['links']:
srdt['links'].append(link)
srid = next(iter(set.intersection(
set(jsonPage['subreddits'].keys()),
set(jsonPage['postFlair'].keys()),
set(jsonPage['subredditAboutInfo'].keys())
)))
jsonPageSr = dict(
id=srid,
name=subreddit,
definition=jsonPage['subreddits'][srid],
about=jsonPage['subredditAboutInfo'][srid],
flair=jsonPage['postFlair'][srid],
)
with open(os.path.join(srp,'subreddit.json'),'w') as f:
f.write(json.dumps(srdt, sort_keys=True, indent=2))
if jsonPageSr is not None:
with open(os.path.join(srp,'meta.json'),'w') as f:
f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2))
def main():
build_summary()
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r'))))
with PoolExecutor(16) as pe:
q = list()
for subreddit in subreddits:
job = pe.submit(process_subreddit, subreddit)
q.append(job)
for job in q:
job.result()
build_summary()
def build_summary():
rjpath = Path(wdir, 'r.json')
oldsrs = dict()
if rjpath.exists():
oldsrs = json.loads(rjpath.read_text())
srs = dict()
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
sr = srp.parent.name.lower()
try:
srs[sr] = json.loads(srp.read_text())
except json.decoder.JSONDecodeError:
if sr not in oldsrs:
raise
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srs[sr] = oldsrs[sr]
srp.write_text(json.dumps(oldsrs[sr], indent=1))
rjpath.write_text(json.dumps(srs, indent=1))
if __name__ == '__main__':
main()