reddit-image-wall-getter/reddit_imgs/sync.py

115 lines
4.0 KiB
Python
Raw Normal View History

2017-12-29 22:54:22 +00:00
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
2020-06-01 03:20:23 +00:00
import json
2017-12-29 22:54:22 +00:00
import os
2020-06-01 03:20:23 +00:00
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from urllib.error import ContentTooShortError, HTTPError, URLError
2017-12-29 22:54:22 +00:00
from bs4 import BeautifulSoup as _BS
2020-06-01 03:20:23 +00:00
2017-12-29 22:54:22 +00:00
from .system import simpleDownloader
2020-06-01 03:20:23 +00:00
from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link,
getEmptySubredditData,
getSubredditPageJsonInfo)
2017-12-29 22:54:22 +00:00
def BeautifulSoup(data): return _BS(data, 'html5lib')
simpleDownloader.setCookies({'over18':1})
wdir = os.path.abspath('.')
2020-06-01 03:20:23 +00:00
def process_subreddit(subreddit):
simpleDownloader.setCookies({'over18':1})
srp = os.path.abspath(os.path.join(wdir, 'r', subreddit))
#if subreddit!='yiff': continue
nextpage = build_gateway_link(subreddit)
srdt = getEmptySubredditData(subreddit)
try:
with open(os.path.join(srp, 'subreddit.json')) as f:
srdt = json.loads(f.read())
except: pass
#srdt = getEmptySubredditData(subreddit)
pageno = 0
ygst = srdt['date_first']
jsonPageSr = None
while nextpage:
pageno+=1
print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit))
print(' >> %s'%(nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),))
redditBytes = None
try:
redditBytes = simpleDownloader.getUrlBytes(nextpage)
except (HTTPError, URLError, ContentTooShortError) as e:
print(" >> HTTP Error with code: Skipping...")
break
if redditBytes is None:
print(" >> HTTP Error: Skipping...")
break
# bs = BeautifulSoup(redditBytes)
jsonPage = json.loads(redditBytes)
first, last, nextpage, links = getSubredditPageJsonInfo(jsonPage, subreddit, pageno)
if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
nextpage = None
srdt['date_first'] = max(first, srdt['date_first'])
srdt['date_last'] = min(last, srdt['date_last'])
for link in links[::-1]:
if link not in srdt['links']:
srdt['links'].append(link)
srid = next(iter(set.intersection(
set(jsonPage['subreddits'].keys()),
set(jsonPage['postFlair'].keys()),
set(jsonPage['subredditAboutInfo'].keys())
)))
jsonPageSr = dict(
id=srid,
name=subreddit,
definition=jsonPage['subreddits'][srid],
about=jsonPage['subredditAboutInfo'][srid],
flair=jsonPage['postFlair'][srid],
)
with open(os.path.join(srp,'subreddit.json'),'w') as f:
f.write(json.dumps(srdt, sort_keys=True, indent=2))
if jsonPageSr is not None:
with open(os.path.join(srp,'meta.json'),'w') as f:
f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2))
2017-12-29 22:54:22 +00:00
def main():
2020-05-13 21:07:05 +00:00
build_summary()
2017-12-29 22:54:22 +00:00
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r'))))
2020-06-01 03:20:23 +00:00
with PoolExecutor(16) as pe:
q = list()
for subreddit in subreddits:
job = pe.submit(process_subreddit, subreddit)
q.append(job)
for job in q:
job.result()
2020-05-13 21:07:05 +00:00
build_summary()
def build_summary():
2020-06-01 03:20:23 +00:00
rjpath = Path(wdir, 'r.json')
oldsrs = dict()
if rjpath.exists():
oldsrs = json.loads(rjpath.read_text())
2020-05-13 21:07:05 +00:00
srs = dict()
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
2020-06-01 03:20:23 +00:00
sr = srp.parent.name.lower()
try:
srs[sr] = json.loads(srp.read_text())
except json.decoder.JSONDecodeError:
if sr not in oldsrs:
raise
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srs[sr] = oldsrs[sr]
srp.write_text(json.dumps(oldsrs[sr], indent=1))
rjpath.write_text(json.dumps(srs, indent=1))
2020-05-13 21:07:05 +00:00
2017-12-29 22:54:22 +00:00
if __name__ == '__main__':
main()