reddit-image-wall-getter/reddit_imgs/sync.py

164 lines
5.6 KiB
Python
Raw Normal View History

2017-12-29 22:54:22 +00:00
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
2020-06-01 03:20:23 +00:00
import json
2017-12-29 22:54:22 +00:00
import os
2020-06-01 03:20:23 +00:00
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from urllib.error import ContentTooShortError, HTTPError, URLError
2017-12-29 22:54:22 +00:00
from bs4 import BeautifulSoup as _BS
2020-06-01 03:20:23 +00:00
2017-12-29 22:54:22 +00:00
from .system import simpleDownloader
2020-07-20 01:54:26 +00:00
from .system.cmdline_parser import parse_cmdline
2020-06-01 03:20:23 +00:00
from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link,
getEmptySubredditData,
getSubredditPageJsonInfo)
2020-07-20 01:54:26 +00:00
MAX_WORKERS = 16
2017-12-29 22:54:22 +00:00
def BeautifulSoup(data): return _BS(data, 'html5lib')
2020-07-20 01:54:26 +00:00
def cmdline(encoded_args: str = None):
if encoded_args is None:
return run_with_config()
else:
return parse_cmdline(run_with_config, encoded_args)
def run_with_config(max_workers: int = None,
):
global MAX_WORKERS
if max_workers is not None:
MAX_WORKERS = max_workers
return main()
simpleDownloader.setCookies({'over18': 1})
2017-12-29 22:54:22 +00:00
wdir = os.path.abspath('.')
2020-07-20 01:54:26 +00:00
2020-06-01 03:20:23 +00:00
def process_subreddit(subreddit):
2020-07-20 01:54:26 +00:00
simpleDownloader.setCookies({'over18': 1})
2020-06-01 03:20:23 +00:00
srp = os.path.abspath(os.path.join(wdir, 'r', subreddit))
2020-07-20 01:54:26 +00:00
# if subreddit!='yiff': continue
2020-06-01 03:20:23 +00:00
nextpage = build_gateway_link(subreddit)
srdt = getEmptySubredditData(subreddit)
try:
with open(os.path.join(srp, 'subreddit.json')) as f:
srdt = json.loads(f.read())
2020-06-05 22:19:45 +00:00
except BaseException:
pass
2020-06-01 03:20:23 +00:00
#srdt = getEmptySubredditData(subreddit)
pageno = 0
ygst = srdt['date_first']
jsonPageSr = None
while nextpage:
2020-07-20 01:54:26 +00:00
pageno += 1
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(' >> %s' % (nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),))
2020-06-01 03:20:23 +00:00
redditBytes = None
try:
redditBytes = simpleDownloader.getUrlBytes(nextpage)
2020-06-05 22:19:45 +00:00
except (HTTPError, URLError, ContentTooShortError):
2020-07-20 01:54:26 +00:00
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
2020-06-01 03:20:23 +00:00
print(" >> HTTP Error with code: Skipping...")
break
if redditBytes is None:
2020-07-20 01:54:26 +00:00
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
2020-06-01 03:20:23 +00:00
print(" >> HTTP Error: Skipping...")
break
# bs = BeautifulSoup(redditBytes)
jsonPage = json.loads(redditBytes)
2020-07-20 01:54:26 +00:00
getSubredditPageJsonInfoResult = None
try:
getSubredditPageJsonInfoResult = (
getSubredditPageJsonInfo(jsonPage, subreddit, pageno))
except IndexError:
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(" >> Empty subreddit: Skipping...")
break
first, last, nextpage, links = getSubredditPageJsonInfoResult
if ygst >= first: # if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
2020-06-01 03:20:23 +00:00
nextpage = None
srdt['date_first'] = max(first, srdt['date_first'])
srdt['date_last'] = min(last, srdt['date_last'])
for link in links[::-1]:
if link not in srdt['links']:
srdt['links'].append(link)
srid = next(iter(set.intersection(
set(jsonPage['subreddits'].keys()),
set(jsonPage['postFlair'].keys()),
set(jsonPage['subredditAboutInfo'].keys())
)))
jsonPageSr = dict(
id=srid,
name=subreddit,
definition=jsonPage['subreddits'][srid],
about=jsonPage['subredditAboutInfo'][srid],
flair=jsonPage['postFlair'][srid],
)
2020-07-20 01:54:26 +00:00
with open(os.path.join(srp, 'subreddit.json'), 'w') as f:
2020-06-01 03:20:23 +00:00
f.write(json.dumps(srdt, sort_keys=True, indent=2))
if jsonPageSr is not None:
2020-07-20 01:54:26 +00:00
with open(os.path.join(srp, 'meta.json'), 'w') as f:
2020-06-01 03:20:23 +00:00
f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2))
2017-12-29 22:54:22 +00:00
def main():
2020-05-13 21:07:05 +00:00
build_summary()
2020-07-20 01:54:26 +00:00
subreddits = sorted(filter(lambda sr: os.path.isdir(
os.path.join(wdir, 'r', sr)), os.listdir(os.path.join(wdir, 'r'))))
with PoolExecutor(MAX_WORKERS) as pe:
2020-06-01 03:20:23 +00:00
q = list()
for subreddit in subreddits:
job = pe.submit(process_subreddit, subreddit)
q.append(job)
for job in q:
job.result()
2020-05-13 21:07:05 +00:00
build_summary()
def build_summary():
2020-06-01 03:20:23 +00:00
rjpath = Path(wdir, 'r.json')
2020-07-20 01:54:26 +00:00
rijpath = Path(wdir, 'ri.json')
2020-06-01 03:20:23 +00:00
oldsrs = dict()
2020-07-20 01:54:26 +00:00
oldsrsi = dict()
2020-06-01 03:20:23 +00:00
if rjpath.exists():
oldsrs = json.loads(rjpath.read_text())
2020-07-20 01:54:26 +00:00
if rijpath.exists():
oldsrsi = json.loads(rijpath.read_text())
2020-05-13 21:07:05 +00:00
srs = dict()
2020-07-20 01:54:26 +00:00
srsi = dict()
2020-05-13 21:07:05 +00:00
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
2020-06-01 03:20:23 +00:00
sr = srp.parent.name.lower()
2020-07-20 01:54:26 +00:00
srip = srp.parent.joinpath('meta.json')
2020-06-01 03:20:23 +00:00
try:
srs[sr] = json.loads(srp.read_text())
except json.decoder.JSONDecodeError:
if sr not in oldsrs:
raise
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srs[sr] = oldsrs[sr]
srp.write_text(json.dumps(oldsrs[sr], indent=1))
2020-07-20 01:54:26 +00:00
if srip.exists():
try:
srsi[sr] = json.loads(srip.read_text())
except json.decoder.JSONDecodeError:
if sr not in oldsrsi:
raise
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srsi[sr] = oldsrsi[sr]
srip.write_text(json.dumps(oldsrsi[sr], indent=1))
2020-06-01 03:20:23 +00:00
rjpath.write_text(json.dumps(srs, indent=1))
2020-07-20 01:54:26 +00:00
rijpath.write_text(json.dumps(srsi, indent=1))
2020-05-13 21:07:05 +00:00
2017-12-29 22:54:22 +00:00
if __name__ == '__main__':
main()