reddit-image-wall-getter/reddit_imgs/sync.py

69 lines
2.6 KiB
Python
Raw Normal View History

2017-12-29 22:54:22 +00:00
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import os
from bs4 import BeautifulSoup as _BS
from .system import simpleDownloader
2020-04-01 03:53:16 +00:00
from .system.subredditTools import getEmptySubredditData, getSubredditPageJsonInfo, build_gateway_link, GATEWAY_LINK_ARGS
2017-12-29 22:54:22 +00:00
import json
2020-05-13 21:07:05 +00:00
from pathlib import Path
2017-12-29 22:54:22 +00:00
def BeautifulSoup(data): return _BS(data, 'html5lib')
simpleDownloader.setCookies({'over18':1})
wdir = os.path.abspath('.')
def main():
2020-05-13 21:07:05 +00:00
build_summary()
2017-12-29 22:54:22 +00:00
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r'))))
for subreddit in subreddits:
srp = os.path.abspath(os.path.join(wdir,'r',subreddit))
#if subreddit!='yiff': continue
2020-04-01 03:53:16 +00:00
nextpage = build_gateway_link(subreddit)
2017-12-29 22:54:22 +00:00
srdt = getEmptySubredditData(subreddit)
try:
with open(os.path.join(srp,'subreddit.json')) as f:
srdt = json.loads(f.read())
except: pass
#srdt = getEmptySubredditData(subreddit)
pageno = 0
2018-01-07 03:57:39 +00:00
ygst = srdt['date_first']
2017-12-29 22:54:22 +00:00
while nextpage:
pageno+=1
print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit))
2020-04-01 03:53:16 +00:00
print(' >> %s'%(nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),))
2020-01-05 03:27:19 +00:00
redditBytes = None
try:
redditBytes = simpleDownloader.getUrlBytes(nextpage)
except BaseException as e:
print(" >> HTTP Error with code: Skipping...")
break
if redditBytes is None:
print(" >> HTTP Error: Skipping...")
break
2020-04-01 03:53:16 +00:00
# bs = BeautifulSoup(redditBytes)
jsonPage = json.loads(redditBytes)
first, last, nextpage, links = getSubredditPageJsonInfo(jsonPage, subreddit, pageno)
2018-01-07 03:57:39 +00:00
if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
2017-12-29 22:54:22 +00:00
nextpage = None
srdt['date_first'] = max(first, srdt['date_first'])
srdt['date_last'] = min(last, srdt['date_last'])
for link in links[::-1]:
if link not in srdt['links']:
srdt['links'].append(link)
with open(os.path.join(srp,'subreddit.json'),'w') as f:
f.write(json.dumps(srdt ,sort_keys=True, indent=2))
2020-05-13 21:07:05 +00:00
build_summary()
def build_summary():
srs = dict()
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
srs[srp.parent.name.lower()] = json.loads(srp.read_text())
Path(wdir, 'r.json').write_text(json.dumps(srs, indent=1))
2017-12-29 22:54:22 +00:00
if __name__ == '__main__':
main()