115 lines
4.0 KiB
Python
Executable File
115 lines
4.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
import json
|
|
import os
|
|
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
|
|
from pathlib import Path
|
|
from urllib.error import ContentTooShortError, HTTPError, URLError
|
|
|
|
from bs4 import BeautifulSoup as _BS
|
|
|
|
from .system import simpleDownloader
|
|
from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link,
|
|
getEmptySubredditData,
|
|
getSubredditPageJsonInfo)
|
|
|
|
|
|
def BeautifulSoup(data): return _BS(data, 'html5lib')
|
|
|
|
simpleDownloader.setCookies({'over18':1})
|
|
|
|
wdir = os.path.abspath('.')
|
|
|
|
def process_subreddit(subreddit):
|
|
simpleDownloader.setCookies({'over18':1})
|
|
srp = os.path.abspath(os.path.join(wdir, 'r', subreddit))
|
|
#if subreddit!='yiff': continue
|
|
nextpage = build_gateway_link(subreddit)
|
|
srdt = getEmptySubredditData(subreddit)
|
|
try:
|
|
with open(os.path.join(srp, 'subreddit.json')) as f:
|
|
srdt = json.loads(f.read())
|
|
except: pass
|
|
#srdt = getEmptySubredditData(subreddit)
|
|
pageno = 0
|
|
ygst = srdt['date_first']
|
|
jsonPageSr = None
|
|
while nextpage:
|
|
pageno+=1
|
|
print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit))
|
|
print(' >> %s'%(nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),))
|
|
redditBytes = None
|
|
try:
|
|
redditBytes = simpleDownloader.getUrlBytes(nextpage)
|
|
except (HTTPError, URLError, ContentTooShortError) as e:
|
|
print(" >> HTTP Error with code: Skipping...")
|
|
break
|
|
if redditBytes is None:
|
|
print(" >> HTTP Error: Skipping...")
|
|
break
|
|
# bs = BeautifulSoup(redditBytes)
|
|
jsonPage = json.loads(redditBytes)
|
|
first, last, nextpage, links = getSubredditPageJsonInfo(jsonPage, subreddit, pageno)
|
|
if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
|
|
nextpage = None
|
|
srdt['date_first'] = max(first, srdt['date_first'])
|
|
srdt['date_last'] = min(last, srdt['date_last'])
|
|
for link in links[::-1]:
|
|
if link not in srdt['links']:
|
|
srdt['links'].append(link)
|
|
srid = next(iter(set.intersection(
|
|
set(jsonPage['subreddits'].keys()),
|
|
set(jsonPage['postFlair'].keys()),
|
|
set(jsonPage['subredditAboutInfo'].keys())
|
|
)))
|
|
jsonPageSr = dict(
|
|
id=srid,
|
|
name=subreddit,
|
|
definition=jsonPage['subreddits'][srid],
|
|
about=jsonPage['subredditAboutInfo'][srid],
|
|
flair=jsonPage['postFlair'][srid],
|
|
)
|
|
with open(os.path.join(srp,'subreddit.json'),'w') as f:
|
|
f.write(json.dumps(srdt, sort_keys=True, indent=2))
|
|
if jsonPageSr is not None:
|
|
with open(os.path.join(srp,'meta.json'),'w') as f:
|
|
f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2))
|
|
|
|
|
|
def main():
|
|
build_summary()
|
|
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r'))))
|
|
with PoolExecutor(16) as pe:
|
|
q = list()
|
|
for subreddit in subreddits:
|
|
job = pe.submit(process_subreddit, subreddit)
|
|
q.append(job)
|
|
for job in q:
|
|
job.result()
|
|
build_summary()
|
|
|
|
|
|
def build_summary():
|
|
rjpath = Path(wdir, 'r.json')
|
|
oldsrs = dict()
|
|
if rjpath.exists():
|
|
oldsrs = json.loads(rjpath.read_text())
|
|
srs = dict()
|
|
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
|
|
sr = srp.parent.name.lower()
|
|
try:
|
|
srs[sr] = json.loads(srp.read_text())
|
|
except json.decoder.JSONDecodeError:
|
|
if sr not in oldsrs:
|
|
raise
|
|
else:
|
|
print('Restoring old data for corrupted subrredit %r' % sr)
|
|
srs[sr] = oldsrs[sr]
|
|
srp.write_text(json.dumps(oldsrs[sr], indent=1))
|
|
rjpath.write_text(json.dumps(srs, indent=1))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|