#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import os from bs4 import BeautifulSoup as _BS from .system import simpleDownloader from .system.subredditTools import getEmptySubredditData, getSubredditPageInfo import json def BeautifulSoup(data): return _BS(data, 'html5lib') simpleDownloader.setCookies({'over18':1}) wdir = os.path.abspath('.') def main(): subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) for subreddit in subreddits: srp = os.path.abspath(os.path.join(wdir,'r',subreddit)) #if subreddit!='yiff': continue nextpage = 'https://www.reddit.com/r/'+subreddit+'/new/?count=0' srdt = getEmptySubredditData(subreddit) try: with open(os.path.join(srp,'subreddit.json')) as f: srdt = json.loads(f.read()) except: pass #srdt = getEmptySubredditData(subreddit) pageno = 0 ygst = srdt['date_first'] while nextpage: pageno+=1 print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit)) print(' >> %s'%nextpage) redditBytes = simpleDownloader.getUrlBytes(nextpage) bs = BeautifulSoup(redditBytes) first, last, nextpage, links = getSubredditPageInfo(bs) if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date nextpage = None srdt['date_first'] = max(first, srdt['date_first']) srdt['date_last'] = min(last, srdt['date_last']) for link in links[::-1]: if link not in srdt['links']: srdt['links'].append(link) with open(os.path.join(srp,'subreddit.json'),'w') as f: f.write(json.dumps(srdt ,sort_keys=True, indent=2)) if __name__ == '__main__': main()