#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import datetime import dateutil.parser from .limits import minint, maxint from .textTools import slugify GATEWAY_LINK_ARGS = '&'.join([ "redditWebClient=web2x", "app=web2x-client-production", "allow_over18=1", "layout=card", "include=identity", "sort=new" ]) def build_gateway_link(sr, after=None, dist=0): d = [] if dist <= 0 else [f'dist={dist}'] a = [] if after is None else [f'after={after}'] return '&'.join([ f"https://gateway.reddit.com/desktopapi/v1/subreddits/{sr}?"+GATEWAY_LINK_ARGS, *d, *a ]) def getInfoFromRedditItem(bs): nsfw = 'over18' in bs['class'] sharer = bs.find(class_='author').text.strip() title = bs.find('a', class_='title').text.strip() link = str(bs.find('a', class_='title')['href']) domain = 'reddit.com' if bs.find('span', class_='domain').find('a') is not None: domain = bs.find('span', class_='domain').find('a').text.strip() datakey = bs['data-fullname'] timestamp = int(dateutil.parser.parse(bs.find('time')['datetime']).strftime('%s')) flair = None try: flair = bs.find('span', class_='linkflairlabel').text.strip() except: pass return { 'nsfw': nsfw, 'link': link, 'title': title, 'flair': flair, 'sharer': sharer, 'domain': domain, 'datakey': datakey, 'timestamp': timestamp, } def getInfoFromRedditJsonItem(jo): return { 'nsfw': jo['isNSFW'], 'link': jo['source']['url'] if ('source' in jo and jo['source'] is not None) else jo['media']['content'], 'title': jo['title'], 'flair': next(iter([f['text'] for f in jo['flair'] if f['type'] == 'text']), None), 'sharer': jo['author'], 'domain': jo['domain'], 'datakey': jo['id'], 'timestamp': jo['created']//1000, } def getEmptySubredditData(srname): return { 'subreddit': srname, 'date_first': minint, 'date_last': maxint, 'links': list() } def getSubredditPageJsonInfo(jo, subreddit, pageno): structured_links = list() if len(jo['postIds']) <= 0: return maxint, minint, None, list() for postId in jo['postIds']: post = jo['posts'][postId] if (( (('source' in post) and (post['source'] is not None) and ('url' in post['source'])) or (('media' in post) and (post['media'] is not None) and ('content' in post['media'])) ) and ( ('domain' in post) and (post['domain'] is not None) ) and ( ('id' in post) and (isinstance(post['id'], str)) and (len(post['id']) < 20) )): structured_links.append(getInfoFromRedditJsonItem(post)) # tss = [sl['timestamp'] for sl in structured_links] return ( structured_links[0]['timestamp'], structured_links[-1]['timestamp'], None if jo['token'] is None else build_gateway_link(subreddit, jo['token'], jo['dist']*pageno+1), structured_links ) def getSubredditPageInfo(bs): pagetable = bs.find(id='siteTable') discussions = pagetable.find_all( lambda a: a.has_attr('class') and 'thing' in a['class'] ) links = list(filter(lambda a: 'self' not in a['class'], discussions)) first = minint last = maxint try: first = int(dateutil.parser.parse(discussions[0].find('time')['datetime']).strftime('%s')) except: pass try: last = int(dateutil.parser.parse(discussions[-1].find('time')['datetime']).strftime('%s')) except: pass nextpage = None try: nextpage = bs.find('div', class_='nav-buttons').find(class_='nextprev').find(class_='next-button').find('a')['href'] except: pass structured_links = list(map(getInfoFromRedditItem, links)) return first, last, nextpage, structured_links def assembleFileName(subreddit, link, seq, ext): imgfn = '' imgfn += subreddit imgfn += '__' imgfn += datetime.datetime.fromtimestamp(int(link['timestamp'])).isoformat().replace('T', '_').replace(':', '-') imgfn += '_' imgfn += 'nsfw' if link['nsfw'] else 'safe' imgfn += '___' imgfn += '-' if link['flair'] is None else slugify(link['flair']) imgfn += '___' imgfn += '-' if link['sharer'] is None else slugify(link['sharer']) imgfn += '___' imgfn += slugify(link['title'][:50]) imgfn += '___' imgfn += slugify(link['datakey']) imgfn += '___' imgfn += str('%04d' % seq) imgfn += '.'+ext return imgfn