147 lines
4.5 KiB
Python
147 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
import datetime
|
|
import dateutil.parser
|
|
from .limits import minint, maxint
|
|
from .textTools import slugify
|
|
|
|
GATEWAY_LINK_ARGS = '&'.join([
|
|
"redditWebClient=web2x",
|
|
"app=web2x-client-production",
|
|
"allow_over18=1",
|
|
"layout=card",
|
|
"include=identity",
|
|
"sort=new"
|
|
])
|
|
|
|
|
|
def build_gateway_link(sr, after=None, dist=0):
|
|
d = [] if dist <= 0 else [f'dist={dist}']
|
|
a = [] if after is None else [f'after={after}']
|
|
return '&'.join([
|
|
f"https://gateway.reddit.com/desktopapi/v1/subreddits/{sr}?"+GATEWAY_LINK_ARGS,
|
|
*d, *a
|
|
])
|
|
|
|
|
|
def getInfoFromRedditItem(bs):
|
|
nsfw = 'over18' in bs['class']
|
|
sharer = bs.find(class_='author').text.strip()
|
|
title = bs.find('a', class_='title').text.strip()
|
|
link = str(bs.find('a', class_='title')['href'])
|
|
domain = 'reddit.com'
|
|
if bs.find('span', class_='domain').find('a') is not None:
|
|
domain = bs.find('span', class_='domain').find('a').text.strip()
|
|
datakey = bs['data-fullname']
|
|
timestamp = int(dateutil.parser.parse(bs.find('time')['datetime']).strftime('%s'))
|
|
flair = None
|
|
try:
|
|
flair = bs.find('span', class_='linkflairlabel').text.strip()
|
|
except:
|
|
pass
|
|
return {
|
|
'nsfw': nsfw,
|
|
'link': link,
|
|
'title': title,
|
|
'flair': flair,
|
|
'sharer': sharer,
|
|
'domain': domain,
|
|
'datakey': datakey,
|
|
'timestamp': timestamp,
|
|
}
|
|
|
|
|
|
def getInfoFromRedditJsonItem(jo):
|
|
return {
|
|
'nsfw': jo['isNSFW'],
|
|
'link': jo['source']['url'] if ('source' in jo and jo['source'] is not None) else jo['media']['content'],
|
|
'title': jo['title'],
|
|
'flair': next(iter([f['text'] for f in jo['flair'] if f['type'] == 'text']), None),
|
|
'sharer': jo['author'],
|
|
'domain': jo['domain'],
|
|
'datakey': jo['id'],
|
|
'timestamp': jo['created']//1000,
|
|
}
|
|
|
|
|
|
def getEmptySubredditData(srname):
|
|
return {
|
|
'subreddit': srname,
|
|
'date_first': minint,
|
|
'date_last': maxint,
|
|
'links': list()
|
|
}
|
|
|
|
|
|
def getSubredditPageJsonInfo(jo, subreddit, pageno):
|
|
structured_links = list()
|
|
if len(jo['postIds']) <= 0:
|
|
return maxint, minint, None, list()
|
|
for postId in jo['postIds']:
|
|
post = jo['posts'][postId]
|
|
if ((
|
|
(('source' in post) and (post['source'] is not None) and ('url' in post['source']))
|
|
or
|
|
(('media' in post) and (post['media'] is not None) and ('content' in post['media']))
|
|
) and (
|
|
('domain' in post) and (post['domain'] is not None)
|
|
) and (
|
|
('id' in post) and (isinstance(post['id'], str)) and (len(post['id']) < 20)
|
|
)):
|
|
structured_links.append(getInfoFromRedditJsonItem(post))
|
|
# tss = [sl['timestamp'] for sl in structured_links]
|
|
return (
|
|
structured_links[0]['timestamp'],
|
|
structured_links[-1]['timestamp'],
|
|
None if jo['token'] is None else build_gateway_link(subreddit, jo['token'], jo['dist']*pageno+1),
|
|
structured_links
|
|
)
|
|
|
|
|
|
def getSubredditPageInfo(bs):
|
|
pagetable = bs.find(id='siteTable')
|
|
discussions = pagetable.find_all(
|
|
lambda a: a.has_attr('class') and
|
|
'thing' in a['class']
|
|
)
|
|
links = list(filter(lambda a: 'self' not in a['class'], discussions))
|
|
first = minint
|
|
last = maxint
|
|
try:
|
|
first = int(dateutil.parser.parse(discussions[0].find('time')['datetime']).strftime('%s'))
|
|
except:
|
|
pass
|
|
try:
|
|
last = int(dateutil.parser.parse(discussions[-1].find('time')['datetime']).strftime('%s'))
|
|
except:
|
|
pass
|
|
nextpage = None
|
|
try:
|
|
nextpage = bs.find('div', class_='nav-buttons').find(class_='nextprev').find(class_='next-button').find('a')['href']
|
|
except:
|
|
pass
|
|
structured_links = list(map(getInfoFromRedditItem, links))
|
|
return first, last, nextpage, structured_links
|
|
|
|
|
|
def assembleFileName(subreddit, link, seq, ext):
|
|
imgfn = ''
|
|
imgfn += subreddit
|
|
imgfn += '__'
|
|
imgfn += datetime.datetime.fromtimestamp(int(link['timestamp'])).isoformat().replace('T', '_').replace(':', '-')
|
|
imgfn += '_'
|
|
imgfn += 'nsfw' if link['nsfw'] else 'safe'
|
|
imgfn += '___'
|
|
imgfn += '-' if link['flair'] is None else slugify(link['flair'])
|
|
imgfn += '___'
|
|
imgfn += '-' if link['sharer'] is None else slugify(link['sharer'])
|
|
imgfn += '___'
|
|
imgfn += slugify(link['title'][:50])
|
|
imgfn += '___'
|
|
imgfn += slugify(link['datakey'])
|
|
imgfn += '___'
|
|
imgfn += str('%04d' % seq)
|
|
imgfn += '.'+ext
|
|
return imgfn
|