reddit-image-wall-getter/reddit_imgs/search_for_subreddits.py

204 lines
6.6 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import sys
import urllib.parse
from pathlib import Path
from typing import AnyStr, Callable, Dict, List, Optional, Tuple
import colored as clrlib
import html2text as html2textlib
from bs4 import BeautifulSoup
from .system import simpleDownloader
def html2text(html, withMd=True, limit=65535):
h = html2textlib.HTML2Text(baseurl="", bodywidth=limit)
if not withMd:
h.ignore_emphasis = True
h.ignore_images = True
h.ignore_links = True
h.ignore_tables = True
return h.handle(html)
def extract_subreddits_from_page(pagebs: BeautifulSoup) -> Tuple[AnyStr, List[Dict[str, str]]]:
nextbutton = pagebs.find(class_='nav-buttons')
if nextbutton:
nextbutton = nextbutton.find(class_='next-button')
if nextbutton:
nextbutton = nextbutton.find('a')
if nextbutton:
nextbutton = nextbutton['href']
srs = list()
srtbs = pagebs.find(id='siteTable')
# print(srtbs)
for srbs in srtbs.find_all(class_='subreddit'):
isNsfw = srbs.find('span', alt='NSFW') is not None
titlebs = srbs.find('a', class_='title')
descriptionbs = srbs.find(class_='description')
# descriptionPrettyHtml = descriptionbs.prettify()
link = titlebs['href']
if '/r/' not in link:
continue
name = titlebs.text
subreddit = name.split(':', 1)[0].split('/', 1)[-1].lower()
title = name.split(':', 1)[1][1:]
description = html2text(str(descriptionbs), False, 60).strip()
srs.append(dict(
isNsfw=isNsfw,
link=link,
subreddit=subreddit,
title=title,
description=description,
))
# print(isNsfw)
# print(subreddit)
# print(title)
# print(link)
# print(description)
# print()
# print('-'*79)
# print()
# raise Exception()
return (nextbutton, srs)
def pad_text_block(s: str, wth: str) -> str:
return '\n'.join(list(map(
lambda l: f'{wth}{l}',
s.splitlines()
)))
def findmany(text: str, terms: List[str]) -> Tuple[int, str]:
if len(terms) <= 0:
return -1, None
else:
incidences = dict()
for term in terms:
pos = text.find(term)
if pos >= 0:
incidences[pos] = term
if len(incidences) <= 0:
return -1, None
else:
m = min(incidences.keys())
return m, incidences[m]
def highlight_search_term(terms: List[str], text: str, styler: Callable[[str], str], case_insensitive: bool = True) -> str:
termso = terms
texto = text
textl = text.lower() if case_insensitive else text
termsl = list(map(str.lower, terms)) if case_insensitive else terms
buffo = ''
while True:
matchpos, matchtrm = findmany(textl, termsl)
if matchpos < 0:
buffo += texto
break
else:
buffo += texto[:matchpos]
buffo += styler(texto[matchpos:matchpos+len(matchtrm)])
texto = texto[matchpos+len(matchtrm):]
textl = textl[matchpos+len(matchtrm):]
return buffo
def do_search(term: str, include_nsfw: bool = True, colored: Optional[bool] = True) -> List[Dict[str, str]]:
simpleDownloader.cleanCookies()
simpleDownloader.setCookies({'over18': 1})
next_page_url = (
'https://old.reddit.com/subreddits/search?' +
('include_over_18=on&' if include_nsfw else '') +
'q=' + urllib.parse.quote_plus(term)
)
srs = list()
srlst = list()
nothing_new = True
while next_page_url:
pagebts = simpleDownloader.getUrlBytes(next_page_url)
pagebs = BeautifulSoup(pagebts, 'html5lib')
next_page_url, nsrs = extract_subreddits_from_page(pagebs)
srs += nsrs
for sr in srs:
if (nm := sr['subreddit']) in srlst:
continue
else:
srlst.append(nm)
iw = Path('r', sr['subreddit']).exists()
nothing_new = nothing_new and iw
if colored is not None:
ds = '@' if iw else '#'
srn = sr['subreddit']
isw = sr['isNsfw']
sfw = 'nsfw' if isw else 'sfw'
sfw = f'[{sfw}]'
srt = sr['title']
srd = pad_text_block(sr['description'], ' '*8)
srl = sr['link'].replace('//old.', '//www.')
if colored:
ds = clrlib.stylize(
ds,
[clrlib.fg('light_green' if iw else 'light_red')]
)
srn = clrlib.stylize(
srn,
[clrlib.fg('light_cyan')]
)
sfw = clrlib.stylize(
sfw,
[clrlib.fg('light_green' if not isw else 'light_red')]
)
srl = clrlib.stylize(
srl,
[clrlib.fg('light_blue')]
)
srt = clrlib.stylize(
srt,
[clrlib.fg('cyan')]
)
srd = '\n'.join(list(map(
lambda srdl: clrlib.stylize(
srdl,
[clrlib.fg('dark_gray' if iw else 'light_gray')]
),
srd.splitlines()
)))
termssplit = term.split()
def highligher(t):
clrlibobj = clrlib.colored('')
bgreset = clrlibobj.ESC+'49'+clrlibobj.END
return clrlib.bg('red') + t + bgreset
srn = highlight_search_term(termssplit, srn, highligher)
srt = highlight_search_term(termssplit, srt, highligher)
srd = highlight_search_term(termssplit, srd, highligher)
print(f"{ds} {srn} {sfw} {srl}")
print(f" {srt}")
print(srd)
print()
if nothing_new:
if colored is not None:
msg = "> Nothing new... move on!"
if colored:
msg = clrlib.stylize(msg, [clrlib.fg('yellow')])
print(msg)
simpleDownloader.cleanCookies()
return srs
def main():
search_term = (
' '.join(list(map(str.strip, map(str, sys.argv[1:]))))
).strip()
if len(search_term) <= 0:
print(f'Usage:\n {sys.argv[0]} <search_term>')
else:
do_search(search_term)
if __name__ == '__main__':
main()