reddit-image-wall-getter/reddit_imgs/search_for_subreddits.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

import sys
import urllib.parse
from pathlib import Path
from typing import AnyStr, Callable, Dict, List, Optional, Tuple

import colored as clrlib
import html2text as html2textlib
from bs4 import BeautifulSoup

from .system import simpleDownloader


def html2text(html, withMd=True, limit=65535):
    h = html2textlib.HTML2Text(baseurl="", bodywidth=limit)
    if not withMd:
        h.ignore_emphasis = True
        h.ignore_images = True
        h.ignore_links = True
        h.ignore_tables = True
    return h.handle(html)


def extract_subreddits_from_page(pagebs: BeautifulSoup) -> Tuple[AnyStr, List[Dict[str, str]]]:
    nextbutton = pagebs.find(class_='nav-buttons')
    if nextbutton:
        nextbutton = nextbutton.find(class_='next-button')
    if nextbutton:
        nextbutton = nextbutton.find('a')
    if nextbutton:
        nextbutton = nextbutton['href']
    srs = list()
    srtbs = pagebs.find(id='siteTable')
    # print(srtbs)
    for srbs in srtbs.find_all(class_='subreddit'):
        isNsfw = srbs.find('span', alt='NSFW') is not None
        titlebs = srbs.find('a', class_='title')
        descriptionbs = srbs.find(class_='description')
        # descriptionPrettyHtml = descriptionbs.prettify()
        link = titlebs['href']
        if '/r/' not in link:
            continue
        name = titlebs.text
        subreddit = name.split(':', 1)[0].split('/', 1)[-1].lower()
        title = name.split(':', 1)[1][1:]
        description = html2text(str(descriptionbs), False, 60).strip()
        srs.append(dict(
            isNsfw=isNsfw,
            link=link,
            subreddit=subreddit,
            title=title,
            description=description,
        ))
    #     print(isNsfw)
    #     print(subreddit)
    #     print(title)
    #     print(link)
    #     print(description)
    #     print()
    #     print('-'*79)
    #     print()
    # raise Exception()
    return (nextbutton, srs)


def pad_text_block(s: str, wth: str) -> str:
    return '\n'.join(list(map(
        lambda l: f'{wth}{l}',
        s.splitlines()
    )))


def findmany(text: str, terms: List[str]) -> Tuple[int, str]:
    if len(terms) <= 0:
        return -1, None
    else:
        incidences = dict()
        for term in terms:
            pos = text.find(term)
            if pos >= 0:
                incidences[pos] = term
        if len(incidences) <= 0:
            return -1, None
        else:
            m = min(incidences.keys())
            return m, incidences[m]


def highlight_search_term(terms: List[str], text: str, styler: Callable[[str], str], case_insensitive: bool = True) -> str:
    termso = terms
    texto = text
    textl = text.lower() if case_insensitive else text
    termsl = list(map(str.lower, terms)) if case_insensitive else terms
    buffo = ''
    while True:
        matchpos, matchtrm = findmany(textl, termsl)
        if matchpos < 0:
            buffo += texto
            break
        else:
            buffo += texto[:matchpos]
            buffo += styler(texto[matchpos:matchpos+len(matchtrm)])
            texto = texto[matchpos+len(matchtrm):]
            textl = textl[matchpos+len(matchtrm):]
    return buffo


def do_search(term: str, include_nsfw: bool = True, colored: Optional[bool] = True) -> List[Dict[str, str]]:
    simpleDownloader.cleanCookies()
    simpleDownloader.setCookies({'over18': 1})
    next_page_url = (
        'https://old.reddit.com/subreddits/search?' +
        ('include_over_18=on&' if include_nsfw else '') +
        'q=' + urllib.parse.quote_plus(term)
    )
    srs = list()
    srlst = list()
    nothing_new = True
    while next_page_url:
        pagebts = simpleDownloader.getUrlBytes(next_page_url)
        pagebs = BeautifulSoup(pagebts, 'html5lib')
        next_page_url, nsrs = extract_subreddits_from_page(pagebs)
        srs += nsrs
        for sr in srs:
            if (nm := sr['subreddit']) in srlst:
                continue
            else:
                srlst.append(nm)
            iw = Path('r', sr['subreddit']).exists()
            nothing_new = nothing_new and iw
            if colored is not None:
                ds = '@' if iw else '#'
                srn = sr['subreddit']
                isw = sr['isNsfw']
                sfw = 'nsfw' if isw else 'sfw'
                sfw = f'[{sfw}]'
                srt = sr['title']
                srd = pad_text_block(sr['description'], ' '*8)
                srl = sr['link'].replace('//old.', '//www.')
                if colored:
                    ds = clrlib.stylize(
                        ds,
                        [clrlib.fg('light_green' if iw else 'light_red')]
                    )
                    srn = clrlib.stylize(
                        srn,
                        [clrlib.fg('light_cyan')]
                    )
                    sfw = clrlib.stylize(
                        sfw,
                        [clrlib.fg('light_green' if not isw else 'light_red')]
                    )
                    srl = clrlib.stylize(
                        srl,
                        [clrlib.fg('light_blue')]
                    )
                    srt = clrlib.stylize(
                        srt,
                        [clrlib.fg('cyan')]
                    )
                    srd = '\n'.join(list(map(
                        lambda srdl: clrlib.stylize(
                            srdl,
                            [clrlib.fg('dark_gray' if iw else 'light_gray')]
                        ),
                        srd.splitlines()
                    )))
                    termssplit = term.split()
                    def highligher(t):
                        clrlibobj = clrlib.colored('')
                        bgreset = clrlibobj.ESC+'49'+clrlibobj.END
                        return clrlib.bg('red') + t + bgreset
                    srn = highlight_search_term(termssplit, srn, highligher)
                    srt = highlight_search_term(termssplit, srt, highligher)
                    srd = highlight_search_term(termssplit, srd, highligher)
                print(f"{ds} {srn} {sfw} {srl}")
                print(f"    {srt}")
                print(srd)
                print()
    if nothing_new:
        if colored is not None:
            msg = "> Nothing new... move on!"
            if colored:
                msg = clrlib.stylize(msg, [clrlib.fg('yellow')])
            print(msg)
    simpleDownloader.cleanCookies()
    return srs


def main():
    search_term = (
        ' '.join(list(map(str.strip, map(str, sys.argv[1:]))))
    ).strip()
    if len(search_term) <= 0:
        print(f'Usage:\n  {sys.argv[0]} <search_term>')
    else:
        do_search(search_term)


if __name__ == '__main__':
    main()