reddit-image-wall-getter/reddit_imgs/search_for_subreddits.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

import sys
import urllib.parse
from pathlib import Path
from typing import AnyStr, Callable, Dict, List, Optional, Tuple

import colored as clrlib
import html2text as html2textlib
from bs4 import BeautifulSoup

from .system import simpleDownloader


def html2text(html, withMd=True, limit=65535):
    h = html2textlib.HTML2Text(baseurl="", bodywidth=limit)
    if not withMd:
        h.ignore_emphasis = True
        h.ignore_images = True
        h.ignore_links = True
        h.ignore_tables = True
    return h.handle(html)


def extract_subreddits_from_page(pagebs: BeautifulSoup) -> Tuple[AnyStr, List[Dict[str, str]]]:
    nextbutton = pagebs.find(class_='nav-buttons')
    if nextbutton:
        nextbutton = nextbutton.find(class_='next-button')
    if nextbutton:
        nextbutton = nextbutton.find('a')
    if nextbutton:
        nextbutton = nextbutton['href']
    srs = list()
    srtbs = pagebs.find(id='siteTable')
    # print(srtbs)
    for srbs in srtbs.find_all(class_='subreddit'):
        isNsfw = srbs.find('span', alt='NSFW') is not None
        titlebs = srbs.find('a', class_='title')
        descriptionbs = srbs.find(class_='description')
        # descriptionPrettyHtml = descriptionbs.prettify()
        link = titlebs['href']
        if '/r/' not in link:
            continue
        name = titlebs.text
        subreddit = name.split(':', 1)[0].split('/', 1)[-1].lower()
        title = name.split(':', 1)[1][1:]
        description = html2text(str(descriptionbs), False, 60).strip()
        srs.append(dict(
            isNsfw=isNsfw,
            link=link,
            subreddit=subreddit,
            title=title,
            description=description,
        ))
    #     print(isNsfw)
    #     print(subreddit)
    #     print(title)
    #     print(link)
    #     print(description)
    #     print()
    #     print('-'*79)
    #     print()
    # raise Exception()
    return (nextbutton, srs)


def pad_text_block(s: str, wth: str) -> str:
    return '\n'.join(list(map(
        lambda l: f'{wth}{l}',
        s.splitlines()
    )))


def findmany(text: str, terms: List[str]) -> Tuple[int, str]:
    if len(terms) <= 0:
        return -1, None
    else:
        incidences = dict()
        for term in terms:
            pos = text.find(term)
            if pos >= 0:
                incidences[pos] = term
        if len(incidences) <= 0:
            return -1, None
        else:
            m = min(incidences.keys())
            return m, incidences[m]


def highlight_search_term(terms: List[str], text: str, styler: Callable[[str], str], case_insensitive: bool = True) -> str:
    termso = terms
    texto = text
    textl = text.lower() if case_insensitive else text
    termsl = list(map(str.lower, terms)) if case_insensitive else terms
    buffo = ''
    while True:
        matchpos, matchtrm = findmany(textl, termsl)
        if matchpos < 0:
            buffo += texto
            break
        else:
            buffo += texto[:matchpos]
            buffo += styler(texto[matchpos:matchpos+len(matchtrm)])
            texto = texto[matchpos+len(matchtrm):]
            textl = textl[matchpos+len(matchtrm):]
    return buffo


def do_search(term: str, include_nsfw: bool = True, colored: Optional[bool] = True) -> List[Dict[str, str]]:
    simpleDownloader.cleanCookies()
    simpleDownloader.setCookies({'over18': 1})
    next_page_url = (
        'https://old.reddit.com/subreddits/search?' +
        ('include_over_18=on&' if include_nsfw else '') +
        'q=' + urllib.parse.quote_plus(term)
    )
    srs = list()
    srlst = list()
    nothing_new = True
    while next_page_url:
        pagebts = simpleDownloader.getUrlBytes(next_page_url)
        pagebs = BeautifulSoup(pagebts, 'html5lib')
        next_page_url, nsrs = extract_subreddits_from_page(pagebs)
        srs += nsrs
        for sr in srs:
            if (nm := sr['subreddit']) in srlst:
                continue
            else:
                srlst.append(nm)
            iw = Path('r', sr['subreddit']).exists()
            nothing_new = nothing_new and iw
            if colored is not None:
                ds = '@' if iw else '#'
                srn = sr['subreddit']
                isw = sr['isNsfw']
                sfw = 'nsfw' if isw else 'sfw'
                sfw = f'[{sfw}]'
                srt = sr['title']
                srd = pad_text_block(sr['description'], ' '*8)
                srl = sr['link'].replace('//old.', '//www.')
                if colored:
                    ds = clrlib.stylize(
                        ds,
                        [clrlib.fg('light_green' if iw else 'light_red')]
                    )
                    srn = clrlib.stylize(
                        srn,
                        [clrlib.fg('light_cyan')]
                    )
                    sfw = clrlib.stylize(
                        sfw,
                        [clrlib.fg('light_green' if not isw else 'light_red')]
                    )
                    srl = clrlib.stylize(
                        srl,
                        [clrlib.fg('light_blue')]
                    )
                    srt = clrlib.stylize(
                        srt,
                        [clrlib.fg('cyan')]
                    )
                    srd = '\n'.join(list(map(
                        lambda srdl: clrlib.stylize(
                            srdl,
                            [clrlib.fg('dark_gray' if iw else 'light_gray')]
                        ),
                        srd.splitlines()
                    )))
                    termssplit = term.split()
                    def highligher(t):
                        clrlibobj = clrlib.colored('')
                        bgreset = clrlibobj.ESC+'49'+clrlibobj.END
                        return clrlib.bg('red') + t + bgreset
                    srn = highlight_search_term(termssplit, srn, highligher)
                    srt = highlight_search_term(termssplit, srt, highligher)
                    srd = highlight_search_term(termssplit, srd, highligher)
                print(f"{ds} {srn} {sfw} {srl}")
                print(f"    {srt}")
                print(srd)
                print()
    if nothing_new:
        if colored is not None:
            msg = "> Nothing new... move on!"
            if colored:
                msg = clrlib.stylize(msg, [clrlib.fg('yellow')])
            print(msg)
    simpleDownloader.cleanCookies()
    return srs


def main():
    search_term = (
        ' '.join(list(map(str.strip, map(str, sys.argv[1:]))))
    ).strip()
    if len(search_term) <= 0:
        print(f'Usage:\n  {sys.argv[0]} <search_term>')
    else:
        do_search(search_term)


if __name__ == '__main__':
    main()
Added normalization stage to pipeline 2020-06-01 03:20:23 +00:00			`#!/usr/bin/env python3`
			`# -- encoding: utf-8 --`

			`import sys`
			`import urllib.parse`
			`from pathlib import Path`
			`from typing import AnyStr, Callable, Dict, List, Optional, Tuple`

			`import colored as clrlib`
			`import html2text as html2textlib`
			`from bs4 import BeautifulSoup`

			`from .system import simpleDownloader`


			`def html2text(html, withMd=True, limit=65535):`
			`h = html2textlib.HTML2Text(baseurl="", bodywidth=limit)`
			`if not withMd:`
			`h.ignore_emphasis = True`
			`h.ignore_images = True`
			`h.ignore_links = True`
			`h.ignore_tables = True`
			`return h.handle(html)`


			`def extract_subreddits_from_page(pagebs: BeautifulSoup) -> Tuple[AnyStr, List[Dict[str, str]]]:`
			`nextbutton = pagebs.find(class_='nav-buttons')`
			`if nextbutton:`
			`nextbutton = nextbutton.find(class_='next-button')`
			`if nextbutton:`
			`nextbutton = nextbutton.find('a')`
			`if nextbutton:`
			`nextbutton = nextbutton['href']`
			`srs = list()`
			`srtbs = pagebs.find(id='siteTable')`
			`# print(srtbs)`
			`for srbs in srtbs.find_all(class_='subreddit'):`
			`isNsfw = srbs.find('span', alt='NSFW') is not None`
			`titlebs = srbs.find('a', class_='title')`
			`descriptionbs = srbs.find(class_='description')`
			`# descriptionPrettyHtml = descriptionbs.prettify()`
			`link = titlebs['href']`
			`if '/r/' not in link:`
			`continue`
			`name = titlebs.text`
			`subreddit = name.split(':', 1)[0].split('/', 1)[-1].lower()`
			`title = name.split(':', 1)[1][1:]`
			`description = html2text(str(descriptionbs), False, 60).strip()`
			`srs.append(dict(`
			`isNsfw=isNsfw,`
			`link=link,`
			`subreddit=subreddit,`
			`title=title,`
			`description=description,`
			`))`
			`# print(isNsfw)`
			`# print(subreddit)`
			`# print(title)`
			`# print(link)`
			`# print(description)`
			`# print()`
			`# print('-'*79)`
			`# print()`
			`# raise Exception()`
			`return (nextbutton, srs)`


			`def pad_text_block(s: str, wth: str) -> str:`
			`return '\n'.join(list(map(`
			`lambda l: f'{wth}{l}',`
			`s.splitlines()`
			`)))`


			`def findmany(text: str, terms: List[str]) -> Tuple[int, str]:`
			`if len(terms) <= 0:`
			`return -1, None`
			`else:`
			`incidences = dict()`
			`for term in terms:`
			`pos = text.find(term)`
			`if pos >= 0:`
			`incidences[pos] = term`
			`if len(incidences) <= 0:`
			`return -1, None`
			`else:`
			`m = min(incidences.keys())`
			`return m, incidences[m]`


			`def highlight_search_term(terms: List[str], text: str, styler: Callable[[str], str], case_insensitive: bool = True) -> str:`
			`termso = terms`
			`texto = text`
			`textl = text.lower() if case_insensitive else text`
			`termsl = list(map(str.lower, terms)) if case_insensitive else terms`
			`buffo = ''`
			`while True:`
			`matchpos, matchtrm = findmany(textl, termsl)`
			`if matchpos < 0:`
			`buffo += texto`
			`break`
			`else:`
			`buffo += texto[:matchpos]`
			`buffo += styler(texto[matchpos:matchpos+len(matchtrm)])`
			`texto = texto[matchpos+len(matchtrm):]`
			`textl = textl[matchpos+len(matchtrm):]`
			`return buffo`


			`def do_search(term: str, include_nsfw: bool = True, colored: Optional[bool] = True) -> List[Dict[str, str]]:`
			`simpleDownloader.cleanCookies()`
			`simpleDownloader.setCookies({'over18': 1})`
			`next_page_url = (`
			`'https://old.reddit.com/subreddits/search?' +`
			`('include_over_18=on&' if include_nsfw else '') +`
			`'q=' + urllib.parse.quote_plus(term)`
			`)`
			`srs = list()`
			`srlst = list()`
			`nothing_new = True`
			`while next_page_url:`
			`pagebts = simpleDownloader.getUrlBytes(next_page_url)`
			`pagebs = BeautifulSoup(pagebts, 'html5lib')`
			`next_page_url, nsrs = extract_subreddits_from_page(pagebs)`
			`srs += nsrs`
			`for sr in srs:`
			`if (nm := sr['subreddit']) in srlst:`
			`continue`
			`else:`
			`srlst.append(nm)`
			`iw = Path('r', sr['subreddit']).exists()`
			`nothing_new = nothing_new and iw`
			`if colored is not None:`
			`ds = '@' if iw else '#'`
			`srn = sr['subreddit']`
			`isw = sr['isNsfw']`
			`sfw = 'nsfw' if isw else 'sfw'`
			`sfw = f'[{sfw}]'`
			`srt = sr['title']`
			`srd = pad_text_block(sr['description'], ' '*8)`
			`srl = sr['link'].replace('//old.', '//www.')`
			`if colored:`
			`ds = clrlib.stylize(`
			`ds,`
			`[clrlib.fg('light_green' if iw else 'light_red')]`
			`)`
			`srn = clrlib.stylize(`
			`srn,`
			`[clrlib.fg('light_cyan')]`
			`)`
			`sfw = clrlib.stylize(`
			`sfw,`
			`[clrlib.fg('light_green' if not isw else 'light_red')]`
			`)`
			`srl = clrlib.stylize(`
			`srl,`
			`[clrlib.fg('light_blue')]`
			`)`
			`srt = clrlib.stylize(`
			`srt,`
			`[clrlib.fg('cyan')]`
			`)`
			`srd = '\n'.join(list(map(`
			`lambda srdl: clrlib.stylize(`
			`srdl,`
			`[clrlib.fg('dark_gray' if iw else 'light_gray')]`
			`),`
			`srd.splitlines()`
			`)))`
			`termssplit = term.split()`
			`def highligher(t):`
			`clrlibobj = clrlib.colored('')`
			`bgreset = clrlibobj.ESC+'49'+clrlibobj.END`
			`return clrlib.bg('red') + t + bgreset`
			`srn = highlight_search_term(termssplit, srn, highligher)`
			`srt = highlight_search_term(termssplit, srt, highligher)`
			`srd = highlight_search_term(termssplit, srd, highligher)`
			`print(f"{ds} {srn} {sfw} {srl}")`
			`print(f" {srt}")`
			`print(srd)`
			`print()`
			`if nothing_new:`
			`if colored is not None:`
			`msg = "> Nothing new... move on!"`
			`if colored:`
			`msg = clrlib.stylize(msg, [clrlib.fg('yellow')])`
			`print(msg)`
			`simpleDownloader.cleanCookies()`
			`return srs`


			`def main():`
			`search_term = (`
			`' '.join(list(map(str.strip, map(str, sys.argv[1:]))))`
			`).strip()`
			`if len(search_term) <= 0:`
			`print(f'Usage:\n {sys.argv[0]} <search_term>')`
			`else:`
			`do_search(search_term)`


			`if __name__ == '__main__':`
			`main()`