reddit-image-wall-getter/reddit_imgs/fetchpreprocess.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

import json
import math
from collections import OrderedDict
from pathlib import Path
from typing import Dict, List, Set, Tuple

import reddit_imgs.sync

from .system import libgallerydl
from .system.cmdline_parser import parse_cmdline
from .system.downloader.cache import get_normalized_link, get_path_for_caching
from .system.flattener import flatten_generator
from .system.urlmatcher import search_urls

FORBIDDEN_WORKER_SPLITS = {'deviantart'}
SKIP_INDEXED_FILES = True
RETRY_ERROR_MASK = 0
CUSTOM_WORKER_SPLITS: Dict[str, int] = {}
SPLIT_WORKER_AFTER_N_LINKS = 10000
IGNORE_WORKERS: Set[str] = set()
REDOWNLOAD_EMPTIES = False
REDOWNLOAD = False

STOP_JOBS_FLAG_PATH = Path('stop_jobs.flag')
HTML_SPECIAL_CHARS_REPLACE: List[Tuple[str, str]] = [
    ('&lt;', '<'),
    ('&gt;', '>'),
    ('&quot;', '"'),
    ('&#039;', '\''),
    ('&amp;', '&'),
]
HTML_SPECIAL_CHARS: List[str] = list(
    map(lambda a: a[0], HTML_SPECIAL_CHARS_REPLACE))


def contains_any(s: str, l: List[str]) -> bool:
    for i in l:
        if i in s:
            return True
    return False


def replace_many(s: str, l: List[Tuple[str, str]]) -> str:
    for f, t in l:
        s = s.replace(f, t)
    return s


def cmdline(encoded_args: str = None):
    if encoded_args is None:
        return run_with_config()
    else:
        return parse_cmdline(run_with_config, encoded_args)


def run_with_config(redownload_empties: bool = False,
                    redownload_all: bool = False,
                    retry_generic_errors: bool = False,
                    retry_unknown_errors: bool = False,
                    retry_network_errors: bool = False,
                    retry_not_found_errors: bool = False,
                    retry_auth_errors: bool = False,
                    retry_format_errors: bool = False,
                    retry_extractor_errors: bool = False,
                    retry_os_errors: bool = False,
                    retry_not_in_disk_errors: bool = False,
                    retry_gdl_mask: int = 0,
                    split_workers: int = None,
                    custom_worker_splits: dict = None,
                    skip_indexed_files: bool = True,
                    ignore_workers: Set[str] = set(),
                    ):
    global SPLIT_WORKER_AFTER_N_LINKS
    global CUSTOM_WORKER_SPLITS
    global SKIP_INDEXED_FILES
    global REDOWNLOAD_EMPTIES
    global RETRY_ERROR_MASK
    global REDOWNLOAD
    global IGNORE_WORKERS
    IGNORE_WORKERS = ignore_workers
    REDOWNLOAD = redownload_all
    SKIP_INDEXED_FILES = skip_indexed_files
    REDOWNLOAD_EMPTIES = redownload_empties
    RETRY_ERROR_MASK |= retry_gdl_mask
    if retry_generic_errors:
        RETRY_ERROR_MASK |= 1 << 0
    if retry_unknown_errors:
        RETRY_ERROR_MASK |= 1 << 1
    if retry_network_errors:
        RETRY_ERROR_MASK |= 1 << 2
    if retry_not_found_errors:
        RETRY_ERROR_MASK |= 1 << 3
    if retry_auth_errors:
        RETRY_ERROR_MASK |= 1 << 4
    if retry_format_errors:
        RETRY_ERROR_MASK |= 1 << 5
    if retry_extractor_errors:
        RETRY_ERROR_MASK |= 1 << 6
    if retry_os_errors:
        RETRY_ERROR_MASK |= 1 << 7
    if retry_not_in_disk_errors:
        RETRY_ERROR_MASK |= 1 << 8
    if split_workers is not None:
        SPLIT_WORKER_AFTER_N_LINKS = split_workers
    if custom_worker_splits is not None:
        CUSTOM_WORKER_SPLITS = custom_worker_splits
    return main()


def main():
    subreddit_data_path = Path('r.json')
    if not subreddit_data_path.exists():
        print("Executing prerrequisite...")
        reddit_imgs.sync.main()
    subreddit_filters_path = Path('rf.json')
    print('Loading posts from disk...')
    Path('i_gdl').mkdir(exist_ok=True, parents=True)
    workers_state_path = Path('i_gdl_w')
    workers_state_path.mkdir(exist_ok=True, parents=True)
    for wsp in workers_state_path.iterdir():
        wsp.unlink()
    if STOP_JOBS_FLAG_PATH.exists():
        STOP_JOBS_FLAG_PATH.unlink()
    subreddit_data = json.loads(subreddit_data_path.read_text())
    subreddit_filters = json.loads(subreddit_filters_path.read_bytes())
    print('Loading posts...')
    posts = prerrun_flatten_subreddits_into_posts(
        subreddit_data, subreddit_filters)
    print(f'{len(posts)} posts identified.')
    print(f'Identifying alternative trivial links...')
    prerrun_posts_re_sort(posts)
    Path('r_gdl_p.json').write_text(
        json.dumps(posts, indent=1, sort_keys=True))
    print(f'Grouping links with the posts they show up in...')
    links = OrderedDict()
    for dk, post in posts.items():
        for link in post['links']:
            if link not in links:
                links[link] = list()
            links[link].append(dk)
    Path('r_gdl_lp.json').write_text(
        json.dumps(links, indent=1, sort_keys=True))
    known_link_set = set(links.keys())
    print(f'{len(links)} links found')
    print(f'Checking if there is an extractor for each link...')
    r_gdl_lk_path = Path('r_gdl_lk.json')
    r_gdl_le_path = Path('r_gdl_le.json')
    link_keys = dict()
    if r_gdl_lk_path.exists():
        link_keys = json.loads(r_gdl_lk_path.read_text())
    for link in links.keys():
        if link not in link_keys or link_keys[link] == '':
            key = libgallerydl.find_extractor_archival_key(link)
            if key is None:
                key = ''
            link_keys[link] = key
            del key
    link_extractors = dict()
    if r_gdl_le_path.exists():
        link_extractors = json.loads(r_gdl_le_path.read_text())
    for link in links.keys():
        if link not in link_extractors or link_extractors[link] == '':
            category_subcategory = libgallerydl.find_extractor_category_subcategory(
                link)
            if category_subcategory is None:
                link_extractors[link] = ''
            else:
                category, subcategory = category_subcategory
                if category == 'reddit' and subcategory in ('subreddit', 'user'):
                    link_extractors[link] = ''
                else:
                    link_extractors[link] = category
    for discarded_link in set(link_extractors.keys()).difference(known_link_set):
        del link_extractors[discarded_link]
    r_gdl_le_path.write_text(json.dumps(
        link_extractors, indent=1, sort_keys=True))
    r_gdl_lk_path.write_text(json.dumps(
        link_keys, indent=1, sort_keys=True))
    links_by_extractor = {
        extractor: list()
        for extractor in list(set(link_extractors.values()))
    }
    for link, extractor in link_extractors.items():
        links_by_extractor[extractor].append(link)
    not_downloadable_links = dict()
    not_downloadable_links[''] = links_by_extractor.get('', [])
    not_downloadable_links['reddit_user'] = links_by_extractor.get(
        'reddit_user', [])
    not_downloadable_links['reddit_subreddit'] = links_by_extractor.get(
        'reddit_subreddit', [])
    Path('i_undownloadable.json').write_text(
        json.dumps(not_downloadable_links, indent=1))
    if '' in links_by_extractor:
        del links_by_extractor['']
    if 'reddit_user' in links_by_extractor:
        del links_by_extractor['reddit_user']
    if 'reddit_subreddit' in links_by_extractor:
        del links_by_extractor['reddit_subreddit']
    not_downloadable_link_set = frozenset(
        flatten_generator(not_downloadable_links.values()))

    print(f'{len(links)-len(not_downloadable_link_set)} downloadable links found')
    print(f'{len(not_downloadable_link_set)} undownloadable links found')
    print(f'{len(links_by_extractor)} extractors found')
    Path('r_gdl_lbe.json').write_text(json.dumps(
        links_by_extractor, indent=1, sort_keys=True))

    files_from_links: Dict[str, List[str]] = dict()
    links_no_files: List[str] = list()
    files_sizes: Dict[str, int] = dict()
    link_statuses: Dict[str, int] = dict()
    ignored_links: Set[str] = set()

    if (pth := Path('i_gdl_ffl.json')).exists():
        try:
            files_from_links = json.loads(pth.read_text())
        except:
            pass

    if (pth := Path('i_gdl_lnf.json')).exists():
        try:
            links_no_files = json.loads(pth.read_text())
        except:
            pass

    if (pth := Path('i_gdl_fsz.json')).exists():
        try:
            files_sizes = json.loads(pth.read_text())
        except:
            pass

    if (pth := Path('i_gdl_spl.json')).exists():
        try:
            link_statuses = json.loads(pth.read_text())
        except:
            pass

    for discarded_link in set(links_no_files).difference(known_link_set):
        links_no_files.remove(discarded_link)
    discarded_files = set()
    for discarded_link in set(files_from_links.keys()).difference(known_link_set):
        if discarded_link in files_from_links:
            files_in_link = files_from_links[discarded_link]
            for file_in_link in files_in_link:
                discarded_files.add(file_in_link)
            if discarded_link in link_statuses:
                del link_statuses[discarded_link]
            del files_from_links[discarded_link]
    files_to_keep = set()
    for files_from_link in files_from_links.values():
        for file_from_link in files_from_link:
            if file_from_link not in files_to_keep:
                files_to_keep.add(file_from_link)
    for discarded_file in discarded_files.difference(files_to_keep):
        if discarded_file in files_sizes:
            del files_sizes[discarded_file]
    for missing_file_size in files_to_keep.difference(set(files_sizes.keys())):
        p = Path(missing_file_size)
        if not p.exists():
            raise FileNotFoundError(missing_file_size)
        else:
            files_sizes[missing_file_size] = p.stat().st_size
            print('Re-filled files_sizes for %r' % p)
    if (p := Path('i_gdl_ignored.txt')).exists():
        ignored_links = set(list(filter(len, p.read_text().splitlines())))

    links_no_files = list(filter(lambda a: a not in ignored_links,
                                 links_no_files))

    link_statuses = dict(filter(lambda a: a[0] not in ignored_links,
                                link_statuses.items()))

    files_from_links = dict(filter(lambda a: a[0] not in ignored_links,
                                   files_from_links.items()))

    checked_links = list(files_from_links.keys()) + links_no_files

    checked_links = frozenset(checked_links)

    max_expected_jobs_for_extractor = 0
    for extractor, links in links_by_extractor.items():
        links = [link
                 for link in links
                 if
                 link not in ignored_links
                 and
                 (
                     link not in checked_links
                     or
                     not SKIP_INDEXED_FILES
                     or
                     (link_statuses.get(link, 0xFF) & RETRY_ERROR_MASK) != 0
                 )]
        if len(links) <= 0:
            continue
        this_worker_split_after_n_links = CUSTOM_WORKER_SPLITS.get(
            extractor, SPLIT_WORKER_AFTER_N_LINKS)
        workers = math.ceil(len(links)/this_worker_split_after_n_links)
        if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:
            workers = 1
        max_expected_jobs_for_extractor = max(
            max_expected_jobs_for_extractor,
            workers
        )
    worker_by_seq = [list() for _ in range(max_expected_jobs_for_extractor)]

    links_to_worker = dict()
    for extractor, links in links_by_extractor.items():
        links = [link
                 for link in links
                 if
                 link not in ignored_links
                 and
                 (
                     link not in checked_links
                     or
                     not SKIP_INDEXED_FILES
                     or
                     (link_statuses.get(link, 0xFF) & RETRY_ERROR_MASK) != 0
                 )]
        if len(links) <= 0:
            continue
        this_worker_split_after_n_links = CUSTOM_WORKER_SPLITS.get(
            extractor, SPLIT_WORKER_AFTER_N_LINKS)
        workers = math.ceil(len(links)/this_worker_split_after_n_links)
        if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:
            if extractor in IGNORE_WORKERS:
                continue
            links_to_worker[extractor] = links
            worker_by_seq[0].append(extractor)
        else:
            digits = math.ceil(math.log10(max(1, workers+1)))
            fmt = "%%0%dd" % digits
            for worker_no in range(workers):
                lowerlimit = (worker_no+0)*this_worker_split_after_n_links
                upperlimit = (worker_no+1)*this_worker_split_after_n_links
                thisrange = links[lowerlimit:upperlimit]
                worker_nm = extractor + ':' + (fmt % (worker_no))
                if worker_nm in IGNORE_WORKERS:
                    continue
                links_to_worker[worker_nm] = thisrange
                worker_by_seq[worker_no].append(worker_nm)
    for w in worker_by_seq:
        w.sort()
    workers_nicely_grouped = [
        worker
        for workergroup in worker_by_seq
        for worker in workergroup
        if worker != ''
    ]
    print(f'{len(links_to_worker)} workers to be spawned')
    response_dict = dict(
        files_from_links=files_from_links,
        links_no_files=links_no_files,
        files_sizes=files_sizes,
        link_statuses=link_statuses,
        workers_nicely_grouped=workers_nicely_grouped,
        workers_state_path=str(workers_state_path),
        links_to_worker=links_to_worker,
        link_keys=link_keys,
        SKIP_INDEXED_FILES=SKIP_INDEXED_FILES,
        RETRY_ERROR_MASK=RETRY_ERROR_MASK,
        CUSTOM_WORKER_SPLITS=CUSTOM_WORKER_SPLITS,
        SPLIT_WORKER_AFTER_N_LINKS=SPLIT_WORKER_AFTER_N_LINKS,
        REDOWNLOAD_EMPTIES=REDOWNLOAD_EMPTIES,
        REDOWNLOAD=REDOWNLOAD,
    )
    Path('r_fetch_preprocessed.json').write_text(
        json.dumps(response_dict, indent=1))
    return response_dict


def prerrun_flatten_subreddits_into_posts(subreddit_data, subreddit_filters):
    postsl = [
        {'subreddit': subreddit, **post}
        for subreddit, srdt in subreddit_data.items()
        for post in srdt['links']
    ]
    postsl.sort(key=lambda a: (-a['timestamp'], a['datakey']))
    postsd = dict()
    for post in postsl:
        dk = post['datakey']
        sr = post['subreddit']
        if subreddit_filters['no_download'][sr]:
            continue
        if subreddit_filters['no_sfw'][sr] and not post['nsfw']:
            continue
        if subreddit_filters['no_nsfw'][sr] and post['nsfw']:
            continue
        if dk not in postsd:
            postsd[dk] = post.copy()
            postsd[dk]['subreddits'] = list()
            postsd[dk]['links'] = list()
            del postsd[dk]['subreddit']
            del postsd[dk]['link']
            del postsd[dk]['domain']
        if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']):
            srs.append(sr)
        if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']):
            lnks.append(lnk)
    return postsd


def prerrun_posts_re_sort(posts):
    for post in sorted(posts.values(), key=lambda a: (-a['timestamp'], a['datakey'])):
        post['subreddits'].sort()
        dk = post['datakey']
        post_links = post['links']
        has_changed_any_link = True
        while has_changed_any_link:
            has_changed_any_link = False
            for link in post_links:
                if '<!--' in link or '-->' in link:
                    for linkcopy in search_urls(link):
                        linkcopy = get_normalized_link(linkcopy)
                        linkcopy = replace_many(
                            linkcopy, HTML_SPECIAL_CHARS_REPLACE)
                        if linkcopy not in post_links:
                            post_links.append(linkcopy)
                            has_changed_any_link = True
                    while link in post_links:
                        post_links.remove(link)
                        has_changed_any_link = True
                    break
                else:
                    linkcopy = link
                    linkcopy = get_normalized_link(linkcopy)
                    if linkcopy not in post_links:
                        post_links.append(linkcopy)
                        has_changed_any_link = True
                        break
                    if '?' in link:
                        linkcopy = link.split('?', 1)[0]
                        linkcopy = get_normalized_link(linkcopy)
                        if linkcopy not in post_links:
                            post_links.append(linkcopy)
                            has_changed_any_link = True
                            break
                    if '#' in link:
                        linkcopy = link.split('#', 1)[0]
                        linkcopy = get_normalized_link(linkcopy)
                        if linkcopy not in post_links:
                            post_links.append(linkcopy)
                            has_changed_any_link = True
                            break
                    if contains_any(linkcopy, HTML_SPECIAL_CHARS):
                        linkcopy = replace_many(
                            linkcopy, HTML_SPECIAL_CHARS_REPLACE)
                        if linkcopy not in post_links:
                            post_links.append(linkcopy)
                            has_changed_any_link = True
                            break
                    if linkcopy[-1:] in ('/', '#', '?'):
                        while linkcopy[-1:] in ('/', '#', '?'):
                            linkcopy = linkcopy[: -1]
                            linkcopy = get_normalized_link(linkcopy)
                            if linkcopy not in post_links:
                                post_links.append(linkcopy)
                                has_changed_any_link = True
                if link.strip() == '':
                    while link in post_links:
                        post_links.remove(link)
                        has_changed_any_link = True
                        break
                if link.startswith('/'):
                    while link in post_links:
                        post_links.remove(link)
                        has_changed_any_link = True
                        break
                if link.startswith('#'):
                    while link in post_links:
                        post_links.remove(link)
                        has_changed_any_link = True
                        break
                if link.startswith('mailto'):
                    while link in post_links:
                        post_links.remove(link)
                        has_changed_any_link = True
                        break
                if (proto := (tpl := link.split(':', 1))[0]).lower() in ('http', 'https') and proto not in ('http', 'https'):
                    lst = list(tpl)
                    lst[0] = lst[0].lower()
                    linkcopy = ':'.join(lst)
                    post_links.remove(link)
                    if linkcopy not in post_links:
                        post_links.append(linkcopy)
                        has_changed_any_link = True
                        break
        post['links'] = list(filter(lambda link: (
            not link.startswith('https://preview.redd.it/')
            or
            (
                (('?width=' in link) or ('&width=' in link))
                and
                (('?format=' in link) or ('&format=' in link))
                and
                (('?auto=' in link) or ('&auto=' in link))
                and
                (('?s=' in link) or ('&s=' in link))
            )
        ), post['links']))
        post['links'].sort()