reddit-image-wall-getter/reddit_imgs/fetch2.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

import json
import math
import shutil
import sys
import traceback
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from typing import List, Optional, Tuple

import colored as clrlib
import gallery_dl
import gallery_dl.config
import gallery_dl.extractor
import gallery_dl.job
import gallery_dl.postprocessor.common

import reddit_imgs.sync

from .system.downloader.cache import get_normalized_link, get_path_for_caching
from .system.urlmatcher import search_urls

MAX_WORKERS = 12
SPLIT_WORKER_AFTER_N_LINKS = 1000
FORBIDDEN_WORKER_SPLITS = {
    'deviantart',
}


def main():
    subreddit_data_path = Path('r.json')
    if not subreddit_data_path.exists():
        print("Executing prerrequisite...")
        reddit_imgs.sync.main()
    print('Loading posts...')
    workers_state_path = Path('i_gdl_w')
    workers_state_path.mkdir(exist_ok=True, parents=True)
    for wsp in workers_state_path.iterdir():
        wsp.unlink()
    subreddit_data = json.loads(subreddit_data_path.read_text())
    links = dict()
    postsl = [
        {**post, 'subreddit': subreddit}
        for subreddit, srdt in subreddit_data.items()
        for post in srdt['links']
    ]
    postsd = dict()
    for post in postsl:
        dk = post['datakey']
        if dk not in postsd:
            postsd[dk] = post.copy()
            postsd[dk]['subreddits'] = list()
            postsd[dk]['links'] = list()
            del postsd[dk]['subreddit']
            del postsd[dk]['link']
            del postsd[dk]['domain']
        if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']):
            srs.append(sr)
        if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']):
            lnks.append(lnk)
    posts = postsd
    del postsl
    del postsd
    print(f'{len(posts)} posts identified.')
    print(f'Identifying alternative trivial links...')
    for post in posts.values():
        dk = post['datakey']
        post_links = post['links']
        has_added_any_link = True
        while has_added_any_link:
            has_added_any_link = False
            for link in post_links:
                linkcopy = link
                while linkcopy.endswith('/') or linkcopy.endswith('#') or linkcopy.endswith('?'):
                    linkcopy = linkcopy[:-1]
                    if linkcopy not in post_links:
                        post_links.append(linkcopy)
                        has_added_any_link = True
                if '<!--' in link:
                    for linkcopy in search_urls(link):
                        if linkcopy not in post_links:
                            post_links.append(linkcopy)
                            has_added_any_link = True
    Path('r_gdl_p.json').write_text(json.dumps(posts, indent=1))
    print(f'Grouping links with the posts they show up in...')
    for dk, post in posts.items():
        for link in post['links']:
            if link not in links:
                links[link] = list()
            links[link].append(dk)
    Path('r_gdl_lp.json').write_text(json.dumps(links, indent=1))
    print(f'{len(links)} links found')
    print(f'Checking if there is an extractor for each link...')
    r_gdl_le_path = Path('r_gdl_le.json')
    link_extractors = dict()
    if r_gdl_le_path.exists():
        link_extractors = json.loads(r_gdl_le_path.read_text())
    for link in links.keys():
        if link not in link_extractors or link_extractors[link] == '':
            ext = None
            try:
                ext = gallery_dl.extractor.find(link)
            except gallery_dl.exception.NotFoundError:
                pass
            link_extractors[link] = type(
                ext).category if ext is not None else ''
    r_gdl_le_path.write_text(json.dumps(link_extractors, indent=1))
    links_by_extractor = {
        extractor: list()
        for extractor in list(set(link_extractors.values()))
    }
    for link, extractor in link_extractors.items():
        links_by_extractor[extractor].append(link)
    undownloadable_posts = links_by_extractor.get('', [])
    Path('i_undownloadable.json').write_text(
        json.dumps(undownloadable_posts, indent=1))
    if '' in links_by_extractor:
        del links_by_extractor['']

    print(f'{len(links)-len(undownloadable_posts)} downloadable links found')
    print(f'{len(undownloadable_posts)} undownloadable links found')
    print(f'{len(links_by_extractor)} extractors found')
    Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1))

    ignored_links = set()
    if (p := Path('i_gdl_ignored.txt')).exists():
        ignored_links = set(list(filter(len, p.read_text().splitlines())))

    max_expected_jobs_for_extractor = 0
    for extractor, links in links_by_extractor.items():
        links = [link for link in links if link not in ignored_links]
        workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS)
        if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:
            workers = 1
        max_expected_jobs_for_extractor = max(
            max_expected_jobs_for_extractor,
            workers
        )
    worker_by_seq = [list() for _ in range(max_expected_jobs_for_extractor)]

    links_to_worker = dict()
    for extractor, links in links_by_extractor.items():
        links = [link for link in links if link not in ignored_links]
        workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS)
        if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:
            links_to_worker[extractor] = links
            worker_by_seq[0].append(extractor)
        else:
            digits = math.ceil(math.log10(max(1, workers+1)))
            fmt = "%%0%dd" % digits
            for worker_no in range(workers):
                lowerlimit = (worker_no+0)*SPLIT_WORKER_AFTER_N_LINKS
                upperlimit = (worker_no+1)*SPLIT_WORKER_AFTER_N_LINKS
                thisrange = links[lowerlimit:upperlimit]
                worker_nm = extractor + ':' + (fmt % (worker_no))
                links_to_worker[worker_nm] = thisrange
                worker_by_seq[worker_no].append(worker_nm)
    for w in worker_by_seq:
        w.sort()
    workers_nicely_grouped = [
        worker 
        for workergroup in worker_by_seq
        for worker in workergroup
        if worker != ''
    ]
    print(f'{len(links_to_worker)} workers to be spawned')


    configure_gdl()

    gallery_dl.output.select = lambda: ColoredLineOutput(False)

    files_from_links = dict()

    totalfiles = 0

    thread_ids = workers_nicely_grouped.copy()
    for line, thread_id in enumerate(thread_ids):
        workers_state_path.joinpath(thread_id+'=line').write_text(str(line))
        linkcount = len(links_to_worker[thread_id])
        workers_state_path.joinpath(thread_id).write_text(f'waiting:{linkcount}:{linkcount}:0:0')
    do_fancy_multithreading_panel = False
    thread_id_count = len(thread_ids)
    with PoolExecutor(min(MAX_WORKERS, thread_id_count)) as pe:
        if do_fancy_multithreading_panel:
            print(f'\033[2J', end='', flush=True)
            print(f'\033[0;0H', end='', flush=True)
        print('Downloading...', flush=True)
        if do_fancy_multithreading_panel:
            print(f'\033[0;0H', end='', flush=True)
        largest_tid_size = max(map(len, thread_ids))
        line2tid = dict()

        def done_callback_generator(line):
            nonlocal totalfiles
            def done_callback(job):
                nonlocal totalfiles
                thread_id = line2tid[line]
                links_list = links_to_worker[thread_id]
                workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:0:0')
                print(clrlib.stylize(
                    f'Received job #{line}: {thread_id}', [
                        clrlib.fg('white'),
                        clrlib.bg('green'),
                        clrlib.attr('bold'),
                    ]
                ))
                totalbytes = 0
                thisfiles = 0
                generator = list()
                try:
                    generator = job.result()
                except:
                    with workers_state_path.joinpath(thread_id+'=exc').open('wt') as f:
                        traceback.print_exc(file=f)
                    traceback.print_exc()
                    sys.exit(255)
                for link, files in generator:
                    files_from_links[link] = files
                    lenfiles = len(files)
                    totalfiles += lenfiles
                    for file in files:
                        st = Path(file).stat()
                        totalbytes += st.st_size
                        thisfiles += lenfiles
                        workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:{totalbytes}:{thisfiles}')


            return done_callback
        for line, thread_id in enumerate(thread_ids):
            line2tid[line] = thread_id
            links_list = links_to_worker[thread_id]
            workers_state_path.joinpath(thread_id).write_text(f'enqueued:{len(links_list)}:{len(links_list)}:0:0')
            print(clrlib.stylize(f'Starting job #{line}: {thread_id}', [
                clrlib.fg('white'),
                clrlib.bg('light_red'),
                clrlib.attr('bold'),
            ]))
            jobstardedmsg = clrlib.stylize(f'Starting job #{line}: {thread_id}', [
                clrlib.fg('black'),
                clrlib.bg('light_yellow'),
                clrlib.attr('bold'),
            ])
            thread_id_nmsz = len(thread_id)
            thread_id_display = thread_id + ' ' * \
                (largest_tid_size - thread_id_nmsz)
            job = pe.submit(
                download_link_list,
                links_list,
                thread_id_display,
                line+3 if do_fancy_multithreading_panel else None,
                jobstardedmsg,
                workers_state_path.joinpath(thread_id),
            )
            job.add_done_callback(done_callback_generator(line))
    Path('i_gdl_ffl.json').write_text(json.dumps(files_from_links, indent=1))
    if (p := Path('latest_image_download.txt')).exists():
        p.unlink()
    if workers_state_path.exists():
        for p in workers_state_path.glob('*'):
            p.unlink()
        shutil.rmtree(workers_state_path)
    print(f'Downloaded {totalfiles} files')


def download_link_list(links: List[str],
                       thread_id: str,
                       line: Optional[int] = None,
                       job_started_msg: Optional[str] = None,
                       thread_state_path: Optional[Path] = None,
                       ) -> List[Tuple[str, List[str]]]:
    '''Downloads a link list inside a ProcessPoolExecutor'''
    if job_started_msg is not None:
        print(job_started_msg)
    has_its_own_line = line is not None
    link_count = len(links)
    remaining_links = link_count
    configure_gdl()
    if thread_state_path is not None:
        thread_state_path.write_text(f'running:{link_count}:{remaining_links}:0:0')

    def get_printer():
        return ColoredLineOutput(
            has_its_own_line,
            prefix=(f'\033[{line};0H' if has_its_own_line else '') +
            clrlib.stylize('% 9d' % remaining_links, [clrlib.fg('light_cyan')]) +
            clrlib.stylize('@', [clrlib.fg('light_red')]) +
            clrlib.stylize(thread_id, [clrlib.fg('yellow')]) +
            clrlib.stylize('=', [clrlib.fg('dark_gray')]),
            suffix=('\033[K\033[0;0H' if has_its_own_line else ''),
            prefixsz=len(('% 9d' % 0)+' '+thread_id),
            suffixsz=0,
            write_successes_to=Path('latest_image_download.txt'),
        )

    gallery_dl.output.select = get_printer
    result = list()
    totalbytes = 0
    totalfiles = 0
    try:
        for link in links:
            scrubbing = True
            cachedir = get_path_for_caching(link, Path('i_gdl_c'))
            cachedir.mkdir(parents=True, exist_ok=True)
            metafile = cachedir.joinpath('_gdl_meta.json')
            meta = dict()
            link_already_downloaded = False
            if metafile.exists():
                meta = json.loads(metafile.read_text())
            if link in meta:
                link_already_downloaded = True
                for fl in meta[link]:
                    pth = Path(fl)
                    if not pth.exists():
                        link_already_downloaded = False
                        break
            if not link_already_downloaded:
                scrubbing = False
                if thread_state_path is not None:
                    thread_state_path.write_text(f'running:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}')
                job = DownloadJobWithCallSaverPostProcessor(link)
                job.out = get_printer()
                job.out.message(link, clrlib.fg('light_magenta'))
                job.run()
                files = list(map(lambda a: a[0], job.cspp.calls['run_final']))
                files = list(filter(lambda a: Path(a).exists(), files))
                meta[link] = files
                metafile.write_text(json.dumps(meta, indent=1))
            for fl in meta[link]:
                pth = Path(fl)
                if not pth.exists():
                    raise FileNotFoundError((link, link_already_downloaded, meta[link]))
                st = pth.stat()
                totalbytes += st.st_size
                totalfiles += 1
            result.append((link, meta[link]))
            remaining_links -= 1
            if thread_state_path is not None:
                scrubbing_running = 'scrubbing' if scrubbing else 'running'
                thread_state_path.write_text(f'{scrubbing_running}:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}')
    finally:
        print((f'\033[{line};0H' if has_its_own_line else '') +
              clrlib.stylize(thread_id.strip(), [clrlib.fg('yellow'), clrlib.attr('bold')]) +
              clrlib.stylize('#', [clrlib.fg('light_red')]) +
              clrlib.stylize('Done', [clrlib.fg('light_green')]) +
              ('\033[K' if has_its_own_line else '')
              )
    return result


def configure_gdl():
    '''Configures Gallery-DL for usage.'''
    parser = gallery_dl.option.build_parser()
    args = parser.parse_args([
        '--download-archive=i_gdl/archive.db',
        '--dest=i_gdl',
        '--write-metadata',
        '--write-tags',
        # '--write-log=i_gdl_log.txt',
        # '--write-unsupported=i_gdl_unsupported.txt',
        '--quiet',
        '--retries=15',
        # '--limit-rate=1500k',
    ])
    gallery_dl.output.initialize_logging(args.loglevel)

    # configuration
    if args.load_config:
        gallery_dl.config.load()
    if args.cfgfiles:
        gallery_dl.config.load(args.cfgfiles, strict=True)
    if args.yamlfiles:
        gallery_dl.config.load(args.yamlfiles, strict=True, fmt="yaml")
    if args.postprocessors:
        gallery_dl.config.set((), "postprocessors", args.postprocessors)
    if args.abort:
        gallery_dl.config.set((), "skip", "abort:" + str(args.abort))
    for opts in args.options:
        gallery_dl.config.set(*opts)

    # loglevels
    gallery_dl.output.configure_logging(args.loglevel)

    gallery_dl.output.select = ColoredLineOutput


class DownloadJobWithCallSaverPostProcessor(gallery_dl.job.DownloadJob):
    def __init__(self, url, parent=None):
        super().__init__(url, parent)
        self.cspp = CallSaverPostProcessor(self)

    def initialize(self, kwdict=None):
        super().initialize(kwdict)
        self.postprocessors.append(self.cspp)


class ColoredLineOutput(gallery_dl.output.TerminalOutput):
    def __init__(self, sameline=False, prefix="", suffix="", prefixsz=0, suffixsz=0, write_successes_to=None):
        super().__init__()
        self.sameline = sameline
        self.eol = '\r' if sameline else '\n'
        self.prefix = prefix
        self.suffix = suffix
        self.prefixsz = prefixsz
        self.suffixsz = suffixsz
        self.write_successes_to = write_successes_to
        self._termsize_update()

    def start(self, path):
        self.message(path,
                     clrlib.fg("light_yellow"),
                     )

    def skip(self, path):
        self.message(path,
                     clrlib.attr('dim'),
                     )

    def success(self, path, tries):
        self.message(path,
                     clrlib.attr('bold'),
                     clrlib.fg('light_green'),
                     )
        if self.write_successes_to is not None:
            self.write_successes_to.write_text(path)

    def message(self, txt: str, *attrs: List[str], do_print: bool = True) -> str:
        """Prints a message with given formatters"""
        clrtxt = clrlib.stylize(self.shorten(txt), attrs)
        fmtd = f"{self.prefix}{clrtxt}{self.suffix}"
        if do_print:
            print(fmtd, flush=True, end=self.eol)
        return fmtd

    def shorten(self, txt):
        self._termsize_update()
        self.width = self.termsize - self.prefixsz - self.suffixsz - 1
        return super().shorten(txt)

    def _termsize_update(self):
        self.termsize = shutil.get_terminal_size().columns


class CallSaverPostProcessor(gallery_dl.postprocessor.common.PostProcessor):
    def __init__(self, job):
        super().__init__(job)
        self.calls = dict(
            prepare=list(),
            run=list(),
            run_metadata=list(),
            run_after=list(),
            run_final=list(),
        )

    def prepare(self, pathfmt):
        """Update file paths, etc."""
        self.calls['prepare'].append((pathfmt.path,))

    def run(self, pathfmt):
        """Execute the postprocessor for a file"""
        self.calls['run'].append((pathfmt.path,))

    def run_metadata(self, pathfmt):
        """Execute the postprocessor for a file"""
        self.calls['run_metadata'].append((pathfmt.path,))

    def run_after(self, pathfmt):
        """Execute postprocessor after moving a file to its target location"""
        self.calls['run_after'].append((pathfmt.path,))

    def run_final(self, pathfmt, status):
        """Postprocessor finalization after all files have been downloaded"""
        self.calls['run_final'].append((pathfmt.path, status))


if __name__ == "__main__":
    main()
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`#!/usr/bin/env python3`
			`# -- encoding: utf-8 --`

			`import json`
has glitch 2020-06-05 22:19:45 +00:00			`import math`
			`import shutil`
			`import sys`
			`import traceback`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`from concurrent.futures import ProcessPoolExecutor as PoolExecutor`
			`from pathlib import Path`
has glitch 2020-06-05 22:19:45 +00:00			`from typing import List, Optional, Tuple`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00
			`import colored as clrlib`
			`import gallery_dl`
			`import gallery_dl.config`
			`import gallery_dl.extractor`
			`import gallery_dl.job`
			`import gallery_dl.postprocessor.common`

			`import reddit_imgs.sync`

			`from .system.downloader.cache import get_normalized_link, get_path_for_caching`
			`from .system.urlmatcher import search_urls`

has glitch 2020-06-05 22:19:45 +00:00			`MAX_WORKERS = 12`
			`SPLIT_WORKER_AFTER_N_LINKS = 1000`
			`FORBIDDEN_WORKER_SPLITS = {`
			`'deviantart',`
			`}`

use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00
			`def main():`
			`subreddit_data_path = Path('r.json')`
			`if not subreddit_data_path.exists():`
			`print("Executing prerrequisite...")`
			`reddit_imgs.sync.main()`
			`print('Loading posts...')`
has glitch 2020-06-05 22:19:45 +00:00			`workers_state_path = Path('i_gdl_w')`
			`workers_state_path.mkdir(exist_ok=True, parents=True)`
			`for wsp in workers_state_path.iterdir():`
			`wsp.unlink()`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`subreddit_data = json.loads(subreddit_data_path.read_text())`
			`links = dict()`
			`postsl = [`
			`{**post, 'subreddit': subreddit}`
			`for subreddit, srdt in subreddit_data.items()`
			`for post in srdt['links']`
			`]`
			`postsd = dict()`
			`for post in postsl:`
			`dk = post['datakey']`
			`if dk not in postsd:`
			`postsd[dk] = post.copy()`
			`postsd[dk]['subreddits'] = list()`
			`postsd[dk]['links'] = list()`
			`del postsd[dk]['subreddit']`
			`del postsd[dk]['link']`
			`del postsd[dk]['domain']`
			`if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']):`
			`srs.append(sr)`
			`if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']):`
			`lnks.append(lnk)`
			`posts = postsd`
			`del postsl`
			`del postsd`
			`print(f'{len(posts)} posts identified.')`
			`print(f'Identifying alternative trivial links...')`
			`for post in posts.values():`
			`dk = post['datakey']`
			`post_links = post['links']`
			`has_added_any_link = True`
			`while has_added_any_link:`
			`has_added_any_link = False`
			`for link in post_links:`
			`linkcopy = link`
			`while linkcopy.endswith('/') or linkcopy.endswith('#') or linkcopy.endswith('?'):`
			`linkcopy = linkcopy[:-1]`
			`if linkcopy not in post_links:`
			`post_links.append(linkcopy)`
			`has_added_any_link = True`
			`if '<!--' in link:`
			`for linkcopy in search_urls(link):`
			`if linkcopy not in post_links:`
			`post_links.append(linkcopy)`
			`has_added_any_link = True`
			`Path('r_gdl_p.json').write_text(json.dumps(posts, indent=1))`
			`print(f'Grouping links with the posts they show up in...')`
			`for dk, post in posts.items():`
			`for link in post['links']:`
			`if link not in links:`
			`links[link] = list()`
			`links[link].append(dk)`
			`Path('r_gdl_lp.json').write_text(json.dumps(links, indent=1))`
			`print(f'{len(links)} links found')`
			`print(f'Checking if there is an extractor for each link...')`
			`r_gdl_le_path = Path('r_gdl_le.json')`
			`link_extractors = dict()`
			`if r_gdl_le_path.exists():`
			`link_extractors = json.loads(r_gdl_le_path.read_text())`
has glitch 2020-06-05 22:19:45 +00:00			`for link in links.keys():`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`if link not in link_extractors or link_extractors[link] == '':`
			`ext = None`
			`try:`
			`ext = gallery_dl.extractor.find(link)`
			`except gallery_dl.exception.NotFoundError:`
			`pass`
			`link_extractors[link] = type(`
			`ext).category if ext is not None else ''`
			`r_gdl_le_path.write_text(json.dumps(link_extractors, indent=1))`
			`links_by_extractor = {`
			`extractor: list()`
			`for extractor in list(set(link_extractors.values()))`
			`}`
			`for link, extractor in link_extractors.items():`
			`links_by_extractor[extractor].append(link)`
			`undownloadable_posts = links_by_extractor.get('', [])`
			`Path('i_undownloadable.json').write_text(`
			`json.dumps(undownloadable_posts, indent=1))`
			`if '' in links_by_extractor:`
			`del links_by_extractor['']`

			`print(f'{len(links)-len(undownloadable_posts)} downloadable links found')`
			`print(f'{len(undownloadable_posts)} undownloadable links found')`
			`print(f'{len(links_by_extractor)} extractors found')`
			`Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1))`

has glitch 2020-06-05 22:19:45 +00:00			`ignored_links = set()`
			`if (p := Path('i_gdl_ignored.txt')).exists():`
			`ignored_links = set(list(filter(len, p.read_text().splitlines())))`

			`max_expected_jobs_for_extractor = 0`
			`for extractor, links in links_by_extractor.items():`
			`links = [link for link in links if link not in ignored_links]`
			`workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS)`
			`if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:`
			`workers = 1`
			`max_expected_jobs_for_extractor = max(`
			`max_expected_jobs_for_extractor,`
			`workers`
			`)`
			`worker_by_seq = [list() for _ in range(max_expected_jobs_for_extractor)]`

			`links_to_worker = dict()`
			`for extractor, links in links_by_extractor.items():`
			`links = [link for link in links if link not in ignored_links]`
			`workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS)`
			`if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:`
			`links_to_worker[extractor] = links`
			`worker_by_seq[0].append(extractor)`
			`else:`
			`digits = math.ceil(math.log10(max(1, workers+1)))`
			`fmt = "%%0%dd" % digits`
			`for worker_no in range(workers):`
			`lowerlimit = (worker_no+0)*SPLIT_WORKER_AFTER_N_LINKS`
			`upperlimit = (worker_no+1)*SPLIT_WORKER_AFTER_N_LINKS`
			`thisrange = links[lowerlimit:upperlimit]`
			`worker_nm = extractor + ':' + (fmt % (worker_no))`
			`links_to_worker[worker_nm] = thisrange`
			`worker_by_seq[worker_no].append(worker_nm)`
			`for w in worker_by_seq:`
			`w.sort()`
			`workers_nicely_grouped = [`
			`worker`
			`for workergroup in worker_by_seq`
			`for worker in workergroup`
			`if worker != ''`
			`]`
			`print(f'{len(links_to_worker)} workers to be spawned')`


use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`configure_gdl()`

			`gallery_dl.output.select = lambda: ColoredLineOutput(False)`

			`files_from_links = dict()`

			`totalfiles = 0`

has glitch 2020-06-05 22:19:45 +00:00			`thread_ids = workers_nicely_grouped.copy()`
			`for line, thread_id in enumerate(thread_ids):`
			`workers_state_path.joinpath(thread_id+'=line').write_text(str(line))`
			`linkcount = len(links_to_worker[thread_id])`
			`workers_state_path.joinpath(thread_id).write_text(f'waiting:{linkcount}:{linkcount}:0:0')`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`do_fancy_multithreading_panel = False`
has glitch 2020-06-05 22:19:45 +00:00			`thread_id_count = len(thread_ids)`
			`with PoolExecutor(min(MAX_WORKERS, thread_id_count)) as pe:`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`if do_fancy_multithreading_panel:`
			`print(f'\033[2J', end='', flush=True)`
			`print(f'\033[0;0H', end='', flush=True)`
			`print('Downloading...', flush=True)`
			`if do_fancy_multithreading_panel:`
			`print(f'\033[0;0H', end='', flush=True)`
has glitch 2020-06-05 22:19:45 +00:00			`largest_tid_size = max(map(len, thread_ids))`
			`line2tid = dict()`

			`def done_callback_generator(line):`
			`nonlocal totalfiles`
			`def done_callback(job):`
			`nonlocal totalfiles`
			`thread_id = line2tid[line]`
			`links_list = links_to_worker[thread_id]`
			`workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:0:0')`
			`print(clrlib.stylize(`
			`f'Received job #{line}: {thread_id}', [`
			`clrlib.fg('white'),`
			`clrlib.bg('green'),`
			`clrlib.attr('bold'),`
			`]`
			`))`
			`totalbytes = 0`
			`thisfiles = 0`
			`generator = list()`
			`try:`
			`generator = job.result()`
			`except:`
			`with workers_state_path.joinpath(thread_id+'=exc').open('wt') as f:`
			`traceback.print_exc(file=f)`
			`traceback.print_exc()`
			`sys.exit(255)`
			`for link, files in generator:`
			`files_from_links[link] = files`
			`lenfiles = len(files)`
			`totalfiles += lenfiles`
			`for file in files:`
			`st = Path(file).stat()`
			`totalbytes += st.st_size`
			`thisfiles += lenfiles`
			`workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:{totalbytes}:{thisfiles}')`


			`return done_callback`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`for line, thread_id in enumerate(thread_ids):`
has glitch 2020-06-05 22:19:45 +00:00			`line2tid[line] = thread_id`
			`links_list = links_to_worker[thread_id]`
			`workers_state_path.joinpath(thread_id).write_text(f'enqueued:{len(links_list)}:{len(links_list)}:0:0')`
			`print(clrlib.stylize(f'Starting job #{line}: {thread_id}', [`
			`clrlib.fg('white'),`
			`clrlib.bg('light_red'),`
			`clrlib.attr('bold'),`
			`]))`
			`jobstardedmsg = clrlib.stylize(f'Starting job #{line}: {thread_id}', [`
			`clrlib.fg('black'),`
			`clrlib.bg('light_yellow'),`
			`clrlib.attr('bold'),`
			`])`
			`thread_id_nmsz = len(thread_id)`
			`thread_id_display = thread_id + ' ' * \`
			`(largest_tid_size - thread_id_nmsz)`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`job = pe.submit(`
			`download_link_list,`
has glitch 2020-06-05 22:19:45 +00:00			`links_list,`
			`thread_id_display,`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`line+3 if do_fancy_multithreading_panel else None,`
has glitch 2020-06-05 22:19:45 +00:00			`jobstardedmsg,`
			`workers_state_path.joinpath(thread_id),`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`)`
has glitch 2020-06-05 22:19:45 +00:00			`job.add_done_callback(done_callback_generator(line))`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`Path('i_gdl_ffl.json').write_text(json.dumps(files_from_links, indent=1))`
has glitch 2020-06-05 22:19:45 +00:00			`if (p := Path('latest_image_download.txt')).exists():`
			`p.unlink()`
			`if workers_state_path.exists():`
			`for p in workers_state_path.glob('*'):`
			`p.unlink()`
			`shutil.rmtree(workers_state_path)`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`print(f'Downloaded {totalfiles} files')`


			`def download_link_list(links: List[str],`
			`thread_id: str,`
			`line: Optional[int] = None,`
has glitch 2020-06-05 22:19:45 +00:00			`job_started_msg: Optional[str] = None,`
			`thread_state_path: Optional[Path] = None,`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`) -> List[Tuple[str, List[str]]]:`
has glitch 2020-06-05 22:19:45 +00:00			`'''Downloads a link list inside a ProcessPoolExecutor'''`
			`if job_started_msg is not None:`
			`print(job_started_msg)`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`has_its_own_line = line is not None`
has glitch 2020-06-05 22:19:45 +00:00			`link_count = len(links)`
			`remaining_links = link_count`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`configure_gdl()`
has glitch 2020-06-05 22:19:45 +00:00			`if thread_state_path is not None:`
			`thread_state_path.write_text(f'running:{link_count}:{remaining_links}:0:0')`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00
			`def get_printer():`
			`return ColoredLineOutput(`
			`has_its_own_line,`
			`prefix=(f'\033[{line};0H' if has_its_own_line else '') +`
			`clrlib.stylize('% 9d' % remaining_links, [clrlib.fg('light_cyan')]) +`
			`clrlib.stylize('@', [clrlib.fg('light_red')]) +`
			`clrlib.stylize(thread_id, [clrlib.fg('yellow')]) +`
			`clrlib.stylize('=', [clrlib.fg('dark_gray')]),`
			`suffix=('\033[K\033[0;0H' if has_its_own_line else ''),`
has glitch 2020-06-05 22:19:45 +00:00			`prefixsz=len(('% 9d' % 0)+' '+thread_id),`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`suffixsz=0,`
has glitch 2020-06-05 22:19:45 +00:00			`write_successes_to=Path('latest_image_download.txt'),`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`)`

			`gallery_dl.output.select = get_printer`
			`result = list()`
has glitch 2020-06-05 22:19:45 +00:00			`totalbytes = 0`
			`totalfiles = 0`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`try:`
			`for link in links:`
has glitch 2020-06-05 22:19:45 +00:00			`scrubbing = True`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`cachedir = get_path_for_caching(link, Path('i_gdl_c'))`
			`cachedir.mkdir(parents=True, exist_ok=True)`
			`metafile = cachedir.joinpath('_gdl_meta.json')`
			`meta = dict()`
has glitch 2020-06-05 22:19:45 +00:00			`link_already_downloaded = False`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`if metafile.exists():`
			`meta = json.loads(metafile.read_text())`
has glitch 2020-06-05 22:19:45 +00:00			`if link in meta:`
			`link_already_downloaded = True`
			`for fl in meta[link]:`
			`pth = Path(fl)`
			`if not pth.exists():`
			`link_already_downloaded = False`
			`break`
			`if not link_already_downloaded:`
			`scrubbing = False`
			`if thread_state_path is not None:`
			`thread_state_path.write_text(f'running:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}')`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`job = DownloadJobWithCallSaverPostProcessor(link)`
			`job.out = get_printer()`
has glitch 2020-06-05 22:19:45 +00:00			`job.out.message(link, clrlib.fg('light_magenta'))`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`job.run()`
			`files = list(map(lambda a: a[0], job.cspp.calls['run_final']))`
has glitch 2020-06-05 22:19:45 +00:00			`files = list(filter(lambda a: Path(a).exists(), files))`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`meta[link] = files`
			`metafile.write_text(json.dumps(meta, indent=1))`
has glitch 2020-06-05 22:19:45 +00:00			`for fl in meta[link]:`
			`pth = Path(fl)`
			`if not pth.exists():`
			`raise FileNotFoundError((link, link_already_downloaded, meta[link]))`
			`st = pth.stat()`
			`totalbytes += st.st_size`
			`totalfiles += 1`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`result.append((link, meta[link]))`
			`remaining_links -= 1`
has glitch 2020-06-05 22:19:45 +00:00			`if thread_state_path is not None:`
			`scrubbing_running = 'scrubbing' if scrubbing else 'running'`
			`thread_state_path.write_text(f'{scrubbing_running}:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}')`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`finally:`
			`print((f'\033[{line};0H' if has_its_own_line else '') +`
has glitch 2020-06-05 22:19:45 +00:00			`clrlib.stylize(thread_id.strip(), [clrlib.fg('yellow'), clrlib.attr('bold')]) +`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`clrlib.stylize('#', [clrlib.fg('light_red')]) +`
			`clrlib.stylize('Done', [clrlib.fg('light_green')]) +`
			`('\033[K' if has_its_own_line else '')`
			`)`
			`return result`


			`def configure_gdl():`
has glitch 2020-06-05 22:19:45 +00:00			`'''Configures Gallery-DL for usage.'''`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`parser = gallery_dl.option.build_parser()`
			`args = parser.parse_args([`
has glitch 2020-06-05 22:19:45 +00:00			`'--download-archive=i_gdl/archive.db',`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`'--dest=i_gdl',`
			`'--write-metadata',`
			`'--write-tags',`
has glitch 2020-06-05 22:19:45 +00:00			`# '--write-log=i_gdl_log.txt',`
			`# '--write-unsupported=i_gdl_unsupported.txt',`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`'--quiet',`
has glitch 2020-06-05 22:19:45 +00:00			`'--retries=15',`
			`# '--limit-rate=1500k',`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`])`
			`gallery_dl.output.initialize_logging(args.loglevel)`

			`# configuration`
			`if args.load_config:`
			`gallery_dl.config.load()`
			`if args.cfgfiles:`
			`gallery_dl.config.load(args.cfgfiles, strict=True)`
			`if args.yamlfiles:`
			`gallery_dl.config.load(args.yamlfiles, strict=True, fmt="yaml")`
			`if args.postprocessors:`
			`gallery_dl.config.set((), "postprocessors", args.postprocessors)`
			`if args.abort:`
			`gallery_dl.config.set((), "skip", "abort:" + str(args.abort))`
			`for opts in args.options:`
			`gallery_dl.config.set(*opts)`

			`# loglevels`
			`gallery_dl.output.configure_logging(args.loglevel)`

			`gallery_dl.output.select = ColoredLineOutput`


			`class DownloadJobWithCallSaverPostProcessor(gallery_dl.job.DownloadJob):`
			`def __init__(self, url, parent=None):`
			`super().__init__(url, parent)`
			`self.cspp = CallSaverPostProcessor(self)`

			`def initialize(self, kwdict=None):`
			`super().initialize(kwdict)`
			`self.postprocessors.append(self.cspp)`


			`class ColoredLineOutput(gallery_dl.output.TerminalOutput):`
has glitch 2020-06-05 22:19:45 +00:00			`def __init__(self, sameline=False, prefix="", suffix="", prefixsz=0, suffixsz=0, write_successes_to=None):`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`super().__init__()`
			`self.sameline = sameline`
			`self.eol = '\r' if sameline else '\n'`
			`self.prefix = prefix`
			`self.suffix = suffix`
			`self.prefixsz = prefixsz`
			`self.suffixsz = suffixsz`
has glitch 2020-06-05 22:19:45 +00:00			`self.write_successes_to = write_successes_to`
			`self._termsize_update()`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00
			`def start(self, path):`
has glitch 2020-06-05 22:19:45 +00:00			`self.message(path,`
			`clrlib.fg("light_yellow"),`
			`)`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00
			`def skip(self, path):`
has glitch 2020-06-05 22:19:45 +00:00			`self.message(path,`
			`clrlib.attr('dim'),`
			`)`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00
			`def success(self, path, tries):`
has glitch 2020-06-05 22:19:45 +00:00			`self.message(path,`
			`clrlib.attr('bold'),`
			`clrlib.fg('light_green'),`
			`)`
			`if self.write_successes_to is not None:`
			`self.write_successes_to.write_text(path)`

			`def message(self, txt: str, *attrs: List[str], do_print: bool = True) -> str:`
			`"""Prints a message with given formatters"""`
			`clrtxt = clrlib.stylize(self.shorten(txt), attrs)`
			`fmtd = f"{self.prefix}{clrtxt}{self.suffix}"`
			`if do_print:`
			`print(fmtd, flush=True, end=self.eol)`
			`return fmtd`

use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`def shorten(self, txt):`
has glitch 2020-06-05 22:19:45 +00:00			`self._termsize_update()`
			`self.width = self.termsize - self.prefixsz - self.suffixsz - 1`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00			`return super().shorten(txt)`

has glitch 2020-06-05 22:19:45 +00:00			`def _termsize_update(self):`
			`self.termsize = shutil.get_terminal_size().columns`
use gallerydl as downloader backend on fetch2 2020-06-03 03:27:13 +00:00

			`class CallSaverPostProcessor(gallery_dl.postprocessor.common.PostProcessor):`
			`def __init__(self, job):`
			`super().__init__(job)`
			`self.calls = dict(`
			`prepare=list(),`
			`run=list(),`
			`run_metadata=list(),`
			`run_after=list(),`
			`run_final=list(),`
			`)`

			`def prepare(self, pathfmt):`
			`"""Update file paths, etc."""`
			`self.calls['prepare'].append((pathfmt.path,))`

			`def run(self, pathfmt):`
			`"""Execute the postprocessor for a file"""`
			`self.calls['run'].append((pathfmt.path,))`

			`def run_metadata(self, pathfmt):`
			`"""Execute the postprocessor for a file"""`
			`self.calls['run_metadata'].append((pathfmt.path,))`

			`def run_after(self, pathfmt):`
			`"""Execute postprocessor after moving a file to its target location"""`
			`self.calls['run_after'].append((pathfmt.path,))`

			`def run_final(self, pathfmt, status):`
			`"""Postprocessor finalization after all files have been downloaded"""`
			`self.calls['run_final'].append((pathfmt.path, status))`


			`if __name__ == "__main__":`
			`main()`