#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import json import math import os import pickle import shutil import subprocess import sys import traceback from collections import OrderedDict from concurrent.futures import ProcessPoolExecutor as PoolExecutor from io import StringIO from pathlib import Path from typing import Dict, List, Optional, Set, Tuple, Type import colored as clrlib import gallery_dl import gallery_dl.config import gallery_dl.extractor import gallery_dl.job import gallery_dl.postprocessor.common import gallery_dl.util import reddit_imgs.sync from .system.cmdline_parser import parse_cmdline from .system.downloader.cache import get_normalized_link, get_path_for_caching from .system.flattener import flatten_generator from .system.urlmatcher import search_urls gdl_pf: Type[gallery_dl.util.PathFormat] = ( gallery_dl.util.PathFormat if not hasattr(gallery_dl.util, 'PathFormatOriginal') else gallery_dl.util.PathFormatOriginal) STOP_JOBS_FLAG_PATH = Path('stop_jobs.flag') FORBIDDEN_WORKER_SPLITS = { 'deviantart', } MAX_WORKERS = 12 SPLIT_WORKER_AFTER_N_LINKS = 10000 USE_FIREFOX_COOKIES = True DEBUG_WORKER = None IGNORE_WORKERS = set() REDOWNLOAD = False REDOWNLOAD_EMPTIES = False CUSTOM_WORKER_SPLITS = dict() SKIP_INDEXED_FILES = True RETRY_ERROR_MASK = 0 GDL_ERRORS = [ 'GENERIC_ERR', # 1 'UNKNOWN_ERR', # 2 'HTTP_ERR', # 4 '404_ERR', # 8 'AUTH_ERR', # 16 'FORMAT_ERR', # 32 'LACKS_EXTRACTOR_ERR', # 64 'OS_ERR', # 128 'NOT_IN_DISK_ERR', # 256 ] GDL_ERRORS_DICT = {(1 << k): v for k, v in enumerate(GDL_ERRORS)} HTML_SPECIAL_CHARS_REPLACE: List[Tuple[str, str]] = [ ('<', '<'), ('>', '>'), ('"', '"'), (''', '\''), ('&', '&'), ] HTML_SPECIAL_CHARS: List[str] = list(map(lambda a: a[0], HTML_SPECIAL_CHARS_REPLACE)) def contains_any(s: str, l: List[str]) -> bool: for i in l: if i in s: return True return False def replace_many(s: str, l: List[Tuple[str, str]]) -> str: for f, t in l: s = s.replace(f, t) return s def cmdline(encoded_args: str = None): if encoded_args is None: return run_with_config() else: return parse_cmdline(run_with_config, encoded_args) def run_with_config(redownload_empties: bool = False, redownload_all: bool = False, use_firefox_cookies: bool = True, retry_generic_errors: bool = False, retry_unknown_errors: bool = False, retry_network_errors: bool = False, retry_not_found_errors: bool = False, retry_auth_errors: bool = False, retry_format_errors: bool = False, retry_extractor_errors: bool = False, retry_os_errors: bool = False, retry_not_in_disk_errors: bool = False, retry_gdl_mask: int = 0, max_workers: int = None, split_workers: int = None, debug_worker: str = None, ignore_workers: Set[str] = set(), custom_worker_splits: dict = None, skip_indexed_files: bool = True, ): global SPLIT_WORKER_AFTER_N_LINKS global CUSTOM_WORKER_SPLITS global USE_FIREFOX_COOKIES global SKIP_INDEXED_FILES global REDOWNLOAD_EMPTIES global RETRY_ERROR_MASK global IGNORE_WORKERS global DEBUG_WORKER global MAX_WORKERS global REDOWNLOAD REDOWNLOAD = redownload_all DEBUG_WORKER = debug_worker IGNORE_WORKERS = ignore_workers SKIP_INDEXED_FILES = skip_indexed_files REDOWNLOAD_EMPTIES = redownload_empties USE_FIREFOX_COOKIES = use_firefox_cookies RETRY_ERROR_MASK |= retry_gdl_mask if retry_generic_errors: RETRY_ERROR_MASK |= 1 << 0 if retry_unknown_errors: RETRY_ERROR_MASK |= 1 << 1 if retry_network_errors: RETRY_ERROR_MASK |= 1 << 2 if retry_not_found_errors: RETRY_ERROR_MASK |= 1 << 3 if retry_auth_errors: RETRY_ERROR_MASK |= 1 << 4 if retry_format_errors: RETRY_ERROR_MASK |= 1 << 5 if retry_extractor_errors: RETRY_ERROR_MASK |= 1 << 6 if retry_os_errors: RETRY_ERROR_MASK |= 1 << 7 if retry_not_in_disk_errors: RETRY_ERROR_MASK |= 1 << 8 if max_workers is not None: MAX_WORKERS = max_workers if split_workers is not None: SPLIT_WORKER_AFTER_N_LINKS = split_workers if debug_worker is not None: DEBUG_WORKER = debug_worker if custom_worker_splits is not None: CUSTOM_WORKER_SPLITS = custom_worker_splits return main() def prerrun(): subreddit_data_path = Path('r.json') if not subreddit_data_path.exists(): print("Executing prerrequisite...") reddit_imgs.sync.main() if USE_FIREFOX_COOKIES: print('Getting cookies from Firefox...') subprocess.run([ 'reddit_imgs/get_firefox_cookies.sh', 'i_gdl/.cookies'], ).check_returncode() subreddit_filters_path = Path('rf.json') print('Loading posts from disk...') Path('i_gdl').mkdir(exist_ok=True, parents=True) workers_state_path = Path('i_gdl_w') workers_state_path.mkdir(exist_ok=True, parents=True) for wsp in workers_state_path.iterdir(): wsp.unlink() if STOP_JOBS_FLAG_PATH.exists(): STOP_JOBS_FLAG_PATH.unlink() subreddit_data = json.loads(subreddit_data_path.read_text()) subreddit_filters = json.loads(subreddit_filters_path.read_bytes()) print('Loading posts...') posts = prerrun_flatten_subreddits_into_posts(subreddit_data, subreddit_filters) print(f'{len(posts)} posts identified.') print(f'Identifying alternative trivial links...') prerrun_posts_re_sort(posts) Path('r_gdl_p.json').write_text( json.dumps(posts, indent=1, sort_keys=True)) print(f'Grouping links with the posts they show up in...') links = OrderedDict() for dk, post in posts.items(): for link in post['links']: if link not in links: links[link] = list() links[link].append(dk) Path('r_gdl_lp.json').write_text( json.dumps(links, indent=1, sort_keys=True)) known_link_set = set(links.keys()) print(f'{len(links)} links found') print(f'Checking if there is an extractor for each link...') r_gdl_le_path = Path('r_gdl_le.json') link_extractors = dict() if r_gdl_le_path.exists(): link_extractors = json.loads(r_gdl_le_path.read_text()) for link in links.keys(): if link not in link_extractors or link_extractors[link] == '': ext = None try: ext = gallery_dl.extractor.find(link) except gallery_dl.exception.NotFoundError: pass if ext is not None and type(ext).category == 'reddit' and type(ext).subcategory in ('subreddit', 'user'): link_extractors[link] = f'{type(ext).category}_{type(ext).subcategory}' else: link_extractors[link] = (type(ext).category if ext is not None else '') for discarded_link in set(link_extractors.keys()).difference(known_link_set): del link_extractors[discarded_link] r_gdl_le_path.write_text(json.dumps( link_extractors, indent=1, sort_keys=True)) links_by_extractor = { extractor: list() for extractor in list(set(link_extractors.values())) } for link, extractor in link_extractors.items(): links_by_extractor[extractor].append(link) not_downloadable_links = dict() not_downloadable_links[''] = links_by_extractor.get('', []) not_downloadable_links['reddit_user'] = links_by_extractor.get('reddit_user', []) not_downloadable_links['reddit_subreddit'] = links_by_extractor.get('reddit_subreddit', []) Path('i_undownloadable.json').write_text( json.dumps(not_downloadable_links, indent=1)) if '' in links_by_extractor: del links_by_extractor[''] if 'reddit_user' in links_by_extractor: del links_by_extractor['reddit_user'] if 'reddit_subreddit' in links_by_extractor: del links_by_extractor['reddit_subreddit'] not_downloadable_link_set = frozenset(flatten_generator(not_downloadable_links.values())) print(f'{len(links)-len(not_downloadable_link_set)} downloadable links found') print(f'{len(not_downloadable_link_set)} undownloadable links found') print(f'{len(links_by_extractor)} extractors found') Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1, sort_keys=True)) files_from_links: Dict[str, List[str]] = dict() links_no_files: List[str] = list() files_sizes: Dict[str, int] = dict() link_statuses: Dict[str, int] = dict() ignored_links: Set[str] = set() if (pth := Path('i_gdl_ffl.json')).exists(): try: files_from_links = json.loads(pth.read_text()) except: pass if (pth := Path('i_gdl_lnf.json')).exists(): try: links_no_files = json.loads(pth.read_text()) except: pass if (pth := Path('i_gdl_fsz.json')).exists(): try: files_sizes = json.loads(pth.read_text()) except: pass if (pth := Path('i_gdl_spl.json')).exists(): try: link_statuses = json.loads(pth.read_text()) except: pass for discarded_link in set(links_no_files).difference(known_link_set): links_no_files.remove(discarded_link) discarded_files = set() for discarded_link in set(files_from_links.keys()).difference(known_link_set): if discarded_link in files_from_links: files_in_link = files_from_links[discarded_link] for file_in_link in files_in_link: discarded_files.add(file_in_link) if discarded_link in link_statuses: del link_statuses[discarded_link] del files_from_links[discarded_link] files_to_keep = set() for files_from_link in files_from_links.values(): for file_from_link in files_from_link: if file_from_link not in files_to_keep: files_to_keep.add(file_from_link) for discarded_file in discarded_files.difference(files_to_keep): if discarded_file in files_sizes: del files_sizes[discarded_file] for missing_file_size in files_to_keep.difference(set(files_sizes.keys())): p = Path(missing_file_size) if not p.exists(): raise FileNotFoundError(missing_file_size) else: files_sizes[missing_file_size] = p.stat().st_size print('Re-filled files_sizes for %r' % p) if (p := Path('i_gdl_ignored.txt')).exists(): ignored_links = set(list(filter(len, p.read_text().splitlines()))) links_no_files = list(filter(lambda a: a not in ignored_links, links_no_files)) link_statuses = dict(filter(lambda a: a[0] not in ignored_links, link_statuses.items())) files_from_links = dict(filter(lambda a: a[0] not in ignored_links, files_from_links.items())) checked_links = list(files_from_links.keys()) + links_no_files checked_links = frozenset(checked_links) max_expected_jobs_for_extractor = 0 for extractor, links in links_by_extractor.items(): links = [link for link in links if link not in ignored_links and ( link not in checked_links or not SKIP_INDEXED_FILES or (link_statuses.get(link, 0xFF) & RETRY_ERROR_MASK) != 0 )] if len(links) <= 0: continue this_worker_split_after_n_links = CUSTOM_WORKER_SPLITS.get( extractor, SPLIT_WORKER_AFTER_N_LINKS) workers = math.ceil(len(links)/this_worker_split_after_n_links) if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS: workers = 1 max_expected_jobs_for_extractor = max( max_expected_jobs_for_extractor, workers ) worker_by_seq = [list() for _ in range(max_expected_jobs_for_extractor)] links_to_worker = dict() for extractor, links in links_by_extractor.items(): links = [link for link in links if link not in ignored_links and ( link not in checked_links or not SKIP_INDEXED_FILES or (link_statuses.get(link, 0xFF) & RETRY_ERROR_MASK) != 0 )] if len(links) <= 0: continue this_worker_split_after_n_links = CUSTOM_WORKER_SPLITS.get( extractor, SPLIT_WORKER_AFTER_N_LINKS) workers = math.ceil(len(links)/this_worker_split_after_n_links) if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS: if extractor in IGNORE_WORKERS: continue links_to_worker[extractor] = links worker_by_seq[0].append(extractor) else: digits = math.ceil(math.log10(max(1, workers+1))) fmt = "%%0%dd" % digits for worker_no in range(workers): lowerlimit = (worker_no+0)*this_worker_split_after_n_links upperlimit = (worker_no+1)*this_worker_split_after_n_links thisrange = links[lowerlimit:upperlimit] worker_nm = extractor + ':' + (fmt % (worker_no)) if worker_nm in IGNORE_WORKERS: continue links_to_worker[worker_nm] = thisrange worker_by_seq[worker_no].append(worker_nm) for w in worker_by_seq: w.sort() workers_nicely_grouped = [ worker for workergroup in worker_by_seq for worker in workergroup if worker != '' ] print(f'{len(links_to_worker)} workers to be spawned') return (files_from_links, links_no_files, files_sizes, link_statuses, workers_nicely_grouped, workers_state_path, links_to_worker, ) def prerrun_flatten_subreddits_into_posts(subreddit_data, subreddit_filters): postsl = [ {'subreddit': subreddit, **post} for subreddit, srdt in subreddit_data.items() for post in srdt['links'] ] postsl.sort(key=lambda a: (-a['timestamp'], a['datakey'])) postsd = dict() for post in postsl: dk = post['datakey'] sr = post['subreddit'] if subreddit_filters['no_download'][sr]: continue if subreddit_filters['no_sfw'][sr] and not post['nsfw']: continue if subreddit_filters['no_nsfw'][sr] and post['nsfw']: continue if dk not in postsd: postsd[dk] = post.copy() postsd[dk]['subreddits'] = list() postsd[dk]['links'] = list() del postsd[dk]['subreddit'] del postsd[dk]['link'] del postsd[dk]['domain'] if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']): srs.append(sr) if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']): lnks.append(lnk) return postsd def prerrun_posts_re_sort(posts): for post in sorted(posts.values(), key=lambda a: (-a['timestamp'], a['datakey'])): post['subreddits'].sort() dk = post['datakey'] post_links = post['links'] has_changed_any_link = True while has_changed_any_link: has_changed_any_link = False for link in post_links: if '' in link: for linkcopy in search_urls(link): linkcopy = get_normalized_link(linkcopy) linkcopy = replace_many(linkcopy, HTML_SPECIAL_CHARS_REPLACE) if linkcopy not in post_links: post_links.append(linkcopy) has_changed_any_link = True while link in post_links: post_links.remove(link) has_changed_any_link = True break else: linkcopy = link linkcopy = get_normalized_link(linkcopy) if linkcopy not in post_links: post_links.append(linkcopy) has_changed_any_link = True break if '?' in link: linkcopy = link.split('?', 1)[0] linkcopy = get_normalized_link(linkcopy) if linkcopy not in post_links: post_links.append(linkcopy) has_changed_any_link = True break if '#' in link: linkcopy = link.split('#', 1)[0] linkcopy = get_normalized_link(linkcopy) if linkcopy not in post_links: post_links.append(linkcopy) has_changed_any_link = True break if contains_any(linkcopy, HTML_SPECIAL_CHARS): linkcopy = replace_many(linkcopy, HTML_SPECIAL_CHARS_REPLACE) if linkcopy not in post_links: post_links.append(linkcopy) has_changed_any_link = True break if linkcopy[-1:] in ('/', '#', '?'): while linkcopy[-1:] in ('/', '#', '?'): linkcopy = linkcopy[:-1] linkcopy = get_normalized_link(linkcopy) if linkcopy not in post_links: post_links.append(linkcopy) has_changed_any_link = True if link.strip() == '': while link in post_links: post_links.remove(link) has_changed_any_link = True break if link.startswith('/'): while link in post_links: post_links.remove(link) has_changed_any_link = True break if link.startswith('#'): while link in post_links: post_links.remove(link) has_changed_any_link = True break if link.startswith('mailto'): while link in post_links: post_links.remove(link) has_changed_any_link = True break if (proto := (tpl := link.split(':', 1))[0]).lower() in ('http', 'https') and proto not in ('http', 'https'): lst = list(tpl) lst[0] = lst[0].lower() linkcopy = ':'.join(lst) post_links.remove(link) if linkcopy not in post_links: post_links.append(linkcopy) has_changed_any_link = True break post['links'] = list(filter(lambda link: ( not link.startswith('https://preview.redd.it/') or ( (('?width=' in link) or ('&width=' in link)) and (('?format=' in link) or ('&format=' in link)) and (('?auto=' in link) or ('&auto=' in link)) and (('?s=' in link) or ('&s=' in link)) ) ), post['links'])) post['links'].sort() def main(): (files_from_links, links_no_files, files_sizes, link_statuses, workers_nicely_grouped, workers_state_path, links_to_worker, ) = prerrun() configure_gdl() def save_ending_files(): nonlocal links_no_files links_no_files2 = list(map(lambda a: a[0], filter(lambda a: len(a[1]) <= 0 and a[0] not in links_no_files, files_from_links.items()))) + links_no_files files_from_links2 = dict( filter(lambda a: len(a[1]) > 0, files_from_links.items())) links_no_files2_sorted = sorted(links_no_files2) links_for_files = dict() for link, files in files_from_links2.items(): for file in files: if file not in links_for_files: links_for_files[file] = list() links_for_files[file].append(link) del file del link del files os.sync() Path('i_gdl_lnf.json').write_text( json.dumps(links_no_files2_sorted, indent=1)) Path('i_gdl_ffl.json').write_text(json.dumps( files_from_links2, indent=1, sort_keys=True)) Path('i_gdl_lff.json').write_text(json.dumps( links_for_files, indent=1, sort_keys=True)) Path('i_gdl_fsz.json').write_text( json.dumps(files_sizes, indent=1, sort_keys=True)) Path('i_gdl_spl.json').write_text(json.dumps( link_statuses, indent=1, sort_keys=True)) os.sync() save_ending_files() gallery_dl.output.select = lambda: ColoredLineOutput(False) totalfiles = 0 thread_ids = workers_nicely_grouped.copy() for line, thread_id in enumerate(thread_ids): workers_state_path.joinpath(thread_id+'=line').write_text(str(line)) linkcount = len(links_to_worker[thread_id]) workers_state_path.joinpath(thread_id).write_text( f'waiting:{linkcount}:{linkcount}:0:0') do_fancy_multithreading_panel = False thread_id_count = len(thread_ids) if DEBUG_WORKER is not None: print(f'Will debug {repr(DEBUG_WORKER)}.') thread_id = DEBUG_WORKER links_list = links_to_worker[DEBUG_WORKER] download_link_list( links_list, thread_id, None, f'Debugging {repr(DEBUG_WORKER)}...', workers_state_path.joinpath(thread_id), ) return if links_to_worker: with PoolExecutor(min(MAX_WORKERS, thread_id_count)) as pe: if do_fancy_multithreading_panel: print(f'\033[2J', end='', flush=True) print(f'\033[0;0H', end='', flush=True) print('Downloading...', flush=True) if do_fancy_multithreading_panel: print(f'\033[0;0H', end='', flush=True) largest_tid_size = max(map(len, thread_ids)) line2tid = dict() def done_callback_generator(line): nonlocal totalfiles def terminate_process_pool(): os.system('sync') os.system("bash -c \"ps -aux | grep './redditgetter.py' | grep -v grep | sed -e 's/ */ /g' | cut -d' ' -f2 | xargs -r -- kill -15\"") sys.exit(0xFF) def done_callback(job): nonlocal totalfiles thread_id = line2tid[line] links_list = links_to_worker[thread_id] try: workers_state_path.joinpath(thread_id).write_text( f'finished:{len(links_list)}:0:0:0') print(clrlib.stylize( f'Received job #{line}: {thread_id}', [ clrlib.fg('white'), clrlib.bg('green'), clrlib.attr('bold'), ] )) downloaded_links = list() totalbytes = 0 thisfiles = 0 true = True downloaded_links = job.result() for link, files in downloaded_links: if true: statusdir = get_path_for_caching( link, Path('i_gdl_s')) statusdir.mkdir(parents=True, exist_ok=True) statusfile = statusdir.joinpath('_gdl_status.json') statuses = dict() if statusfile.exists(): statuses = json.loads(statusfile.read_text()) link_statuses[link] = statuses.get(link, 0xFF) if link not in files_from_links: files_from_links[link] = list() lenfiles = len(files) totalfiles += lenfiles for file in files: filepath = Path(file) thisfiles += 1 if filepath.exists(): files_from_links[link].append(file) st_size = filepath.stat().st_size files_sizes[file] = st_size totalbytes += st_size workers_state_path.joinpath(thread_id).write_text( f'finished:{len(links_list)}:0:{totalbytes}:{thisfiles}') save_ending_files() except: sio = StringIO() traceback.print_exc(file=sio) excTxt = sio.getvalue() try: workers_state_path.joinpath(thread_id).write_text( f'failed:{len(links_list)}:0:0:0') except: pass try: workers_state_path.joinpath(thread_id+'=exc').write_text(excTxt) except: pass try: pe.shutdown(wait=False) except: pass print(excTxt) terminate_process_pool() return return done_callback for line, thread_id in enumerate(thread_ids): line2tid[line] = thread_id links_list = links_to_worker[thread_id] workers_state_path.joinpath(thread_id).write_text( f'enqueued:{len(links_list)}:{len(links_list)}:0:0') print(clrlib.stylize(f'Starting job #{line}: {thread_id}', [ clrlib.fg('white'), clrlib.bg('light_red'), clrlib.attr('bold'), ])) jobstardedmsg = clrlib.stylize(f'Starting job #{line}: {thread_id}', [ clrlib.fg('black'), clrlib.bg('light_yellow'), clrlib.attr('bold'), ]) thread_id_nmsz = len(thread_id) thread_id_display = thread_id + ' ' * (largest_tid_size - thread_id_nmsz) job = pe.submit( download_link_list, links_list, thread_id_display, line+3 if do_fancy_multithreading_panel else None, jobstardedmsg, workers_state_path.joinpath(thread_id), ) job.add_done_callback(done_callback_generator(line)) save_ending_files() if (p := Path('latest_image_download.txt')).exists(): p.unlink() if workers_state_path.exists(): for p in workers_state_path.glob('*'): p.unlink() shutil.rmtree(workers_state_path) print(f'Downloaded {totalfiles} files') def download_link_list(links: List[str], thread_id: str, line: Optional[int] = None, job_started_msg: Optional[str] = None, thread_state_path: Optional[Path] = None, ) -> List[Tuple[str, List[str]]]: '''Downloads a link list inside a ProcessPoolExecutor''' if STOP_JOBS_FLAG_PATH.exists(): raise InterruptedError(STOP_JOBS_FLAG_PATH) if job_started_msg is not None: print(job_started_msg) has_its_own_line = line is not None link_count = len(links) remaining_links = link_count configure_gdl() if thread_state_path is not None: thread_state_path.write_text( f'running:{link_count}:{remaining_links}:0:0') def get_printer(): return ColoredLineOutput( has_its_own_line, prefix=(f'\033[{line};0H' if has_its_own_line else '') + clrlib.stylize('%9d' % remaining_links, [clrlib.fg('light_cyan')]) + clrlib.stylize('@', [clrlib.fg('light_red')]) + clrlib.stylize(thread_id, [clrlib.fg('yellow')]) + clrlib.stylize('= ', [clrlib.fg('dark_gray')]), suffix=('\033[K\033[0;0H' if has_its_own_line else ''), prefixsz=len(('%9d' % 0)+' '+thread_id), suffixsz=0, write_successes_to=Path('latest_image_download.txt'), ) gallery_dl.output.select = get_printer result = list() totalbytes = 0 totalfiles = 0 try: for link in links: scrubbing = True cachedir = get_path_for_caching(link, Path('i_gdl_c')) statusdir = get_path_for_caching(link, Path('i_gdl_s')) cachedir.mkdir(parents=True, exist_ok=True) statusdir.mkdir(parents=True, exist_ok=True) metafile = cachedir.joinpath('_gdl_meta.json') statusfile = statusdir.joinpath('_gdl_status.json') meta = dict() statuses = dict() link_already_downloaded = False if metafile.exists(): try: meta = json.loads(metafile.read_text()) except json.JSONDecodeError: pass if statusfile.exists(): try: statuses = json.loads(statusfile.read_text()) except json.JSONDecodeError: pass if link in meta and link in statuses: link_already_downloaded = True rc = statuses.get(link, 0xFF) if rc == 0: for fl in meta[link]: pth = Path(fl) try: if not pth.exists(): link_already_downloaded = False break except OSError: link_already_downloaded = False break if len(meta[link]) == 0 and REDOWNLOAD_EMPTIES: link_already_downloaded = False if (rc & RETRY_ERROR_MASK) != 0: link_already_downloaded = False if not link_already_downloaded or REDOWNLOAD: scrubbing = False if thread_state_path is not None: thread_state_path.write_text( f'running:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}:{link}') job = DownloadJobWithCallSaverPostProcessor(link) job.out = get_printer() job.out.message(link, clrlib.fg('light_magenta')) rc = job.run() os.sync() # print('FINAL', job.cspp.calls) # raise Exception(job.cspp.calls) # files = job.cspp.calls['run_final'].copy() # Only brings the last element files = job.cspp.calls['prepare'].copy() files = list(filter(len, files)) has_changed = True while has_changed: has_changed = False for seq, fl in enumerate(files): if not (pth := Path(fl)).exists(): candidates = sorted(list(filter( lambda p: (p.name.startswith(pth.name) and p.suffix != '.part' and p.suffix != '.json'), pth.parent.iterdir())), key=lambda p: len(p.name) ) if len(candidates) > 0: files[seq] = str(candidates[0]) has_changed = True break else: rc |= 256 # raise Exception(pth.name, candidates, files) del has_changed meta[link] = files statuses[link] = rc metafile.write_text(json.dumps(meta, indent=1)) statusfile.write_text(json.dumps(statuses, indent=1)) os.sync() for fl in meta[link]: code = statuses[link] pth = Path(fl) if not pth.exists(): if code != 0: continue else: raise FileNotFoundError((link, link_already_downloaded, meta[link])) else: totalfiles += 1 totalbytes += pth.stat().st_size result.append((link, meta[link])) remaining_links -= 1 if thread_state_path is not None: scrubbing_running = 'scrubbing' if scrubbing else 'running' thread_state_path.write_text( f'{scrubbing_running}:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}:{link}') if STOP_JOBS_FLAG_PATH.exists(): raise InterruptedError(STOP_JOBS_FLAG_PATH) finally: print((f'\033[{line};0H' if has_its_own_line else '') + clrlib.stylize(thread_id.strip(), [clrlib.fg('yellow'), clrlib.attr('bold')]) + clrlib.stylize('#', [clrlib.fg('light_red')]) + clrlib.stylize('Done', [clrlib.fg('light_green')]) + ('\033[K' if has_its_own_line else '') ) return result def configure_gdl(): '''Configures Gallery-DL for usage.''' parser = gallery_dl.option.build_parser() args = parser.parse_args([ *([] if USE_FIREFOX_COOKIES else ['--cookies=i_gdl/.cookies']), '--dest=i_gdl', '--write-metadata', # '--write-tags', # '--write-log=i_gdl_log.txt', '--write-unsupported=i_gdl_unsupported.txt', # '--quiet', *(['--verbose'] if DEBUG_WORKER else []), '--retries=1', # '--retries=7', # '--limit-rate=1500k', ]) gallery_dl.output.initialize_logging(args.loglevel) # configuration if args.load_config: gallery_dl.config.load() if args.cfgfiles: gallery_dl.config.load(args.cfgfiles, strict=True) if args.yamlfiles: gallery_dl.config.load(args.yamlfiles, strict=True, fmt="yaml") if args.postprocessors: gallery_dl.config.set((), "postprocessors", args.postprocessors) if args.abort: gallery_dl.config.set((), "skip", "abort:" + str(args.abort)) for opts in args.options: gallery_dl.config.set(*opts) # loglevels gallery_dl.output.configure_logging(args.loglevel) gallery_dl.output.select = ColoredLineOutput gallery_dl.util.PathFormatOriginal = gdl_pf gallery_dl.util.PathFormat = OverriddenPathFormat class DownloadJobWithCallSaverPostProcessor(gallery_dl.job.DownloadJob): def __init__(self, url, parent=None): super().__init__(url, parent) self.cspp = CallSaverPostProcessor( self) if parent is None else parent.cspp def initialize(self, kwdict=None): if not isinstance(self.hooks, tuple): print('ADDED!!') self.hooks['prepare'].append(self.cspp.prepare) class ColoredLineOutput(gallery_dl.output.TerminalOutput): def __init__(self, sameline=False, prefix="", suffix="", prefixsz=0, suffixsz=0, write_successes_to=None): super().__init__() self.sameline = sameline self.eol = '\r' if sameline else '\n' self.prefix = prefix self.suffix = suffix self.prefixsz = prefixsz self.suffixsz = suffixsz self.write_successes_to = write_successes_to self._termsize_update() def start(self, path): self.message(path, clrlib.fg("light_yellow"), ) def skip(self, path): self.message(path, clrlib.attr('dim'), ) def success(self, path, tries): self.message(path, clrlib.attr('bold'), clrlib.fg('light_green'), ) if self.write_successes_to is not None: self.write_successes_to.write_text(path) def message(self, txt: str, *attrs: List[str], do_print: bool = True) -> str: """Prints a message with given formatters""" clrtxt = clrlib.stylize(self.shorten(txt), attrs) fmtd = f"{self.prefix}{clrtxt}{self.suffix}" if do_print: print(fmtd, flush=True, end=self.eol) return fmtd def shorten(self, txt): self._termsize_update() self.width = self.termsize - self.prefixsz - self.suffixsz - 1 return super().shorten(txt) def _termsize_update(self): self.termsize = shutil.get_terminal_size().columns class OverriddenPathFormat(gdl_pf): def __init__(self, extractor): super().__init__(extractor) self.clean_path = FixFileNameFormatterWrapper(self.clean_path) class CallSaverPostProcessor(gallery_dl.postprocessor.common.PostProcessor): def __init__(self, job): super().__init__(job) self.calls = dict( prepare=list(), run=list(), run_metadata=list(), run_after=list(), run_final=list(), ) def prepare(self, pathfmt: gallery_dl.util.PathFormat): """Update file paths, etc.""" directory_formatters = pathfmt.directory_formatters filename_formatter = pathfmt.filename_formatter clean_segment = pathfmt.clean_segment clean_path = pathfmt.clean_path pathfmt.directory_formatters = None pathfmt.filename_formatter = None pathfmt.clean_segment = None pathfmt.clean_path = None cloned_pathfmt: gallery_dl.util.PathFormat = pickle.loads(pickle.dumps(pathfmt)) pathfmt.directory_formatters = directory_formatters pathfmt.filename_formatter = filename_formatter pathfmt.clean_segment = clean_segment pathfmt.clean_path = clean_path cloned_pathfmt.directory_formatters = directory_formatters cloned_pathfmt.filename_formatter = filename_formatter cloned_pathfmt.clean_segment = clean_segment cloned_pathfmt.clean_path = clean_path cloned_pathfmt.build_path() # print(cloned_pathfmt.path) # print(cloned_pathfmt.filename) # print(cloned_pathfmt.kwdict) # print(cloned_pathfmt) self.calls['prepare'].append(cloned_pathfmt.path) return pathfmt def run(self, pathfmt: gallery_dl.util.PathFormat): """Execute the postprocessor for a file""" self.calls['run'].append(pathfmt.path) def run_metadata(self, pathfmt: gallery_dl.util.PathFormat): """Execute the postprocessor for a file""" self.calls['run_metadata'].append(pathfmt.path) def run_after(self, pathfmt: gallery_dl.util.PathFormat): """Execute postprocessor after moving a file to its target location""" self.calls['run_after'].append(pathfmt.path) def run_final(self, pathfmt: gallery_dl.util.PathFormat, status: int): """Postprocessor finalization after all files have been downloaded""" self.calls['run_final'].append((pathfmt.path, status)) class FixFileNameFormatterWrapper: """Wraps file name formatter for ensuring a valid file name length""" def __init__(self, formatter: gallery_dl.util.Formatter): self.formatter = formatter def __call__(self, *args, **kwargs) -> str: path = self.formatter(*args, **kwargs) parts = list(map(fix_filename_ending_extension, map(fix_filename_length, map(fix_filename_ending_extension, Path(path).parts)))) return str(Path(*parts)) def fix_filename_length(filename: str) -> str: """Ensures a segment has a valid file name length""" if len(filename.encode()) > 240: extension = Path(filename).suffix extension_bytes_length = len(extension.encode()) stem_bytes = Path(filename).stem.encode() fixed_stem_bytes = stem_bytes[:240-extension_bytes_length] fixed_stem = fixed_stem_bytes.decode(errors="ignore") return fixed_stem + extension return filename def fix_filename_ending_extension(filename: str) -> str: if (fp := Path(filename)).stem[-1:] in ('.', ' '): return str(fp.parent.joinpath(f"{fp.stem.rstrip('. ')}{fp.suffix}")) return filename if __name__ == "__main__": main()