#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import json import math import shutil import sys import traceback from concurrent.futures import ProcessPoolExecutor as PoolExecutor from pathlib import Path from typing import List, Optional, Tuple import colored as clrlib import gallery_dl import gallery_dl.config import gallery_dl.extractor import gallery_dl.job import gallery_dl.postprocessor.common import reddit_imgs.sync from .system.downloader.cache import get_normalized_link, get_path_for_caching from .system.urlmatcher import search_urls MAX_WORKERS = 12 SPLIT_WORKER_AFTER_N_LINKS = 1000 FORBIDDEN_WORKER_SPLITS = { 'deviantart', } def main(): subreddit_data_path = Path('r.json') if not subreddit_data_path.exists(): print("Executing prerrequisite...") reddit_imgs.sync.main() print('Loading posts...') workers_state_path = Path('i_gdl_w') workers_state_path.mkdir(exist_ok=True, parents=True) for wsp in workers_state_path.iterdir(): wsp.unlink() subreddit_data = json.loads(subreddit_data_path.read_text()) links = dict() postsl = [ {**post, 'subreddit': subreddit} for subreddit, srdt in subreddit_data.items() for post in srdt['links'] ] postsd = dict() for post in postsl: dk = post['datakey'] if dk not in postsd: postsd[dk] = post.copy() postsd[dk]['subreddits'] = list() postsd[dk]['links'] = list() del postsd[dk]['subreddit'] del postsd[dk]['link'] del postsd[dk]['domain'] if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']): srs.append(sr) if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']): lnks.append(lnk) posts = postsd del postsl del postsd print(f'{len(posts)} posts identified.') print(f'Identifying alternative trivial links...') for post in posts.values(): dk = post['datakey'] post_links = post['links'] has_added_any_link = True while has_added_any_link: has_added_any_link = False for link in post_links: linkcopy = link while linkcopy.endswith('/') or linkcopy.endswith('#') or linkcopy.endswith('?'): linkcopy = linkcopy[:-1] if linkcopy not in post_links: post_links.append(linkcopy) has_added_any_link = True if '