#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import datetime import json import multiprocessing from pathlib import Path from typing import Any, Collection, Dict, FrozenSet, List, Optional, Tuple import colored as clrlib from .system.cmdline_parser import parse_cmdline from .system.flattener import flatten_generator from .system.format_file_size import format_power10 HISTORICAL_EXPORT = False EXTENSION_FILTER = None NSFW_NESS_FILTER = None SUBREDDIT_FILTER = None def cmdline(encoded_args: str = None): if encoded_args is None: return run_with_config() else: return parse_cmdline(run_with_config, encoded_args) def run_with_config(historical: bool = False, nsfw_ness_filter: bool = None, extension_filter: list = None, subreddit_filter: frozenset = None): global HISTORICAL_EXPORT global EXTENSION_FILTER global NSFW_NESS_FILTER global SUBREDDIT_FILTER EXTENSION_FILTER = extension_filter HISTORICAL_EXPORT = historical NSFW_NESS_FILTER = nsfw_ness_filter SUBREDDIT_FILTER = (None if subreddit_filter is None else frozenset(sr.lower() for sr in subreddit_filter)) return main() class ExtensionFilter(multiprocessing.Process): def __init__(self, allowed_extensions: Collection[str], input_queue: multiprocessing.Queue, output_queue: multiprocessing.Queue): multiprocessing.Process.__init__(self) self.allowed_extensions = allowed_extensions self.input_queue = input_queue self.output_queue = output_queue def run(self): while True: next_item: str = self.input_queue.get() if next_item is None: self.output_queue.put(None) break if self.allowed_extensions is None: self.output_queue.put(next_item) elif Path(next_item).suffix in self.allowed_extensions: self.output_queue.put(next_item) class NsfwNessFilter(multiprocessing.Process): def __init__(self, nsfw_ness: Optional[bool], file2link: Dict[str, List[str]], link2post: Dict[str, List[str]], post_info: Dict[str, Dict[str, Any]], input_queue: multiprocessing.Queue, output_queue: multiprocessing.Queue): multiprocessing.Process.__init__(self) self.nsfw_ness = nsfw_ness self.file2link = file2link self.link2post = link2post self.post_info = post_info self.input_queue = input_queue self.output_queue = output_queue def run(self): while True: next_item: str = self.input_queue.get() if next_item is None: self.output_queue.put(None) break if ( (self.nsfw_ness is None) or all(map( lambda post: ( self .post_info .get(post, {}) .get('nsfw', None) == self.nsfw_ness ), flatten_generator(map( lambda link: self.link2post.get(link, []), self.file2link.get(next_item, []) )) )) ): self.output_queue.put(next_item) class SubredditFilter(multiprocessing.Process): def __init__(self, subreddits: Optional[FrozenSet[str]], file2link: Dict[str, List[str]], link2post: Dict[str, List[str]], post_info: Dict[str, Dict[str, Any]], input_queue: multiprocessing.Queue, output_queue: multiprocessing.Queue): multiprocessing.Process.__init__(self) self.subreddits = subreddits self.file2link = file2link self.link2post = link2post self.post_info = post_info self.input_queue = input_queue self.output_queue = output_queue def run(self): while True: next_item: str = self.input_queue.get() if next_item is None: self.output_queue.put(None) break if ( (self.subreddits is None) or all(map( lambda subreddit: subreddit in self.subreddits, flatten_generator(map( lambda post: ( self .post_info .get(post, {}) .get('subreddits', []) ), flatten_generator(map( lambda link: self.link2post.get(link, []), self.file2link.get(next_item, []) )) )) )) ): self.output_queue.put(next_item) class FileExistsFilter(multiprocessing.Process): def __init__(self, input_queue: multiprocessing.Queue, output_queue: multiprocessing.Queue): multiprocessing.Process.__init__(self) self.input_queue = input_queue self.output_queue = output_queue def run(self): while True: next_item: str = self.input_queue.get() if next_item is None: self.output_queue.put(None) break if Path(next_item).exists(): self.output_queue.put(next_item) def main(): exported_name = 'h_gdl.txt' if not HISTORICAL_EXPORT else f'h_gdl_{datetime.datetime.now()}.txt' exported_path = Path(exported_name) exported_path.write_text('') hashes_list: List[Tuple[str, str]] = list(map( lambda a: (a[1], a[0]), map( lambda a: a.split('|', 1), Path('i_gdl_hashes.txt').read_text().splitlines()))) hashes_dict: Dict[str, str] = dict(hashes_list) r_gdl_p = json.loads(Path('r_gdl_p.json').read_text()) r_gdl_lp = json.loads(Path('r_gdl_lp.json').read_text()) i_gdl_lff = json.loads(Path('i_gdl_lff.json').read_text()) file_sizes = json.loads(Path('i_gdl_fsz.json').read_text()) general_size = 0 general_count = 0 existing_files_queue = multiprocessing.Queue() intermediary_queue = multiprocessing.Queue() intermediary_queue2 = multiprocessing.Queue() remaining_queue = multiprocessing.Queue() ffp = ExtensionFilter(EXTENSION_FILTER, existing_files_queue, intermediary_queue) nfp = NsfwNessFilter(NSFW_NESS_FILTER, i_gdl_lff, r_gdl_lp, r_gdl_p, intermediary_queue, intermediary_queue2) srp = SubredditFilter(SUBREDDIT_FILTER, i_gdl_lff, r_gdl_lp, r_gdl_p, intermediary_queue2, remaining_queue) # fep = FileExistsFilter(intermediary_queue, remaining_queue) ffp.start() nfp.start() srp.start() # fep.start() for file, _ in hashes_list: existing_files_queue.put(file) existing_files_queue.put(None) known_hashes = set() with exported_path.open('at') as fd: while True: next_item: str = remaining_queue.get() if next_item is None: break else: hsh = hashes_dict[next_item] known_hashes.add(hsh) fd.write(f'{hsh}|{next_item}\n') general_size += file_sizes.get(next_item, 0) general_count += 1 ffp.join() nfp.join() srp.join() # fep.join() existing_files_queue.close() existing_files_queue.join_thread() intermediary_queue.close() intermediary_queue.join_thread() intermediary_queue2.close() intermediary_queue2.join_thread() remaining_queue.close() remaining_queue.join_thread() print(f'Found {general_count} files') print(f'Found {len(known_hashes)} unique hashes') print(f'Size: {general_size} bytes ({format_power10(general_size)})') if __name__ == "__main__": HISTORICAL_EXPORT = True main()