#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import hashlib import json import multiprocessing import traceback from pathlib import Path from typing import Any, Dict, FrozenSet, Generator, List, Set, TypeVar import colored from .system.cmdline_parser import parse_cmdline from .system.flattener import flatten_generator from .system.format_file_size import format_power10 from .system.hexhashof import hexhashof from .system.table_fmt import table_fmt WORKERS = 4 T = TypeVar('T') def cmdline(encoded_args: str = None): if encoded_args is None: return run_with_config() else: return parse_cmdline(run_with_config, encoded_args) def run_with_config(): main() def main(): print('Loading files...') files_for_link = json.loads(Path('i_gdl_ffl.json').read_bytes()) posts_for_link = json.loads(Path('r_gdl_lp.json').read_bytes()) file_sizes = json.loads(Path('i_gdl_fsz.json').read_bytes()) posts_dict = json.loads(Path('r_gdl_p.json').read_bytes()) subreddit_files: Dict[str, Set[str]] = dict() subreddit_size: Dict[str, int] = dict() ffl_sz = len(files_for_link) print('Processing data...') for idx, (link, files) in enumerate(files_for_link.items()): if idx % 50000 == 0: print(f'{idx+1} of {ffl_sz}') post_ids = posts_for_link[link] posts = [posts_dict[post_id] for post_id in post_ids] subreddits = [subreddit for post in posts for subreddit in post['subreddits']] for subreddit in subreddits: if subreddit not in subreddit_files: subreddit_files[subreddit] = set() if subreddit not in subreddit_size: subreddit_size[subreddit] = 0 for file in files: if file not in subreddit_files[subreddit]: subreddit_files[subreddit].add(file) if file in file_sizes: subreddit_size[subreddit] += file_sizes[file] else: print('%r does not have a size' % file) print('Printing...') srst = sorted(subreddit_size.items(), key=lambda a: (a[1], a[0])) print(table_fmt( 'subreddit,disk usage'.split(','), list(map(lambda a: (a[0], format_power10(a[1])), srst)), alignment='^>' ))