reddit-image-wall-getter/reddit_imgs/sizebysubreddit.py

70 lines
2.3 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import hashlib
import json
import multiprocessing
import traceback
from pathlib import Path
from typing import Any, Dict, FrozenSet, Generator, List, Set, TypeVar
import colored
from .system.cmdline_parser import parse_cmdline
from .system.flattener import flatten_generator
from .system.format_file_size import format_power10
from .system.hexhashof import hexhashof
from .system.table_fmt import table_fmt
WORKERS = 4
T = TypeVar('T')
def cmdline(encoded_args: str = None):
if encoded_args is None:
return run_with_config()
else:
return parse_cmdline(run_with_config, encoded_args)
def run_with_config():
main()
def main():
print('Loading files...')
files_for_link = json.loads(Path('i_gdl_ffl.json').read_bytes())
posts_for_link = json.loads(Path('r_gdl_lp.json').read_bytes())
file_sizes = json.loads(Path('i_gdl_fsz.json').read_bytes())
posts_dict = json.loads(Path('r_gdl_p.json').read_bytes())
subreddit_files: Dict[str, Set[str]] = dict()
subreddit_size: Dict[str, int] = dict()
ffl_sz = len(files_for_link)
print('Processing data...')
for idx, (link, files) in enumerate(files_for_link.items()):
if idx % 50000 == 0:
print(f'{idx+1} of {ffl_sz}')
post_ids = posts_for_link[link]
posts = [posts_dict[post_id] for post_id in post_ids]
subreddits = [subreddit for post in posts for subreddit in post['subreddits']]
for subreddit in subreddits:
if subreddit not in subreddit_files:
subreddit_files[subreddit] = set()
if subreddit not in subreddit_size:
subreddit_size[subreddit] = 0
for file in files:
if file not in subreddit_files[subreddit]:
subreddit_files[subreddit].add(file)
if file in file_sizes:
subreddit_size[subreddit] += file_sizes[file]
else:
print('%r does not have a size' % file)
print('Printing...')
srst = sorted(subreddit_size.items(), key=lambda a: (a[1], a[0]))
print(table_fmt(
'subreddit,disk usage'.split(','),
list(map(lambda a: (a[0], format_power10(a[1])), srst)),
alignment='^>'
))