#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import gc import hashlib import json import multiprocessing import traceback from pathlib import Path from typing import Any, Dict, FrozenSet, Generator, List, Set, Tuple, TypeVar import colored from .system.cmdline_parser import parse_cmdline from .system.flattener import flatten_generator from .system.format_file_size import format_power10 from .system.hexhashof import hexhashof WORKERS = 4 T = TypeVar('T') def cmdline(encoded_args: str = None): if encoded_args is None: return run_with_config() else: return parse_cmdline(run_with_config, encoded_args) def run_with_config(max_workers: int = 4): WORKERS = max_workers main() class AnnotatedText: def __init__(self, text: str, attributes: Dict[str, float]): self._text = text self._attributes = attributes @property def as_dict(self): return dict(text=self._text, attributes=self._attributes) @classmethod def build(cls, text: str, attributes: Dict[str, float]) -> 'AnnotatedText': if text is None: return None else: return cls(text, attributes) @classmethod def build_dict(cls, text: str, attributes: Dict[str, float]) -> Dict[str, Any]: if text is None: return None else: return cls(text, attributes).as_dict def main(): reddit_posts_path = Path('r.json') reddit_meta_path = Path('ri.json') print('Loading posts...') reddit_posts: dict = json.loads(reddit_posts_path.read_bytes()) reddit_meta: dict = json.loads(reddit_meta_path.read_bytes()) print('Building initial dictionary...') dct_subreddit_level: Dict[str, List[AnnotatedText]] = dict() subreddits: List[str] = sorted(list(set(reddit_posts.keys()).union(reddit_meta.keys()))) current_data_keys = [*[subreddit for subreddit in subreddits], 'subredditDisplayText', 'subredditName', 'subredditUrl', 'subredditTitle', 'subredditPublicDescription', 'postFlair', 'postTitle', 'postSharer', 'postLink', 'nsfw', ] current_data = {dt: 0 for dt in current_data_keys} for subreddit in subreddits: if subreddit not in dct_subreddit_level: dct_subreddit_level[subreddit] = list() dct_this_subreddit: List[AnnotatedText] = dct_subreddit_level[subreddit] subreddit_meta: dict = reddit_meta.get(subreddit, dict()) subreddit_meta = subreddit_meta if subreddit_meta is not None else dict() subreddit_posts: List[Dict[str, str]] = reddit_posts.get(subreddit, dict(links=list()))['links'] subreddit_nsfw = ( 1 if subreddit_meta.get('definition', dict()).get('isNSFW', False) else 0) add_to_lists([dct_this_subreddit], AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('displayText'), dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditDisplayText': 1, }, current_data_keys))) add_to_lists([dct_this_subreddit], AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('name'), dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditName': 1, }, current_data_keys))) add_to_lists([dct_this_subreddit], AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('url'), dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditUrl': 1, }, current_data_keys))) add_to_lists([dct_this_subreddit], AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('title'), dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditTitle': 1, }, current_data_keys))) add_to_lists([dct_this_subreddit], AnnotatedText.build_dict(subreddit_meta.get('about', dict()).get('publicDescription'), dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditPublicDescription': 1, }, current_data_keys))) gc.collect() print(subreddit, len(subreddit_posts)) for seq, subreddit_post in enumerate(subreddit_posts): post_nsfw = ( 1 if subreddit_post.get('nsfw', False) else 0) add_to_lists([dct_this_subreddit], AnnotatedText.build_dict(subreddit_post.get('flair'), dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postFlair': 1, }, current_data_keys))) add_to_lists([dct_this_subreddit], AnnotatedText.build_dict(subreddit_post.get('title'), dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postTitle': 1, }, current_data_keys))) add_to_lists([dct_this_subreddit], AnnotatedText.build_dict(subreddit_post.get('sharer'), dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postSharer': 1, }, current_data_keys))) # add_to_lists([dct_this_subreddit], # AnnotatedText.build_dict(subreddit_post.get('link'), # dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postLink': 1, # }, current_data_keys))) def dict_to_value_list(d: Dict[str, T], l: Tuple[str, ...]) -> Tuple[T, ...]: return tuple([d[i] for i in l]) def add_to_dicts(dicts: List[Dict[str, List[AnnotatedText]]], key: str, item: AnnotatedText, insert_none: bool = False, ): if insert_none or item is not None: for dct in dicts: if key not in dct: dct[key] = list() dct[key].append(item) def add_to_lists(lists: List[List[AnnotatedText]], item: AnnotatedText, insert_none: bool = False, ): if insert_none or item is not None: for lst in lists: lst.append(item)