161 lines
7.2 KiB
Python
161 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
import gc
|
|
import hashlib
|
|
import json
|
|
import multiprocessing
|
|
import traceback
|
|
from pathlib import Path
|
|
from typing import Any, Dict, FrozenSet, Generator, List, Set, Tuple, TypeVar
|
|
|
|
import colored
|
|
|
|
from .system.cmdline_parser import parse_cmdline
|
|
from .system.flattener import flatten_generator
|
|
from .system.format_file_size import format_power10
|
|
from .system.hexhashof import hexhashof
|
|
|
|
WORKERS = 4
|
|
|
|
T = TypeVar('T')
|
|
|
|
|
|
def cmdline(encoded_args: str = None):
|
|
if encoded_args is None:
|
|
return run_with_config()
|
|
else:
|
|
return parse_cmdline(run_with_config, encoded_args)
|
|
|
|
|
|
def run_with_config(max_workers: int = 4):
|
|
WORKERS = max_workers
|
|
main()
|
|
|
|
|
|
class AnnotatedText:
|
|
def __init__(self, text: str, attributes: Dict[str, float]):
|
|
self._text = text
|
|
self._attributes = attributes
|
|
|
|
@property
|
|
def as_dict(self):
|
|
return dict(text=self._text, attributes=self._attributes)
|
|
|
|
@classmethod
|
|
def build(cls, text: str, attributes: Dict[str, float]) -> 'AnnotatedText':
|
|
if text is None:
|
|
return None
|
|
else:
|
|
return cls(text, attributes)
|
|
|
|
@classmethod
|
|
def build_dict(cls, text: str, attributes: Dict[str, float]) -> Dict[str, Any]:
|
|
if text is None:
|
|
return None
|
|
else:
|
|
return cls(text, attributes).as_dict
|
|
|
|
|
|
def main():
|
|
reddit_posts_path = Path('r.json')
|
|
reddit_meta_path = Path('ri.json')
|
|
print('Loading posts...')
|
|
reddit_posts: dict = json.loads(reddit_posts_path.read_bytes())
|
|
reddit_meta: dict = json.loads(reddit_meta_path.read_bytes())
|
|
print('Building initial dictionary...')
|
|
dct_subreddit_level: Dict[str, List[AnnotatedText]] = dict()
|
|
subreddits: List[str] = sorted(list(set(reddit_posts.keys()).union(reddit_meta.keys())))
|
|
current_data_keys = [*[subreddit for subreddit in subreddits],
|
|
'subredditDisplayText',
|
|
'subredditName',
|
|
'subredditUrl',
|
|
'subredditTitle',
|
|
'subredditPublicDescription',
|
|
'postFlair',
|
|
'postTitle',
|
|
'postSharer',
|
|
'postLink',
|
|
'nsfw',
|
|
]
|
|
current_data = {dt: 0 for dt in current_data_keys}
|
|
for subreddit in subreddits:
|
|
if subreddit not in dct_subreddit_level:
|
|
dct_subreddit_level[subreddit] = list()
|
|
dct_this_subreddit: List[AnnotatedText] = dct_subreddit_level[subreddit]
|
|
subreddit_meta: dict = reddit_meta.get(subreddit, dict())
|
|
subreddit_meta = subreddit_meta if subreddit_meta is not None else dict()
|
|
subreddit_posts: List[Dict[str, str]] = reddit_posts.get(subreddit, dict(links=list()))['links']
|
|
subreddit_nsfw = (
|
|
1
|
|
if subreddit_meta.get('definition', dict()).get('isNSFW', False) else
|
|
0)
|
|
add_to_lists([dct_this_subreddit],
|
|
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('displayText'),
|
|
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditDisplayText': 1,
|
|
}, current_data_keys)))
|
|
add_to_lists([dct_this_subreddit],
|
|
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('name'),
|
|
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditName': 1,
|
|
}, current_data_keys)))
|
|
add_to_lists([dct_this_subreddit],
|
|
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('url'),
|
|
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditUrl': 1,
|
|
}, current_data_keys)))
|
|
add_to_lists([dct_this_subreddit],
|
|
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('title'),
|
|
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditTitle': 1,
|
|
}, current_data_keys)))
|
|
add_to_lists([dct_this_subreddit],
|
|
AnnotatedText.build_dict(subreddit_meta.get('about', dict()).get('publicDescription'),
|
|
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditPublicDescription': 1,
|
|
}, current_data_keys)))
|
|
gc.collect()
|
|
print(subreddit, len(subreddit_posts))
|
|
for seq, subreddit_post in enumerate(subreddit_posts):
|
|
post_nsfw = (
|
|
1
|
|
if subreddit_post.get('nsfw', False) else
|
|
0)
|
|
add_to_lists([dct_this_subreddit],
|
|
AnnotatedText.build_dict(subreddit_post.get('flair'),
|
|
dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postFlair': 1,
|
|
}, current_data_keys)))
|
|
add_to_lists([dct_this_subreddit],
|
|
AnnotatedText.build_dict(subreddit_post.get('title'),
|
|
dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postTitle': 1,
|
|
}, current_data_keys)))
|
|
add_to_lists([dct_this_subreddit],
|
|
AnnotatedText.build_dict(subreddit_post.get('sharer'),
|
|
dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postSharer': 1,
|
|
}, current_data_keys)))
|
|
# add_to_lists([dct_this_subreddit],
|
|
# AnnotatedText.build_dict(subreddit_post.get('link'),
|
|
# dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postLink': 1,
|
|
# }, current_data_keys)))
|
|
|
|
|
|
def dict_to_value_list(d: Dict[str, T], l: Tuple[str, ...]) -> Tuple[T, ...]:
|
|
return tuple([d[i] for i in l])
|
|
|
|
|
|
def add_to_dicts(dicts: List[Dict[str, List[AnnotatedText]]],
|
|
key: str,
|
|
item: AnnotatedText,
|
|
insert_none: bool = False,
|
|
):
|
|
if insert_none or item is not None:
|
|
for dct in dicts:
|
|
if key not in dct:
|
|
dct[key] = list()
|
|
dct[key].append(item)
|
|
|
|
|
|
def add_to_lists(lists: List[List[AnnotatedText]],
|
|
item: AnnotatedText,
|
|
insert_none: bool = False,
|
|
):
|
|
if insert_none or item is not None:
|
|
for lst in lists:
|
|
lst.append(item)
|