reddit-image-wall-getter/reddit_imgs/linguisticdictanal.py

161 lines
7.2 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import gc
import hashlib
import json
import multiprocessing
import traceback
from pathlib import Path
from typing import Any, Dict, FrozenSet, Generator, List, Set, Tuple, TypeVar
import colored
from .system.cmdline_parser import parse_cmdline
from .system.flattener import flatten_generator
from .system.format_file_size import format_power10
from .system.hexhashof import hexhashof
WORKERS = 4
T = TypeVar('T')
def cmdline(encoded_args: str = None):
if encoded_args is None:
return run_with_config()
else:
return parse_cmdline(run_with_config, encoded_args)
def run_with_config(max_workers: int = 4):
WORKERS = max_workers
main()
class AnnotatedText:
def __init__(self, text: str, attributes: Dict[str, float]):
self._text = text
self._attributes = attributes
@property
def as_dict(self):
return dict(text=self._text, attributes=self._attributes)
@classmethod
def build(cls, text: str, attributes: Dict[str, float]) -> 'AnnotatedText':
if text is None:
return None
else:
return cls(text, attributes)
@classmethod
def build_dict(cls, text: str, attributes: Dict[str, float]) -> Dict[str, Any]:
if text is None:
return None
else:
return cls(text, attributes).as_dict
def main():
reddit_posts_path = Path('r.json')
reddit_meta_path = Path('ri.json')
print('Loading posts...')
reddit_posts: dict = json.loads(reddit_posts_path.read_bytes())
reddit_meta: dict = json.loads(reddit_meta_path.read_bytes())
print('Building initial dictionary...')
dct_subreddit_level: Dict[str, List[AnnotatedText]] = dict()
subreddits: List[str] = sorted(list(set(reddit_posts.keys()).union(reddit_meta.keys())))
current_data_keys = [*[subreddit for subreddit in subreddits],
'subredditDisplayText',
'subredditName',
'subredditUrl',
'subredditTitle',
'subredditPublicDescription',
'postFlair',
'postTitle',
'postSharer',
'postLink',
'nsfw',
]
current_data = {dt: 0 for dt in current_data_keys}
for subreddit in subreddits:
if subreddit not in dct_subreddit_level:
dct_subreddit_level[subreddit] = list()
dct_this_subreddit: List[AnnotatedText] = dct_subreddit_level[subreddit]
subreddit_meta: dict = reddit_meta.get(subreddit, dict())
subreddit_meta = subreddit_meta if subreddit_meta is not None else dict()
subreddit_posts: List[Dict[str, str]] = reddit_posts.get(subreddit, dict(links=list()))['links']
subreddit_nsfw = (
1
if subreddit_meta.get('definition', dict()).get('isNSFW', False) else
0)
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('displayText'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditDisplayText': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('name'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditName': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('url'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditUrl': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('title'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditTitle': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('about', dict()).get('publicDescription'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditPublicDescription': 1,
}, current_data_keys)))
gc.collect()
print(subreddit, len(subreddit_posts))
for seq, subreddit_post in enumerate(subreddit_posts):
post_nsfw = (
1
if subreddit_post.get('nsfw', False) else
0)
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_post.get('flair'),
dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postFlair': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_post.get('title'),
dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postTitle': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_post.get('sharer'),
dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postSharer': 1,
}, current_data_keys)))
# add_to_lists([dct_this_subreddit],
# AnnotatedText.build_dict(subreddit_post.get('link'),
# dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postLink': 1,
# }, current_data_keys)))
def dict_to_value_list(d: Dict[str, T], l: Tuple[str, ...]) -> Tuple[T, ...]:
return tuple([d[i] for i in l])
def add_to_dicts(dicts: List[Dict[str, List[AnnotatedText]]],
key: str,
item: AnnotatedText,
insert_none: bool = False,
):
if insert_none or item is not None:
for dct in dicts:
if key not in dct:
dct[key] = list()
dct[key].append(item)
def add_to_lists(lists: List[List[AnnotatedText]],
item: AnnotatedText,
insert_none: bool = False,
):
if insert_none or item is not None:
for lst in lists:
lst.append(item)