From 5d13706d5444b71069cd58b3d4290718d7e4d4b8 Mon Sep 17 00:00:00 2001 From: Adler Neves Date: Thu, 5 Nov 2020 21:08:05 -0300 Subject: [PATCH] refactoring --- .gitignore | 11 +- cron_prerun_kill.sh | 32 ++ getlinkof.py | 23 ++ .../.gitignore | 0 .../Makefile | 0 .../manage.py | 0 .../webproj/__init__.py | 0 .../webproj/adminModelRegister.py | 0 .../webproj/asgi.py | 0 .../webproj/settings.py | 0 .../webproj/stackOverflowSnippets.py | 0 .../webproj/thumbnailer/__init__.py | 0 .../webproj/thumbnailer/admin.py | 0 .../webproj/thumbnailer/apps.py | 0 .../management/commands/dumpresults.py | 0 .../management/commands/loadhashes.py | 0 .../thumbnailer/migrations/0001_initial.py | 0 .../thumbnailer/migrations/__init__.py | 0 .../webproj/thumbnailer/models.py | 0 .../webproj/thumbnailer/tests.py | 0 .../webproj/thumbnailer/urls.py | 0 .../webproj/thumbnailer/views.py | 0 .../webproj/urls.py | 0 .../webproj/wsgi.py | 0 .../worker.py | 0 .../worker_thumbnailer.py | 0 reddit_imgs/{cachedhash.py => _cachedhash.py} | 0 reddit_imgs/{fetch.py => _fetch.py} | 0 reddit_imgs/{hashit.py => _hashit.py} | 0 .../{normalizetobmp.py => _normalizetobmp.py} | 0 ...oimagehash.py => _normalizetoimagehash.py} | 0 .../{thumbnailize.py => _thumbnailize.py} | 0 reddit_imgs/{wallpapers.py => _wallpapers.py} | 0 reddit_imgs/display_fetch_futures.py | 9 +- reddit_imgs/fetch2.py | 313 ++++++++++++------ reddit_imgs/hashit2.py | 22 +- reddit_imgs/linguisticdictanal.py | 160 +++++++++ reddit_imgs/runner.py | 183 ++-------- reddit_imgs/sizebysubreddit.py | 69 ++++ reddit_imgs/sync.py | 212 ++++++++---- reddit_imgs/system/cmdline_parser.py | 65 +++- reddit_imgs/system/simpleDownloader.py | 57 ++-- reddit_imgs/system/subredditTools.py | 117 ++++--- reddit_imgs/wallpapers2.py | 129 ++++++++ redditgetterunshared.py | 7 + 45 files changed, 994 insertions(+), 415 deletions(-) create mode 100755 cron_prerun_kill.sh create mode 100755 getlinkof.py rename {hash_thumbnailer_distributed => hash_compressor_distributed}/.gitignore (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/Makefile (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/manage.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/__init__.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/adminModelRegister.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/asgi.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/settings.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/stackOverflowSnippets.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/__init__.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/admin.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/apps.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/management/commands/dumpresults.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/management/commands/loadhashes.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/migrations/0001_initial.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/migrations/__init__.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/models.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/tests.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/urls.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/thumbnailer/views.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/urls.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/webproj/wsgi.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/worker.py (100%) rename {hash_thumbnailer_distributed => hash_compressor_distributed}/worker_thumbnailer.py (100%) rename reddit_imgs/{cachedhash.py => _cachedhash.py} (100%) rename reddit_imgs/{fetch.py => _fetch.py} (100%) rename reddit_imgs/{hashit.py => _hashit.py} (100%) rename reddit_imgs/{normalizetobmp.py => _normalizetobmp.py} (100%) rename reddit_imgs/{normalizetoimagehash.py => _normalizetoimagehash.py} (100%) rename reddit_imgs/{thumbnailize.py => _thumbnailize.py} (100%) rename reddit_imgs/{wallpapers.py => _wallpapers.py} (100%) create mode 100644 reddit_imgs/linguisticdictanal.py create mode 100644 reddit_imgs/sizebysubreddit.py create mode 100644 reddit_imgs/wallpapers2.py create mode 100755 redditgetterunshared.py diff --git a/.gitignore b/.gitignore index ae5bf1f..65278e5 100644 --- a/.gitignore +++ b/.gitignore @@ -15,21 +15,14 @@ r_gdl* r_gdl*/** i_c i_c/** -i_c.json -i_cde.json +i_*.json fetch_missing.json -i_he.json -i_c_h.json most_repeated_hashes.json display_fetch_futures.trace i_h i_h/** -i_h.json -i_hc.json -i_hs.json i_h_n i_h_n/** -i_h_n.json i_t i_t/** **/*.pyc @@ -45,3 +38,5 @@ ignored.txt .vscode/** .mypy_cache .mypy_cache/** +del/** +del diff --git a/cron_prerun_kill.sh b/cron_prerun_kill.sh new file mode 100755 index 0000000..1c2c635 --- /dev/null +++ b/cron_prerun_kill.sh @@ -0,0 +1,32 @@ +#!/bin/bash +attempt_interrupt() { + export TEST="$(ps -aux | grep './redditgetter.py' | grep -v grep | grep -v bash | sed -e 's/ */ /g' | cut -d' ' -f2)"; + if [ -n "$TEST" ] ; then + echo "Killing..."; + echo "$TEST" | xargs -rl1 -- kill -2; + sleep 1; + fi +} +force_interrupt() { + export TEST="$(ps -aux | grep './redditgetter.py' | grep -v grep | grep -v bash | sed -e 's/ */ /g' | cut -d' ' -f2)"; + if [ -n "$TEST" ] ; then + echo "Force-Killing..."; + echo "$TEST" | xargs -rl1 -- kill -15; + sleep 1; + fi +} +attempt_interrupt +attempt_interrupt +attempt_interrupt + +attempt_interrupt +attempt_interrupt +attempt_interrupt + +attempt_interrupt +attempt_interrupt +attempt_interrupt + +attempt_interrupt + +force_interrupt diff --git a/getlinkof.py b/getlinkof.py new file mode 100755 index 0000000..912b1fc --- /dev/null +++ b/getlinkof.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import json +from pathlib import Path +import sys + + +def main(): + if not Path('r_gdl_lbe.json').exists(): + print('"r_gdl_lbe.json" does not exist') + elif len(sys.argv) != 3: + print('Usage:') + print(f' {sys.argv[0]} ') + else: + dldr = sys.argv[1] + rlnk = int(sys.argv[2]) + data = json.loads(Path('r_gdl_lbe.json').read_bytes()) + print(data[dldr][-rlnk]) + + +if __name__ == '__main__': + main() diff --git a/hash_thumbnailer_distributed/.gitignore b/hash_compressor_distributed/.gitignore similarity index 100% rename from hash_thumbnailer_distributed/.gitignore rename to hash_compressor_distributed/.gitignore diff --git a/hash_thumbnailer_distributed/Makefile b/hash_compressor_distributed/Makefile similarity index 100% rename from hash_thumbnailer_distributed/Makefile rename to hash_compressor_distributed/Makefile diff --git a/hash_thumbnailer_distributed/manage.py b/hash_compressor_distributed/manage.py similarity index 100% rename from hash_thumbnailer_distributed/manage.py rename to hash_compressor_distributed/manage.py diff --git a/hash_thumbnailer_distributed/webproj/__init__.py b/hash_compressor_distributed/webproj/__init__.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/__init__.py rename to hash_compressor_distributed/webproj/__init__.py diff --git a/hash_thumbnailer_distributed/webproj/adminModelRegister.py b/hash_compressor_distributed/webproj/adminModelRegister.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/adminModelRegister.py rename to hash_compressor_distributed/webproj/adminModelRegister.py diff --git a/hash_thumbnailer_distributed/webproj/asgi.py b/hash_compressor_distributed/webproj/asgi.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/asgi.py rename to hash_compressor_distributed/webproj/asgi.py diff --git a/hash_thumbnailer_distributed/webproj/settings.py b/hash_compressor_distributed/webproj/settings.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/settings.py rename to hash_compressor_distributed/webproj/settings.py diff --git a/hash_thumbnailer_distributed/webproj/stackOverflowSnippets.py b/hash_compressor_distributed/webproj/stackOverflowSnippets.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/stackOverflowSnippets.py rename to hash_compressor_distributed/webproj/stackOverflowSnippets.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/__init__.py b/hash_compressor_distributed/webproj/thumbnailer/__init__.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/__init__.py rename to hash_compressor_distributed/webproj/thumbnailer/__init__.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/admin.py b/hash_compressor_distributed/webproj/thumbnailer/admin.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/admin.py rename to hash_compressor_distributed/webproj/thumbnailer/admin.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/apps.py b/hash_compressor_distributed/webproj/thumbnailer/apps.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/apps.py rename to hash_compressor_distributed/webproj/thumbnailer/apps.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/dumpresults.py b/hash_compressor_distributed/webproj/thumbnailer/management/commands/dumpresults.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/dumpresults.py rename to hash_compressor_distributed/webproj/thumbnailer/management/commands/dumpresults.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/loadhashes.py b/hash_compressor_distributed/webproj/thumbnailer/management/commands/loadhashes.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/loadhashes.py rename to hash_compressor_distributed/webproj/thumbnailer/management/commands/loadhashes.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/migrations/0001_initial.py b/hash_compressor_distributed/webproj/thumbnailer/migrations/0001_initial.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/migrations/0001_initial.py rename to hash_compressor_distributed/webproj/thumbnailer/migrations/0001_initial.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/migrations/__init__.py b/hash_compressor_distributed/webproj/thumbnailer/migrations/__init__.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/migrations/__init__.py rename to hash_compressor_distributed/webproj/thumbnailer/migrations/__init__.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/models.py b/hash_compressor_distributed/webproj/thumbnailer/models.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/models.py rename to hash_compressor_distributed/webproj/thumbnailer/models.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/tests.py b/hash_compressor_distributed/webproj/thumbnailer/tests.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/tests.py rename to hash_compressor_distributed/webproj/thumbnailer/tests.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/urls.py b/hash_compressor_distributed/webproj/thumbnailer/urls.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/urls.py rename to hash_compressor_distributed/webproj/thumbnailer/urls.py diff --git a/hash_thumbnailer_distributed/webproj/thumbnailer/views.py b/hash_compressor_distributed/webproj/thumbnailer/views.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/thumbnailer/views.py rename to hash_compressor_distributed/webproj/thumbnailer/views.py diff --git a/hash_thumbnailer_distributed/webproj/urls.py b/hash_compressor_distributed/webproj/urls.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/urls.py rename to hash_compressor_distributed/webproj/urls.py diff --git a/hash_thumbnailer_distributed/webproj/wsgi.py b/hash_compressor_distributed/webproj/wsgi.py similarity index 100% rename from hash_thumbnailer_distributed/webproj/wsgi.py rename to hash_compressor_distributed/webproj/wsgi.py diff --git a/hash_thumbnailer_distributed/worker.py b/hash_compressor_distributed/worker.py similarity index 100% rename from hash_thumbnailer_distributed/worker.py rename to hash_compressor_distributed/worker.py diff --git a/hash_thumbnailer_distributed/worker_thumbnailer.py b/hash_compressor_distributed/worker_thumbnailer.py similarity index 100% rename from hash_thumbnailer_distributed/worker_thumbnailer.py rename to hash_compressor_distributed/worker_thumbnailer.py diff --git a/reddit_imgs/cachedhash.py b/reddit_imgs/_cachedhash.py similarity index 100% rename from reddit_imgs/cachedhash.py rename to reddit_imgs/_cachedhash.py diff --git a/reddit_imgs/fetch.py b/reddit_imgs/_fetch.py similarity index 100% rename from reddit_imgs/fetch.py rename to reddit_imgs/_fetch.py diff --git a/reddit_imgs/hashit.py b/reddit_imgs/_hashit.py similarity index 100% rename from reddit_imgs/hashit.py rename to reddit_imgs/_hashit.py diff --git a/reddit_imgs/normalizetobmp.py b/reddit_imgs/_normalizetobmp.py similarity index 100% rename from reddit_imgs/normalizetobmp.py rename to reddit_imgs/_normalizetobmp.py diff --git a/reddit_imgs/normalizetoimagehash.py b/reddit_imgs/_normalizetoimagehash.py similarity index 100% rename from reddit_imgs/normalizetoimagehash.py rename to reddit_imgs/_normalizetoimagehash.py diff --git a/reddit_imgs/thumbnailize.py b/reddit_imgs/_thumbnailize.py similarity index 100% rename from reddit_imgs/thumbnailize.py rename to reddit_imgs/_thumbnailize.py diff --git a/reddit_imgs/wallpapers.py b/reddit_imgs/_wallpapers.py similarity index 100% rename from reddit_imgs/wallpapers.py rename to reddit_imgs/_wallpapers.py diff --git a/reddit_imgs/display_fetch_futures.py b/reddit_imgs/display_fetch_futures.py index 2ccaecb..15a0950 100644 --- a/reddit_imgs/display_fetch_futures.py +++ b/reddit_imgs/display_fetch_futures.py @@ -217,12 +217,15 @@ def print_terminal(workers_state_path: Path, keep_to_next_cycle=None): colored.bg(clr) for clr in bg_rank_color_names ] - bg_rank = bg_rank[-(max( + + bg_rank_size = max( 1, state_stats.get('running', 0) + state_stats.get('scrubbing', 0) - )):] + ) + bg_rank = bg_rank[-bg_rank_size:] bg_rang_programmed_len = len(bg_rank) - bg_rank += ['']*(len(jobs_dates)-len(bg_rank)) + bg_rang_programmed_len = bg_rank_size + bg_rank += [colored.bg('black')] * (len(jobs_dates) - len(bg_rank)) # jobs_timestamps = keep_to_next_cycle.get( # 'jobs_timestamps', dict()) diff --git a/reddit_imgs/fetch2.py b/reddit_imgs/fetch2.py index b58750e..4fde6c7 100644 --- a/reddit_imgs/fetch2.py +++ b/reddit_imgs/fetch2.py @@ -9,10 +9,11 @@ import shutil import subprocess import sys import traceback +from collections import OrderedDict from concurrent.futures import ProcessPoolExecutor as PoolExecutor from io import StringIO from pathlib import Path -from typing import List, Optional, Set, Tuple, Type +from typing import Dict, List, Optional, Set, Tuple, Type import colored as clrlib import gallery_dl @@ -40,7 +41,7 @@ FORBIDDEN_WORKER_SPLITS = { } MAX_WORKERS = 12 -SPLIT_WORKER_AFTER_N_LINKS = 1000 +SPLIT_WORKER_AFTER_N_LINKS = 10000 USE_FIREFOX_COOKIES = True DEBUG_WORKER = None IGNORE_WORKERS = set() @@ -63,6 +64,28 @@ GDL_ERRORS = [ ] GDL_ERRORS_DICT = {(1 << k): v for k, v in enumerate(GDL_ERRORS)} +HTML_SPECIAL_CHARS_REPLACE: List[Tuple[str, str]] = [ + ('&', '&'), + ('<', '<'), + ('>', '>'), + ('"', '"'), + (''', '\''), +] +HTML_SPECIAL_CHARS: List[str] = list(map(lambda a: a[0], HTML_SPECIAL_CHARS_REPLACE)) + + +def contains_any(s: str, l: List[str]) -> bool: + for i in l: + if i in s: + return True + return False + + +def replace_many(s: str, l: List[Tuple[str, str]]) -> str: + for f, t in l: + s = s.replace(f, t) + return s + def cmdline(encoded_args: str = None): if encoded_args is None: @@ -137,7 +160,7 @@ def run_with_config(redownload_empties: bool = False, return main() -def main(): +def prerrun(): subreddit_data_path = Path('r.json') if not subreddit_data_path.exists(): print("Executing prerrequisite...") @@ -148,6 +171,7 @@ def main(): 'reddit_imgs/get_firefox_cookies.sh', 'i_gdl/.cookies'], ).check_returncode() + subreddit_filters_path = Path('rf.json') print('Loading posts from disk...') Path('i_gdl').mkdir(exist_ok=True, parents=True) workers_state_path = Path('i_gdl_w') @@ -157,99 +181,16 @@ def main(): if STOP_JOBS_FLAG_PATH.exists(): STOP_JOBS_FLAG_PATH.unlink() subreddit_data = json.loads(subreddit_data_path.read_text()) - links = dict() + subreddit_filters = json.loads(subreddit_filters_path.read_bytes()) print('Loading posts...') - postsl = [ - {'subreddit': subreddit, **post} - for subreddit, srdt in subreddit_data.items() - for post in srdt['links'] - ] - postsd = dict() - for post in postsl: - dk = post['datakey'] - if dk not in postsd: - postsd[dk] = post.copy() - postsd[dk]['subreddits'] = list() - postsd[dk]['links'] = list() - del postsd[dk]['subreddit'] - del postsd[dk]['link'] - del postsd[dk]['domain'] - if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']): - srs.append(sr) - if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']): - lnks.append(lnk) - posts = postsd - del postsl - del postsd + posts = prerrun_flatten_subreddits_into_posts(subreddit_data, subreddit_filters) print(f'{len(posts)} posts identified.') print(f'Identifying alternative trivial links...') - for post in posts.values(): - dk = post['datakey'] - post_links = post['links'] - has_changed_any_link = True - while has_changed_any_link: - has_changed_any_link = False - for link in post_links: - if '' in link: - for linkcopy in search_urls(link): - linkcopy = get_normalized_link(linkcopy) - if linkcopy not in post_links: - post_links.append(linkcopy) - has_changed_any_link = True - while link in post_links: - post_links.remove(link) - has_changed_any_link = True - else: - linkcopy = link - linkcopy = get_normalized_link(linkcopy) - if linkcopy not in post_links: - post_links.append(linkcopy) - has_changed_any_link = True - while linkcopy[-1:] in ('/', '#', '?'): - linkcopy = linkcopy[:-1] - linkcopy = get_normalized_link(linkcopy) - if linkcopy not in post_links: - post_links.append(linkcopy) - has_changed_any_link = True - if '?' in link: - linkcopy = link.split('?', 1)[0] - linkcopy = get_normalized_link(linkcopy) - if linkcopy not in post_links: - post_links.append(linkcopy) - has_changed_any_link = True - if '#' in link: - linkcopy = link.split('#', 1)[0] - linkcopy = get_normalized_link(linkcopy) - if linkcopy not in post_links: - post_links.append(linkcopy) - has_changed_any_link = True - if link == '': - while link in post_links: - post_links.remove(link) - has_changed_any_link = True - if link.startswith('/'): - while link in post_links: - post_links.remove(link) - has_changed_any_link = True - if link.startswith('#'): - while link in post_links: - post_links.remove(link) - has_changed_any_link = True - if link.startswith('mailto'): - while link in post_links: - post_links.remove(link) - has_changed_any_link = True - if (proto := (tpl := link.split(':', 1))[0]).lower() in ('http', 'https') and proto not in ('http', 'https'): - lst = list(tpl) - lst[0] = lst[0].lower() - linkcopy = ':'.join(lst) - post_links.remove(link) - if linkcopy not in post_links: - post_links.append(linkcopy) - has_changed_any_link = True + prerrun_posts_re_sort(posts) Path('r_gdl_p.json').write_text( json.dumps(posts, indent=1, sort_keys=True)) print(f'Grouping links with the posts they show up in...') + links = OrderedDict() for dk, post in posts.items(): for link in post['links']: if link not in links: @@ -257,6 +198,7 @@ def main(): links[link].append(dk) Path('r_gdl_lp.json').write_text( json.dumps(links, indent=1, sort_keys=True)) + known_link_set = set(links.keys()) print(f'{len(links)} links found') print(f'Checking if there is an extractor for each link...') r_gdl_le_path = Path('r_gdl_le.json') @@ -276,6 +218,8 @@ def main(): link_extractors[link] = (type(ext).category if ext is not None else '') + for discarded_link in set(link_extractors.keys()).difference(known_link_set): + del link_extractors[discarded_link] r_gdl_le_path.write_text(json.dumps( link_extractors, indent=1, sort_keys=True)) links_by_extractor = { @@ -301,13 +245,13 @@ def main(): print(f'{len(links)-len(not_downloadable_link_set)} downloadable links found') print(f'{len(not_downloadable_link_set)} undownloadable links found') print(f'{len(links_by_extractor)} extractors found') - Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1)) + Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1, sort_keys=True)) - files_from_links = dict() - links_no_files = list() - files_sizes = dict() - link_statuses = dict() - ignored_links = set() + files_from_links: Dict[str, List[str]] = dict() + links_no_files: List[str] = list() + files_sizes: Dict[str, int] = dict() + link_statuses: Dict[str, int] = dict() + ignored_links: Set[str] = set() if (pth := Path('i_gdl_ffl.json')).exists(): try: @@ -333,6 +277,32 @@ def main(): except: pass + for discarded_link in set(links_no_files).difference(known_link_set): + links_no_files.remove(discarded_link) + discarded_files = set() + for discarded_link in set(files_from_links.keys()).difference(known_link_set): + if discarded_link in files_from_links: + files_in_link = files_from_links[discarded_link] + for file_in_link in files_in_link: + discarded_files.add(file_in_link) + if discarded_link in link_statuses: + del link_statuses[discarded_link] + del files_from_links[discarded_link] + files_to_keep = set() + for files_from_link in files_from_links.values(): + for file_from_link in files_from_link: + if file_from_link not in files_to_keep: + files_to_keep.add(file_from_link) + for discarded_file in discarded_files.difference(files_to_keep): + if discarded_file in files_sizes: + del files_sizes[discarded_file] + for missing_file_size in files_to_keep.difference(set(files_sizes.keys())): + p = Path(missing_file_size) + if not p.exists(): + raise FileNotFoundError(missing_file_size) + else: + files_sizes[missing_file_size] = p.stat().st_size + print('Re-filled files_sizes for %r' % p) if (p := Path('i_gdl_ignored.txt')).exists(): ignored_links = set(list(filter(len, p.read_text().splitlines()))) @@ -421,6 +391,155 @@ def main(): if worker != '' ] print(f'{len(links_to_worker)} workers to be spawned') + return (files_from_links, + links_no_files, + files_sizes, + link_statuses, + workers_nicely_grouped, + workers_state_path, + links_to_worker, + ) + + +def prerrun_flatten_subreddits_into_posts(subreddit_data, subreddit_filters): + postsl = [ + {'subreddit': subreddit, **post} + for subreddit, srdt in subreddit_data.items() + for post in srdt['links'] + ] + postsl.sort(key=lambda a: (-a['timestamp'], a['datakey'])) + postsd = dict() + for post in postsl: + dk = post['datakey'] + sr = post['subreddit'] + if subreddit_filters['no_download'][sr]: + continue + if subreddit_filters['no_sfw'][sr] and not post['nsfw']: + continue + if subreddit_filters['no_nsfw'][sr] and post['nsfw']: + continue + if dk not in postsd: + postsd[dk] = post.copy() + postsd[dk]['subreddits'] = list() + postsd[dk]['links'] = list() + del postsd[dk]['subreddit'] + del postsd[dk]['link'] + del postsd[dk]['domain'] + if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']): + srs.append(sr) + if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']): + lnks.append(lnk) + return postsd + + +def prerrun_posts_re_sort(posts): + for post in sorted(posts.values(), key=lambda a: (-a['timestamp'], a['datakey'])): + post['subreddits'].sort() + dk = post['datakey'] + post_links = post['links'] + has_changed_any_link = True + while has_changed_any_link: + has_changed_any_link = False + for link in post_links: + if '' in link: + for linkcopy in search_urls(link): + linkcopy = get_normalized_link(linkcopy) + linkcopy = replace_many(linkcopy, HTML_SPECIAL_CHARS_REPLACE) + if linkcopy not in post_links: + post_links.append(linkcopy) + has_changed_any_link = True + while link in post_links: + post_links.remove(link) + has_changed_any_link = True + break + else: + linkcopy = link + linkcopy = get_normalized_link(linkcopy) + if linkcopy not in post_links: + post_links.append(linkcopy) + has_changed_any_link = True + break + if '?' in link: + linkcopy = link.split('?', 1)[0] + linkcopy = get_normalized_link(linkcopy) + if linkcopy not in post_links: + post_links.append(linkcopy) + has_changed_any_link = True + break + if '#' in link: + linkcopy = link.split('#', 1)[0] + linkcopy = get_normalized_link(linkcopy) + if linkcopy not in post_links: + post_links.append(linkcopy) + has_changed_any_link = True + break + if contains_any(linkcopy, HTML_SPECIAL_CHARS): + linkcopy = replace_many(linkcopy, HTML_SPECIAL_CHARS_REPLACE) + if linkcopy not in post_links: + post_links.append(linkcopy) + has_changed_any_link = True + break + if linkcopy[-1:] in ('/', '#', '?'): + while linkcopy[-1:] in ('/', '#', '?'): + linkcopy = linkcopy[:-1] + linkcopy = get_normalized_link(linkcopy) + if linkcopy not in post_links: + post_links.append(linkcopy) + has_changed_any_link = True + if link.strip() == '': + while link in post_links: + post_links.remove(link) + has_changed_any_link = True + break + if link.startswith('/'): + while link in post_links: + post_links.remove(link) + has_changed_any_link = True + break + if link.startswith('#'): + while link in post_links: + post_links.remove(link) + has_changed_any_link = True + break + if link.startswith('mailto'): + while link in post_links: + post_links.remove(link) + has_changed_any_link = True + break + if (proto := (tpl := link.split(':', 1))[0]).lower() in ('http', 'https') and proto not in ('http', 'https'): + lst = list(tpl) + lst[0] = lst[0].lower() + linkcopy = ':'.join(lst) + post_links.remove(link) + if linkcopy not in post_links: + post_links.append(linkcopy) + has_changed_any_link = True + break + post['links'] = list(filter(lambda link: ( + not link.startswith('https://preview.redd.it/') + or + ( + (('?width=' in link) or ('&width=' in link)) + and + (('?format=' in link) or ('&format=' in link)) + and + (('?auto=' in link) or ('&auto=' in link)) + and + (('?s=' in link) or ('&s=' in link)) + ) + ), post['links'])) + post['links'].sort() + + +def main(): + (files_from_links, + links_no_files, + files_sizes, + link_statuses, + workers_nicely_grouped, + workers_state_path, + links_to_worker, + ) = prerrun() configure_gdl() @@ -762,7 +881,7 @@ def configure_gdl(): '--write-unsupported=i_gdl_unsupported.txt', # '--quiet', *(['--verbose'] if DEBUG_WORKER else []), - '--retries=2', + '--retries=1', # '--retries=7', # '--limit-rate=1500k', ]) diff --git a/reddit_imgs/hashit2.py b/reddit_imgs/hashit2.py index 828d5ae..9ca3f41 100644 --- a/reddit_imgs/hashit2.py +++ b/reddit_imgs/hashit2.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 # -*- encoding: utf-8 -*- +import datetime import hashlib import json import multiprocessing +import time import traceback from pathlib import Path from typing import Any, Dict, FrozenSet, Generator, List, Set, TypeVar @@ -119,17 +121,31 @@ def main(): # raise Exception('-'*50) print(colored.stylize('Listening queues...', [ colored.fg('light_cyan'), colored.attr('bold')])) + timings: List[Tuple[float, float]] = list() with hashes_path.open('at') as hashes_handler: active_workers = WORKERS while active_workers > 0: file_hash: FileHash = hashes_queue.get() if file_hash is not None: hashed_size_bytes += files_size[file_hash.file] + progress_pct = hashed_size_bytes / max(1, total_file_size_bytes) + timings.append((progress_pct, time.time())) + while len(timings) > 128: + del timings[0] + end_prediction = '' + if len(timings) > 1: + dp = timings[0][0] - timings[-1][0] + dt = timings[0][1] - timings[-1][1] + secs_pred = (1 - progress_pct) * (dt / dp) + td = datetime.timedelta(seconds=secs_pred) + end_prediction = (f' - {td}' + + f' - {datetime.datetime.now() + td}') print(colored.stylize( - '%11.6f%% - %s of %s' % ( - 100*hashed_size_bytes / max(1, total_file_size_bytes), + '%11.6f%% - %s of %s%s' % ( + 100*progress_pct, format_power10(hashed_size_bytes), - total_file_size), + total_file_size, + end_prediction), [colored.fg('light_green'), colored.attr('bold')])) hashes_handler.write(f'{file_hash.hash}|{file_hash.file}\n') else: diff --git a/reddit_imgs/linguisticdictanal.py b/reddit_imgs/linguisticdictanal.py new file mode 100644 index 0000000..5fdf035 --- /dev/null +++ b/reddit_imgs/linguisticdictanal.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import gc +import hashlib +import json +import multiprocessing +import traceback +from pathlib import Path +from typing import Any, Dict, FrozenSet, Generator, List, Set, Tuple, TypeVar + +import colored + +from .system.cmdline_parser import parse_cmdline +from .system.flattener import flatten_generator +from .system.format_file_size import format_power10 +from .system.hexhashof import hexhashof + +WORKERS = 4 + +T = TypeVar('T') + + +def cmdline(encoded_args: str = None): + if encoded_args is None: + return run_with_config() + else: + return parse_cmdline(run_with_config, encoded_args) + + +def run_with_config(max_workers: int = 4): + WORKERS = max_workers + main() + + +class AnnotatedText: + def __init__(self, text: str, attributes: Dict[str, float]): + self._text = text + self._attributes = attributes + + @property + def as_dict(self): + return dict(text=self._text, attributes=self._attributes) + + @classmethod + def build(cls, text: str, attributes: Dict[str, float]) -> 'AnnotatedText': + if text is None: + return None + else: + return cls(text, attributes) + + @classmethod + def build_dict(cls, text: str, attributes: Dict[str, float]) -> Dict[str, Any]: + if text is None: + return None + else: + return cls(text, attributes).as_dict + + +def main(): + reddit_posts_path = Path('r.json') + reddit_meta_path = Path('ri.json') + print('Loading posts...') + reddit_posts: dict = json.loads(reddit_posts_path.read_bytes()) + reddit_meta: dict = json.loads(reddit_meta_path.read_bytes()) + print('Building initial dictionary...') + dct_subreddit_level: Dict[str, List[AnnotatedText]] = dict() + subreddits: List[str] = sorted(list(set(reddit_posts.keys()).union(reddit_meta.keys()))) + current_data_keys = [*[subreddit for subreddit in subreddits], + 'subredditDisplayText', + 'subredditName', + 'subredditUrl', + 'subredditTitle', + 'subredditPublicDescription', + 'postFlair', + 'postTitle', + 'postSharer', + 'postLink', + 'nsfw', + ] + current_data = {dt: 0 for dt in current_data_keys} + for subreddit in subreddits: + if subreddit not in dct_subreddit_level: + dct_subreddit_level[subreddit] = list() + dct_this_subreddit: List[AnnotatedText] = dct_subreddit_level[subreddit] + subreddit_meta: dict = reddit_meta.get(subreddit, dict()) + subreddit_meta = subreddit_meta if subreddit_meta is not None else dict() + subreddit_posts: List[Dict[str, str]] = reddit_posts.get(subreddit, dict(links=list()))['links'] + subreddit_nsfw = ( + 1 + if subreddit_meta.get('definition', dict()).get('isNSFW', False) else + 0) + add_to_lists([dct_this_subreddit], + AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('displayText'), + dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditDisplayText': 1, + }, current_data_keys))) + add_to_lists([dct_this_subreddit], + AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('name'), + dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditName': 1, + }, current_data_keys))) + add_to_lists([dct_this_subreddit], + AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('url'), + dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditUrl': 1, + }, current_data_keys))) + add_to_lists([dct_this_subreddit], + AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('title'), + dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditTitle': 1, + }, current_data_keys))) + add_to_lists([dct_this_subreddit], + AnnotatedText.build_dict(subreddit_meta.get('about', dict()).get('publicDescription'), + dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditPublicDescription': 1, + }, current_data_keys))) + gc.collect() + print(subreddit, len(subreddit_posts)) + for seq, subreddit_post in enumerate(subreddit_posts): + post_nsfw = ( + 1 + if subreddit_post.get('nsfw', False) else + 0) + add_to_lists([dct_this_subreddit], + AnnotatedText.build_dict(subreddit_post.get('flair'), + dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postFlair': 1, + }, current_data_keys))) + add_to_lists([dct_this_subreddit], + AnnotatedText.build_dict(subreddit_post.get('title'), + dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postTitle': 1, + }, current_data_keys))) + add_to_lists([dct_this_subreddit], + AnnotatedText.build_dict(subreddit_post.get('sharer'), + dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postSharer': 1, + }, current_data_keys))) + # add_to_lists([dct_this_subreddit], + # AnnotatedText.build_dict(subreddit_post.get('link'), + # dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postLink': 1, + # }, current_data_keys))) + + +def dict_to_value_list(d: Dict[str, T], l: Tuple[str, ...]) -> Tuple[T, ...]: + return tuple([d[i] for i in l]) + + +def add_to_dicts(dicts: List[Dict[str, List[AnnotatedText]]], + key: str, + item: AnnotatedText, + insert_none: bool = False, + ): + if insert_none or item is not None: + for dct in dicts: + if key not in dct: + dct[key] = list() + dct[key].append(item) + + +def add_to_lists(lists: List[List[AnnotatedText]], + item: AnnotatedText, + insert_none: bool = False, + ): + if insert_none or item is not None: + for lst in lists: + lst.append(item) diff --git a/reddit_imgs/runner.py b/reddit_imgs/runner.py index 7342e74..2c32208 100755 --- a/reddit_imgs/runner.py +++ b/reddit_imgs/runner.py @@ -1,23 +1,22 @@ #!/usr/bin/env python3 # -*- encoding: utf-8 -*- -import reddit_imgs.sync -import reddit_imgs.fetch -import reddit_imgs.fetch2 -import reddit_imgs.reorganize -import reddit_imgs.wallpapers -import reddit_imgs.thumbnailize -import reddit_imgs.hashit -import reddit_imgs.hashit2 -import reddit_imgs.normalizetobmp -import reddit_imgs.cachedhash -import reddit_imgs.download_pruner -import reddit_imgs.suggest_subreddits_from_links -import reddit_imgs.condensate_hashes - import os -import sys import shutil +import subprocess +import sys +from pathlib import Path + +import reddit_imgs.condensate_hashes +import reddit_imgs.download_pruner +import reddit_imgs.fetch2 +import reddit_imgs.hashit2 +import reddit_imgs.linguisticdictanal +import reddit_imgs.sizebysubreddit +import reddit_imgs.suggest_subreddits_from_links +import reddit_imgs.sync +import reddit_imgs.wallpapers2 + wdir = os.path.abspath('.') @@ -32,138 +31,6 @@ def ensureFolderAvailability(): os.makedirs(os.path.join(wdir, 'r')) -def managesubreddits(): - i = '' - while i != '0': - print('\n'*100) - print('----------------------------------------------') - print(' Subreddit Manager ') - print('----------------------------------------------') - print('1) List monitored subreddits') - print('2) Add monitored subreddit') - print('3) Remove monitored subreddit') - print('4) Set as wallpaper source') - print('5) Unset as wallpaper source') - print() - print('0) Back') - print('----------------------------------------------') - print() - print('Enter your choice:') - i = input() - i = i.strip() - print() - print() - subreddits_dir = os.path.join(wdir, 'r') - def subreddits_isfolder(sr): return os.path.isdir( - os.path.join(subreddits_dir, sr)) - subreddits = sorted( - filter(subreddits_isfolder, os.listdir(subreddits_dir))) - if i in ['1', '3', '4', '5']: - print('Subreddits monitored:') - for sr in subreddits: - print('/r/{0}'.format(sr), end='') - if os.path.isfile(os.path.join(subreddits_dir, sr, 'wallpaper.flag')): - print('\t\t(wallpaper)') - else: - print() - print() - if i == '1': - print('Press enter to continue') - input() - if i == '3': - print('Enter the subreddit you want to get rid of:') - rem = input('/r/') - try: - shutil.rmtree(os.path.join(subreddits_dir, rem)) - except: - pass - print() - print('Done.') - print('Press enter to continue') - input() - elif i == '2': - print('Enter the subreddit you want to add:') - add = input('/r/') - try: - os.makedirs(os.path.join(subreddits_dir, add)) - except: - pass - print() - print('Done.') - print('Press enter to continue') - input() - elif i == '4': - print('Enter the subreddit you want to set as wallpaper source:') - add = input('/r/') - try: - dd = os.path.join(subreddits_dir, add) - if not os.path.exists(dd): - os.makedirs(dd) - f = open(os.path.join(dd, 'wallpaper.flag'), 'w') - f.write('') - f.close() - except: - pass - print() - print('Done.') - print('Press enter to continue') - input() - elif i == '5': - print('Enter the subreddit you want to unset as wallpaper source:') - add = input('/r/') - try: - dd = os.path.join(subreddits_dir, add) - if not os.path.exists(dd): - os.makedirs(dd) - f = open(os.path.join(dd, 'wallpaper.flag'), 'w') - f.write('') - f.close() - os.remove(os.path.join(dd, 'wallpaper.flag')) - except: - pass - print() - print('Done.') - print('Press enter to continue') - input() - - -def mainmenu(): - i = '' - while i != '0': - print('\n'*100) - print('----------------------------------------------') - print(' Reddit Image Downloader ') - print('----------------------------------------------') - print('1) Manage subreddits') - print('2) Get link list to be downloaded from reddit') - print('3) Download grabbed links') - print('4) Organize by hashes') - print('5) Generate thumbnails') - print('6) Group and put nice names on downloaded data') - print('7) Sepparate wallpapers') - print() - print('0) Quit') - print('----------------------------------------------') - print() - print('Enter your choice:') - i = input() - i = i.strip() - if i == '1': - managesubreddits() - elif i == '2': - reddit_imgs.sync.main() - elif i == '3': - reddit_imgs.fetch.main() - elif i == '4': - reddit_imgs.hashit.main() - elif i == '5': - reddit_imgs.thumbnailize.main() - elif i == '6': - reddit_imgs.reorganize.main() - elif i == '7': - reddit_imgs.wallpapers.main() - - def main(): # ensureFolderAvailability() if len(sys.argv) > 1: @@ -172,7 +39,11 @@ def main(): mainmenu() -def cmdline(): +def main_unshared(): + cmdline(True) + + +def cmdline(run_each_on_subprocess=False): cmds = sys.argv[1:] available_commands = (( ('sync', reddit_imgs.sync.cmdline), @@ -181,12 +52,14 @@ def cmdline(): ('prune_downloads', reddit_imgs.download_pruner.cmdline), ('hashit', reddit_imgs.hashit2.cmdline), ('condensate_hashes', reddit_imgs.condensate_hashes.cmdline), + ('size_by_subreddit', reddit_imgs.sizebysubreddit.cmdline), + ('wallpapers', reddit_imgs.wallpapers2.cmdline), + ('linguistic_dictionary_analysis', reddit_imgs.linguisticdictanal.cmdline), # ('cachedhash', reddit_imgs.cachedhash.main), # ('hashit', reddit_imgs.hashit.main), # ('normalizetobmp', reddit_imgs.normalizetobmp.main), # ('thumbnailize', reddit_imgs.thumbnailize.main), # ('reorganize', reddit_imgs.reorganize.main), - # ('wallpapers', reddit_imgs.wallpapers.main), )) available_commands_names = tuple( list(map(lambda a: a[0], available_commands))) @@ -199,13 +72,15 @@ def cmdline(): command_ran = False for acmd in available_commands: if cmd.split(':', 1)[0] == acmd[0]: - x = cmd.split(':', 1) command_ran = True - fcmd = acmd[1] - if len(x) == 1: - fcmd() + if run_each_on_subprocess: + the_other_callable = Path(__file__).parent.parent.joinpath('redditgetter.py').absolute() + handler = subprocess.run([str(the_other_callable), cmd]) + handler.check_returncode() else: - fcmd(encoded_args=x[1]) + cmd_callable = acmd[1] + cmd_name, fcmd, *_ = *cmd.split(':', 1), '' + cmd_callable(encoded_args=fcmd) if not command_ran: print('Usage {0} [{1}]'.format(sys.argv[0], '/'.join(available_commands_names))) diff --git a/reddit_imgs/sizebysubreddit.py b/reddit_imgs/sizebysubreddit.py new file mode 100644 index 0000000..b7c9591 --- /dev/null +++ b/reddit_imgs/sizebysubreddit.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import hashlib +import json +import multiprocessing +import traceback +from pathlib import Path +from typing import Any, Dict, FrozenSet, Generator, List, Set, TypeVar + +import colored + +from .system.cmdline_parser import parse_cmdline +from .system.flattener import flatten_generator +from .system.format_file_size import format_power10 +from .system.hexhashof import hexhashof +from .system.table_fmt import table_fmt + +WORKERS = 4 + +T = TypeVar('T') + + +def cmdline(encoded_args: str = None): + if encoded_args is None: + return run_with_config() + else: + return parse_cmdline(run_with_config, encoded_args) + + +def run_with_config(): + main() + + +def main(): + print('Loading files...') + files_for_link = json.loads(Path('i_gdl_ffl.json').read_bytes()) + posts_for_link = json.loads(Path('r_gdl_lp.json').read_bytes()) + file_sizes = json.loads(Path('i_gdl_fsz.json').read_bytes()) + posts_dict = json.loads(Path('r_gdl_p.json').read_bytes()) + subreddit_files: Dict[str, Set[str]] = dict() + subreddit_size: Dict[str, int] = dict() + ffl_sz = len(files_for_link) + print('Processing data...') + for idx, (link, files) in enumerate(files_for_link.items()): + if idx % 50000 == 0: + print(f'{idx+1} of {ffl_sz}') + post_ids = posts_for_link[link] + posts = [posts_dict[post_id] for post_id in post_ids] + subreddits = [subreddit for post in posts for subreddit in post['subreddits']] + for subreddit in subreddits: + if subreddit not in subreddit_files: + subreddit_files[subreddit] = set() + if subreddit not in subreddit_size: + subreddit_size[subreddit] = 0 + for file in files: + if file not in subreddit_files[subreddit]: + subreddit_files[subreddit].add(file) + if file in file_sizes: + subreddit_size[subreddit] += file_sizes[file] + else: + print('%r does not have a size' % file) + print('Printing...') + srst = sorted(subreddit_size.items(), key=lambda a: (a[1], a[0])) + print(table_fmt( + 'subreddit,disk usage'.split(','), + list(map(lambda a: (a[0], format_power10(a[1])), srst)), + alignment='^>' + )) diff --git a/reddit_imgs/sync.py b/reddit_imgs/sync.py index 6c739ea..4e73c44 100755 --- a/reddit_imgs/sync.py +++ b/reddit_imgs/sync.py @@ -7,7 +7,7 @@ from concurrent.futures import ProcessPoolExecutor as PoolExecutor from pathlib import Path from urllib.error import ContentTooShortError, HTTPError, URLError -from bs4 import BeautifulSoup as _BS +import colored as clrlib from .system import simpleDownloader from .system.cmdline_parser import parse_cmdline @@ -18,9 +18,6 @@ from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link, MAX_WORKERS = 16 -def BeautifulSoup(data): return _BS(data, 'html5lib') - - def cmdline(encoded_args: str = None): if encoded_args is None: return run_with_config() @@ -41,45 +38,51 @@ simpleDownloader.setCookies({'over18': 1}) wdir = os.path.abspath('.') -def process_subreddit(subreddit): +def process_subreddit(subreddit, srdt, jsonPageSr): simpleDownloader.setCookies({'over18': 1}) srp = os.path.abspath(os.path.join(wdir, 'r', subreddit)) - # if subreddit!='yiff': continue nextpage = build_gateway_link(subreddit) - srdt = getEmptySubredditData(subreddit) - try: - with open(os.path.join(srp, 'subreddit.json')) as f: - srdt = json.loads(f.read()) - except BaseException: - pass - #srdt = getEmptySubredditData(subreddit) pageno = 0 ygst = srdt['date_first'] - jsonPageSr = None while nextpage: pageno += 1 - print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit)) - print(' >> %s' % (nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),)) + print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [ + clrlib.fg('light_yellow'), + ])) + print(clrlib.stylize(' >> %s' % (nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),), [ + clrlib.fg('light_yellow'), clrlib.attr('dim'), + ])) redditBytes = None try: redditBytes = simpleDownloader.getUrlBytes(nextpage) except (HTTPError, URLError, ContentTooShortError): - print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit)) - print(" >> HTTP Error with code: Skipping...") + print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [ + clrlib.fg('light_red'), clrlib.attr('bold'), + ])) + print(clrlib.stylize(" >> HTTP Error with code: Skipping...", [ + clrlib.fg('light_red'), clrlib.attr('bold'), clrlib.attr('dim'), + ])) break if redditBytes is None: - print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit)) - print(" >> HTTP Error: Skipping...") + print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [ + clrlib.fg('light_red'), clrlib.attr('bold'), + ])) + print(clrlib.stylize(" >> HTTP Error: Skipping...", [ + clrlib.fg('light_red'), clrlib.attr('bold'), clrlib.attr('dim'), + ])) break - # bs = BeautifulSoup(redditBytes) jsonPage = json.loads(redditBytes) getSubredditPageJsonInfoResult = None try: getSubredditPageJsonInfoResult = ( getSubredditPageJsonInfo(jsonPage, subreddit, pageno)) except IndexError: - print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit)) - print(" >> Empty subreddit: Skipping...") + print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [ + clrlib.fg('light_gray'), clrlib.attr('dim'), + ])) + print(clrlib.stylize(" >> Empty subreddit: Skipping...", [ + clrlib.fg('light_gray'), clrlib.attr('dim'), + ])) break first, last, nextpage, links = getSubredditPageJsonInfoResult if ygst >= first: # if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date @@ -101,62 +104,151 @@ def process_subreddit(subreddit): about=jsonPage['subredditAboutInfo'][srid], flair=jsonPage['postFlair'][srid], ) - with open(os.path.join(srp, 'subreddit.json'), 'w') as f: - f.write(json.dumps(srdt, sort_keys=True, indent=2)) - if jsonPageSr is not None: - with open(os.path.join(srp, 'meta.json'), 'w') as f: - f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2)) + srdt['links'] = list(filter(lambda a: len(a['datakey']) < 20, srdt['links'])) + srdt['links'] = sorted(srdt['links'], key=lambda a: -a['timestamp']) + return (subreddit, srp, srdt, jsonPageSr) def main(): - build_summary() + print('Building summary...') + srs, srsi, srf = build_summary() + print('Download...') subreddits = sorted(filter(lambda sr: os.path.isdir( os.path.join(wdir, 'r', sr)), os.listdir(os.path.join(wdir, 'r')))) - with PoolExecutor(MAX_WORKERS) as pe: - q = list() - for subreddit in subreddits: - job = pe.submit(process_subreddit, subreddit) - q.append(job) - for job in q: - job.result() - build_summary() + print('Opening process pool...') + with PoolExecutor(MAX_WORKERS) as pe2: + def process_subreddit_done_callback_inner(job): + (subreddit, srp, srdt, jsonPageSr) = job.result() + del job + process_subreddit_done_callback(subreddit, srp, srdt, jsonPageSr, pe2, srs, srsi) + return + with PoolExecutor(MAX_WORKERS) as pe: + print('Opened process pool') + for subreddit in subreddits: + if subreddit not in srs: + srs[subreddit] = getEmptySubredditData(subreddit) + if subreddit not in srsi: + srsi[subreddit] = None + job = pe.submit( + process_subreddit, + subreddit, + srs[subreddit], + srsi[subreddit], + ) + job.add_done_callback(process_subreddit_done_callback_inner) + print('Closing process pool...') + print('Closed process pool') + print('Writing summary...') + write_summary(srs, srsi, srf) + print('Done') + + +def process_subreddit_done_callback(subreddit, srp, srdt, jsonPageSr, pe, srs, srsi): + srs[subreddit] = srdt + srsi[subreddit] = jsonPageSr + print(clrlib.stylize(f' @> Writing /r/{subreddit}', [ + clrlib.fg('light_cyan'), + ])) + job = pe.submit( + post_processing_saver, + subreddit, srp, srdt, jsonPageSr + ) + + +def post_processing_saver(subreddit, srp, srdt, jsonPageSr): + write_json(Path(os.path.join(srp, 'subreddit.json')), srdt, sort_keys=True) + if jsonPageSr is not None: + write_json(Path(os.path.join(srp, 'meta.json')), jsonPageSr, sort_keys=True) + print(clrlib.stylize(f' @> Written /r/{subreddit}', [ + clrlib.fg('light_green'), + ])) def build_summary(): rjpath = Path(wdir, 'r.json') rijpath = Path(wdir, 'ri.json') + rfpath = Path(wdir, 'rf.json') oldsrs = dict() oldsrsi = dict() if rjpath.exists(): - oldsrs = json.loads(rjpath.read_text()) + oldsrs = json.loads(rjpath.read_bytes()) if rijpath.exists(): - oldsrsi = json.loads(rijpath.read_text()) + oldsrsi = json.loads(rijpath.read_bytes()) srs = dict() srsi = dict() - for srp in Path(wdir, 'r').glob('*/subreddit.json'): - sr = srp.parent.name.lower() - srip = srp.parent.joinpath('meta.json') - try: - srs[sr] = json.loads(srp.read_text()) - except json.decoder.JSONDecodeError: - if sr not in oldsrs: - raise + nodownloadfilter = dict() + nosfwfilter = dict() + nonsfwfilter = dict() + wallpaperfilter = dict() + with PoolExecutor(MAX_WORKERS) as pe: + def on_data_read(job): + (sr, srp, srip, srd, srid, sripe) = job.result() + if srd is not None: + srs[sr] = srd else: - print('Restoring old data for corrupted subrredit %r' % sr) - srs[sr] = oldsrs[sr] - srp.write_text(json.dumps(oldsrs[sr], indent=1)) - if srip.exists(): - try: - srsi[sr] = json.loads(srip.read_text()) - except json.decoder.JSONDecodeError: - if sr not in oldsrsi: - raise + if sr not in oldsrs: + srp.unlink() else: print('Restoring old data for corrupted subrredit %r' % sr) - srsi[sr] = oldsrsi[sr] - srip.write_text(json.dumps(oldsrsi[sr], indent=1)) - rjpath.write_text(json.dumps(srs, indent=1)) - rijpath.write_text(json.dumps(srsi, indent=1)) + srs[sr] = oldsrs[sr] + srp.write_text(json.dumps(oldsrs[sr], indent=1)) + if sripe: + if srid is not None: + srsi[sr] = srid + else: + if sr not in oldsrsi: + srip.unlink() + else: + print('Restoring old data for corrupted subrredit %r' % sr) + srsi[sr] = oldsrsi[sr] + srip.write_text(json.dumps(oldsrsi[sr], indent=1)) + + for srp in Path(wdir, 'r').glob('*/subreddit.json'): + sr = srp.parent.name.lower() + nodownloadfilter[sr] = srp.parent.joinpath('nodownload.flag').exists() + nosfwfilter[sr] = srp.parent.joinpath('nosfw.flag').exists() + nonsfwfilter[sr] = srp.parent.joinpath('nonsfw.flag').exists() + wallpaperfilter[sr] = srp.parent.joinpath('wallpaper.flag').exists() + srip = srp.parent.joinpath('meta.json') + job = pe.submit(read_disk_summary, sr, srp, srip) + job.add_done_callback(on_data_read) + srf = dict( + no_download=nodownloadfilter, + no_sfw=nosfwfilter, + no_nsfw=nonsfwfilter, + wallpaper=wallpaperfilter, + ) + return srs, srsi, srf + + +def read_disk_summary(sr, srp, srip): + srd = None + srid = None + sripe = srip.exists() + try: + srd = json.loads(srp.read_bytes()) + except json.decoder.JSONDecodeError: + pass + if sripe: + try: + srid = json.loads(srip.read_bytes()) + except json.decoder.JSONDecodeError: + pass + return (sr, srp, srip, srd, srid, sripe) + + +def write_summary(srs, srsi, srf): + rjpath = Path(wdir, 'r.json') + rijpath = Path(wdir, 'ri.json') + rfpath = Path(wdir, 'rf.json') + with PoolExecutor(MAX_WORKERS) as pe: + pe.submit(write_json, rjpath, srs) + pe.submit(write_json, rijpath, srsi) + pe.submit(write_json, rfpath, srf) + + +def write_json(path, data, **kwargs): + path.write_text(json.dumps(data, indent=1, **kwargs)) if __name__ == '__main__': diff --git a/reddit_imgs/system/cmdline_parser.py b/reddit_imgs/system/cmdline_parser.py index f453d81..38d1bc1 100644 --- a/reddit_imgs/system/cmdline_parser.py +++ b/reddit_imgs/system/cmdline_parser.py @@ -4,6 +4,7 @@ import re from inspect import _empty, getfullargspec, signature from typing import Callable, Dict, List, Optional, Set, Type, TypeVar +import colored as clrlib from .table_fmt import table_fmt @@ -40,24 +41,39 @@ def parse_cmdline(func: Callable[..., T], encoded_args: str) -> Optional[T]: if k not in func_args} if encoded_args == 'help' or len(unknown_args) > 0: if len(unknown_args) > 0: - print('Unknown arguments found:') + print(clrlib.stylize('Unknown arguments found:', [ + clrlib.fg('light_red'), + clrlib.attr('bold'), + ])) for k, v in unknown_args.items(): - print(f' {k}: {repr(v)}') + print(clrlib.stylize(f' {k}: {repr(v)}', [ + clrlib.fg('light_red'), + ])) print() - print(f'Usage help for: {func.__module__}.{func.__name__}') + print(clrlib.stylize(f'Usage help for: {func.__module__}.{func.__name__}', [ + clrlib.fg('light_cyan'), + clrlib.attr('bold'), + ])) tbl = list() - for name, parameter in sig.parameters.items(): - annotation = parameter.annotation if parameter.annotation != _empty else str - tbl.append(( - str(name), - repr(annotation), - repr(parameter.default) if parameter.default != _empty else '-unset-', - )) - print(table_fmt( - 'name,type,default'.split(','), - tbl, - alignment='^'*3, - )) + if len(sig.parameters) <= 0: + print(clrlib.stylize(' ' * 4 + 'No arguments accepted', [ + clrlib.fg('light_cyan'), + ])) + else: + for name, parameter in sig.parameters.items(): + annotation = parameter.annotation if parameter.annotation != _empty else str + tbl.append(( + str(name), + repr(annotation), + repr(parameter.default) if parameter.default != _empty else '-unset-', + )) + print(clrlib.stylize(space_left_pad_text(4, table_fmt( + 'name,type,default'.split(','), + tbl, + alignment='^'*3, + )), [ + clrlib.fg('light_cyan'), + ])) return None kwargs = dict() for key in str_args: @@ -65,12 +81,21 @@ def parse_cmdline(func: Callable[..., T], encoded_args: str) -> Optional[T]: func_annotations.get(key, str), str_args[key] ) - print(f'Calling {func.__module__}.{func.__name__} with arguments:') + print(clrlib.stylize(f'Calling {func.__module__}.{func.__name__} with arguments:', [ + clrlib.fg('light_gray'), + clrlib.attr('dim'), + ])) if len(kwargs) <= 0: - print(' --- no arguments given ---') + print(clrlib.stylize(' --- no arguments given ---', [ + clrlib.fg('light_gray'), + clrlib.attr('dim'), + ])) else: for k, v in kwargs.items(): - print(f' {k}: {repr(v)}') + print(clrlib.stylize(f' {k}: {repr(v)}', [ + clrlib.fg('light_gray'), + clrlib.attr('dim'), + ])) return func(**kwargs) @@ -81,3 +106,7 @@ def convert_type(cls: Type[K], data: str) -> K: if cls not in (str, int, float): cls = eval return cls(data) + + +def space_left_pad_text(qty: int, multiline_text: str) -> str: + return '\n'.join([' ' * qty + line for line in multiline_text.splitlines()]) diff --git a/reddit_imgs/system/simpleDownloader.py b/reddit_imgs/system/simpleDownloader.py index 3632335..4fa3196 100644 --- a/reddit_imgs/system/simpleDownloader.py +++ b/reddit_imgs/system/simpleDownloader.py @@ -8,43 +8,52 @@ import urllib.error cookie = dict() + def delCookie(cookiekey): cookiekey = str(cookiekey) if cookiekey in cookie: del cookie[cookiekey] + def setCookie(cookiekey, cookieval): cookieval = str(cookieval) cookiekey = str(cookiekey) - if not cookiekey: return - if not cookieval: delCookie(cookiekey) + if not cookiekey: + return + if not cookieval: + delCookie(cookiekey) cookie[cookiekey] = cookieval + def getCookies(): return dict(cookie.items()) + def patchCookies(newCookies): for nk, nv in newCookies.items(): - setCookie(nk,nv) + setCookie(nk, nv) + def cleanCookies(): global cookie cookie = dict() + def setCookies(newCookies): cleanCookies() patchCookies(newCookies) + def getUrlBytes(url, giveUpOn403=False): global cookie request = urllib.request.Request(url.replace(' ', '%20')) try: url.encode('ascii') except: - request = urllib.request.Request(urllib.parse.quote(url,safe='/%?#:')) - request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:76.0) '+ - 'Gecko/20100101 Firefox/76.0' - ) + request = urllib.request.Request(urllib.parse.quote(url, safe='/%?#:')) + request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:82.0) ' + + 'Gecko/20100101 Firefox/82.0' + ) if len(cookie): request.add_header("Cookie", '; '.join(map(lambda a: '='.join(a), cookie.items()))) response = None @@ -53,17 +62,17 @@ def getUrlBytes(url, giveUpOn403=False): except urllib.error.HTTPError as e: if e.code == 429: print('[URL] Got 429 (Too Many Requests): sleeping for 5 seconds') - print(' @ %s'%url) + print(' @ %s' % url) time.sleep(5) return getUrlBytes(url) if e.code == 503: print('[URL] Got 503 (Service Temporarily Unavailable): retrying after 5 seconds') - print(' @ %s'%url) + print(' @ %s' % url) time.sleep(5) return getUrlBytes(url) if e.code == 403 and giveUpOn403: print('[URL] Got 403 (Forbidden): assuming "Not Found"') - print(' @ %s'%url) + print(' @ %s' % url) return None elif e.code == 500: print('[URL] Got 500 (Server Error): assuming "Not Found"') @@ -90,22 +99,22 @@ def getUrlBytes(url, giveUpOn403=False): return None if str(e.reason).startswith('[Errno -3]'): print('Check your internet connection. It seems gone.') - if str(e.reason).startswith('[Errno 110]') or str(e.reason)=='timed out': + if str(e.reason).startswith('[Errno 110]') or str(e.reason) == 'timed out': print('Connection request has timed out - assuming "Not Found"') return None - if str(e.reason).startswith('[Errno 111]') or str(e.reason)=='timed out': + if str(e.reason).startswith('[Errno 111]') or str(e.reason) == 'timed out': print('Connection refused - assuming "Not Found"') return None raise e rcode = response.getcode() rinfo = response.info() headers = dict() - headers_l = list(map(lambda a: list(map(str.strip, a.split(':',1))), str(rinfo).strip().splitlines())) + headers_l = list(map(lambda a: list(map(str.strip, a.split(':', 1))), str(rinfo).strip().splitlines())) for header in headers_l: k = header[0].lower() v = header[1] if k not in headers: - headers[k]=list() + headers[k] = list() headers[k].append(v) del k del v @@ -113,19 +122,22 @@ def getUrlBytes(url, giveUpOn403=False): del headers_l if 'set-cookie' in headers: for cke in headers['set-cookie']: - ckek = cke.split('=',1)[0].strip() - ckev = cke.split('=',1)[1].split(';',1)[0].strip() - setCookie(ckek,ckev) + ckek = cke.split('=', 1)[0].strip() + ckev = cke.split('=', 1)[1].split(';', 1)[0].strip() + setCookie(ckek, ckev) del ckek del ckev del cke if rcode == 429: tosleep = 5 - try: tosleep = int(headers['retry-after'][0]) - except: pass - if tosleep < 1: tosleep = 1 - print('[URL] Got 429 (Too Many Requests): sleeping for %d seconds'%tosleep) - print(' @ %s'%url) + try: + tosleep = int(headers['retry-after'][0]) + except: + pass + if tosleep < 1: + tosleep = 1 + print('[URL] Got 429 (Too Many Requests): sleeping for %d seconds' % tosleep) + print(' @ %s' % url) time.sleep(tosleep) return getUrlBytes(url) data = None @@ -134,5 +146,6 @@ def getUrlBytes(url, giveUpOn403=False): response.close() return data + def getUrl(url): return getUrlBytes(url).decode('utf-8') diff --git a/reddit_imgs/system/subredditTools.py b/reddit_imgs/system/subredditTools.py index 4da8ca6..17d1470 100644 --- a/reddit_imgs/system/subredditTools.py +++ b/reddit_imgs/system/subredditTools.py @@ -15,6 +15,7 @@ GATEWAY_LINK_ARGS = '&'.join([ "sort=new" ]) + def build_gateway_link(sr, after=None, dist=0): d = [] if dist <= 0 else [f'dist={dist}'] a = [] if after is None else [f'after={after}'] @@ -23,50 +24,56 @@ def build_gateway_link(sr, after=None, dist=0): *d, *a ]) + def getInfoFromRedditItem(bs): nsfw = 'over18' in bs['class'] sharer = bs.find(class_='author').text.strip() - title = bs.find('a',class_='title').text.strip() - link = str(bs.find('a',class_='title')['href']) + title = bs.find('a', class_='title').text.strip() + link = str(bs.find('a', class_='title')['href']) domain = 'reddit.com' - if bs.find('span',class_='domain').find('a') is not None: - domain = bs.find('span',class_='domain').find('a').text.strip() + if bs.find('span', class_='domain').find('a') is not None: + domain = bs.find('span', class_='domain').find('a').text.strip() datakey = bs['data-fullname'] timestamp = int(dateutil.parser.parse(bs.find('time')['datetime']).strftime('%s')) flair = None - try: flair = bs.find('span',class_='linkflairlabel').text.strip() - except: pass + try: + flair = bs.find('span', class_='linkflairlabel').text.strip() + except: + pass return { - 'nsfw': nsfw, - 'link': link, - 'title': title, - 'flair': flair, - 'sharer': sharer, - 'domain': domain, - 'datakey': datakey, - 'timestamp': timestamp, + 'nsfw': nsfw, + 'link': link, + 'title': title, + 'flair': flair, + 'sharer': sharer, + 'domain': domain, + 'datakey': datakey, + 'timestamp': timestamp, } + def getInfoFromRedditJsonItem(jo): return { - 'nsfw': jo['isNSFW'], - 'link': jo['source']['url'] if ('source' in jo and jo['source'] is not None) else jo['media']['content'], - 'title': jo['title'], - 'flair': next(iter([f['text'] for f in jo['flair'] if f['type']=='text']), None), - 'sharer': jo['author'], - 'domain': jo['domain'], - 'datakey': jo['id'], - 'timestamp': jo['created']//1000, + 'nsfw': jo['isNSFW'], + 'link': jo['source']['url'] if ('source' in jo and jo['source'] is not None) else jo['media']['content'], + 'title': jo['title'], + 'flair': next(iter([f['text'] for f in jo['flair'] if f['type'] == 'text']), None), + 'sharer': jo['author'], + 'domain': jo['domain'], + 'datakey': jo['id'], + 'timestamp': jo['created']//1000, } + def getEmptySubredditData(srname): return { - 'subreddit': srname, - 'date_first': minint, - 'date_last': maxint, - 'links': list() + 'subreddit': srname, + 'date_first': minint, + 'date_last': maxint, + 'links': list() } + def getSubredditPageJsonInfo(jo, subreddit, pageno): structured_links = list() if len(jo['postIds']) <= 0: @@ -79,6 +86,8 @@ def getSubredditPageJsonInfo(jo, subreddit, pageno): (('media' in post) and (post['media'] is not None) and ('content' in post['media'])) ) and ( ('domain' in post) and (post['domain'] is not None) + ) and ( + ('id' in post) and (isinstance(post['id'], str)) and (len(post['id']) < 20) )): structured_links.append(getInfoFromRedditJsonItem(post)) # tss = [sl['timestamp'] for sl in structured_links] @@ -89,41 +98,49 @@ def getSubredditPageJsonInfo(jo, subreddit, pageno): structured_links ) + def getSubredditPageInfo(bs): pagetable = bs.find(id='siteTable') discussions = pagetable.find_all( lambda a: a.has_attr('class') and 'thing' in a['class'] ) - links = list(filter(lambda a: 'self' not in a['class'],discussions)) + links = list(filter(lambda a: 'self' not in a['class'], discussions)) first = minint last = maxint - try: first = int(dateutil.parser.parse(discussions[0].find('time')['datetime']).strftime('%s')) - except: pass - try: last = int(dateutil.parser.parse(discussions[-1].find('time')['datetime']).strftime('%s')) - except: pass + try: + first = int(dateutil.parser.parse(discussions[0].find('time')['datetime']).strftime('%s')) + except: + pass + try: + last = int(dateutil.parser.parse(discussions[-1].find('time')['datetime']).strftime('%s')) + except: + pass nextpage = None - try: nextpage = bs.find('div', class_='nav-buttons').find(class_='nextprev').find(class_='next-button').find('a')['href'] - except: pass + try: + nextpage = bs.find('div', class_='nav-buttons').find(class_='nextprev').find(class_='next-button').find('a')['href'] + except: + pass structured_links = list(map(getInfoFromRedditItem, links)) return first, last, nextpage, structured_links -def assembleFileName(subreddit,link,seq,ext): + +def assembleFileName(subreddit, link, seq, ext): imgfn = '' - imgfn+= subreddit - imgfn+= '__' - imgfn+= datetime.datetime.fromtimestamp(int(link['timestamp'])).isoformat().replace('T','_').replace(':','-') - imgfn+= '_' - imgfn+= 'nsfw' if link['nsfw'] else 'safe' - imgfn+= '___' - imgfn+= '-' if link['flair'] is None else slugify(link['flair']) - imgfn+= '___' - imgfn+= '-' if link['sharer'] is None else slugify(link['sharer']) - imgfn+= '___' - imgfn+= slugify(link['title'][:50]) - imgfn+= '___' - imgfn+= slugify(link['datakey']) - imgfn+= '___' - imgfn+= str('%04d'%seq) - imgfn+= '.'+ext + imgfn += subreddit + imgfn += '__' + imgfn += datetime.datetime.fromtimestamp(int(link['timestamp'])).isoformat().replace('T', '_').replace(':', '-') + imgfn += '_' + imgfn += 'nsfw' if link['nsfw'] else 'safe' + imgfn += '___' + imgfn += '-' if link['flair'] is None else slugify(link['flair']) + imgfn += '___' + imgfn += '-' if link['sharer'] is None else slugify(link['sharer']) + imgfn += '___' + imgfn += slugify(link['title'][:50]) + imgfn += '___' + imgfn += slugify(link['datakey']) + imgfn += '___' + imgfn += str('%04d' % seq) + imgfn += '.'+ext return imgfn diff --git a/reddit_imgs/wallpapers2.py b/reddit_imgs/wallpapers2.py new file mode 100644 index 0000000..4760964 --- /dev/null +++ b/reddit_imgs/wallpapers2.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import json +import os +import shutil +from pathlib import Path + +import filetype +import PIL.Image + +from .system import subredditTools +from .system.cmdline_parser import parse_cmdline + + +def readAllFile(d): + with open(d) as f: + return f.read() + + +proportion = ( + 5/4, + 21/9 +) +minPixels = 1000**2 + + +def cmdline(encoded_args: str = None): + if encoded_args is None: + return run_with_config() + else: + return parse_cmdline(run_with_config, encoded_args) + + +def run_with_config(): + return main() + + +def main(): + subreddits_name = set(map(lambda a: a[0], + filter(lambda a: a[1], + json.loads(Path('rf.json').read_text())['wallpaper'].items()))) + reddit_posts = json.loads(Path('r_gdl_p.json').read_text()) + link_to_files = json.loads(Path('i_gdl_ffl.json').read_text()) + copyfiles = list() + + linksDown = 0 + linksNotDown = 0 + linksErr = 0 + + print('Listing files...') + + for reddit_post in reddit_posts.values(): + if len(subreddits_name.intersection((posted_on_subreddits := reddit_post['subreddits']))) > 0: + album_files = list() + for link in reddit_post['links']: + for file in link_to_files.get(link, []): + if file not in album_files: + album_files.append(file) + for index, file in enumerate(album_files): + img_fn = subredditTools.assembleFileName( + next(iter(posted_on_subreddits)), + reddit_post, + index, + Path(file).suffix.replace('.', '') + ) + nsfwsafe = 'nsfw' if reddit_post['nsfw'] else 'safe' + file_to = Path('w').joinpath(nsfwsafe).joinpath(img_fn) + copyfiles.append((file, file_to)) + print('Creating folders...') + + lcf = len(copyfiles) + for (cnt, (src, dst)) in enumerate(copyfiles): + container = os.path.dirname(os.path.abspath(dst)) + if not os.path.exists(container): + os.makedirs(container) + + print('Ensuring minimum resolution and proportion...') + ignored = 0 + kept = 0 + + lcf = len(copyfiles) + print('\r'+' '*79+'\r'+'%03d%% processed: %05d of %05d' % (0, 0, lcf), end='') + for (cnt, (src, dst)) in reversed(list(enumerate(copyfiles))): + if os.path.exists(dst): + continue + print('\r'+' '*79+'\r'+'%03d%% processed: %05d of %05d' % ((((lcf-cnt)/lcf)*100)//1, lcf-cnt, lcf), end='') + img = None + try: + img = PIL.Image.open(src) + except: + ignored += 1 + continue + width, height = img.size + prop = width/height + pxls = width*height + if not (pxls >= minPixels and prop >= proportion[0] and prop <= proportion[1]): + ignored += 1 + del copyfiles[cnt] + else: + kept += 1 + img.close() + print() + + print('Copying files...') + + lcf = len(copyfiles) + print('\r'+' '*79+'\r'+'%03d%% copied: %05d of %05d' % (0, 0, lcf), end='') + for (cnt, (src, dst)) in enumerate(copyfiles): + if os.path.exists(dst): + continue + print('\r'+' '*79+'\r'+'%03d%% copied: %05d of %05d' % ((((cnt+1)/lcf)*100)//1, cnt+1, lcf), end='') + try: + shutil.copyfile(src, dst) + except KeyboardInterrupt as e: + print() + print('\r'+' '*79+'\r'+'Deleting interrupted file...', end='') + os.remove(dst) + print('\r'+' '*79+'\r'+'Aborted safely', end='') + print() + raise e + print() + print() + print('{0:>5} files were kept'.format(kept)) + print('{0:>5} files were ignored'.format(ignored)) + + +if __name__ == '__main__': + main() diff --git a/redditgetterunshared.py b/redditgetterunshared.py new file mode 100755 index 0000000..921023d --- /dev/null +++ b/redditgetterunshared.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import reddit_imgs.runner + +if __name__ == '__main__': + reddit_imgs.runner.main_unshared()