Browse Source

refactoring

master
Adler Neves 5 months ago
parent
commit
5d13706d54
45 changed files with 992 additions and 413 deletions
  1. +3
    -8
      .gitignore
  2. +32
    -0
      cron_prerun_kill.sh
  3. +23
    -0
      getlinkof.py
  4. +0
    -0
      hash_compressor_distributed/.gitignore
  5. +0
    -0
      hash_compressor_distributed/Makefile
  6. +0
    -0
      hash_compressor_distributed/manage.py
  7. +0
    -0
      hash_compressor_distributed/webproj/__init__.py
  8. +0
    -0
      hash_compressor_distributed/webproj/adminModelRegister.py
  9. +0
    -0
      hash_compressor_distributed/webproj/asgi.py
  10. +0
    -0
      hash_compressor_distributed/webproj/settings.py
  11. +0
    -0
      hash_compressor_distributed/webproj/stackOverflowSnippets.py
  12. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/__init__.py
  13. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/admin.py
  14. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/apps.py
  15. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/management/commands/dumpresults.py
  16. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/management/commands/loadhashes.py
  17. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/migrations/0001_initial.py
  18. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/migrations/__init__.py
  19. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/models.py
  20. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/tests.py
  21. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/urls.py
  22. +0
    -0
      hash_compressor_distributed/webproj/thumbnailer/views.py
  23. +0
    -0
      hash_compressor_distributed/webproj/urls.py
  24. +0
    -0
      hash_compressor_distributed/webproj/wsgi.py
  25. +0
    -0
      hash_compressor_distributed/worker.py
  26. +0
    -0
      hash_compressor_distributed/worker_thumbnailer.py
  27. +0
    -0
      reddit_imgs/_cachedhash.py
  28. +0
    -0
      reddit_imgs/_fetch.py
  29. +0
    -0
      reddit_imgs/_hashit.py
  30. +0
    -0
      reddit_imgs/_normalizetobmp.py
  31. +0
    -0
      reddit_imgs/_normalizetoimagehash.py
  32. +0
    -0
      reddit_imgs/_thumbnailize.py
  33. +0
    -0
      reddit_imgs/_wallpapers.py
  34. +6
    -3
      reddit_imgs/display_fetch_futures.py
  35. +216
    -97
      reddit_imgs/fetch2.py
  36. +19
    -3
      reddit_imgs/hashit2.py
  37. +160
    -0
      reddit_imgs/linguisticdictanal.py
  38. +27
    -152
      reddit_imgs/runner.py
  39. +69
    -0
      reddit_imgs/sizebysubreddit.py
  40. +152
    -60
      reddit_imgs/sync.py
  41. +47
    -18
      reddit_imgs/system/cmdline_parser.py
  42. +35
    -22
      reddit_imgs/system/simpleDownloader.py
  43. +67
    -50
      reddit_imgs/system/subredditTools.py
  44. +129
    -0
      reddit_imgs/wallpapers2.py
  45. +7
    -0
      redditgetterunshared.py

+ 3
- 8
.gitignore View File

@ -15,21 +15,14 @@ r_gdl*
r_gdl*/**
i_c
i_c/**
i_c.json
i_cde.json
i_*.json
fetch_missing.json
i_he.json
i_c_h.json
most_repeated_hashes.json
display_fetch_futures.trace
i_h
i_h/**
i_h.json
i_hc.json
i_hs.json
i_h_n
i_h_n/**
i_h_n.json
i_t
i_t/**
**/*.pyc
@ -45,3 +38,5 @@ ignored.txt
.vscode/**
.mypy_cache
.mypy_cache/**
del/**
del

+ 32
- 0
cron_prerun_kill.sh View File

@ -0,0 +1,32 @@
#!/bin/bash
attempt_interrupt() {
export TEST="$(ps -aux | grep './redditgetter.py' | grep -v grep | grep -v bash | sed -e 's/ */ /g' | cut -d' ' -f2)";
if [ -n "$TEST" ] ; then
echo "Killing...";
echo "$TEST" | xargs -rl1 -- kill -2;
sleep 1;
fi
}
force_interrupt() {
export TEST="$(ps -aux | grep './redditgetter.py' | grep -v grep | grep -v bash | sed -e 's/ */ /g' | cut -d' ' -f2)";
if [ -n "$TEST" ] ; then
echo "Force-Killing...";
echo "$TEST" | xargs -rl1 -- kill -15;
sleep 1;
fi
}
attempt_interrupt
attempt_interrupt
attempt_interrupt
attempt_interrupt
attempt_interrupt
attempt_interrupt
attempt_interrupt
attempt_interrupt
attempt_interrupt
attempt_interrupt
force_interrupt

+ 23
- 0
getlinkof.py View File

@ -0,0 +1,23 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
from pathlib import Path
import sys
def main():
if not Path('r_gdl_lbe.json').exists():
print('"r_gdl_lbe.json" does not exist')
elif len(sys.argv) != 3:
print('Usage:')
print(f' {sys.argv[0]} <downloader> <remaining_links>')
else:
dldr = sys.argv[1]
rlnk = int(sys.argv[2])
data = json.loads(Path('r_gdl_lbe.json').read_bytes())
print(data[dldr][-rlnk])
if __name__ == '__main__':
main()

hash_thumbnailer_distributed/.gitignore → hash_compressor_distributed/.gitignore View File


hash_thumbnailer_distributed/Makefile → hash_compressor_distributed/Makefile View File


hash_thumbnailer_distributed/manage.py → hash_compressor_distributed/manage.py View File


hash_thumbnailer_distributed/webproj/__init__.py → hash_compressor_distributed/webproj/__init__.py View File


hash_thumbnailer_distributed/webproj/adminModelRegister.py → hash_compressor_distributed/webproj/adminModelRegister.py View File


hash_thumbnailer_distributed/webproj/asgi.py → hash_compressor_distributed/webproj/asgi.py View File


hash_thumbnailer_distributed/webproj/settings.py → hash_compressor_distributed/webproj/settings.py View File


hash_thumbnailer_distributed/webproj/stackOverflowSnippets.py → hash_compressor_distributed/webproj/stackOverflowSnippets.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/__init__.py → hash_compressor_distributed/webproj/thumbnailer/__init__.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/admin.py → hash_compressor_distributed/webproj/thumbnailer/admin.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/apps.py → hash_compressor_distributed/webproj/thumbnailer/apps.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/dumpresults.py → hash_compressor_distributed/webproj/thumbnailer/management/commands/dumpresults.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/loadhashes.py → hash_compressor_distributed/webproj/thumbnailer/management/commands/loadhashes.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/migrations/0001_initial.py → hash_compressor_distributed/webproj/thumbnailer/migrations/0001_initial.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/migrations/__init__.py → hash_compressor_distributed/webproj/thumbnailer/migrations/__init__.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/models.py → hash_compressor_distributed/webproj/thumbnailer/models.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/tests.py → hash_compressor_distributed/webproj/thumbnailer/tests.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/urls.py → hash_compressor_distributed/webproj/thumbnailer/urls.py View File


hash_thumbnailer_distributed/webproj/thumbnailer/views.py → hash_compressor_distributed/webproj/thumbnailer/views.py View File


hash_thumbnailer_distributed/webproj/urls.py → hash_compressor_distributed/webproj/urls.py View File


hash_thumbnailer_distributed/webproj/wsgi.py → hash_compressor_distributed/webproj/wsgi.py View File


hash_thumbnailer_distributed/worker.py → hash_compressor_distributed/worker.py View File


hash_thumbnailer_distributed/worker_thumbnailer.py → hash_compressor_distributed/worker_thumbnailer.py View File


reddit_imgs/cachedhash.py → reddit_imgs/_cachedhash.py View File


reddit_imgs/fetch.py → reddit_imgs/_fetch.py View File


reddit_imgs/hashit.py → reddit_imgs/_hashit.py View File


reddit_imgs/normalizetobmp.py → reddit_imgs/_normalizetobmp.py View File


reddit_imgs/normalizetoimagehash.py → reddit_imgs/_normalizetoimagehash.py View File


reddit_imgs/thumbnailize.py → reddit_imgs/_thumbnailize.py View File


reddit_imgs/wallpapers.py → reddit_imgs/_wallpapers.py View File


+ 6
- 3
reddit_imgs/display_fetch_futures.py View File

@ -217,12 +217,15 @@ def print_terminal(workers_state_path: Path, keep_to_next_cycle=None):
colored.bg(clr)
for clr in bg_rank_color_names
]
bg_rank = bg_rank[-(max(
bg_rank_size = max(
1,
state_stats.get('running', 0) + state_stats.get('scrubbing', 0)
)):]
)
bg_rank = bg_rank[-bg_rank_size:]
bg_rang_programmed_len = len(bg_rank)
bg_rank += ['']*(len(jobs_dates)-len(bg_rank))
bg_rang_programmed_len = bg_rank_size
bg_rank += [colored.bg('black')] * (len(jobs_dates) - len(bg_rank))
# jobs_timestamps = keep_to_next_cycle.get(
# 'jobs_timestamps', dict())


+ 216
- 97
reddit_imgs/fetch2.py View File

@ -9,10 +9,11 @@ import shutil
import subprocess
import sys
import traceback
from collections import OrderedDict
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from io import StringIO
from pathlib import Path
from typing import List, Optional, Set, Tuple, Type
from typing import Dict, List, Optional, Set, Tuple, Type
import colored as clrlib
import gallery_dl
@ -40,7 +41,7 @@ FORBIDDEN_WORKER_SPLITS = {
}
MAX_WORKERS = 12
SPLIT_WORKER_AFTER_N_LINKS = 1000
SPLIT_WORKER_AFTER_N_LINKS = 10000
USE_FIREFOX_COOKIES = True
DEBUG_WORKER = None
IGNORE_WORKERS = set()
@ -63,6 +64,28 @@ GDL_ERRORS = [
]
GDL_ERRORS_DICT = {(1 << k): v for k, v in enumerate(GDL_ERRORS)}
HTML_SPECIAL_CHARS_REPLACE: List[Tuple[str, str]] = [
('&amp;', '&'),
('&lt;', '<'),
('&gt;', '>'),
('&quot;', '"'),
('&#039;', '\''),
]
HTML_SPECIAL_CHARS: List[str] = list(map(lambda a: a[0], HTML_SPECIAL_CHARS_REPLACE))
def contains_any(s: str, l: List[str]) -> bool:
for i in l:
if i in s:
return True
return False
def replace_many(s: str, l: List[Tuple[str, str]]) -> str:
for f, t in l:
s = s.replace(f, t)
return s
def cmdline(encoded_args: str = None):
if encoded_args is None:
@ -137,7 +160,7 @@ def run_with_config(redownload_empties: bool = False,
return main()
def main():
def prerrun():
subreddit_data_path = Path('r.json')
if not subreddit_data_path.exists():
print("Executing prerrequisite...")
@ -148,6 +171,7 @@ def main():
'reddit_imgs/get_firefox_cookies.sh',
'i_gdl/.cookies'],
).check_returncode()
subreddit_filters_path = Path('rf.json')
print('Loading posts from disk...')
Path('i_gdl').mkdir(exist_ok=True, parents=True)
workers_state_path = Path('i_gdl_w')
@ -157,99 +181,16 @@ def main():
if STOP_JOBS_FLAG_PATH.exists():
STOP_JOBS_FLAG_PATH.unlink()
subreddit_data = json.loads(subreddit_data_path.read_text())
links = dict()
subreddit_filters = json.loads(subreddit_filters_path.read_bytes())
print('Loading posts...')
postsl = [
{'subreddit': subreddit, **post}
for subreddit, srdt in subreddit_data.items()
for post in srdt['links']
]
postsd = dict()
for post in postsl:
dk = post['datakey']
if dk not in postsd:
postsd[dk] = post.copy()
postsd[dk]['subreddits'] = list()
postsd[dk]['links'] = list()
del postsd[dk]['subreddit']
del postsd[dk]['link']
del postsd[dk]['domain']
if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']):
srs.append(sr)
if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']):
lnks.append(lnk)
posts = postsd
del postsl
del postsd
posts = prerrun_flatten_subreddits_into_posts(subreddit_data, subreddit_filters)
print(f'{len(posts)} posts identified.')
print(f'Identifying alternative trivial links...')
for post in posts.values():
dk = post['datakey']
post_links = post['links']
has_changed_any_link = True
while has_changed_any_link:
has_changed_any_link = False
for link in post_links:
if '<!--' in link or '-->' in link:
for linkcopy in search_urls(link):
linkcopy = get_normalized_link(linkcopy)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
else:
linkcopy = link
linkcopy = get_normalized_link(linkcopy)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
while linkcopy[-1:] in ('/', '#', '?'):
linkcopy = linkcopy[:-1]
linkcopy = get_normalized_link(linkcopy)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
if '?' in link:
linkcopy = link.split('?', 1)[0]
linkcopy = get_normalized_link(linkcopy)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
if '#' in link:
linkcopy = link.split('#', 1)[0]
linkcopy = get_normalized_link(linkcopy)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
if link == '':
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
if link.startswith('/'):
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
if link.startswith('#'):
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
if link.startswith('mailto'):
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
if (proto := (tpl := link.split(':', 1))[0]).lower() in ('http', 'https') and proto not in ('http', 'https'):
lst = list(tpl)
lst[0] = lst[0].lower()
linkcopy = ':'.join(lst)
post_links.remove(link)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
prerrun_posts_re_sort(posts)
Path('r_gdl_p.json').write_text(
json.dumps(posts, indent=1, sort_keys=True))
print(f'Grouping links with the posts they show up in...')
links = OrderedDict()
for dk, post in posts.items():
for link in post['links']:
if link not in links:
@ -257,6 +198,7 @@ def main():
links[link].append(dk)
Path('r_gdl_lp.json').write_text(
json.dumps(links, indent=1, sort_keys=True))
known_link_set = set(links.keys())
print(f'{len(links)} links found')
print(f'Checking if there is an extractor for each link...')
r_gdl_le_path = Path('r_gdl_le.json')
@ -276,6 +218,8 @@ def main():
link_extractors[link] = (type(ext).category
if ext is not None else
'')
for discarded_link in set(link_extractors.keys()).difference(known_link_set):
del link_extractors[discarded_link]
r_gdl_le_path.write_text(json.dumps(
link_extractors, indent=1, sort_keys=True))
links_by_extractor = {
@ -301,13 +245,13 @@ def main():
print(f'{len(links)-len(not_downloadable_link_set)} downloadable links found')
print(f'{len(not_downloadable_link_set)} undownloadable links found')
print(f'{len(links_by_extractor)} extractors found')
Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1))
Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1, sort_keys=True))
files_from_links = dict()
links_no_files = list()
files_sizes = dict()
link_statuses = dict()
ignored_links = set()
files_from_links: Dict[str, List[str]] = dict()
links_no_files: List[str] = list()
files_sizes: Dict[str, int] = dict()
link_statuses: Dict[str, int] = dict()
ignored_links: Set[str] = set()
if (pth := Path('i_gdl_ffl.json')).exists():
try:
@ -333,6 +277,32 @@ def main():
except:
pass
for discarded_link in set(links_no_files).difference(known_link_set):
links_no_files.remove(discarded_link)
discarded_files = set()
for discarded_link in set(files_from_links.keys()).difference(known_link_set):
if discarded_link in files_from_links:
files_in_link = files_from_links[discarded_link]
for file_in_link in files_in_link:
discarded_files.add(file_in_link)
if discarded_link in link_statuses:
del link_statuses[discarded_link]
del files_from_links[discarded_link]
files_to_keep = set()
for files_from_link in files_from_links.values():
for file_from_link in files_from_link:
if file_from_link not in files_to_keep:
files_to_keep.add(file_from_link)
for discarded_file in discarded_files.difference(files_to_keep):
if discarded_file in files_sizes:
del files_sizes[discarded_file]
for missing_file_size in files_to_keep.difference(set(files_sizes.keys())):
p = Path(missing_file_size)
if not p.exists():
raise FileNotFoundError(missing_file_size)
else:
files_sizes[missing_file_size] = p.stat().st_size
print('Re-filled files_sizes for %r' % p)
if (p := Path('i_gdl_ignored.txt')).exists():
ignored_links = set(list(filter(len, p.read_text().splitlines())))
@ -421,6 +391,155 @@ def main():
if worker != ''
]
print(f'{len(links_to_worker)} workers to be spawned')
return (files_from_links,
links_no_files,
files_sizes,
link_statuses,
workers_nicely_grouped,
workers_state_path,
links_to_worker,
)
def prerrun_flatten_subreddits_into_posts(subreddit_data, subreddit_filters):
postsl = [
{'subreddit': subreddit, **post}
for subreddit, srdt in subreddit_data.items()
for post in srdt['links']
]
postsl.sort(key=lambda a: (-a['timestamp'], a['datakey']))
postsd = dict()
for post in postsl:
dk = post['datakey']
sr = post['subreddit']
if subreddit_filters['no_download'][sr]:
continue
if subreddit_filters['no_sfw'][sr] and not post['nsfw']:
continue
if subreddit_filters['no_nsfw'][sr] and post['nsfw']:
continue
if dk not in postsd:
postsd[dk] = post.copy()
postsd[dk]['subreddits'] = list()
postsd[dk]['links'] = list()
del postsd[dk]['subreddit']
del postsd[dk]['link']
del postsd[dk]['domain']
if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']):
srs.append(sr)
if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']):
lnks.append(lnk)
return postsd
def prerrun_posts_re_sort(posts):
for post in sorted(posts.values(), key=lambda a: (-a['timestamp'], a['datakey'])):
post['subreddits'].sort()
dk = post['datakey']
post_links = post['links']
has_changed_any_link = True
while has_changed_any_link:
has_changed_any_link = False
for link in post_links:
if '<!--' in link or '-->' in link:
for linkcopy in search_urls(link):
linkcopy = get_normalized_link(linkcopy)
linkcopy = replace_many(linkcopy, HTML_SPECIAL_CHARS_REPLACE)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
break
else:
linkcopy = link
linkcopy = get_normalized_link(linkcopy)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
break
if '?' in link:
linkcopy = link.split('?', 1)[0]
linkcopy = get_normalized_link(linkcopy)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
break
if '#' in link:
linkcopy = link.split('#', 1)[0]
linkcopy = get_normalized_link(linkcopy)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
break
if contains_any(linkcopy, HTML_SPECIAL_CHARS):
linkcopy = replace_many(linkcopy, HTML_SPECIAL_CHARS_REPLACE)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
break
if linkcopy[-1:] in ('/', '#', '?'):
while linkcopy[-1:] in ('/', '#', '?'):
linkcopy = linkcopy[:-1]
linkcopy = get_normalized_link(linkcopy)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
if link.strip() == '':
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
break
if link.startswith('/'):
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
break
if link.startswith('#'):
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
break
if link.startswith('mailto'):
while link in post_links:
post_links.remove(link)
has_changed_any_link = True
break
if (proto := (tpl := link.split(':', 1))[0]).lower() in ('http', 'https') and proto not in ('http', 'https'):
lst = list(tpl)
lst[0] = lst[0].lower()
linkcopy = ':'.join(lst)
post_links.remove(link)
if linkcopy not in post_links:
post_links.append(linkcopy)
has_changed_any_link = True
break
post['links'] = list(filter(lambda link: (
not link.startswith('https://preview.redd.it/')
or
(
(('?width=' in link) or ('&width=' in link))
and
(('?format=' in link) or ('&format=' in link))
and
(('?auto=' in link) or ('&auto=' in link))
and
(('?s=' in link) or ('&s=' in link))
)
), post['links']))
post['links'].sort()
def main():
(files_from_links,
links_no_files,
files_sizes,
link_statuses,
workers_nicely_grouped,
workers_state_path,
links_to_worker,
) = prerrun()
configure_gdl()
@ -762,7 +881,7 @@ def configure_gdl():
'--write-unsupported=i_gdl_unsupported.txt',
# '--quiet',
*(['--verbose'] if DEBUG_WORKER else []),
'--retries=2',
'--retries=1',
# '--retries=7',
# '--limit-rate=1500k',
])


+ 19
- 3
reddit_imgs/hashit2.py View File

@ -1,9 +1,11 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import datetime
import hashlib
import json
import multiprocessing
import time
import traceback
from pathlib import Path
from typing import Any, Dict, FrozenSet, Generator, List, Set, TypeVar
@ -119,17 +121,31 @@ def main():
# raise Exception('-'*50)
print(colored.stylize('Listening queues...', [
colored.fg('light_cyan'), colored.attr('bold')]))
timings: List[Tuple[float, float]] = list()
with hashes_path.open('at') as hashes_handler:
active_workers = WORKERS
while active_workers > 0:
file_hash: FileHash = hashes_queue.get()
if file_hash is not None:
hashed_size_bytes += files_size[file_hash.file]
progress_pct = hashed_size_bytes / max(1, total_file_size_bytes)
timings.append((progress_pct, time.time()))
while len(timings) > 128:
del timings[0]
end_prediction = ''
if len(timings) > 1:
dp = timings[0][0] - timings[-1][0]
dt = timings[0][1] - timings[-1][1]
secs_pred = (1 - progress_pct) * (dt / dp)
td = datetime.timedelta(seconds=secs_pred)
end_prediction = (f' - {td}' +
f' - {datetime.datetime.now() + td}')
print(colored.stylize(
'%11.6f%% - %s of %s' % (
100*hashed_size_bytes / max(1, total_file_size_bytes),
'%11.6f%% - %s of %s%s' % (
100*progress_pct,
format_power10(hashed_size_bytes),
total_file_size),
total_file_size,
end_prediction),
[colored.fg('light_green'), colored.attr('bold')]))
hashes_handler.write(f'{file_hash.hash}|{file_hash.file}\n')
else:


+ 160
- 0
reddit_imgs/linguisticdictanal.py View File

@ -0,0 +1,160 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import gc
import hashlib
import json
import multiprocessing
import traceback
from pathlib import Path
from typing import Any, Dict, FrozenSet, Generator, List, Set, Tuple, TypeVar
import colored
from .system.cmdline_parser import parse_cmdline
from .system.flattener import flatten_generator
from .system.format_file_size import format_power10
from .system.hexhashof import hexhashof
WORKERS = 4
T = TypeVar('T')
def cmdline(encoded_args: str = None):
if encoded_args is None:
return run_with_config()
else:
return parse_cmdline(run_with_config, encoded_args)
def run_with_config(max_workers: int = 4):
WORKERS = max_workers
main()
class AnnotatedText:
def __init__(self, text: str, attributes: Dict[str, float]):
self._text = text
self._attributes = attributes
@property
def as_dict(self):
return dict(text=self._text, attributes=self._attributes)
@classmethod
def build(cls, text: str, attributes: Dict[str, float]) -> 'AnnotatedText':
if text is None:
return None
else:
return cls(text, attributes)
@classmethod
def build_dict(cls, text: str, attributes: Dict[str, float]) -> Dict[str, Any]:
if text is None:
return None
else:
return cls(text, attributes).as_dict
def main():
reddit_posts_path = Path('r.json')
reddit_meta_path = Path('ri.json')
print('Loading posts...')
reddit_posts: dict = json.loads(reddit_posts_path.read_bytes())
reddit_meta: dict = json.loads(reddit_meta_path.read_bytes())
print('Building initial dictionary...')
dct_subreddit_level: Dict[str, List[AnnotatedText]] = dict()
subreddits: List[str] = sorted(list(set(reddit_posts.keys()).union(reddit_meta.keys())))
current_data_keys = [*[subreddit for subreddit in subreddits],
'subredditDisplayText',
'subredditName',
'subredditUrl',
'subredditTitle',
'subredditPublicDescription',
'postFlair',
'postTitle',
'postSharer',
'postLink',
'nsfw',
]
current_data = {dt: 0 for dt in current_data_keys}
for subreddit in subreddits:
if subreddit not in dct_subreddit_level:
dct_subreddit_level[subreddit] = list()
dct_this_subreddit: List[AnnotatedText] = dct_subreddit_level[subreddit]
subreddit_meta: dict = reddit_meta.get(subreddit, dict())
subreddit_meta = subreddit_meta if subreddit_meta is not None else dict()
subreddit_posts: List[Dict[str, str]] = reddit_posts.get(subreddit, dict(links=list()))['links']
subreddit_nsfw = (
1
if subreddit_meta.get('definition', dict()).get('isNSFW', False) else
0)
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('displayText'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditDisplayText': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('name'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditName': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('url'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditUrl': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('definition', dict()).get('title'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditTitle': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_meta.get('about', dict()).get('publicDescription'),
dict_to_value_list({'nsfw': subreddit_nsfw, **current_data, subreddit: 1, 'subredditPublicDescription': 1,
}, current_data_keys)))
gc.collect()
print(subreddit, len(subreddit_posts))
for seq, subreddit_post in enumerate(subreddit_posts):
post_nsfw = (
1
if subreddit_post.get('nsfw', False) else
0)
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_post.get('flair'),
dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postFlair': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_post.get('title'),
dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postTitle': 1,
}, current_data_keys)))
add_to_lists([dct_this_subreddit],
AnnotatedText.build_dict(subreddit_post.get('sharer'),
dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postSharer': 1,
}, current_data_keys)))
# add_to_lists([dct_this_subreddit],
# AnnotatedText.build_dict(subreddit_post.get('link'),
# dict_to_value_list({'nsfw': post_nsfw, **current_data, subreddit: 1, 'postLink': 1,
# }, current_data_keys)))
def dict_to_value_list(d: Dict[str, T], l: Tuple[str, ...]) -> Tuple[T, ...]:
return tuple([d[i] for i in l])
def add_to_dicts(dicts: List[Dict[str, List[AnnotatedText]]],
key: str,
item: AnnotatedText,
insert_none: bool = False,
):
if insert_none or item is not None:
for dct in dicts:
if key not in dct:
dct[key] = list()
dct[key].append(item)
def add_to_lists(lists: List[List[AnnotatedText]],
item: AnnotatedText,
insert_none: bool = False,
):
if insert_none or item is not None:
for lst in lists:
lst.append(item)

+ 27
- 152
reddit_imgs/runner.py View File

@ -1,23 +1,22 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import reddit_imgs.sync
import reddit_imgs.fetch
import os
import shutil
import subprocess
import sys
from pathlib import Path
import reddit_imgs.condensate_hashes
import reddit_imgs.download_pruner
import reddit_imgs.fetch2
import reddit_imgs.reorganize
import reddit_imgs.wallpapers
import reddit_imgs.thumbnailize
import reddit_imgs.hashit
import reddit_imgs.hashit2
import reddit_imgs.normalizetobmp
import reddit_imgs.cachedhash
import reddit_imgs.download_pruner
import reddit_imgs.linguisticdictanal
import reddit_imgs.sizebysubreddit
import reddit_imgs.suggest_subreddits_from_links
import reddit_imgs.condensate_hashes
import reddit_imgs.sync
import reddit_imgs.wallpapers2
import os
import sys
import shutil
wdir = os.path.abspath('.')
@ -32,138 +31,6 @@ def ensureFolderAvailability():
os.makedirs(os.path.join(wdir, 'r'))
def managesubreddits():
i = ''
while i != '0':
print('\n'*100)
print('----------------------------------------------')
print(' Subreddit Manager ')
print('----------------------------------------------')
print('1) List monitored subreddits')
print('2) Add monitored subreddit')
print('3) Remove monitored subreddit')
print('4) Set as wallpaper source')
print('5) Unset as wallpaper source')
print()
print('0) Back')
print('----------------------------------------------')
print()
print('Enter your choice:')
i = input()
i = i.strip()
print()
print()
subreddits_dir = os.path.join(wdir, 'r')
def subreddits_isfolder(sr): return os.path.isdir(
os.path.join(subreddits_dir, sr))
subreddits = sorted(
filter(subreddits_isfolder, os.listdir(subreddits_dir)))
if i in ['1', '3', '4', '5']:
print('Subreddits monitored:')
for sr in subreddits:
print('/r/{0}'.format(sr), end='')
if os.path.isfile(os.path.join(subreddits_dir, sr, 'wallpaper.flag')):
print('\t\t(wallpaper)')
else:
print()
print()
if i == '1':
print('Press enter to continue')
input()
if i == '3':
print('Enter the subreddit you want to get rid of:')
rem = input('/r/')
try:
shutil.rmtree(os.path.join(subreddits_dir, rem))
except:
pass
print()
print('Done.')
print('Press enter to continue')
input()
elif i == '2':
print('Enter the subreddit you want to add:')
add = input('/r/')
try:
os.makedirs(os.path.join(subreddits_dir, add))
except:
pass
print()
print('Done.')
print('Press enter to continue')
input()
elif i == '4':
print('Enter the subreddit you want to set as wallpaper source:')
add = input('/r/')
try:
dd = os.path.join(subreddits_dir, add)
if not os.path.exists(dd):
os.makedirs(dd)
f = open(os.path.join(dd, 'wallpaper.flag'), 'w')
f.write('')
f.close()
except:
pass
print()
print('Done.')
print('Press enter to continue')
input()
elif i == '5':
print('Enter the subreddit you want to unset as wallpaper source:')
add = input('/r/')
try:
dd = os.path.join(subreddits_dir, add)
if not os.path.exists(dd):
os.makedirs(dd)
f = open(os.path.join(dd, 'wallpaper.flag'), 'w')
f.write('')
f.close()
os.remove(os.path.join(dd, 'wallpaper.flag'))
except:
pass
print()
print('Done.')
print('Press enter to continue')
input()
def mainmenu():
i = ''
while i != '0':
print('\n'*100)
print('----------------------------------------------')
print(' Reddit Image Downloader ')
print('----------------------------------------------')
print('1) Manage subreddits')
print('2) Get link list to be downloaded from reddit')
print('3) Download grabbed links')
print('4) Organize by hashes')
print('5) Generate thumbnails')
print('6) Group and put nice names on downloaded data')
print('7) Sepparate wallpapers')
print()
print('0) Quit')
print('----------------------------------------------')
print()
print('Enter your choice:')
i = input()
i = i.strip()
if i == '1':
managesubreddits()
elif i == '2':
reddit_imgs.sync.main()
elif i == '3':
reddit_imgs.fetch.main()
elif i == '4':
reddit_imgs.hashit.main()
elif i == '5':
reddit_imgs.thumbnailize.main()
elif i == '6':
reddit_imgs.reorganize.main()
elif i == '7':
reddit_imgs.wallpapers.main()
def main():
# ensureFolderAvailability()
if len(sys.argv) > 1:
@ -172,7 +39,11 @@ def main():
mainmenu()
def cmdline():
def main_unshared():
cmdline(True)
def cmdline(run_each_on_subprocess=False):
cmds = sys.argv[1:]
available_commands = ((
('sync', reddit_imgs.sync.cmdline),
@ -181,12 +52,14 @@ def cmdline():
('prune_downloads', reddit_imgs.download_pruner.cmdline),
('hashit', reddit_imgs.hashit2.cmdline),
('condensate_hashes', reddit_imgs.condensate_hashes.cmdline),
('size_by_subreddit', reddit_imgs.sizebysubreddit.cmdline),
('wallpapers', reddit_imgs.wallpapers2.cmdline),
('linguistic_dictionary_analysis', reddit_imgs.linguisticdictanal.cmdline),
# ('cachedhash', reddit_imgs.cachedhash.main),
# ('hashit', reddit_imgs.hashit.main),
# ('normalizetobmp', reddit_imgs.normalizetobmp.main),
# ('thumbnailize', reddit_imgs.thumbnailize.main),
# ('reorganize', reddit_imgs.reorganize.main),
# ('wallpapers', reddit_imgs.wallpapers.main),
))
available_commands_names = tuple(
list(map(lambda a: a[0], available_commands)))
@ -199,13 +72,15 @@ def cmdline():
command_ran = False
for acmd in available_commands:
if cmd.split(':', 1)[0] == acmd[0]:
x = cmd.split(':', 1)
command_ran = True
fcmd = acmd[1]
if len(x) == 1:
fcmd()
if run_each_on_subprocess:
the_other_callable = Path(__file__).parent.parent.joinpath('redditgetter.py').absolute()
handler = subprocess.run([str(the_other_callable), cmd])
handler.check_returncode()
else:
fcmd(encoded_args=x[1])
cmd_callable = acmd[1]
cmd_name, fcmd, *_ = *cmd.split(':', 1), ''
cmd_callable(encoded_args=fcmd)
if not command_ran:
print('Usage {0} [{1}]'.format(sys.argv[0],
'/'.join(available_commands_names)))


+ 69
- 0
reddit_imgs/sizebysubreddit.py View File

@ -0,0 +1,69 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import hashlib
import json
import multiprocessing
import traceback
from pathlib import Path
from typing import Any, Dict, FrozenSet, Generator, List, Set, TypeVar
import colored
from .system.cmdline_parser import parse_cmdline
from .system.flattener import flatten_generator
from .system.format_file_size import format_power10
from .system.hexhashof import hexhashof
from .system.table_fmt import table_fmt
WORKERS = 4
T = TypeVar('T')
def cmdline(encoded_args: str = None):
if encoded_args is None:
return run_with_config()
else:
return parse_cmdline(run_with_config, encoded_args)
def run_with_config():
main()
def main():
print('Loading files...')
files_for_link = json.loads(Path('i_gdl_ffl.json').read_bytes())
posts_for_link = json.loads(Path('r_gdl_lp.json').read_bytes())
file_sizes = json.loads(Path('i_gdl_fsz.json').read_bytes())
posts_dict = json.loads(Path('r_gdl_p.json').read_bytes())
subreddit_files: Dict[str, Set[str]] = dict()
subreddit_size: Dict[str, int] = dict()
ffl_sz = len(files_for_link)
print('Processing data...')
for idx, (link, files) in enumerate(files_for_link.items()):
if idx % 50000 == 0:
print(f'{idx+1} of {ffl_sz}')
post_ids = posts_for_link[link]
posts = [posts_dict[post_id] for post_id in post_ids]
subreddits = [subreddit for post in posts for subreddit in post['subreddits']]
for subreddit in subreddits:
if subreddit not in subreddit_files:
subreddit_files[subreddit] = set()
if subreddit not in subreddit_size:
subreddit_size[subreddit] = 0
for file in files:
if file not in subreddit_files[subreddit]:
subreddit_files[subreddit].add(file)
if file in file_sizes:
subreddit_size[subreddit] += file_sizes[file]
else:
print('%r does not have a size' % file)
print('Printing...')
srst = sorted(subreddit_size.items(), key=lambda a: (a[1], a[0]))
print(table_fmt(
'subreddit,disk usage'.split(','),
list(map(lambda a: (a[0], format_power10(a[1])), srst)),
alignment='^>'
))

+ 152
- 60
reddit_imgs/sync.py View File

@ -7,7 +7,7 @@ from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from urllib.error import ContentTooShortError, HTTPError, URLError
from bs4 import BeautifulSoup as _BS
import colored as clrlib
from .system import simpleDownloader
from .system.cmdline_parser import parse_cmdline
@ -18,9 +18,6 @@ from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link,
MAX_WORKERS = 16
def BeautifulSoup(data): return _BS(data, 'html5lib')
def cmdline(encoded_args: str = None):
if encoded_args is None:
return run_with_config()
@ -41,45 +38,51 @@ simpleDownloader.setCookies({'over18': 1})
wdir = os.path.abspath('.')
def process_subreddit(subreddit):
def process_subreddit(subreddit, srdt, jsonPageSr):
simpleDownloader.setCookies({'over18': 1})
srp = os.path.abspath(os.path.join(wdir, 'r', subreddit))
# if subreddit!='yiff': continue
nextpage = build_gateway_link(subreddit)
srdt = getEmptySubredditData(subreddit)
try:
with open(os.path.join(srp, 'subreddit.json')) as f:
srdt = json.loads(f.read())
except BaseException:
pass
#srdt = getEmptySubredditData(subreddit)
pageno = 0
ygst = srdt['date_first']
jsonPageSr = None
while nextpage:
pageno += 1
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(' >> %s' % (nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),))
print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [
clrlib.fg('light_yellow'),
]))
print(clrlib.stylize(' >> %s' % (nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),), [
clrlib.fg('light_yellow'), clrlib.attr('dim'),
]))
redditBytes = None
try:
redditBytes = simpleDownloader.getUrlBytes(nextpage)
except (HTTPError, URLError, ContentTooShortError):
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(" >> HTTP Error with code: Skipping...")
print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [
clrlib.fg('light_red'), clrlib.attr('bold'),
]))
print(clrlib.stylize(" >> HTTP Error with code: Skipping...", [
clrlib.fg('light_red'), clrlib.attr('bold'), clrlib.attr('dim'),
]))
break
if redditBytes is None:
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(" >> HTTP Error: Skipping...")
print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [
clrlib.fg('light_red'), clrlib.attr('bold'),
]))
print(clrlib.stylize(" >> HTTP Error: Skipping...", [
clrlib.fg('light_red'), clrlib.attr('bold'), clrlib.attr('dim'),
]))
break
# bs = BeautifulSoup(redditBytes)
jsonPage = json.loads(redditBytes)
getSubredditPageJsonInfoResult = None
try:
getSubredditPageJsonInfoResult = (
getSubredditPageJsonInfo(jsonPage, subreddit, pageno))
except IndexError:
print(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit))
print(" >> Empty subreddit: Skipping...")
print(clrlib.stylize(('/r/{0:<20} loading page #%05d' % pageno).format(subreddit), [
clrlib.fg('light_gray'), clrlib.attr('dim'),
]))
print(clrlib.stylize(" >> Empty subreddit: Skipping...", [
clrlib.fg('light_gray'), clrlib.attr('dim'),
]))
break
first, last, nextpage, links = getSubredditPageJsonInfoResult
if ygst >= first: # if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
@ -101,62 +104,151 @@ def process_subreddit(subreddit):
about=jsonPage['subredditAboutInfo'][srid],
flair=jsonPage['postFlair'][srid],
)
with open(os.path.join(srp, 'subreddit.json'), 'w') as f:
f.write(json.dumps(srdt, sort_keys=True, indent=2))
if jsonPageSr is not None:
with open(os.path.join(srp, 'meta.json'), 'w') as f:
f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2))
srdt['links'] = list(filter(lambda a: len(a['datakey']) < 20, srdt['links']))
srdt['links'] = sorted(srdt['links'], key=lambda a: -a['timestamp'])
return (subreddit, srp, srdt, jsonPageSr)
def main():
build_summary()
print('Building summary...')
srs, srsi, srf = build_summary()
print('Download...')
subreddits = sorted(filter(lambda sr: os.path.isdir(
os.path.join(wdir, 'r', sr)), os.listdir(os.path.join(wdir, 'r'))))
with PoolExecutor(MAX_WORKERS) as pe:
q = list()
for subreddit in subreddits:
job = pe.submit(process_subreddit, subreddit)
q.append(job)
for job in q:
job.result()
build_summary()
print('Opening process pool...')
with PoolExecutor(MAX_WORKERS) as pe2:
def process_subreddit_done_callback_inner(job):
(subreddit, srp, srdt, jsonPageSr) = job.result()
del job
process_subreddit_done_callback(subreddit, srp, srdt, jsonPageSr, pe2, srs, srsi)
return
with PoolExecutor(MAX_WORKERS) as pe:
print('Opened process pool')
for subreddit in subreddits:
if subreddit not in srs:
srs[subreddit] = getEmptySubredditData(subreddit)
if subreddit not in srsi:
srsi[subreddit] = None
job = pe.submit(
process_subreddit,
subreddit,
srs[subreddit],
srsi[subreddit],
)
job.add_done_callback(process_subreddit_done_callback_inner)
print('Closing process pool...')
print('Closed process pool')
print('Writing summary...')
write_summary(srs, srsi, srf)
print('Done')
def process_subreddit_done_callback(subreddit, srp, srdt, jsonPageSr, pe, srs, srsi):
srs[subreddit] = srdt
srsi[subreddit] = jsonPageSr
print(clrlib.stylize(f' @> Writing /r/{subreddit}', [
clrlib.fg('light_cyan'),
]))
job = pe.submit(
post_processing_saver,
subreddit, srp, srdt, jsonPageSr
)
def post_processing_saver(subreddit, srp, srdt, jsonPageSr):
write_json(Path(os.path.join(srp, 'subreddit.json')), srdt, sort_keys=True)
if jsonPageSr is not None:
write_json(Path(os.path.join(srp, 'meta.json')), jsonPageSr, sort_keys=True)
print(clrlib.stylize(f' @> Written /r/{subreddit}', [
clrlib.fg('light_green'),
]))
def build_summary():
rjpath = Path(wdir, 'r.json')
rijpath = Path(wdir, 'ri.json')
rfpath = Path(wdir, 'rf.json')
oldsrs = dict()
oldsrsi = dict()
if rjpath.exists():
oldsrs = json.loads(rjpath.read_text())
oldsrs = json.loads(rjpath.read_bytes())
if rijpath.exists():
oldsrsi = json.loads(rijpath.read_text())
oldsrsi = json.loads(rijpath.read_bytes())
srs = dict()
srsi = dict()
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
sr = srp.parent.name.lower()
srip = srp.parent.joinpath('meta.json')
try:
srs[sr] = json.loads(srp.read_text())
except json.decoder.JSONDecodeError:
if sr not in oldsrs:
raise
nodownloadfilter = dict()
nosfwfilter = dict()
nonsfwfilter = dict()
wallpaperfilter = dict()
with PoolExecutor(MAX_WORKERS) as pe:
def on_data_read(job):
(sr, srp, srip, srd, srid, sripe) = job.result()
if srd is not None:
srs[sr] = srd
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srs[sr] = oldsrs[sr]
srp.write_text(json.dumps(oldsrs[sr], indent=1))
if srip.exists():
try:
srsi[sr] = json.loads(srip.read_text())
except json.decoder.JSONDecodeError:
if sr not in oldsrsi:
raise
if sr not in oldsrs:
srp.unlink()
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srsi[sr] = oldsrsi[sr]
srip.write_text(json.dumps(oldsrsi[sr], indent=1))
rjpath.write_text(json.dumps(srs, indent=1))
rijpath.write_text(json.dumps(srsi, indent=1))
srs[sr] = oldsrs[sr]
srp.write_text(json.dumps(oldsrs[sr], indent=1))
if sripe:
if srid is not None:
srsi[sr] = srid
else:
if sr not in oldsrsi:
srip.unlink()
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srsi[sr] = oldsrsi[sr]
srip.write_text(json.dumps(oldsrsi[sr], indent=1))
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
sr = srp.parent.name.lower()
nodownloadfilter[sr] = srp.parent.joinpath('nodownload.flag').exists()
nosfwfilter[sr] = srp.parent.joinpath('nosfw.flag').exists()
nonsfwfilter[sr] = srp.parent.joinpath('nonsfw.flag').exists()
wallpaperfilter[sr] = srp.parent.joinpath('wallpaper.flag').exists()
srip = srp.parent.joinpath('meta.json')
job = pe.submit(read_disk_summary, sr, srp, srip)
job.add_done_callback(on_data_read)
srf = dict(
no_download=nodownloadfilter,
no_sfw=nosfwfilter,
no_nsfw=nonsfwfilter,
wallpaper=wallpaperfilter,
)
return srs, srsi, srf
def read_disk_summary(sr, srp, srip):
srd = None
srid = None
sripe = srip.exists()
try:
srd = json.loads(srp.read_bytes())
except json.decoder.JSONDecodeError:
pass
if sripe:
try:
srid = json.loads(srip.read_bytes())
except json.decoder.JSONDecodeError:
pass
return (sr, srp, srip, srd, srid, sripe)
def write_summary(srs, srsi, srf):
rjpath = Path(wdir, 'r.json')
rijpath = Path(wdir, 'ri.json')
rfpath = Path(wdir, 'rf.json')
with PoolExecutor(MAX_WORKERS) as pe:
pe.submit(write_json, rjpath, srs)
pe.submit(write_json, rijpath, srsi)
pe.submit(write_json, rfpath, srf)
def write_json(path, data, **kwargs):
path.write_text(json.dumps(data, indent=1, **kwargs))
if __name__ == '__main__':


+ 47
- 18
reddit_imgs/system/cmdline_parser.py View File

@ -4,6 +4,7 @@
import re
from inspect import _empty, getfullargspec, signature
from typing import Callable, Dict, List, Optional, Set, Type, TypeVar
import colored as clrlib
from .table_fmt import table_fmt
@ -40,24 +41,39 @@ def parse_cmdline(func: Callable[..., T], encoded_args: str) -> Optional[T]:
if k not in func_args}
if encoded_args == 'help' or len(unknown_args) > 0:
if len(unknown_args) > 0:
print('Unknown arguments found:')
print(clrlib.stylize('Unknown arguments found:', [
clrlib.fg('light_red'),
clrlib.attr('bold'),
]))
for k, v in unknown_args.items():
print(f' {k}: {repr(v)}')
print(clrlib.stylize(f' {k}: {repr(v)}', [
clrlib.fg('light_red'),
]))
print()
print(f'Usage help for: {func.__module__}.{func.__name__}')
print(clrlib.stylize(f'Usage help for: {func.__module__}.{func.__name__}', [
clrlib.fg('light_cyan'),
clrlib.attr('bold'),
]))
tbl = list()
for name, parameter in sig.parameters.items():
annotation = parameter.annotation if parameter.annotation != _empty else str
tbl.append((
str(name),
repr(annotation),
repr(parameter.default) if parameter.default != _empty else '-unset-',
))
print(table_fmt(
'name,type,default'.split(','),
tbl,
alignment='^'*3,
))
if len(sig.parameters) <= 0:
print(clrlib.stylize(' ' * 4 + 'No arguments accepted', [
clrlib.fg('light_cyan'),
]))
else:
for name, parameter in sig.parameters.items():
annotation = parameter.annotation if parameter.annotation != _empty else str
tbl.append((
str(name),
repr(annotation),
repr(parameter.default) if parameter.default != _empty else '-unset-',
))
print(clrlib.stylize(space_left_pad_text(4, table_fmt(
'name,type,default'.split(','),
tbl,
alignment='^'*3,
)), [
clrlib.fg('light_cyan'),
]))
return None
kwargs = dict()
for key in str_args:
@ -65,12 +81,21 @@ def parse_cmdline(func: Callable[..., T], encoded_args: str) -> Optional[T]:
func_annotations.get(key, str),
str_args[key]
)
print(f'Calling {func.__module__}.{func.__name__} with arguments:')
print(clrlib.stylize(f'Calling {func.__module__}.{func.__name__} with arguments:', [
clrlib.fg('light_gray'),
clrlib.attr('dim'),
]))
if len(kwargs) <= 0:
print(' --- no arguments given ---')
print(clrlib.stylize(' --- no arguments given ---', [
clrlib.fg('light_gray'),
clrlib.attr('dim'),
]))
else:
for k, v in kwargs.items():
print(f' {k}: {repr(v)}')
print(clrlib.stylize(f' {k}: {repr(v)}