Browse Source

big refactoring

keep-around/4ce83f0c5fead0222a322ef678c88655cdae8979
Adler Neves 9 months ago
parent
commit
4ce83f0c5f
42 changed files with 3189 additions and 354 deletions
  1. +3
    -0
      .gitignore
  2. +9
    -0
      hash_thumbnailer_distributed/.gitignore
  3. +17
    -0
      hash_thumbnailer_distributed/Makefile
  4. +21
    -0
      hash_thumbnailer_distributed/manage.py
  5. +0
    -0
      hash_thumbnailer_distributed/webproj/__init__.py
  6. +30
    -0
      hash_thumbnailer_distributed/webproj/adminModelRegister.py
  7. +16
    -0
      hash_thumbnailer_distributed/webproj/asgi.py
  8. +129
    -0
      hash_thumbnailer_distributed/webproj/settings.py
  9. +24
    -0
      hash_thumbnailer_distributed/webproj/stackOverflowSnippets.py
  10. +0
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/__init__.py
  11. +6
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/admin.py
  12. +5
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/apps.py
  13. +28
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/dumpresults.py
  14. +33
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/loadhashes.py
  15. +50
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/migrations/0001_initial.py
  16. +0
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/migrations/__init__.py
  17. +57
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/models.py
  18. +3
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/tests.py
  19. +13
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/urls.py
  20. +66
    -0
      hash_thumbnailer_distributed/webproj/thumbnailer/views.py
  21. +24
    -0
      hash_thumbnailer_distributed/webproj/urls.py
  22. +16
    -0
      hash_thumbnailer_distributed/webproj/wsgi.py
  23. +275
    -0
      hash_thumbnailer_distributed/worker.py
  24. +422
    -0
      hash_thumbnailer_distributed/worker_thumbnailer.py
  25. +7
    -0
      prunedownloads.py
  26. +238
    -0
      reddit_imgs/condensate_hashes.py
  27. +333
    -104
      reddit_imgs/display_fetch_futures.py
  28. +176
    -0
      reddit_imgs/download_pruner.py
  29. +31
    -25
      reddit_imgs/fetch.py
  30. +604
    -144
      reddit_imgs/fetch2.py
  31. +41
    -0
      reddit_imgs/get_firefox_cookies.sh
  32. +173
    -0
      reddit_imgs/hashit2.py
  33. +83
    -55
      reddit_imgs/runner.py
  34. +44
    -0
      reddit_imgs/suggest_subreddits_from_links.py
  35. +60
    -12
      reddit_imgs/sync.py
  36. +83
    -0
      reddit_imgs/system/cmdline_parser.py
  37. +17
    -6
      reddit_imgs/system/downloader/cache.py
  38. +13
    -0
      reddit_imgs/system/flattener.py
  39. +18
    -0
      reddit_imgs/system/format_file_size.py
  40. +2
    -4
      reddit_imgs/system/hexhashof.py
  41. +3
    -3
      reddit_imgs/system/table_fmt.py
  42. +16
    -1
      reddit_imgs/system/urlmatcher.py

+ 3
- 0
.gitignore View File

@ -11,6 +11,8 @@ w
w/**
i_gdl*
i_gdl*/**
r_gdl*
r_gdl*/**
i_c
i_c/**
i_c.json
@ -19,6 +21,7 @@ fetch_missing.json
i_he.json
i_c_h.json
most_repeated_hashes.json
display_fetch_futures.trace
i_h
i_h/**
i_h.json


+ 9
- 0
hash_thumbnailer_distributed/.gitignore View File

@ -0,0 +1,9 @@
/.mypy_cache
/.mypy_cache/**
/.vscode
/.vscode/**
/static
/static/**
/db.sqlite3
/dumped*.json
/hashes*.txt

+ 17
- 0
hash_thumbnailer_distributed/Makefile View File

@ -0,0 +1,17 @@
devserver:
-@mkdir -p static
python manage.py makemigrations
python manage.py migrate
python manage.py createcachetable
yes yes | python manage.py collectstatic
python manage.py runserver 0.0.0.0:8000
server:
python manage.py migrate
uvicorn webproj.asgi:application --lifespan off --host 0.0.0.0 --workers 12
prepare:
python manage.py loadhashes
dumpresults:
python manage.py dumpresults

+ 21
- 0
hash_thumbnailer_distributed/manage.py View File

@ -0,0 +1,21 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'webproj.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

+ 0
- 0
hash_thumbnailer_distributed/webproj/__init__.py View File


+ 30
- 0
hash_thumbnailer_distributed/webproj/adminModelRegister.py View File

@ -0,0 +1,30 @@
from django.db import models
from .stackOverflowSnippets import classesInModule
from django.core import exceptions
from sys import stderr
def onlyModels(userMadeModels):
return [model for model in userMadeModels if models.Model in model.__mro__]
def isAbstract(clazz):
return clazz._meta.abstract
def discardAbstractModels(userMadeModels):
return [model for model in userMadeModels if not isAbstract(model)]
def registrableModelsInModule(module):
return discardAbstractModels(onlyModels(classesInModule(module)))
def registerForMe(admin, models_module):
for model in registrableModelsInModule(models_module):
try:
admin.site.register(model)
except exceptions.ImproperlyConfigured:
pass
except BaseException as e:
print(str(e.__class__)+': '+str(e), file=stderr)

+ 16
- 0
hash_thumbnailer_distributed/webproj/asgi.py View File

@ -0,0 +1,16 @@
"""
ASGI config for webproj project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/3.0/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'webproj.settings')
application = get_asgi_application()

+ 129
- 0
hash_thumbnailer_distributed/webproj/settings.py View File

@ -0,0 +1,129 @@
"""
Django settings for webproj project.
Generated by 'django-admin startproject' using Django 3.0.8.
For more information on this file, see
https://docs.djangoproject.com/en/3.0/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/3.0/ref/settings/
"""
import os
from pathlib import Path
import psycopg2.extensions
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/3.0/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'o$fohw1$^ber9jn^t2l@+$a5kb5ys%xm^^9)0&#n9--zs5zc@k'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
ALLOWED_HOSTS = '*'
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'webproj.thumbnailer',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'webproj.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'webproj.wsgi.application'
# Database
# https://docs.djangoproject.com/en/3.0/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql_psycopg2',
'NAME': 'test',
'USER': 'test',
'PASSWORD': 'test',
'HOST': 'localhost',
'PORT': '5432',
}
}
# Password validation
# https://docs.djangoproject.com/en/3.0/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/3.0/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_L10N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/3.0/howto/static-files/
STATIC_URL = '/static/'
STATIC_ROOT = str(Path(__file__).absolute().parent.parent.joinpath('static'))

+ 24
- 0
hash_thumbnailer_distributed/webproj/stackOverflowSnippets.py View File

@ -0,0 +1,24 @@
class StackOverflowCopypaste:
__doc__ = None
__author__ = None
__license__ = 'CC BY-SA 3.0'
def __str__(self): return str(self.__call__)
def __call__(self, module): pass
class stackoverflow_a_21563930(StackOverflowCopypaste):
__doc__ = 'https://stackoverflow.com/a/21563930'
__author__ = 'piRSquared'
def __call__(self, module):
moduleDict = module.__dict__
return [
definedClass for definedClass in moduleDict.values() if (
isinstance(definedClass, type) and definedClass.__module__ == module.__name__
)
]
classesInModule = stackoverflow_a_21563930()

+ 0
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/__init__.py View File


+ 6
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/admin.py View File

@ -0,0 +1,6 @@
from django.contrib import admin
import webproj.thumbnailer.models
from webproj.adminModelRegister import registerForMe
registerForMe(admin, webproj.thumbnailer.models)

+ 5
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/apps.py View File

@ -0,0 +1,5 @@
from django.apps import AppConfig
class ThumbnailerConfig(AppConfig):
name = 'thumbnailer'

+ 28
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/dumpresults.py View File

@ -0,0 +1,28 @@
from pathlib import Path
from django.core.management.base import BaseCommand, no_translations
from webproj.thumbnailer.models import ErrorLog, Job, Log, PerformanceLog, ToJsonableMixin
from django.core.serializers.json import DjangoJSONEncoder
class Command(BaseCommand):
def add_arguments(self, parser):
return
def handle(self, *args, **options):
# print(sorted(dir(self.style)))
self.stdout.write(self.style.HTTP_INFO(
'Reading table entries...'))
jobs = list(map(ToJsonableMixin.to_jsonable_nested, Job.objects.order_by('taken_at').all()))
err_log = list(map(ToJsonableMixin.to_jsonable_nested, ErrorLog.objects.order_by('id').all()))
perf_log = list(map(ToJsonableMixin.to_jsonable_nested, PerformanceLog.objects.order_by('id').all()))
outdata = dict(jobs=jobs, logs=dict(error=err_log, performance=perf_log))
self.stdout.write(self.style.HTTP_INFO(
'Serializing to JSON...'))
jsoned = DjangoJSONEncoder(indent=1).encode(outdata)
self.stdout.write(self.style.HTTP_INFO(
'Saving file...'))
Path('dumped.json').write_text(jsoned)
self.stdout.write(self.style.SUCCESS('Saved into "dumped.json".'))

+ 33
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/management/commands/loadhashes.py View File

@ -0,0 +1,33 @@
from pathlib import Path
from django.core.management.base import BaseCommand, no_translations
from webproj.thumbnailer.models import ErrorLog, Job, Log, PerformanceLog
class Command(BaseCommand):
def add_arguments(self, parser):
return
def handle(self, *args, **options):
# print(sorted(dir(self.style)))
self.stdout.write(self.style.HTTP_INFO(
'Truncating all relevant tables...'))
Job.truncate_table()
ErrorLog.truncate_table()
PerformanceLog.truncate_table()
self.stdout.write(self.style.SUCCESS('Tables trucated...'))
self.stdout.write(self.style.HTTP_INFO('Reading "hashes.txt"...'))
hashes = dict(map(lambda a: a.split('|', 1), Path(
'hashes.txt').read_text().splitlines()))
hashes_len = len(hashes)
self.stdout.write(self.style.HTTP_INFO('Adding entries to list...'))
jobs = list()
for seq, (hsh, file) in enumerate(hashes.items()):
jobs.append(Job(hsh=hsh, file=file))
if seq % 100000 == 0:
print(f'Added {seq} of {hashes_len}')
self.stdout.write(self.style.HTTP_INFO(
'Bulk-creating jobs into database...'))
Job.objects.bulk_create(jobs)
self.stdout.write(self.style.SUCCESS('Hashes loaded.'))

+ 50
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/migrations/0001_initial.py View File

@ -0,0 +1,50 @@
# Generated by Django 3.0.8 on 2020-07-18 13:16
from django.db import migrations, models
import webproj.thumbnailer.models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='ErrorLog',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('sender', models.TextField()),
('content', models.TextField()),
],
options={
'abstract': False,
},
bases=(models.Model, webproj.thumbnailer.models.TruncatableMixin, webproj.thumbnailer.models.ToJsonableMixin),
),
migrations.CreateModel(
name='Job',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('hsh', models.CharField(max_length=255)),
('file', models.TextField()),
('taken_at', models.DateTimeField(blank=True, default=None, null=True)),
('result', models.TextField(blank=True, default=None, null=True)),
],
bases=(models.Model, webproj.thumbnailer.models.TruncatableMixin, webproj.thumbnailer.models.ToJsonableMixin),
),
migrations.CreateModel(
name='PerformanceLog',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('sender', models.TextField()),
('content', models.TextField()),
],
options={
'abstract': False,
},
bases=(models.Model, webproj.thumbnailer.models.TruncatableMixin, webproj.thumbnailer.models.ToJsonableMixin),
),
]

+ 0
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/migrations/__init__.py View File


+ 57
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/models.py View File

@ -0,0 +1,57 @@
import json
from typing import List
from django.db import connection, models
# Create your models here.
class TruncatableMixin:
@classmethod
def truncate_table(cls):
cursor = connection.cursor()
cursor.execute('TRUNCATE TABLE "{0}"'.format(cls._meta.db_table))
cursor.execute('ALTER SEQUENCE {0}_id_seq RESTART WITH 1'.format(cls._meta.db_table.lower()))
class ToJsonableMixin:
def to_jsonable(self):
internal_dict = self.__dict__.copy()
if '_state' in internal_dict:
del internal_dict['_state']
return internal_dict
NESTED_JSON_FIELDS: List[str] = []
def to_jsonable_nested(self):
data = self.to_jsonable()
for nested_json_field in type(self).NESTED_JSON_FIELDS:
field = data[nested_json_field]
if field is not None:
data[nested_json_field] = json.loads(data[nested_json_field])
return data
class Job(models.Model, TruncatableMixin, ToJsonableMixin):
NESTED_JSON_FIELDS = ['result']
hsh = models.CharField(max_length=255, null=False, blank=False)
file = models.TextField(null=False, blank=False)
taken_at = models.DateTimeField(default=None, null=True, blank=True)
result = models.TextField(default=None, null=True, blank=True)
class Log(models.Model, TruncatableMixin, ToJsonableMixin):
NESTED_JSON_FIELDS = ['sender', 'content']
sender = models.TextField(null=False, blank=False)
content = models.TextField(null=False, blank=False)
class Meta:
abstract = True
class ErrorLog(Log):
pass
class PerformanceLog(Log):
pass

+ 3
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/tests.py View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

+ 13
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/urls.py View File

@ -0,0 +1,13 @@
from pathlib import Path
from django.conf.urls.static import static
from django.urls import path
from . import views
urlpatterns = [
path('job', views.JobsView.as_view(), name='jobs'),
path('job/<int:job_id>', views.JobView.as_view(), name='job'),
path('log/error', views.LogErrorView.as_view(), name='log_error'),
path('log/performance', views.LogPerformanceView.as_view(), name='log_performance'),
]+static('i_gdl/', document_root=str(Path(__file__).absolute().parent.parent.parent.parent.joinpath('i_gdl')), show_indexes=True)

+ 66
- 0
hash_thumbnailer_distributed/webproj/thumbnailer/views.py View File

@ -0,0 +1,66 @@
import json
from django.http import HttpResponse, JsonResponse
from django.http.request import HttpRequest
from django.shortcuts import get_object_or_404, render
from django.utils import timezone
from django.utils.decorators import method_decorator
from django.views.decorators.csrf import csrf_exempt
from django.views.generic import View
from webproj.thumbnailer.models import ErrorLog, Job, Log, PerformanceLog, ToJsonableMixin
# Create your views here.
@method_decorator(csrf_exempt, name='dispatch')
class JobsView(View):
def get(self, request: HttpRequest):
job: Job = Job.objects.filter(taken_at=None, result=None).first()
if job is None:
job = Job.objects.filter(result=None).order_by('taken_at').first()
if job is None:
return HttpResponse('done')
job.taken_at = timezone.now()
job.save()
return HttpResponse(f'{job.pk}')
@method_decorator(csrf_exempt, name='dispatch')
class JobView(View):
def get(self, request: HttpRequest, job_id: int):
job = get_object_or_404(Job, id=job_id)
job.taken_at = timezone.now()
job.save()
return JsonResponse(job.to_jsonable(), safe=False)
def post(self, request, job_id):
job = get_object_or_404(Job, id=job_id)
job.result = json.dumps(json.loads(request.body))
job.save()
return HttpResponse('ok')
class LogView(View):
model = Log
def post(self, request: HttpRequest):
Logging = type(self).model
logging = Logging()
logging.sender = json.dumps(json.loads(request.POST['sender']))
logging.content = json.dumps(json.loads(request.POST['content']))
logging.save()
return JsonResponse(logging.to_jsonable(), safe=False)
def get(self, request: HttpRequest):
return JsonResponse(list(map(ToJsonableMixin.to_jsonable, type(self).model.objects.all())), safe=False)
@method_decorator(csrf_exempt, name='dispatch')
class LogErrorView(LogView):
model = ErrorLog
@method_decorator(csrf_exempt, name='dispatch')
class LogPerformanceView(LogView):
model = PerformanceLog

+ 24
- 0
hash_thumbnailer_distributed/webproj/urls.py View File

@ -0,0 +1,24 @@
"""webproj URL Configuration
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/3.0/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path
from django.urls import include
import webproj.thumbnailer.urls
urlpatterns = [
path('admin/', admin.site.urls),
path('', include(webproj.thumbnailer.urls)),
]

+ 16
- 0
hash_thumbnailer_distributed/webproj/wsgi.py View File

@ -0,0 +1,16 @@
"""
WSGI config for webproj project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/3.0/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'webproj.settings')
application = get_wsgi_application()

+ 275
- 0
hash_thumbnailer_distributed/worker.py View File

@ -0,0 +1,275 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import bz2
import hashlib
import json
import lzma
import multiprocessing
import socket
import sys
import time
import traceback
import zlib
from concurrent.futures import Future, ProcessPoolExecutor
from io import BytesIO, StringIO
from pathlib import Path
from typing import Any, Callable, Dict, List, Tuple, Type
import requests
import zstandard as zstd
COMPRESSORS: List[Tuple[str, Callable[[bytes], bytes]]] = [
('uncompressed', lambda uncompressed: uncompressed),
('zlib', lambda uncompressed: zlib.compress(uncompressed, 9)),
('bz2', lambda uncompressed: bz2.compress(uncompressed, 9)),
('lzma', lzma.compress),
('zstd', lambda uncompressed: zstd.ZstdCompressor(
level=22).compress(uncompressed)),
]
class HashMismatch(Exception):
pass
def check_best_compressions(uncompressed: bytes) -> Dict[str, Tuple[int, float]]:
algos = dict()
for name, callback in COMPRESSORS:
time_start = time.time()
compressed_size = len(callback(uncompressed))
time_end = time.time()
algos[name] = (compressed_size, time_end - time_start)
return algos
def hexhashof(bts: bytes, using: Callable[[], Any]) -> str:
m = using()
m.update(bts)
return m.hexdigest()
def upload_log(url, sender, **content):
sent = False
while not sent:
try:
requests.post(url, data={
'sender': json.dumps(sender),
'content': json.dumps(content),
}).raise_for_status()
sent = True
except:
traceback.print_exc()
def upload_job(url, **content):
sent = False
while not sent:
try:
requests.post(url, json=content).raise_for_status()
sent = True
except:
traceback.print_exc()
def do_work(base_address: str, worker_id: str):
while True:
try:
tick_set = time.time()
job_id = None
try:
with requests.get(f'{base_address}job') as response:
response.raise_for_status()
job_id = response.text
except KeyboardInterrupt:
raise
except:
pass
if job_id is None:
continue
elif job_id == 'done':
break
else:
tick_downloading_job_started = time.time()
tick_downloading_job_retry_started = tick_downloading_job_started
tick_downloading_job_retry_count = 0
job = None
while job is None:
try:
tick_downloading_job_retry_started = time.time()
with requests.get(f'{base_address}job/{job_id}') as response:
response.raise_for_status()
job = response.json()
except KeyboardInterrupt:
raise
except:
tick_downloading_job_retry_count += 1
sio = StringIO()
traceback.print_exc(file=sio)
formatted_exception = sio.getvalue()
print(formatted_exception, file=sys.stderr)
upload_log(
f'{base_address}log/error',
worker_id,
during='JobDownload',
tick_set=tick_set,
traceback=formatted_exception,
job_id=job_id,
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
)
tick_downloading_job_ended = time.time()
tick_downloading_image_started = time.time()
tick_downloading_image_retry_started = tick_downloading_image_started
tick_downloading_image_retry_count = 0
tick_downloading_image_retry_mismatch = 0
image_bytes = None
while image_bytes is None:
try:
tick_downloading_job_retry_started = time.time()
with requests.get(f'{base_address}{job["file"]}') as response:
if response.status_code == 404:
break
response.raise_for_status()
response.raw.decode_content = True
if hexhashof(response.content, hashlib.sha256) == job['hsh']:
image_bytes = response.content
else:
raise HashMismatch()
except KeyboardInterrupt:
raise
except BaseException as exception:
tick_downloading_image_retry_count += 1
if isinstance(exception, HashMismatch):
tick_downloading_image_retry_mismatch += 1
sio = StringIO()
traceback.print_exc(file=sio)
formatted_exception = sio.getvalue()
print(formatted_exception, file=sys.stderr)
upload_log(
f'{base_address}log/error',
worker_id,
during='ImageDownload',
tick_set=tick_set,
traceback=formatted_exception,
job_id=job_id,
file=job["file"],
hash=job["hsh"],
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
tick_downloading_job_ended=tick_downloading_job_ended,
tick_downloading_image_started=tick_downloading_image_started,
tick_downloading_image_retry_started=tick_downloading_image_retry_started,
tick_downloading_image_retry_count=tick_downloading_image_retry_count,
tick_downloading_image_retry_mismatch=tick_downloading_image_retry_mismatch,
)
if tick_downloading_image_retry_mismatch >= 10:
break
tick_downloading_image_ended = time.time()
if image_bytes is None:
upload_job(
f'{base_address}job/{job_id}',
status='NoValidImageData',
tick_set=tick_set,
job_id=job_id,
file=job["file"],
hash=job["hsh"],
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
tick_downloading_job_ended=tick_downloading_job_ended,
tick_downloading_image_started=tick_downloading_image_started,
tick_downloading_image_retry_started=tick_downloading_image_retry_started,
tick_downloading_image_retry_count=tick_downloading_image_retry_count,
tick_downloading_image_retry_mismatch=tick_downloading_image_retry_mismatch,
tick_downloading_image_ended=tick_downloading_image_ended,
)
else:
tick_image_compress_start = time.time()
compressions = check_best_compressions(image_bytes)
tick_image_compress_ended = time.time()
tick_uploading_started = time.time()
upload_job(
f'{base_address}job/{job_id}',
status='Complete',
worker=worker_id,
tick_set=tick_set,
job_id=job_id,
file=job["file"],
hash=job["hsh"],
compressions=compressions,
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
tick_downloading_job_ended=tick_downloading_job_ended,
tick_downloading_image_started=tick_downloading_image_started,
tick_downloading_image_retry_started=tick_downloading_image_retry_started,
tick_downloading_image_retry_count=tick_downloading_image_retry_count,
tick_downloading_image_retry_mismatch=tick_downloading_image_retry_mismatch,
tick_downloading_image_ended=tick_downloading_image_ended,
tick_image_compress_start=tick_image_compress_start,
tick_image_compress_ended=tick_image_compress_ended,
)
tick_uploading_ended = time.time()
upload_log(
f'{base_address}log/performance',
worker_id,
file=job["file"],
hash=job["hsh"],
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
tick_downloading_job_ended=tick_downloading_job_ended,
tick_downloading_image_started=tick_downloading_image_started,
tick_downloading_image_retry_started=tick_downloading_image_retry_started,
tick_downloading_image_retry_count=tick_downloading_image_retry_count,
tick_downloading_image_retry_mismatch=tick_downloading_image_retry_mismatch,
tick_downloading_image_ended=tick_downloading_image_ended,
tick_image_compress_start=tick_image_compress_start,
tick_image_compress_ended=tick_image_compress_ended,
tick_uploading_started=tick_uploading_started,
tick_uploading_ended=tick_uploading_ended,
)
tick_uploading_ended = time.time()
print(f"Done: {job['hsh']}")
except KeyboardInterrupt:
raise
except:
raise
def kickstart(base_address: str):
job_count: int = multiprocessing.cpu_count() * 2
# job_count = 1
hostname: str = socket.gethostname()
with ProcessPoolExecutor(job_count) as pe:
for job_seq in range(job_count):
job_id = f'{hostname}-{job_seq}'
def on_completed(job: Future):
job.result()
pe.submit(
do_work,
worker_id=job_id,
base_address=base_address,
).add_done_callback(on_completed)
print('Ready')
def main():
if len(sys.argv) == 2:
base_address = sys.argv[1]
if not base_address.startswith('http'):
base_address = 'http://'+base_address
if not base_address.endswith('/'):
base_address += '/'
kickstart(base_address)
else:
print(f'Usage:\n {sys.argv[0]} <ip_address:port>')
if __name__ == "__main__":
main()

+ 422
- 0
hash_thumbnailer_distributed/worker_thumbnailer.py View File

@ -0,0 +1,422 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import hashlib
import json
import multiprocessing
import socket
import sys
import time
import traceback
from concurrent.futures import Future, ProcessPoolExecutor
from io import BytesIO, StringIO
from pathlib import Path
from typing import Any, Callable, Dict, List, Tuple, Type
import requests
from PIL import Image, ImageDraw, ImageOps
class HashMismatch(Exception):
pass
START = -(sys.maxsize // 2) + 0
END = -(sys.maxsize // 2) + 1
CENTER = -(sys.maxsize // 2) + 2
POS_TOP_LEFT = (START, START)
POS_CENTER_LEFT = (START, CENTER)
POS_BOTTOM_LEFT = (START, END)
POS_TOP_CENTER = (CENTER, START)
POS_CENTER_CENTER = (CENTER, CENTER)
POS_BOTTOM_CENTER = (CENTER, END)
POS_TOP_RIGHT = (END, START)
POS_CENTER_RIGHT = (END, CENTER)
POS_BOTTOM_RIGHT = (END, END)
P_TL = POS_TOP_LEFT
P_CL = POS_CENTER_LEFT
P_BL = POS_BOTTOM_LEFT
P_TC = POS_TOP_CENTER
P_CC = POS_CENTER_CENTER
P_BC = POS_BOTTOM_CENTER
P_TR = POS_TOP_RIGHT
P_CR = POS_CENTER_RIGHT
P_BR = POS_BOTTOM_RIGHT
def edge_propagation_scaling(image: Image.Image,
desired_size: Tuple[int, int],
paste_position: Tuple[int, int] = P_CC,
) -> Image.Image:
image = image.copy()
scaled = Image.new(image.mode, desired_size)
new_placement_: List[int] = list()
for isz, ssz, pp in zip(image.size, scaled.size, paste_position,):
if pp == START:
new_placement_.append(0)
elif pp == END:
new_placement_.append(ssz-isz)
elif pp == CENTER:
new_placement_.append((ssz-isz)//2)
else:
new_placement_.append(pp)
new_placement: Tuple[int, int] = new_placement_[0], new_placement_[1]
del new_placement_
scaled.paste(image, new_placement)
parts = dict(
# left, upper, right, lower
t=image.copy().crop(
(0, 0, image.size[0], 1)),
b=image.copy().crop(
(0, image.size[1]-1, image.size[0], image.size[1])),
l=image.copy().crop(
(0, 0, 1, image.size[1])),
r=image.copy().crop(
(image.size[0]-1, 0, image.size[0], image.size[1])),
)
if (sz := new_placement[1]) > 0:
part = parts['t'].copy()
resized = part.resize((part.size[0], sz))
scaled.paste(resized, (new_placement[0], 0))
if (sz := scaled.size[1]-(dsp := new_placement[1]+image.size[1])) > 0:
part = parts['b'].copy()
resized = part.resize((part.size[0], sz))
scaled.paste(resized, (new_placement[0], dsp))
if (sz := new_placement[0]) > 0:
part = parts['l'].copy()
resized = part.resize((sz, part.size[1]))
scaled.paste(resized, (0, new_placement[1]))
if (sz := scaled.size[0]-(dsp := new_placement[0]+image.size[0])) > 0:
part = parts['r'].copy()
resized = part.resize((sz, part.size[1]))
scaled.paste(resized, (dsp, new_placement[1]))
del parts
corners = dict(
tl=image.getpixel((0, 0)),
tr=image.getpixel((image.size[0]-1, 0)),
bl=image.getpixel((0, image.size[1]-1)),
br=image.getpixel((image.size[0]-1, image.size[1]-1)),
)
draw: ImageDraw.ImageDraw = ImageDraw.Draw(scaled)
szt = new_placement[1]
szb = scaled.size[1]-(dspb := new_placement[1]+image.size[1])
szl = new_placement[0]
szr = scaled.size[0]-(dspr := new_placement[0]+image.size[0])
if szt > 0 and szl > 0:
draw.rectangle(((0, 0), (szl-1, szt-1)), corners['tl'])
if szt > 0 and szr > 0:
draw.rectangle(((dspr, 0), (scaled.size[0], szt-1)), corners['tr'])
if szb > 0 and szl > 0:
draw.rectangle(((0, dspb), (szl-1, scaled.size[1])), corners['bl'])
if szb > 0 and szr > 0:
draw.rectangle(((dspr, dspb), scaled.size), corners['br'])
del dspr
del dspb
del szt
del szb
del szl
del szr
return scaled
def calculate_thumbnail_hashes(image: Image.Image) -> Dict[str, Dict[str, Dict[str, Dict[str, str]]]]:
out_dict: Dict[str, Dict[str, Dict[str, Dict[str, str]]]] = dict()
max_dimen = max(image.size)
for filling in [True, False]:
transparent_square = None
if filling:
transparent_square = edge_propagation_scaling(image, (max_dimen, max_dimen))
else:
transparent_square = Image.new('RGBA', (max_dimen, max_dimen))
transparent_square.paste(image, (
(max_dimen - image.size[0]) // 2,
(max_dimen - image.size[1]) // 2,
))
backgrounds: Dict[str, Dict[str, Dict[str, str]]] = dict()
for background in ['#000000', '#FFFFFF']:
backgrounded = Image.new('RGB', transparent_square.size, background)
backgrounded.paste(transparent_square)
sizes: Dict[str, Dict[str, str]] = dict()
for size in [4, 8, 16, 24, 32, 48, 64, 72, 96, 128]:
resized = backgrounded.copy()
resized = resized.resize((size, size))
bit_depths: Dict[str, str] = dict()
for bit_depth in range(1, 9):
posterized: Image.Image = resized.copy()
posterized = ImageOps.posterize(posterized, bit_depth)
bio = BytesIO()
posterized.save(bio, format='BMP')
hashsum = hexhashof(bio.getvalue(), hashlib.md5)
bit_depths[str(bit_depth)] = hashsum
sizes[str(size)] = bit_depths
backgrounds[background] = sizes
out_dict['fill' if filling else 'center'] = backgrounds
return out_dict
def hexhashof(bts: bytes, using: Callable[[], Any]) -> str:
m = using()
m.update(bts)
return m.hexdigest()
def upload_log(url, sender, **content):
sent = False
while not sent:
try:
requests.post(url, data={
'sender': json.dumps(sender),
'content': json.dumps(content),
}).raise_for_status()
sent = True
except:
traceback.print_exc()
def upload_job(url, **content):
sent = False
while not sent:
try:
requests.post(url, json=content).raise_for_status()
sent = True
except:
traceback.print_exc()
def do_work(base_address: str, worker_id: str):
while True:
try:
tick_set = time.time()
job_id = None
try:
job_id = requests.get(f'{base_address}job').text
except KeyboardInterrupt:
raise
except:
pass
if job_id is None:
continue
elif job_id == 'done':
break
else:
tick_downloading_job_started = time.time()
tick_downloading_job_retry_started = tick_downloading_job_started
tick_downloading_job_retry_count = 0
job = None
while job is None:
try:
tick_downloading_job_retry_started = time.time()
with requests.get(f'{base_address}job/{job_id}') as response:
response.raise_for_status()
job = response.json()
except KeyboardInterrupt:
raise
except:
tick_downloading_job_retry_count += 1
sio = StringIO()
traceback.print_exc(file=sio)
formatted_exception = sio.getvalue()
print(formatted_exception, file=sys.stderr)
upload_log(
f'{base_address}log/error',
worker_id,
during='JobDownload',
tick_set=tick_set,
traceback=formatted_exception,
job_id=job_id,
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
)
tick_downloading_job_ended = time.time()
tick_downloading_image_started = time.time()
tick_downloading_image_retry_started = tick_downloading_image_started
tick_downloading_image_retry_count = 0
tick_downloading_image_retry_mismatch = 0
image_bytes = None
while image_bytes is None:
try:
tick_downloading_job_retry_started = time.time()
with requests.get(f'{base_address}{job["file"]}') as response:
if response.status_code == 404:
break
response.raise_for_status()
response.raw.decode_content = True
if hexhashof(response.content, hashlib.sha256) == job['hsh']:
image_bytes = response.content
else:
raise HashMismatch()
except KeyboardInterrupt:
raise
except BaseException as exception:
tick_downloading_image_retry_count += 1
if isinstance(exception, HashMismatch):
tick_downloading_image_retry_mismatch += 1
sio = StringIO()
traceback.print_exc(file=sio)
formatted_exception = sio.getvalue()
print(formatted_exception, file=sys.stderr)
upload_log(
f'{base_address}log/error',
worker_id,
during='ImageDownload',
tick_set=tick_set,
traceback=formatted_exception,
job_id=job_id,
file=job["file"],
hash=job["hsh"],
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
tick_downloading_job_ended=tick_downloading_job_ended,
tick_downloading_image_started=tick_downloading_image_started,
tick_downloading_image_retry_started=tick_downloading_image_retry_started,
tick_downloading_image_retry_count=tick_downloading_image_retry_count,
tick_downloading_image_retry_mismatch=tick_downloading_image_retry_mismatch,
)
if tick_downloading_image_retry_mismatch >= 10:
break
tick_downloading_image_ended = time.time()
if image_bytes is None:
upload_job(
f'{base_address}job/{job_id}',
status='NoValidImageData',
tick_set=tick_set,
job_id=job_id,
file=job["file"],
hash=job["hsh"],
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
tick_downloading_job_ended=tick_downloading_job_ended,
tick_downloading_image_started=tick_downloading_image_started,
tick_downloading_image_retry_started=tick_downloading_image_retry_started,
tick_downloading_image_retry_count=tick_downloading_image_retry_count,
tick_downloading_image_retry_mismatch=tick_downloading_image_retry_mismatch,
tick_downloading_image_ended=tick_downloading_image_ended,
)
else:
tick_image_decoding_start = time.time()
image = None
try:
image = Image.open(BytesIO(image_bytes)).copy()
except KeyboardInterrupt:
raise
except:
pass
tick_image_decoding_ended = time.time()
if image is None:
upload_job(
f'{base_address}job/{job_id}',
status='ImageIsBroken',
tick_set=tick_set,
job_id=job_id,
file=job["file"],
hash=job["hsh"],
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
tick_downloading_job_ended=tick_downloading_job_ended,
tick_downloading_image_started=tick_downloading_image_started,
tick_downloading_image_retry_started=tick_downloading_image_retry_started,
tick_downloading_image_retry_count=tick_downloading_image_retry_count,
tick_downloading_image_retry_mismatch=tick_downloading_image_retry_mismatch,
tick_downloading_image_ended=tick_downloading_image_ended,
tick_image_decoding_start=tick_image_decoding_start,
tick_image_decoding_ended=tick_image_decoding_ended,
)
else:
tick_image_thumbnailing_start = time.time()
calculated_thumbnail_hashes = calculate_thumbnail_hashes(image)
tick_image_thumbnailing_ended = time.time()
tick_uploading_started = time.time()
upload_job(
f'{base_address}job/{job_id}',
status='Complete',
tick_set=tick_set,
job_id=job_id,
file=job["file"],
hash=job["hsh"],
calculated_thumbnail_hashes=calculated_thumbnail_hashes,
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
tick_downloading_job_ended=tick_downloading_job_ended,
tick_downloading_image_started=tick_downloading_image_started,
tick_downloading_image_retry_started=tick_downloading_image_retry_started,
tick_downloading_image_retry_count=tick_downloading_image_retry_count,
tick_downloading_image_retry_mismatch=tick_downloading_image_retry_mismatch,
tick_downloading_image_ended=tick_downloading_image_ended,
tick_image_decoding_start=tick_image_decoding_start,
tick_image_decoding_ended=tick_image_decoding_ended,
tick_image_thumbnailing_start=tick_image_thumbnailing_start,
tick_image_thumbnailing_ended=tick_image_thumbnailing_ended,
)
tick_uploading_ended = time.time()
upload_log(
f'{base_address}log/performance',
worker_id,
file=job["file"],
hash=job["hsh"],
tick_downloading_job_started=tick_downloading_job_started,
tick_downloading_job_retry_started=tick_downloading_job_retry_started,
tick_downloading_job_retry_count=tick_downloading_job_retry_count,
tick_downloading_job_ended=tick_downloading_job_ended,
tick_downloading_image_started=tick_downloading_image_started,
tick_downloading_image_retry_started=tick_downloading_image_retry_started,
tick_downloading_image_retry_count=tick_downloading_image_retry_count,
tick_downloading_image_retry_mismatch=tick_downloading_image_retry_mismatch,
tick_downloading_image_ended=tick_downloading_image_ended,
tick_image_decoding_start=tick_image_decoding_start,
tick_image_decoding_ended=tick_image_decoding_ended,
tick_image_thumbnailing_start=tick_image_thumbnailing_start,
tick_image_thumbnailing_ended=tick_image_thumbnailing_ended,
tick_uploading_started=tick_uploading_started,
tick_uploading_ended=tick_uploading_ended,
)
tick_uploading_ended = time.time()
print(f"Done: {job['hsh']}")
except KeyboardInterrupt:
raise
except:
raise
def kickstart(base_address: str):
job_count: int = multiprocessing.cpu_count() * 2
# job_count = 1
hostname: str = socket.gethostname()
with ProcessPoolExecutor(job_count) as pe:
for job_seq in range(job_count):
job_id = f'{hostname}-{job_seq}'
def on_completed(job: Future):
job.result()
pe.submit(
do_work,
worker_id=job_id,
base_address=base_address,
).add_done_callback(on_completed)
def main():
if len(sys.argv) == 2:
base_address = sys.argv[1]
if not base_address.startswith('http'):
base_address = 'http://'+base_address
if not base_address.endswith('/'):
base_address += '/'
kickstart(base_address)
else:
print(f'Usage:\n {sys.argv[0]} <ip_address:port>')
if __name__ == "__main__":
main()

+ 7
- 0
prunedownloads.py View File

@ -0,0 +1,7 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from reddit_imgs.download_pruner import main
if __name__ == '__main__':
main()

+ 238
- 0
reddit_imgs/condensate_hashes.py View File

@ -0,0 +1,238 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import datetime
import json
import multiprocessing
from pathlib import Path
from typing import Any, Collection, Dict, FrozenSet, List, Optional, Tuple
import colored as clrlib
from .system.cmdline_parser import parse_cmdline
from .system.flattener import flatten_generator
from .system.format_file_size import format_power10
HISTORICAL_EXPORT = False
EXTENSION_FILTER = None
NSFW_NESS_FILTER = None
SUBREDDIT_FILTER = None
def cmdline(encoded_args: str = None):
if encoded_args is None:
return run_with_config()
else:
return parse_cmdline(run_with_config, encoded_args)
def run_with_config(historical: bool = False,
nsfw_ness_filter: bool = None,
extension_filter: list = None,
subreddit_filter: frozenset = None):
global HISTORICAL_EXPORT
global EXTENSION_FILTER
global NSFW_NESS_FILTER
global SUBREDDIT_FILTER
EXTENSION_FILTER = extension_filter
HISTORICAL_EXPORT = historical
NSFW_NESS_FILTER = nsfw_ness_filter
SUBREDDIT_FILTER = (None
if subreddit_filter is None else
frozenset(sr.lower() for sr in subreddit_filter))
return main()
class ExtensionFilter(multiprocessing.Process):
def __init__(self,
allowed_extensions: Collection[str],
input_queue: multiprocessing.Queue,
output_queue: multiprocessing.Queue):
multiprocessing.Process.__init__(self)
self.allowed_extensions = allowed_extensions
self.input_queue = input_queue
self.output_queue = output_queue
def run(self):
while True:
next_item: str = self.input_queue.get()
if next_item is None:
self.output_queue.put(None)
break
if self.allowed_extensions is None:
self.output_queue.put(next_item)
elif Path(next_item).suffix in self.allowed_extensions:
self.output_queue.put(next_item)
class NsfwNessFilter(multiprocessing.Process):
def __init__(self,
nsfw_ness: Optional[bool],
file2link: Dict[str, List[str]],
link2post: Dict[str, List[str]],
post_info: Dict[str, Dict[str, Any]],
input_queue: multiprocessing.Queue,
output_queue: multiprocessing.Queue):
multiprocessing.Process.__init__(self)
self.nsfw_ness = nsfw_ness
self.file2link = file2link
self.link2post = link2post
self.post_info = post_info
self.input_queue = input_queue
self.output_queue = output_queue
def run(self):
while True:
next_item: str = self.input_queue.get()
if next_item is None:
self.output_queue.put(None)
break
if (
(self.nsfw_ness is None)
or
all(map(
lambda post: (
self
.post_info
.get(post, {})
.get('nsfw', None)
==
self.nsfw_ness
),
flatten_generator(map(
lambda link: self.link2post.get(link, []),
self.file2link.get(next_item, [])
))
))
):
self.output_queue.put(next_item)
class SubredditFilter(multiprocessing.Process):
def __init__(self,
subreddits: Optional[FrozenSet[str]],
file2link: Dict[str, List[str]],
link2post: Dict[str, List[str]],
post_info: Dict[str, Dict[str, Any]],
input_queue: multiprocessing.Queue,
output_queue: multiprocessing.Queue):
multiprocessing.Process.__init__(self)
self.subreddits = subreddits
self.file2link = file2link
self.link2post = link2post
self.post_info = post_info
self.input_queue = input_queue
self.output_queue = output_queue
def run(self):
while True:
next_item: str = self.input_queue.get()
if next_item is None:
self.output_queue.put(None)
break
if (
(self.subreddits is None)
or
all(map(
lambda subreddit: subreddit in self.subreddits,
flatten_generator(map(
lambda post: (
self
.post_info
.get(post, {})
.get('subreddits', [])
),
flatten_generator(map(
lambda link: self.link2post.get(link, []),
self.file2link.get(next_item, [])
))
))
))
):
self.output_queue.put(next_item)
class FileExistsFilter(multiprocessing.Process):
def __init__(self,
input_queue: multiprocessing.Queue,
output_queue: multiprocessing.Queue):
multiprocessing.Process.__init__(self)
self.input_queue = input_queue
self.output_queue = output_queue
def run(self):
while True:
next_item: str = self.input_queue.get()
if next_item is None:
self.output_queue.put(None)
break
if Path(next_item).exists():
self.output_queue.put(next_item)
def main():
exported_name = 'h_gdl.txt' if not HISTORICAL_EXPORT else f'h_gdl_{datetime.datetime.now()}.txt'
exported_path = Path(exported_name)
exported_path.write_text('')
hashes_list: List[Tuple[str, str]] = list(map(
lambda a: (a[1], a[0]),
map(
lambda a: a.split('|', 1),
Path('i_gdl_hashes.txt').read_text().splitlines())))
hashes_dict: Dict[str, str] = dict(hashes_list)
r_gdl_p = json.loads(Path('r_gdl_p.json').read_text())
r_gdl_lp = json.loads(Path('r_gdl_lp.json').read_text())
i_gdl_lff = json.loads(Path('i_gdl_lff.json').read_text())
file_sizes = json.loads(Path('i_gdl_fsz.json').read_text())
general_size = 0
general_count = 0
existing_files_queue = multiprocessing.Queue()
intermediary_queue = multiprocessing.Queue()
intermediary_queue2 = multiprocessing.Queue()
remaining_queue = multiprocessing.Queue()
ffp = ExtensionFilter(EXTENSION_FILTER, existing_files_queue,
intermediary_queue)
nfp = NsfwNessFilter(NSFW_NESS_FILTER, i_gdl_lff, r_gdl_lp, r_gdl_p,
intermediary_queue, intermediary_queue2)
srp = SubredditFilter(SUBREDDIT_FILTER, i_gdl_lff, r_gdl_lp, r_gdl_p,
intermediary_queue2, remaining_queue)
# fep = FileExistsFilter(intermediary_queue, remaining_queue)
ffp.start()
nfp.start()
srp.start()
# fep.start()
for file, _ in hashes_list:
existing_files_queue.put(file)
existing_files_queue.put(None)
known_hashes = set()
with exported_path.open('at') as fd:
while True:
next_item: str = remaining_queue.get()
if next_item is None:
break
else:
hsh = hashes_dict[next_item]
known_hashes.add(hsh)
fd.write(f'{hsh}|{next_item}\n')
general_size += file_sizes.get(next_item, 0)
general_count += 1
ffp.join()
nfp.join()
srp.join()
# fep.join()
existing_files_queue.close()
existing_files_queue.join_thread()
intermediary_queue.close()
intermediary_queue.join_thread()
intermediary_queue2.close()
intermediary_queue2.join_thread()
remaining_queue.close()
remaining_queue.join_thread()
print(f'Found {general_count} files')
print(f'Found {len(known_hashes)} unique hashes')
print(f'Size: {general_size} bytes ({format_power10(general_size)})')
if __name__ == "__main__":
HISTORICAL_EXPORT = True
main()

+ 333
- 104
reddit_imgs/display_fetch_futures.py View File

@ -6,13 +6,20 @@ import importlib
import shutil
import time
import traceback
from io import StringIO
from pathlib import Path
from time import sleep
from typing import Dict, List, Union
import colored
from .system.format_file_size import Numeric, format_power2, format_power10
</