163 lines
5.9 KiB
Python
163 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
import json
|
|
import subprocess
|
|
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
|
|
from pathlib import Path
|
|
from typing import Dict, Union
|
|
import hashlib
|
|
import sys
|
|
from io import BytesIO
|
|
|
|
import PIL.Image
|
|
|
|
import reddit_imgs.hashit
|
|
|
|
from .hashit import hash2pathsegments
|
|
from .system.table_fmt import table_fmt
|
|
|
|
|
|
POOL_SIZE = 32
|
|
|
|
|
|
def hexhashof(bts: bytes, using) -> str:
|
|
m = using()
|
|
m.update(bts)
|
|
return m.hexdigest()
|
|
|
|
|
|
def main():
|
|
print("Caution: you'll need a lot of disk space.")
|
|
input("Press ENTER to continue. ")
|
|
image_hashes_extensions_path = Path('i_he.json')
|
|
if not image_hashes_extensions_path.exists():
|
|
print("Executing prerrequisite...")
|
|
reddit_imgs.hashit.main()
|
|
image_hashes_extensions = json.loads(
|
|
image_hashes_extensions_path.read_text()
|
|
)
|
|
used_extensions = {i: 0 for i in set(image_hashes_extensions.values())}
|
|
for e in image_hashes_extensions.values():
|
|
used_extensions[e] += 1
|
|
image_hashed_path = Path('i_h')
|
|
image_hashed_normalized_path = Path('i_h_n')
|
|
image_hashed_normalized_path.mkdir(parents=True, exist_ok=True)
|
|
print(table_fmt(
|
|
['ext', 'hashes'],
|
|
sorted(used_extensions.items(), key=lambda t: (t[1], t[0]))+[
|
|
('total', sum(used_extensions.values())),
|
|
],
|
|
'Extensions',
|
|
alignment='^>',
|
|
divide_last_line=True,
|
|
))
|
|
converted_path = Path('i_h_n.json')
|
|
converted = dict()
|
|
if converted_path.exists():
|
|
converted = json.loads(converted_path.read_text())
|
|
with PoolExecutor(POOL_SIZE) as pe:
|
|
totalcnt = len(image_hashes_extensions)
|
|
on_finished_count = 0
|
|
def on_finished(job):
|
|
nonlocal on_finished_count
|
|
on_finished_count += 1
|
|
res = job.result()
|
|
converted[res['hash']] = res
|
|
if on_finished_count % 1000 == 0:
|
|
converted_path.write_text(json.dumps(
|
|
converted,
|
|
indent=1,
|
|
))
|
|
for seq, (hsh, ext) in enumerate(image_hashes_extensions.items()):
|
|
job = pe.submit(
|
|
normalize_to_bmp,
|
|
seq,
|
|
totalcnt,
|
|
image_hashed_path.joinpath(
|
|
*hash2pathsegments(hsh),
|
|
'sample'
|
|
),
|
|
image_hashed_normalized_path.joinpath(
|
|
*hash2pathsegments(hsh),
|
|
),
|
|
ext,
|
|
hsh,
|
|
converted.get(hsh, dict(hash=hsh)),
|
|
)
|
|
job.add_done_callback(on_finished)
|
|
# return
|
|
converted_path.write_text(json.dumps(
|
|
converted,
|
|
indent=1,
|
|
))
|
|
|
|
|
|
def normalize_to_bmp(seq: int,
|
|
total: int,
|
|
pathinfile: Path,
|
|
pathoutdir: Path,
|
|
knownext: str,
|
|
hsh: str,
|
|
lastout: Dict[str, Union[str, int]],
|
|
) -> Dict[str, Union[str, int]]:
|
|
needed_info = (
|
|
# 'square', 'square_size', 'square_dimens',
|
|
'aspect', 'aspect_size', 'aspect_dimens',
|
|
)
|
|
if len(set(needed_info).difference(lastout.keys())) > 0:
|
|
progress_of = '%06d of %06d' % (seq+1, total)
|
|
print(f'{progress_of} - Ensuring that "{knownext}" entry exists: {hsh}')
|
|
if not pathinfile.exists():
|
|
raise FileNotFoundError(pathinfile)
|
|
if not (pathoutfile := pathoutdir.joinpath('aspect.bmp')).exists():
|
|
print(f'{progress_of} - Converting to BMP keeping aspect')
|
|
r = subprocess.run(
|
|
['ffmpegthumbnailer',
|
|
'-i', str(pathinfile),
|
|
'-t', '10%',
|
|
'-s', '0',
|
|
'-c', 'png',
|
|
'-o', '-',
|
|
],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
)
|
|
if r.returncode or len(r.stdout)==0:
|
|
sys.stdout.write(r.stderr.decode(errors='ignore'))
|
|
raise ValueError('Conversion failed for %r [%s]' % (pathinfile, knownext))
|
|
r.check_returncode()
|
|
pathoutdir.mkdir(parents=True, exist_ok=True)
|
|
PIL.Image.open(BytesIO(r.stdout)).save(pathoutfile)
|
|
if len(set(['aspect', 'aspect_size', 'aspect_dimens']).difference(lastout.keys())) > 0:
|
|
print(f'{progress_of} - Extacting BMP data from aspect')
|
|
pathoutfile = pathoutdir.joinpath('aspect.bmp')
|
|
bts = pathoutfile.read_bytes()
|
|
aspectsha256 = hexhashof(bts, hashlib.sha256)
|
|
pathoutdir.joinpath('aspect.bmp.sha256').write_text(aspectsha256)
|
|
lastout['aspect'] = aspectsha256
|
|
lastout['aspect_size'] = len(bts)
|
|
lastout['aspect_dimens'] = PIL.Image.open(pathoutfile).size
|
|
# if not (pathoutfile := pathoutdir.joinpath('square.bmp')).exists():
|
|
# print(f'{progress_of} - Converting to BMP square')
|
|
# im = PIL.Image.open(pathoutdir.joinpath('aspect.bmp'))
|
|
# ax, ay = im.size
|
|
# am = max(ax, ay)
|
|
# sq = PIL.Image.new('RGB', (am, am))
|
|
# sq.paste(im, ((am-ax)//2, (am-ay)//2))
|
|
# sq.save(pathoutfile)
|
|
# if len(set(['square', 'square_size', 'square_dimens']).difference(lastout.keys())) > 0:
|
|
# print(f'{progress_of} - Extacting BMP data from square')
|
|
# pathoutfile = pathoutdir.joinpath('square.bmp')
|
|
# bts = pathoutfile.read_bytes()
|
|
# aspectsha256 = hexhashof(bts, hashlib.sha256)
|
|
# pathoutdir.joinpath('square.bmp.sha256').write_text(aspectsha256)
|
|
# lastout['square'] = aspectsha256
|
|
# lastout['square_size'] = len(bts)
|
|
# lastout['square_dimens'] = PIL.Image.open(pathoutfile).size
|
|
return lastout
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|