#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import json import subprocess from concurrent.futures import ProcessPoolExecutor as PoolExecutor from pathlib import Path from typing import Dict, Union import hashlib import sys from io import BytesIO import PIL.Image import reddit_imgs.hashit from .hashit import hash2pathsegments from .system.table_fmt import table_fmt POOL_SIZE = 32 def hexhashof(bts: bytes, using) -> str: m = using() m.update(bts) return m.hexdigest() def main(): print("Caution: you'll need a lot of disk space.") input("Press ENTER to continue. ") image_hashes_extensions_path = Path('i_he.json') if not image_hashes_extensions_path.exists(): print("Executing prerrequisite...") reddit_imgs.hashit.main() image_hashes_extensions = json.loads( image_hashes_extensions_path.read_text() ) used_extensions = {i: 0 for i in set(image_hashes_extensions.values())} for e in image_hashes_extensions.values(): used_extensions[e] += 1 image_hashed_path = Path('i_h') image_hashed_normalized_path = Path('i_h_n') image_hashed_normalized_path.mkdir(parents=True, exist_ok=True) print(table_fmt( ['ext', 'hashes'], sorted(used_extensions.items(), key=lambda t: (t[1], t[0]))+[ ('total', sum(used_extensions.values())), ], 'Extensions', alignment='^>', divide_last_line=True, )) converted_path = Path('i_h_n.json') converted = dict() if converted_path.exists(): converted = json.loads(converted_path.read_text()) with PoolExecutor(POOL_SIZE) as pe: totalcnt = len(image_hashes_extensions) on_finished_count = 0 def on_finished(job): nonlocal on_finished_count on_finished_count += 1 res = job.result() converted[res['hash']] = res if on_finished_count % 1000 == 0: converted_path.write_text(json.dumps( converted, indent=1, )) for seq, (hsh, ext) in enumerate(image_hashes_extensions.items()): job = pe.submit( normalize_to_bmp, seq, totalcnt, image_hashed_path.joinpath( *hash2pathsegments(hsh), 'sample' ), image_hashed_normalized_path.joinpath( *hash2pathsegments(hsh), ), ext, hsh, converted.get(hsh, dict(hash=hsh)), ) job.add_done_callback(on_finished) # return converted_path.write_text(json.dumps( converted, indent=1, )) def normalize_to_bmp(seq: int, total: int, pathinfile: Path, pathoutdir: Path, knownext: str, hsh: str, lastout: Dict[str, Union[str, int]], ) -> Dict[str, Union[str, int]]: needed_info = ( # 'square', 'square_size', 'square_dimens', 'aspect', 'aspect_size', 'aspect_dimens', ) if len(set(needed_info).difference(lastout.keys())) > 0: progress_of = '%06d of %06d' % (seq+1, total) print(f'{progress_of} - Ensuring that "{knownext}" entry exists: {hsh}') if not pathinfile.exists(): raise FileNotFoundError(pathinfile) if not (pathoutfile := pathoutdir.joinpath('aspect.bmp')).exists(): print(f'{progress_of} - Converting to BMP keeping aspect') r = subprocess.run( ['ffmpegthumbnailer', '-i', str(pathinfile), '-t', '10%', '-s', '0', '-c', 'png', '-o', '-', ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if r.returncode or len(r.stdout)==0: sys.stdout.write(r.stderr.decode(errors='ignore')) raise ValueError('Conversion failed for %r [%s]' % (pathinfile, knownext)) r.check_returncode() pathoutdir.mkdir(parents=True, exist_ok=True) PIL.Image.open(BytesIO(r.stdout)).save(pathoutfile) if len(set(['aspect', 'aspect_size', 'aspect_dimens']).difference(lastout.keys())) > 0: print(f'{progress_of} - Extacting BMP data from aspect') pathoutfile = pathoutdir.joinpath('aspect.bmp') bts = pathoutfile.read_bytes() aspectsha256 = hexhashof(bts, hashlib.sha256) pathoutdir.joinpath('aspect.bmp.sha256').write_text(aspectsha256) lastout['aspect'] = aspectsha256 lastout['aspect_size'] = len(bts) lastout['aspect_dimens'] = PIL.Image.open(pathoutfile).size # if not (pathoutfile := pathoutdir.joinpath('square.bmp')).exists(): # print(f'{progress_of} - Converting to BMP square') # im = PIL.Image.open(pathoutdir.joinpath('aspect.bmp')) # ax, ay = im.size # am = max(ax, ay) # sq = PIL.Image.new('RGB', (am, am)) # sq.paste(im, ((am-ax)//2, (am-ay)//2)) # sq.save(pathoutfile) # if len(set(['square', 'square_size', 'square_dimens']).difference(lastout.keys())) > 0: # print(f'{progress_of} - Extacting BMP data from square') # pathoutfile = pathoutdir.joinpath('square.bmp') # bts = pathoutfile.read_bytes() # aspectsha256 = hexhashof(bts, hashlib.sha256) # pathoutdir.joinpath('square.bmp.sha256').write_text(aspectsha256) # lastout['square'] = aspectsha256 # lastout['square_size'] = len(bts) # lastout['square_dimens'] = PIL.Image.open(pathoutfile).size return lastout if __name__ == "__main__": main()