#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import json import shutil from pathlib import Path from typing import List import filetype import reddit_imgs.cachedhash import reddit_imgs.fetch def hash2pathsegments(sha256: str) -> List[str]: return [sha256[0:3], sha256[3:6], sha256[6:]] def main(): if not Path("i_c_h.json").exists(): print("Executing prerrequisite...") reddit_imgs.cachedhash.main() if not Path("i_c.json").exists(): print("Executing prerrequisite...") reddit_imgs.fetch.main() errorHashesFile = Path('error_hashes.txt') errorHashes = list() if not errorHashesFile.exists() else list(filter(len, map(str.strip, errorHashesFile.read_text().splitlines()))) imageCacheSaved = json.loads(Path("i_c.json").read_text()) imageHashCache = json.loads(Path("i_c_h.json").read_text()) imageCache = sorted(imageCacheSaved, key=lambda c: (-len(c['downloaded']), c['timestamp'], c['datakey'])) hashCache = dict() hashStore = dict() hashExtensions = dict() if Path('i_hc.json').exists(): hashCache = json.loads(Path('i_hc.json').read_text()) if Path('i_hs.json').exists(): hashStore = json.loads(Path('i_hs.json').read_text()) if Path('i_he.json').exists(): hashExtensions = json.loads(Path('i_he.json').read_text()) for errorHash in errorHashes: for d in (hashCache, hashStore, hashExtensions): if errorHash in d: del d[errorHash] imageCacheSize = len(imageCache) for seq, metadata in enumerate(imageCache): print(f"Handling post {seq+1} of {imageCacheSize}... ", end='') postdata = metadata.copy() downloaded = metadata['downloaded'] del postdata['downloaded'] print(f'{len(downloaded)} files found.') if len(metadata['downloaded']) == 0: continue postdata['album'] = len(metadata['downloaded']) > 1 for entry in downloaded: imagehash = imageHashCache[entry['file']] if imagehash in errorHashes: continue if imagehash not in hashStore: hashStore[imagehash] = list() hashCache[imagehash] = list() if entry['file'] not in hashStore[imagehash]: hashStore[imagehash].append(entry['file']) if postdata not in hashCache[imagehash]: hashCache[imagehash].append(postdata) most_repeated_hashes = sorted(list(filter(lambda a: len(a[1])>1, hashStore.items())), key=lambda a:(-len(a[1]), a[0])) Path('most_repeated_hashes.json').write_text(json.dumps(most_repeated_hashes, indent=1)) most_repeated_hashes_exhibition = list(map(lambda mrh: (len(mrh[1]), mrh[0]), most_repeated_hashes)) hashCacheSize = len(hashCache) for seq, sha256 in enumerate(hashCache.keys()): print(f"Hash {seq+1} of {hashCacheSize}...") if hashExtensions.get(sha256, None) is None: hash_path = Path('i_h', *hash2pathsegments(sha256)) hash_occur_path = hash_path.joinpath('occurrences') if not hash_occur_path.exists(): hash_occur_path.mkdir(parents=True, exist_ok=True) for path in hashStore[sha256]: reffn = path.split('/', 1)[1].replace('/', '_')[-255:] reffp = hash_occur_path.joinpath(reffn) if not reffp.exists(): reffp.symlink_to('../../../../../'+path) hash_sample_path = hash_path.joinpath('sample') if not hash_sample_path.exists(): if not hash_sample_path.is_symlink(): hash_sample_path.symlink_to('../../../../'+hashStore[sha256][0]) else: # symlink is there, but pointing to a broken location shutil.rmtree(hash_path) for hashed_instance in hashStore[sha256]: shutil.rmtree(Path(hashed_instance).parent) for k in ['i_c.json', 'i_c_h.json', 'i_h.json', 'i_hc.json', 'i_hs.json', 'i_he.json']: if (fl := Path(k)).exists(): fl.unlink() raise Exception( 'Cannot proccess broken path.\n' + 'Re-run pipeline since "fetch".\n' + f'{hash_path}\n' + "\n".join([f" - {Path(k).parent}" for k in hashStore[sha256]]) ) hash_ext_path = hash_path.joinpath('ext') if not hash_ext_path.exists(): with hash_sample_path.open('rb') as f: ext = filetype.guess_extension(f) ext = ext if ext else 'unk' hash_ext_path.write_text(ext) hashExtensions[sha256] = ext elif hashExtensions.get(sha256, None) is None: hashExtensions[sha256] = hash_ext_path.read_text() Path('i_h.json').write_text(json.dumps(dict(posts=hashCache, files=hashStore, extensions=hashExtensions), indent=1)) Path('i_hc.json').write_text(json.dumps(hashCache, indent=1)) Path('i_hs.json').write_text(json.dumps(hashStore, indent=1)) Path('i_he.json').write_text(json.dumps(hashExtensions, indent=1)) print('+------+'+'-'*66+'+'+'-'*7+'+') print('| Hits |'+' '*31+'Hash'+' '*31+'| ext |') print('+------+'+'-'*66+'+'+'-'*7+'+') for mrhe in reversed(most_repeated_hashes_exhibition): print('| {0:>4} | {1:64} | {2:^5} |'.format(*mrhe, hashExtensions[mrhe[1]])) print('+------+'+'-'*66+'+'+'-'*7+'+') print('| Hits |'+' '*31+'Hash'+' '*31+'| ext |') print('+------+'+'-'*66+'+'+'-'*7+'+') print("Check 'most_repeated_hashes.json' for more details.") if __name__ == "__main__": main()