reddit-image-wall-getter/reddit_imgs/_hashit.py

124 lines
5.7 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import shutil
from pathlib import Path
from typing import List
import filetype
import reddit_imgs.cachedhash
import reddit_imgs.fetch
def hash2pathsegments(sha256: str) -> List[str]:
return [sha256[0:3], sha256[3:6], sha256[6:]]
def main():
if not Path("i_c_h.json").exists():
print("Executing prerrequisite...")
reddit_imgs.cachedhash.main()
if not Path("i_c.json").exists():
print("Executing prerrequisite...")
reddit_imgs.fetch.main()
errorHashesFile = Path('error_hashes.txt')
errorHashes = list() if not errorHashesFile.exists() else list(filter(len, map(str.strip, errorHashesFile.read_text().splitlines())))
imageCacheSaved = json.loads(Path("i_c.json").read_text())
imageHashCache = json.loads(Path("i_c_h.json").read_text())
imageCache = sorted(imageCacheSaved, key=lambda c: (-len(c['downloaded']), c['timestamp'], c['datakey']))
hashCache = dict()
hashStore = dict()
hashExtensions = dict()
if Path('i_hc.json').exists():
hashCache = json.loads(Path('i_hc.json').read_text())
if Path('i_hs.json').exists():
hashStore = json.loads(Path('i_hs.json').read_text())
if Path('i_he.json').exists():
hashExtensions = json.loads(Path('i_he.json').read_text())
for errorHash in errorHashes:
for d in (hashCache, hashStore, hashExtensions):
if errorHash in d:
del d[errorHash]
imageCacheSize = len(imageCache)
for seq, metadata in enumerate(imageCache):
print(f"Handling post {seq+1} of {imageCacheSize}... ", end='')
postdata = metadata.copy()
downloaded = metadata['downloaded']
del postdata['downloaded']
print(f'{len(downloaded)} files found.')
if len(metadata['downloaded']) == 0:
continue
postdata['album'] = len(metadata['downloaded']) > 1
for entry in downloaded:
imagehash = imageHashCache[entry['file']]
if imagehash in errorHashes:
continue
if imagehash not in hashStore:
hashStore[imagehash] = list()
hashCache[imagehash] = list()
if entry['file'] not in hashStore[imagehash]:
hashStore[imagehash].append(entry['file'])
if postdata not in hashCache[imagehash]:
hashCache[imagehash].append(postdata)
most_repeated_hashes = sorted(list(filter(lambda a: len(a[1])>1, hashStore.items())), key=lambda a:(-len(a[1]), a[0]))
Path('most_repeated_hashes.json').write_text(json.dumps(most_repeated_hashes, indent=1))
most_repeated_hashes_exhibition = list(map(lambda mrh: (len(mrh[1]), mrh[0]), most_repeated_hashes))
hashCacheSize = len(hashCache)
for seq, sha256 in enumerate(hashCache.keys()):
print(f"Hash {seq+1} of {hashCacheSize}...")
if hashExtensions.get(sha256, None) is None:
hash_path = Path('i_h', *hash2pathsegments(sha256))
hash_occur_path = hash_path.joinpath('occurrences')
if not hash_occur_path.exists():
hash_occur_path.mkdir(parents=True, exist_ok=True)
for path in hashStore[sha256]:
reffn = path.split('/', 1)[1].replace('/', '_')[-255:]
reffp = hash_occur_path.joinpath(reffn)
if not reffp.exists():
reffp.symlink_to('../../../../../'+path)
hash_sample_path = hash_path.joinpath('sample')
if not hash_sample_path.exists():
if not hash_sample_path.is_symlink():
hash_sample_path.symlink_to('../../../../'+hashStore[sha256][0])
else: # symlink is there, but pointing to a broken location
shutil.rmtree(hash_path)
for hashed_instance in hashStore[sha256]:
shutil.rmtree(Path(hashed_instance).parent)
for k in ['i_c.json', 'i_c_h.json', 'i_h.json', 'i_hc.json', 'i_hs.json', 'i_he.json']:
if (fl := Path(k)).exists():
fl.unlink()
raise Exception(
'Cannot proccess broken path.\n' +
'Re-run pipeline since "fetch".\n' +
f'{hash_path}\n' +
"\n".join([f" - {Path(k).parent}" for k in hashStore[sha256]])
)
hash_ext_path = hash_path.joinpath('ext')
if not hash_ext_path.exists():
with hash_sample_path.open('rb') as f:
ext = filetype.guess_extension(f)
ext = ext if ext else 'unk'
hash_ext_path.write_text(ext)
hashExtensions[sha256] = ext
elif hashExtensions.get(sha256, None) is None:
hashExtensions[sha256] = hash_ext_path.read_text()
Path('i_h.json').write_text(json.dumps(dict(posts=hashCache, files=hashStore, extensions=hashExtensions), indent=1))
Path('i_hc.json').write_text(json.dumps(hashCache, indent=1))
Path('i_hs.json').write_text(json.dumps(hashStore, indent=1))
Path('i_he.json').write_text(json.dumps(hashExtensions, indent=1))
print('+------+'+'-'*66+'+'+'-'*7+'+')
print('| Hits |'+' '*31+'Hash'+' '*31+'| ext |')
print('+------+'+'-'*66+'+'+'-'*7+'+')
for mrhe in reversed(most_repeated_hashes_exhibition):
print('| {0:>4} | {1:64} | {2:^5} |'.format(*mrhe, hashExtensions[mrhe[1]]))
print('+------+'+'-'*66+'+'+'-'*7+'+')
print('| Hits |'+' '*31+'Hash'+' '*31+'| ext |')
print('+------+'+'-'*66+'+'+'-'*7+'+')
print("Check 'most_repeated_hashes.json' for more details.")
if __name__ == "__main__":
main()