2020-05-13 21:07:05 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
|
2020-06-01 03:20:23 +00:00
|
|
|
import json
|
|
|
|
import shutil
|
2020-05-13 21:07:05 +00:00
|
|
|
from pathlib import Path
|
2020-06-01 03:20:23 +00:00
|
|
|
from typing import List
|
2020-05-13 21:07:05 +00:00
|
|
|
|
|
|
|
import filetype
|
2020-06-01 03:20:23 +00:00
|
|
|
|
2020-05-13 21:07:05 +00:00
|
|
|
import reddit_imgs.cachedhash
|
2020-06-01 03:20:23 +00:00
|
|
|
import reddit_imgs.fetch
|
|
|
|
|
|
|
|
|
|
|
|
def hash2pathsegments(sha256: str) -> List[str]:
|
|
|
|
return [sha256[0:3], sha256[3:6], sha256[6:]]
|
2020-05-13 21:07:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
if not Path("i_c_h.json").exists():
|
|
|
|
print("Executing prerrequisite...")
|
|
|
|
reddit_imgs.cachedhash.main()
|
|
|
|
if not Path("i_c.json").exists():
|
|
|
|
print("Executing prerrequisite...")
|
|
|
|
reddit_imgs.fetch.main()
|
|
|
|
errorHashesFile = Path('error_hashes.txt')
|
|
|
|
errorHashes = list() if not errorHashesFile.exists() else list(filter(len, map(str.strip, errorHashesFile.read_text().splitlines())))
|
|
|
|
imageCacheSaved = json.loads(Path("i_c.json").read_text())
|
|
|
|
imageHashCache = json.loads(Path("i_c_h.json").read_text())
|
|
|
|
imageCache = sorted(imageCacheSaved, key=lambda c: (-len(c['downloaded']), c['timestamp'], c['datakey']))
|
|
|
|
hashCache = dict()
|
|
|
|
hashStore = dict()
|
|
|
|
hashExtensions = dict()
|
|
|
|
if Path('i_hc.json').exists():
|
|
|
|
hashCache = json.loads(Path('i_hc.json').read_text())
|
|
|
|
if Path('i_hs.json').exists():
|
|
|
|
hashStore = json.loads(Path('i_hs.json').read_text())
|
|
|
|
if Path('i_he.json').exists():
|
|
|
|
hashExtensions = json.loads(Path('i_he.json').read_text())
|
|
|
|
for errorHash in errorHashes:
|
|
|
|
for d in (hashCache, hashStore, hashExtensions):
|
|
|
|
if errorHash in d:
|
|
|
|
del d[errorHash]
|
|
|
|
imageCacheSize = len(imageCache)
|
|
|
|
for seq, metadata in enumerate(imageCache):
|
|
|
|
print(f"Handling post {seq+1} of {imageCacheSize}... ", end='')
|
|
|
|
postdata = metadata.copy()
|
|
|
|
downloaded = metadata['downloaded']
|
|
|
|
del postdata['downloaded']
|
|
|
|
print(f'{len(downloaded)} files found.')
|
|
|
|
if len(metadata['downloaded']) == 0:
|
|
|
|
continue
|
|
|
|
postdata['album'] = len(metadata['downloaded']) > 1
|
|
|
|
for entry in downloaded:
|
|
|
|
imagehash = imageHashCache[entry['file']]
|
|
|
|
if imagehash in errorHashes:
|
|
|
|
continue
|
|
|
|
if imagehash not in hashStore:
|
|
|
|
hashStore[imagehash] = list()
|
|
|
|
hashCache[imagehash] = list()
|
|
|
|
if entry['file'] not in hashStore[imagehash]:
|
|
|
|
hashStore[imagehash].append(entry['file'])
|
|
|
|
if postdata not in hashCache[imagehash]:
|
|
|
|
hashCache[imagehash].append(postdata)
|
|
|
|
most_repeated_hashes = sorted(list(filter(lambda a: len(a[1])>1, hashStore.items())), key=lambda a:(-len(a[1]), a[0]))
|
|
|
|
Path('most_repeated_hashes.json').write_text(json.dumps(most_repeated_hashes, indent=1))
|
|
|
|
most_repeated_hashes_exhibition = list(map(lambda mrh: (len(mrh[1]), mrh[0]), most_repeated_hashes))
|
|
|
|
hashCacheSize = len(hashCache)
|
|
|
|
for seq, sha256 in enumerate(hashCache.keys()):
|
|
|
|
print(f"Hash {seq+1} of {hashCacheSize}...")
|
|
|
|
if hashExtensions.get(sha256, None) is None:
|
2020-06-01 03:20:23 +00:00
|
|
|
hash_path = Path('i_h', *hash2pathsegments(sha256))
|
2020-05-13 21:07:05 +00:00
|
|
|
hash_occur_path = hash_path.joinpath('occurrences')
|
|
|
|
if not hash_occur_path.exists():
|
|
|
|
hash_occur_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
for path in hashStore[sha256]:
|
|
|
|
reffn = path.split('/', 1)[1].replace('/', '_')[-255:]
|
|
|
|
reffp = hash_occur_path.joinpath(reffn)
|
|
|
|
if not reffp.exists():
|
|
|
|
reffp.symlink_to('../../../../../'+path)
|
|
|
|
hash_sample_path = hash_path.joinpath('sample')
|
|
|
|
if not hash_sample_path.exists():
|
2020-06-01 03:20:23 +00:00
|
|
|
if not hash_sample_path.is_symlink():
|
|
|
|
hash_sample_path.symlink_to('../../../../'+hashStore[sha256][0])
|
|
|
|
else: # symlink is there, but pointing to a broken location
|
|
|
|
shutil.rmtree(hash_path)
|
|
|
|
for hashed_instance in hashStore[sha256]:
|
|
|
|
shutil.rmtree(Path(hashed_instance).parent)
|
|
|
|
for k in ['i_c.json', 'i_c_h.json', 'i_h.json', 'i_hc.json', 'i_hs.json', 'i_he.json']:
|
|
|
|
if (fl := Path(k)).exists():
|
|
|
|
fl.unlink()
|
|
|
|
raise Exception(
|
|
|
|
'Cannot proccess broken path.\n' +
|
|
|
|
'Re-run pipeline since "fetch".\n' +
|
|
|
|
f'{hash_path}\n' +
|
|
|
|
"\n".join([f" - {Path(k).parent}" for k in hashStore[sha256]])
|
|
|
|
)
|
2020-05-13 21:07:05 +00:00
|
|
|
hash_ext_path = hash_path.joinpath('ext')
|
|
|
|
if not hash_ext_path.exists():
|
|
|
|
with hash_sample_path.open('rb') as f:
|
|
|
|
ext = filetype.guess_extension(f)
|
|
|
|
ext = ext if ext else 'unk'
|
|
|
|
hash_ext_path.write_text(ext)
|
|
|
|
hashExtensions[sha256] = ext
|
|
|
|
elif hashExtensions.get(sha256, None) is None:
|
|
|
|
hashExtensions[sha256] = hash_ext_path.read_text()
|
|
|
|
Path('i_h.json').write_text(json.dumps(dict(posts=hashCache, files=hashStore, extensions=hashExtensions), indent=1))
|
|
|
|
Path('i_hc.json').write_text(json.dumps(hashCache, indent=1))
|
|
|
|
Path('i_hs.json').write_text(json.dumps(hashStore, indent=1))
|
|
|
|
Path('i_he.json').write_text(json.dumps(hashExtensions, indent=1))
|
|
|
|
print('+------+'+'-'*66+'+'+'-'*7+'+')
|
|
|
|
print('| Hits |'+' '*31+'Hash'+' '*31+'| ext |')
|
|
|
|
print('+------+'+'-'*66+'+'+'-'*7+'+')
|
|
|
|
for mrhe in reversed(most_repeated_hashes_exhibition):
|
|
|
|
print('| {0:>4} | {1:64} | {2:^5} |'.format(*mrhe, hashExtensions[mrhe[1]]))
|
|
|
|
print('+------+'+'-'*66+'+'+'-'*7+'+')
|
|
|
|
print('| Hits |'+' '*31+'Hash'+' '*31+'| ext |')
|
|
|
|
print('+------+'+'-'*66+'+'+'-'*7+'+')
|
|
|
|
print("Check 'most_repeated_hashes.json' for more details.")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|