reddit-image-wall-getter/reddit_imgs/fetch.py
2020-04-01 00:53:16 -03:00

204 lines
8.0 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from .system import downloader as downloaderModule
from pathlib import Path
import json
import os
import hashlib
downloaderGetter = downloaderModule.getDownloader
wdir = os.path.abspath('.')
def _isImageDirectLink(s):
return s.endswith('.jpg') or s.endswith('.png') or s.endswith('.gif') or s.endswith('.webp') or s.endswith('.mp4')
def isImageDirectLink(s):
return _isImageDirectLink(s) or _isImageDirectLink(s.split('?', 1)[0])
def main():
links = list()
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r'))))
for subreddit in subreddits:
srf = os.path.abspath(os.path.join(wdir,'r',subreddit,'subreddit.json'))
links2 = list()
try:
with open(srf) as f:
links2 = json.loads(f.read())['links']
except: pass
links+=links2
del links2
del srf
del subreddit
del subreddits
links.sort(key=lambda link: link['timestamp'])
medias = dict((('direct_link',list()),))
for link in links:
if isImageDirectLink(link['link']):
medias['direct_link'].append(link)
continue
if link['domain'] not in medias:
medias[link['domain']] = list()
medias[link['domain']].append(link)
del link
del links
priorities = list()
for source, links in sorted(medias.items()):
downloaderClass = downloaderGetter(source)
if downloaderClass is None:
print('No downloader for: {0:<35} | {1:>5} links dropped'.format(source,len(links)))
priorities.append((len(links),source))
del medias[source]
continue
top_priorities = list(reversed(sorted(priorities)))[:10]
prioremain = sum(map(lambda a: a[0], list(reversed(sorted(priorities)))[10:]))
priolen = len(priorities)
del priorities
ignorelist = list()
if os.path.isfile(os.path.join(wdir, 'ignored.txt')):
with open(os.path.join(wdir, 'ignored.txt')) as f:
ignorelist = f.read().splitlines()
for source, links in sorted(medias.items()):
print('Changing downloader for next %d links on %s'%(len(links),source))
#if not source.endswith('.deviantart.com'): continue
#if source!='i.reddituploads.com': continue
downloaderClass = downloaderGetter(source)
downloader = downloaderClass()
for seq, link in enumerate(links):
prnmsg = 'Downloading link #%05d of %05d: %s << %s'%(seq+1, len(links), link['link'], link['datakey'])
if link['link'] in ignorelist:
print('Skipping'+prnmsg[11:])
continue
else:
print(prnmsg)
if not downloader.recognizes(link['link']):
continue
target = os.path.join(wdir,'i',link['datakey'])
if not os.path.exists(target):
if not replicate_from_cache(target, link['link']):
downloader.download(link['link']).into(target)
fix_cache(target, link['link'])
print()
print('='*47)
print('| {0:^43} |'.format('Missing downloaders'))
print('='*47)
print('| {0:^30} | {1:^10} |'.format('Domain','Hits'))
print('-'*47)
for priority in top_priorities:
print('| {0:^30} | {1:^10} |'.format(*list(reversed(priority))))
del priority
del top_priorities
print('|'+'.'*32+'|'+'.'*12+'|')
print('| {0:^30} | {1:^10} |'.format('...and more %d domains'%(priolen-10), prioremain))
del priolen
print('='*47)
print()
def replicate_from_cache(download_target, link):
if link.startswith('/r/'):
link = 'https://www.reddit.com'+link
download_path = Path(download_target)
download_meta = download_path.joinpath('meta.json')
target = Path('i_c').joinpath(link.split('://', 1)[1])
target = Path(*[part[:255] for part in target.parts])
target_meta = target.joinpath('_meta.json')
if not target_meta.exists():
return False
else:
target_metajson = json.loads(target_meta.read_text())
if target_metajson['type']=='file':
download_path.mkdir(parents=True, exist_ok=True)
ext = target_metajson['ext']
ffl = f"0000.{ext}"
target_file = target.joinpath(target_metajson['disk'])
download_path.joinpath(ffl).symlink_to(f'../../{str(target_file)}')
download_path.joinpath('meta.json').write_text(json.dumps([{
'dname': ffl,
'ext': ext,
'link': link,
}], sort_keys=True, indent=2))
elif target_metajson['type']=='album':
download_path.mkdir(parents=True, exist_ok=True)
files = list()
for i, lnk in enumerate(target_metajson['link']):
fltarget = Path('i_c').joinpath(lnk.split('://', 1)[1])
fltarget = Path(*[part[:255] for part in fltarget.parts])
fltarget_meta = fltarget.joinpath('_meta.json')
fltarget_metajson = json.loads(fltarget_meta.read_text())
ext = fltarget_metajson['ext']
ffl = '%04d.%s' % (i, ext)
fltarget_file = fltarget.joinpath(fltarget_metajson['disk'])
download_path.joinpath(ffl).symlink_to(f'../../{str(fltarget_file)}')
files.append({
'dname': ffl,
'ext': ext,
'link': lnk,
})
download_path.joinpath('meta.json').write_text(json.dumps(files, sort_keys=True, indent=2))
else:
raise ValueError("type field cannot be %r" % target_metajson['type'])
return True
def fix_cache(download_target, link):
if link.startswith('/r/'):
link = 'https://www.reddit.com'+link
download_path = Path(download_target)
download_meta = download_path.joinpath('meta.json')
downloads = json.loads(download_meta.read_text())
target = Path('i_c').joinpath(link.split('://', 1)[1])
target = Path(*[part[:255] for part in target.parts])
target.mkdir(parents=True, exist_ok=True)
protocolless_link = link.split('://', 1)[1]
if len(downloads) == 1 and downloads[0]['link'].split('://', 1)[1] == protocolless_link:
fix_cache_relocate_single_file_from_download(download_path, downloads[0], target)
else:
target_meta = target.joinpath('_meta.json')
if not target_meta.exists():
target_meta.write_text(json.dumps({
'type': 'album',
'link': [download['link'] for download in downloads],
}, sort_keys=True, indent=2))
for download in downloads:
fix_cache_relocate_single_file_from_download(
download_path,
download,
Path('i_c').joinpath(download['link'].split('://', 1)[1])
)
def fix_cache_relocate_single_file_from_download(download_path, download, target):
target = Path(*[part[:255] for part in target.parts])
target_meta = target.joinpath('_meta.json')
ext = download['ext']
target_file = target.joinpath(f"file.{ext}",)
target_hashfile = target.joinpath(f"file.{ext}.sha256",)
downloaded_file = download_path.joinpath(download['dname'])
if not downloaded_file.is_symlink():
target_meta.parent.mkdir(parents=True, exist_ok=True)
target_meta.write_text(json.dumps({
'type': 'file',
'link': download['link'].split('://', 1)[1],
'ext': ext,
'disk': target_file.name,
}, sort_keys=True, indent=2))
target_file.write_bytes(downloaded_file.read_bytes())
if target_hashfile.exists():
target_hashfile.unlink()
downloaded_file.unlink()
downloaded_file.symlink_to(f'../../{str(target_file)}')
if not target_hashfile.exists():
m = hashlib.sha256()
m.update(target_file.read_bytes())
target_hashfile.write_text(m.hexdigest())
if __name__ == '__main__':
main()