#!/usr/bin/env python3 # -*- encoding: utf-8 -*- from .system import downloader as downloaderModule from pathlib import Path import json import os import hashlib downloaderGetter = downloaderModule.getDownloader wdir = os.path.abspath('.') def _isImageDirectLink(s): return s.endswith('.jpg') or s.endswith('.png') or s.endswith('.gif') or s.endswith('.webp') or s.endswith('.mp4') def isImageDirectLink(s): return _isImageDirectLink(s) or _isImageDirectLink(s.split('?', 1)[0]) def main(): links = list() subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) for subreddit in subreddits: srf = os.path.abspath(os.path.join(wdir,'r',subreddit,'subreddit.json')) links2 = list() try: with open(srf) as f: links2 = json.loads(f.read())['links'] except: pass links+=links2 del links2 del srf del subreddit del subreddits links.sort(key=lambda link: link['timestamp']) medias = dict((('direct_link',list()),)) for link in links: if isImageDirectLink(link['link']): medias['direct_link'].append(link) continue if link['domain'] not in medias: medias[link['domain']] = list() medias[link['domain']].append(link) del link del links priorities = list() for source, links in sorted(medias.items()): downloaderClass = downloaderGetter(source) if downloaderClass is None: print('No downloader for: {0:<35} | {1:>5} links dropped'.format(source,len(links))) priorities.append((len(links),source)) del medias[source] continue top_priorities = list(reversed(sorted(priorities)))[:10] prioremain = sum(map(lambda a: a[0], list(reversed(sorted(priorities)))[10:])) priolen = len(priorities) del priorities ignorelist = list() if os.path.isfile(os.path.join(wdir, 'ignored.txt')): with open(os.path.join(wdir, 'ignored.txt')) as f: ignorelist = f.read().splitlines() for source, links in sorted(medias.items()): print('Changing downloader for next %d links on %s'%(len(links),source)) #if not source.endswith('.deviantart.com'): continue #if source!='i.reddituploads.com': continue downloaderClass = downloaderGetter(source) downloader = downloaderClass() for seq, link in enumerate(links): prnmsg = 'Downloading link #%05d of %05d: %s << %s'%(seq+1, len(links), link['link'], link['datakey']) if link['link'] in ignorelist: print('Skipping'+prnmsg[11:]) continue else: print(prnmsg) if not downloader.recognizes(link['link']): continue target = os.path.join(wdir,'i',link['datakey']) if not os.path.exists(target): if not replicate_from_cache(target, link['link']): downloader.download(link['link']).into(target) fix_cache(target, link['link']) print() print('='*47) print('| {0:^43} |'.format('Missing downloaders')) print('='*47) print('| {0:^30} | {1:^10} |'.format('Domain','Hits')) print('-'*47) for priority in top_priorities: print('| {0:^30} | {1:^10} |'.format(*list(reversed(priority)))) del priority del top_priorities print('|'+'.'*32+'|'+'.'*12+'|') print('| {0:^30} | {1:^10} |'.format('...and more %d domains'%(priolen-10), prioremain)) del priolen print('='*47) print() def replicate_from_cache(download_target, link): if link.startswith('/r/'): link = 'https://www.reddit.com'+link download_path = Path(download_target) download_meta = download_path.joinpath('meta.json') target = Path('i_c').joinpath(link.split('://', 1)[1]) target = Path(*[part[:255] for part in target.parts]) target_meta = target.joinpath('_meta.json') if not target_meta.exists(): return False else: target_metajson = json.loads(target_meta.read_text()) if target_metajson['type']=='file': download_path.mkdir(parents=True, exist_ok=True) ext = target_metajson['ext'] ffl = f"0000.{ext}" target_file = target.joinpath(target_metajson['disk']) download_path.joinpath(ffl).symlink_to(f'../../{str(target_file)}') download_path.joinpath('meta.json').write_text(json.dumps([{ 'dname': ffl, 'ext': ext, 'link': link, }], sort_keys=True, indent=2)) elif target_metajson['type']=='album': download_path.mkdir(parents=True, exist_ok=True) files = list() for i, lnk in enumerate(target_metajson['link']): fltarget = Path('i_c').joinpath(lnk.split('://', 1)[1]) fltarget = Path(*[part[:255] for part in fltarget.parts]) fltarget_meta = fltarget.joinpath('_meta.json') fltarget_metajson = json.loads(fltarget_meta.read_text()) ext = fltarget_metajson['ext'] ffl = '%04d.%s' % (i, ext) fltarget_file = fltarget.joinpath(fltarget_metajson['disk']) download_path.joinpath(ffl).symlink_to(f'../../{str(fltarget_file)}') files.append({ 'dname': ffl, 'ext': ext, 'link': lnk, }) download_path.joinpath('meta.json').write_text(json.dumps(files, sort_keys=True, indent=2)) else: raise ValueError("type field cannot be %r" % target_metajson['type']) return True def fix_cache(download_target, link): if link.startswith('/r/'): link = 'https://www.reddit.com'+link download_path = Path(download_target) download_meta = download_path.joinpath('meta.json') downloads = json.loads(download_meta.read_text()) target = Path('i_c').joinpath(link.split('://', 1)[1]) target = Path(*[part[:255] for part in target.parts]) target.mkdir(parents=True, exist_ok=True) protocolless_link = link.split('://', 1)[1] if len(downloads) == 1 and downloads[0]['link'].split('://', 1)[1] == protocolless_link: fix_cache_relocate_single_file_from_download(download_path, downloads[0], target) else: target_meta = target.joinpath('_meta.json') if not target_meta.exists(): target_meta.write_text(json.dumps({ 'type': 'album', 'link': [download['link'] for download in downloads], }, sort_keys=True, indent=2)) for download in downloads: fix_cache_relocate_single_file_from_download( download_path, download, Path('i_c').joinpath(download['link'].split('://', 1)[1]) ) def fix_cache_relocate_single_file_from_download(download_path, download, target): target = Path(*[part[:255] for part in target.parts]) target_meta = target.joinpath('_meta.json') ext = download['ext'] target_file = target.joinpath(f"file.{ext}",) target_hashfile = target.joinpath(f"file.{ext}.sha256",) downloaded_file = download_path.joinpath(download['dname']) if not downloaded_file.is_symlink(): target_meta.parent.mkdir(parents=True, exist_ok=True) target_meta.write_text(json.dumps({ 'type': 'file', 'link': download['link'].split('://', 1)[1], 'ext': ext, 'disk': target_file.name, }, sort_keys=True, indent=2)) target_file.write_bytes(downloaded_file.read_bytes()) if target_hashfile.exists(): target_hashfile.unlink() downloaded_file.unlink() downloaded_file.symlink_to(f'../../{str(target_file)}') if not target_hashfile.exists(): m = hashlib.sha256() m.update(target_file.read_bytes()) target_hashfile.write_text(m.hexdigest()) if __name__ == '__main__': main()