reddit-image-wall-getter/reddit_imgs/system/downloader/cache.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from pathlib import Path
import json
import hashlib
import shutil

def get_normalized_link(link: str) -> str:
    if link.startswith('/r/'):
        link = 'https://www.reddit.com'+link
    if link.startswith('/user/'):
        link = 'https://www.reddit.com'+link
    return link


def limit_filename_lenght(target: Path) -> Path:
    return Path(*[part[:255] for part in target.parts])


def get_domain(link: str) -> str:
    return get_path_for_caching(link).parts[1]


def get_path_for_caching(link: str) -> Path:
    link = get_normalized_link(link)
    target = Path('i_c').joinpath(link.split('://', 1)[1])
    return limit_filename_lenght(target)

def has_file_cache(cached: Path) -> bool:
    if not cached.exists():
        return False
    metafile = cached.joinpath('_meta.json')
    if not metafile.exists():
        return False
    meta = json.loads(cached.joinpath('_meta.json').read_text())
    if meta['type'] != 'file':
        return False
    file = cached.joinpath(meta['disk'])
    return file.exists()


def read_file_from_cache(cached: Path) -> bytes:
    if not cached.exists():
        raise ValueError("Cannot read from non-existing cache: %r" % cached)
    metafile = cached.joinpath('_meta.json')
    if not metafile.exists():
        raise ValueError("Cannot read from broken cache: %r" % metafile)
    meta = json.loads(cached.joinpath('_meta.json').read_text())
    if meta['type'] != 'file':
        raise ValueError("Cannot read a gallery as single file: %r" % cached)
    file = cached.joinpath(meta['disk'])
    if not file.exists():
        raise ValueError("Cannot locate missing file: %r" % file)
    return file.read_bytes()


def uncache_download(link):
    target = get_path_for_caching(link)
    if target.exists():
        shutil.rmtree(target)


def replicate_from_cache(download_target, link):
    link = get_normalized_link(link)
    download_path = Path(download_target)
    target = get_path_for_caching(link)
    target_meta = target.joinpath('_meta.json')
    if not target_meta.exists():
        return False
    else:
        target_metajson = json.loads(target_meta.read_text())
        if target_metajson['type']=='file':
            download_path.mkdir(parents=True, exist_ok=True)
            ext = target_metajson['ext']
            ffl = f"0000.{ext}"
            target_file = target.joinpath(target_metajson['disk'])
            sl = download_path.joinpath(ffl)
            if sl.exists():
                sl.unlink()
            sl.symlink_to(f'../../{str(target_file)}')
            download_path.joinpath('meta.json').write_text(json.dumps([{
                'dname': ffl,
                'ext': ext,
                'link': link,
            }], sort_keys=True, indent=2))
        elif target_metajson['type']=='album':
            download_path.mkdir(parents=True, exist_ok=True)
            files = list()
            for i, lnk in enumerate(target_metajson['link']):
                fltarget = Path('i_c').joinpath(lnk.split('://', 1)[1])
                fltarget = Path(*[part[:255] for part in fltarget.parts])
                fltarget_meta = fltarget.joinpath('_meta.json')
                fltarget_metajson = json.loads(fltarget_meta.read_text())
                ext = fltarget_metajson['ext']
                ffl = '%04d.%s' % (i, ext)
                fltarget_file = fltarget.joinpath(fltarget_metajson['disk'])
                sl = download_path.joinpath(ffl)
                if sl.exists():
                    sl.unlink()
                sl.symlink_to(f'../../{str(fltarget_file)}')
                files.append({
                    'dname': ffl,
                    'ext': ext,
                    'link': lnk,
                })
            download_path.joinpath('meta.json').write_text(json.dumps(files, sort_keys=True, indent=2))
        else:
            raise ValueError("type field cannot be %r" % target_metajson['type'])
        return True


def fix_cache(download_target, link):
    link = get_normalized_link(link)
    download_path = Path(download_target)
    download_meta = download_path.joinpath('meta.json')
    downloads = json.loads(download_meta.read_text())
    target = get_path_for_caching(link)
    target.mkdir(parents=True, exist_ok=True)
    protocolless_link = link.split('://', 1)[1]
    if len(downloads) == 1 and downloads[0]['link'].split('://', 1)[1] == protocolless_link:
        return [fix_cache_relocate_single_file_from_download(download_path, downloads[0], target)]
    else:
        target_meta = target.joinpath('_meta.json')
        if not target_meta.exists():
            target_meta.write_text(json.dumps({
                'type': 'album',
                'link': [download['link'] for download in downloads],
            }, sort_keys=True, indent=2))
        return [
            fix_cache_relocate_single_file_from_download(
                download_path,
                download,
                Path('i_c').joinpath(download['link'].split('://', 1)[1])
            )
            for download in downloads
        ]


def fix_cache_relocate_single_file_from_download(download_path, download, target):
    thumbs_dict = dict()
    target = limit_filename_lenght(target)
    target_meta = target.joinpath('_meta.json')
    ext = download['ext']
    target_file = target.joinpath(f"file.{ext}",)
    target_hashfile = target.joinpath(f"file.{ext}.sha256",)
    downloaded_file = download_path.joinpath(download['dname'])
    if not downloaded_file.is_symlink():
        target_meta.parent.mkdir(parents=True, exist_ok=True)
        target_meta.write_text(json.dumps({
            'type': 'file',
            'link': download['link'].split('://', 1)[1],
            'ext': ext,
            'disk': target_file.name,
        }, sort_keys=True, indent=2))
        target_file.write_bytes(downloaded_file.read_bytes())
        if target_hashfile.exists():
            target_hashfile.unlink()
        downloaded_file.unlink()
        downloaded_file.symlink_to(f'../../{str(target_file)}')
    if not target_file.exists():
        shutil.rmtree(target)  # cache is invalid; remove it
        for fl in download_path.glob('*'):
            if fl.is_symlink():  # download has a broken symlink into cache
                shutil.rmtree(download_path)
                break
        raise Exception("Specified cached file does not exist.\n" +
            f"Download path: {repr(download_path)}\n" +
            f"Target: {repr(target)}")
    if not target_hashfile.exists():
        m = hashlib.sha256()
        m.update(target_file.read_bytes())
        target_hashfile.write_text(m.hexdigest())
    thumbs_dict['file'] = str(target_file)
    return thumbs_dict
hashes and gallery-dl 2020-05-13 21:07:05 +00:00			`#!/usr/bin/env python3`
			`# -- encoding: utf-8 --`

			`from pathlib import Path`
			`import json`
			`import hashlib`
			`import shutil`

			`def get_normalized_link(link: str) -> str:`
			`if link.startswith('/r/'):`
			`link = 'https://www.reddit.com'+link`
			`if link.startswith('/user/'):`
			`link = 'https://www.reddit.com'+link`
			`return link`


			`def limit_filename_lenght(target: Path) -> Path:`
			`return Path(*[part[:255] for part in target.parts])`


			`def get_domain(link: str) -> str:`
			`return get_path_for_caching(link).parts[1]`


			`def get_path_for_caching(link: str) -> Path:`
			`link = get_normalized_link(link)`
			`target = Path('i_c').joinpath(link.split('://', 1)[1])`
			`return limit_filename_lenght(target)`

Added normalization stage to pipeline 2020-06-01 03:20:23 +00:00			`def has_file_cache(cached: Path) -> bool:`
			`if not cached.exists():`
			`return False`
			`metafile = cached.joinpath('_meta.json')`
			`if not metafile.exists():`
			`return False`
			`meta = json.loads(cached.joinpath('_meta.json').read_text())`
			`if meta['type'] != 'file':`
			`return False`
			`file = cached.joinpath(meta['disk'])`
			`return file.exists()`

hashes and gallery-dl 2020-05-13 21:07:05 +00:00
			`def read_file_from_cache(cached: Path) -> bytes:`
			`if not cached.exists():`
			`raise ValueError("Cannot read from non-existing cache: %r" % cached)`
Added normalization stage to pipeline 2020-06-01 03:20:23 +00:00			`metafile = cached.joinpath('_meta.json')`
			`if not metafile.exists():`
			`raise ValueError("Cannot read from broken cache: %r" % metafile)`
hashes and gallery-dl 2020-05-13 21:07:05 +00:00			`meta = json.loads(cached.joinpath('_meta.json').read_text())`
			`if meta['type'] != 'file':`
			`raise ValueError("Cannot read a gallery as single file: %r" % cached)`
			`file = cached.joinpath(meta['disk'])`
Added normalization stage to pipeline 2020-06-01 03:20:23 +00:00			`if not file.exists():`
			`raise ValueError("Cannot locate missing file: %r" % file)`
hashes and gallery-dl 2020-05-13 21:07:05 +00:00			`return file.read_bytes()`


			`def uncache_download(link):`
			`target = get_path_for_caching(link)`
			`if target.exists():`
			`shutil.rmtree(target)`


			`def replicate_from_cache(download_target, link):`
			`link = get_normalized_link(link)`
			`download_path = Path(download_target)`
			`target = get_path_for_caching(link)`
			`target_meta = target.joinpath('_meta.json')`
			`if not target_meta.exists():`
			`return False`
			`else:`
			`target_metajson = json.loads(target_meta.read_text())`
			`if target_metajson['type']=='file':`
			`download_path.mkdir(parents=True, exist_ok=True)`
			`ext = target_metajson['ext']`
			`ffl = f"0000.{ext}"`
			`target_file = target.joinpath(target_metajson['disk'])`
			`sl = download_path.joinpath(ffl)`
			`if sl.exists():`
			`sl.unlink()`
			`sl.symlink_to(f'../../{str(target_file)}')`
			`download_path.joinpath('meta.json').write_text(json.dumps([{`
			`'dname': ffl,`
			`'ext': ext,`
			`'link': link,`
			`}], sort_keys=True, indent=2))`
			`elif target_metajson['type']=='album':`
			`download_path.mkdir(parents=True, exist_ok=True)`
			`files = list()`
			`for i, lnk in enumerate(target_metajson['link']):`
			`fltarget = Path('i_c').joinpath(lnk.split('://', 1)[1])`
			`fltarget = Path(*[part[:255] for part in fltarget.parts])`
			`fltarget_meta = fltarget.joinpath('_meta.json')`
			`fltarget_metajson = json.loads(fltarget_meta.read_text())`
			`ext = fltarget_metajson['ext']`
			`ffl = '%04d.%s' % (i, ext)`
			`fltarget_file = fltarget.joinpath(fltarget_metajson['disk'])`
			`sl = download_path.joinpath(ffl)`
			`if sl.exists():`
			`sl.unlink()`
			`sl.symlink_to(f'../../{str(fltarget_file)}')`
			`files.append({`
			`'dname': ffl,`
			`'ext': ext,`
			`'link': lnk,`
			`})`
			`download_path.joinpath('meta.json').write_text(json.dumps(files, sort_keys=True, indent=2))`
			`else:`
			`raise ValueError("type field cannot be %r" % target_metajson['type'])`
			`return True`


			`def fix_cache(download_target, link):`
			`link = get_normalized_link(link)`
			`download_path = Path(download_target)`
			`download_meta = download_path.joinpath('meta.json')`
			`downloads = json.loads(download_meta.read_text())`
			`target = get_path_for_caching(link)`
			`target.mkdir(parents=True, exist_ok=True)`
			`protocolless_link = link.split('://', 1)[1]`
			`if len(downloads) == 1 and downloads[0]['link'].split('://', 1)[1] == protocolless_link:`
			`return [fix_cache_relocate_single_file_from_download(download_path, downloads[0], target)]`
			`else:`
			`target_meta = target.joinpath('_meta.json')`
			`if not target_meta.exists():`
			`target_meta.write_text(json.dumps({`
			`'type': 'album',`
			`'link': [download['link'] for download in downloads],`
			`}, sort_keys=True, indent=2))`
			`return [`
			`fix_cache_relocate_single_file_from_download(`
			`download_path,`
			`download,`
			`Path('i_c').joinpath(download['link'].split('://', 1)[1])`
			`)`
			`for download in downloads`
			`]`


			`def fix_cache_relocate_single_file_from_download(download_path, download, target):`
			`thumbs_dict = dict()`
			`target = limit_filename_lenght(target)`
			`target_meta = target.joinpath('_meta.json')`
			`ext = download['ext']`
			`target_file = target.joinpath(f"file.{ext}",)`
			`target_hashfile = target.joinpath(f"file.{ext}.sha256",)`
			`downloaded_file = download_path.joinpath(download['dname'])`
			`if not downloaded_file.is_symlink():`
			`target_meta.parent.mkdir(parents=True, exist_ok=True)`
			`target_meta.write_text(json.dumps({`
			`'type': 'file',`
			`'link': download['link'].split('://', 1)[1],`
			`'ext': ext,`
			`'disk': target_file.name,`
			`}, sort_keys=True, indent=2))`
			`target_file.write_bytes(downloaded_file.read_bytes())`
			`if target_hashfile.exists():`
			`target_hashfile.unlink()`
			`downloaded_file.unlink()`
			`downloaded_file.symlink_to(f'../../{str(target_file)}')`
			`if not target_file.exists():`
Added normalization stage to pipeline 2020-06-01 03:20:23 +00:00			`shutil.rmtree(target) # cache is invalid; remove it`
			`for fl in download_path.glob('*'):`
			`if fl.is_symlink(): # download has a broken symlink into cache`
			`shutil.rmtree(download_path)`
			`break`
			`raise Exception("Specified cached file does not exist.\n" +`
			`f"Download path: {repr(download_path)}\n" +`
			`f"Target: {repr(target)}")`
hashes and gallery-dl 2020-05-13 21:07:05 +00:00			`if not target_hashfile.exists():`
			`m = hashlib.sha256()`
			`m.update(target_file.read_bytes())`
			`target_hashfile.write_text(m.hexdigest())`
			`thumbs_dict['file'] = str(target_file)`
			`return thumbs_dict`