reddit-image-wall-getter/reddit_imgs/_fetch.py

279 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import os
from pathlib import Path
from .system import downloader as downloaderModule
from .system.downloader import cache
downloaderGetter = downloaderModule.getDownloader
wdir = os.path.abspath('.')
def _isImageDirectLink(s):
return s.endswith('.jpg') or s.endswith('.png') or s.endswith('.gif') or s.endswith('.webp') or s.endswith('.mp4')
def isImageDirectLink(s):
return _isImageDirectLink(s) or _isImageDirectLink(s.split('?', 1)[0])
def retry():
main(True)
def main(retryEmptyAlbums=False):
links = list()
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir, 'r', sr)), os.listdir(os.path.join(wdir, 'r'))))
for subreddit in subreddits:
srf = os.path.abspath(os.path.join(wdir, 'r', subreddit, 'subreddit.json'))
links2 = list()
try:
with open(srf) as f:
links2 = json.loads(f.read())['links']
for lnk in links2:
lnk['subreddit'] = subreddit
del lnk
except:
pass
links += links2
del links2
del srf
del subreddit
del subreddits
links.sort(key=lambda link: link['timestamp'])
links = list(filter(lambda l: '<!--' not in l['link'], links))
gallery_link_downloader = downloaderGetter('gallery_link')
gallery_link_downloader_inst = gallery_link_downloader()
medias = dict(direct_link=list(), gallery_link=list())
for link in links:
if isImageDirectLink(link['link']):
medias['direct_link'].append(link)
continue
elif link['domain'] not in medias:
medias[link['domain']] = list()
medias[link['domain']].append(link)
del link
del links
priorities_d = dict()
priorities = list()
for source, links in sorted(medias.items()):
downloaderClass = downloaderGetter(source)
if downloaderClass is None:
toElliminate = list()
for seq, link in enumerate(links):
if gallery_link_downloader_inst.recognizes(link['link']):
medias['gallery_link'].append(link)
toElliminate.append(seq)
for i in reversed(toElliminate):
del links[i]
if len(links) <= 0:
del medias[source]
continue
if downloaderClass is None:
msg = 'No downloader for: {0:<35} | {1:>5} links dropped'.format(source, len(links))
print(msg)
priorities.append((len(links), source))
k = fix_domain_for_display(source)
if k not in priorities_d:
priorities_d[k] = 0
priorities_d[k] += len(links)
del medias[source]
continue
for linkset in medias.values():
linkset.sort(key=lambda a: (
fix_domain_for_display(cache.get_domain(cache.get_normalized_link(a['link']))),
a['timestamp'],
a['datakey'],
))
Path('fetch_missing.json').write_text(json.dumps(dict(list(map(reversed, sorted(list(map(list, map(reversed, priorities_d.items()))))))), sort_keys=True, indent=1))
top_priorities = list(reversed(sorted(priorities)))[:10]
prioremain = sum(map(lambda a: a[0], list(reversed(sorted(priorities)))[10:]))
priolen = len(priorities)
del priorities
ignorelist = list()
if os.path.isfile(os.path.join(wdir, 'ignored.txt')):
with open(os.path.join(wdir, 'ignored.txt')) as f:
ignorelist = f.read().splitlines()
image_catalog_file = Path('i_c.json')
if not image_catalog_file.exists():
image_catalog_file.write_text('[]')
image_cde_file = Path('i_cde.json')
if not image_cde_file.exists():
image_cde_file.write_text('[]')
image_cde = list()
if retryEmptyAlbums:
image_cde = json.loads(image_cde_file.read_text())
image_catalog = json.loads(image_catalog_file.read_text())
image_catalog_set = set(list(map(lambda a: a['datakey'], image_catalog)))
image_catalog_keyed = dict(list(map(lambda a: (a['datakey'], a), image_catalog)))
downloadcount = 0
for source, links in sorted(medias.items()):
print('Changing downloader for next %d links on %s' % (len(links), source))
# if not source.endswith('.deviantart.com'): continue
# if source!='i.reddituploads.com': continue
downloaderClass = downloaderGetter(source)
downloader = downloaderClass()
untilDomainsChange = list()
prevDomain = None
currDomain = None
lastSeq = 0
for seq, link in enumerate(reversed(links)):
domain = fix_domain_for_display(cache.get_domain(cache.get_normalized_link(link['link'])))
if domain != currDomain:
prevDomain = currDomain
currDomain = domain
lastSeq = seq
passCount = seq - lastSeq
pastDomain = dict(
nextDomain=prevDomain if prevDomain is not None else '-',
count=passCount
)
pastDomain['nextDomainAbbrv'] = (
"\u2026"+pastDomain['nextDomain'][-14:]
if len(pastDomain['nextDomain']) > 15 else
pastDomain['nextDomain']
)
untilDomainsChange.append(pastDomain)
untilDomainsChange.reverse()
for seq, link in enumerate(links):
if len(link['datakey']) > 20:
continue
if link['datakey'] in image_catalog_set and not (retryEmptyAlbums and len(image_catalog_keyed.get(link['datakey'], {'downloaded': [1]})['downloaded']) == 0):
continue
prnmsg = 'Downloading link #%05d of %05d: %s << %s | %03d>>%s' % (
seq+1,
len(links),
link['link'],
link['datakey'],
untilDomainsChange[seq]['count']+1,
untilDomainsChange[seq]['nextDomainAbbrv']
)
if link['link'] in ignorelist:
print('Skipping'+prnmsg[11:])
continue
else:
print(prnmsg)
if not downloader.recognizes(link['link']):
continue
target = os.path.join(wdir, 'i', link['datakey'])
targetJson = os.path.join(target, 'meta.json')
if not os.path.exists(targetJson) or (retryEmptyAlbums and link['datakey'] not in image_cde):
if not cache.replicate_from_cache(target, link['link']) or (retryEmptyAlbums and link['datakey'] not in image_cde):
downloader.download(link['link']).into(target)
cache.uncache_download(link['link'])
if retryEmptyAlbums:
downloadcount += 1
image_cde.append(link['datakey'])
if downloadcount > 9:
print("Saving checkpoint... ", end='\r')
image_catalog_file.write_text(json.dumps(image_catalog, indent=1))
image_cde_file.write_text(json.dumps(image_cde, indent=1))
downloadcount = 0
thumbhashdata = cache.fix_cache(target, link['link'])
if retryEmptyAlbums and len(thumbhashdata) > 0:
print(f"Profit! Got {len(thumbhashdata)} files. ")
metadata = link.copy()
del metadata['link']
del metadata['domain']
metadata['downloaded'] = thumbhashdata
if metadata['sharer'] == '[deleted]':
metadata['sharer'] = None
if retryEmptyAlbums:
found = None
if link['datakey'] in image_catalog_set:
for seqcat, savedEntry in enumerate(image_catalog):
if link['datakey'] == savedEntry['datakey']:
found = seqcat
break
if found is None:
image_catalog.append(metadata)
else:
image_catalog[found] = metadata
else:
image_catalog.append(metadata)
image_catalog_set.add(link['datakey'])
image_catalog_keyed[link['datakey']] = metadata
image_catalog_file.write_text(json.dumps(image_catalog, indent=1))
if retryEmptyAlbums:
image_cde_file.unlink()
if (pth := Path('latest_put_image.url')).exists():
pth.unlink()
if (pth := Path('latest_put_image.file')).exists():
pth.unlink()
print()
print('='*47)
print('| {0:^43} |'.format('Missing downloaders'))
print('='*47)
print('| {0:^30} | {1:^10} |'.format('Domain', 'Hits'))
print('-'*47)
for priority in top_priorities:
print('| {0:^30} | {1:^10} |'.format(*list(reversed(priority))))
del priority
del top_priorities
print('|'+'.'*32+'|'+'.'*12+'|')
print('| {0:^30} | {1:^10} |'.format('...and more %d domains' % (priolen-10), prioremain))
del priolen
print('='*47)
print()
def fix_domain_for_display(domain):
if domain.startswith('www.'):
return fix_domain_for_display(domain[4:])
elif domain.startswith('ww2.'):
return fix_domain_for_display(domain[4:])
elif domain.startswith('www2.'):
return fix_domain_for_display(domain[5:])
elif domain.startswith('old.'):
return fix_domain_for_display(domain[4:])
elif domain.startswith('new.'):
return fix_domain_for_display(domain[4:])
elif domain.startswith('beta.'):
return fix_domain_for_display(domain[5:])
elif domain.startswith('test.'):
return fix_domain_for_display(domain[5:])
elif domain.startswith('preview.'):
return fix_domain_for_display(domain[8:])
elif domain.startswith('prev.'):
return fix_domain_for_display(domain[5:])
elif domain.startswith('self.'):
return fix_domain_for_display(domain[5:])
elif domain.startswith('cdn.'):
return fix_domain_for_display(domain[4:])
elif domain.startswith('cdn2.'):
return fix_domain_for_display(domain[5:])
elif domain.startswith('i.'):
return fix_domain_for_display(domain[2:])
elif domain.startswith('v.'):
return fix_domain_for_display(domain[2:])
elif domain.startswith('r.'):
return fix_domain_for_display(domain[2:])
elif domain.endswith('deviantart.com') and domain != 'deviantart.com':
return fix_domain_for_display('deviantart.com')
elif domain.endswith('tumblr.com') and domain != 'tumblr.com':
return fix_domain_for_display('tumblr.com')
elif domain.endswith('blogspot.com') and domain != 'blogspot.com':
return fix_domain_for_display('blogspot.com')
elif domain.endswith('.e-hentai.org'):
return fix_domain_for_display('e-hentai.org')
else:
return domain
if __name__ == '__main__':
main()