273 lines
11 KiB
Python
Executable File
273 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from .system import downloader as downloaderModule
|
|
from .system.downloader import cache
|
|
|
|
downloaderGetter = downloaderModule.getDownloader
|
|
|
|
wdir = os.path.abspath('.')
|
|
|
|
def _isImageDirectLink(s):
|
|
return s.endswith('.jpg') or s.endswith('.png') or s.endswith('.gif') or s.endswith('.webp') or s.endswith('.mp4')
|
|
|
|
def isImageDirectLink(s):
|
|
return _isImageDirectLink(s) or _isImageDirectLink(s.split('?', 1)[0])
|
|
|
|
def retry():
|
|
main(True)
|
|
|
|
def main(retryEmptyAlbums = False):
|
|
links = list()
|
|
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r'))))
|
|
for subreddit in subreddits:
|
|
srf = os.path.abspath(os.path.join(wdir,'r',subreddit,'subreddit.json'))
|
|
links2 = list()
|
|
try:
|
|
with open(srf) as f:
|
|
links2 = json.loads(f.read())['links']
|
|
for lnk in links2:
|
|
lnk['subreddit'] = subreddit
|
|
del lnk
|
|
except: pass
|
|
links+=links2
|
|
del links2
|
|
del srf
|
|
del subreddit
|
|
del subreddits
|
|
|
|
links.sort(key=lambda link: link['timestamp'])
|
|
links = list(filter(lambda l: '<!--' not in l['link'], links))
|
|
gallery_link_downloader = downloaderGetter('gallery_link')
|
|
gallery_link_downloader_inst = gallery_link_downloader()
|
|
|
|
medias = dict(direct_link=list(), gallery_link=list())
|
|
for link in links:
|
|
if isImageDirectLink(link['link']):
|
|
medias['direct_link'].append(link)
|
|
continue
|
|
elif link['domain'] not in medias:
|
|
medias[link['domain']] = list()
|
|
medias[link['domain']].append(link)
|
|
del link
|
|
del links
|
|
|
|
priorities_d = dict()
|
|
priorities = list()
|
|
for source, links in sorted(medias.items()):
|
|
downloaderClass = downloaderGetter(source)
|
|
if downloaderClass is None:
|
|
toElliminate = list()
|
|
for seq, link in enumerate(links):
|
|
if gallery_link_downloader_inst.recognizes(link['link']):
|
|
medias['gallery_link'].append(link)
|
|
toElliminate.append(seq)
|
|
for i in reversed(toElliminate):
|
|
del links[i]
|
|
if len(links) <= 0:
|
|
del medias[source]
|
|
continue
|
|
if downloaderClass is None:
|
|
msg = 'No downloader for: {0:<35} | {1:>5} links dropped'.format(source,len(links))
|
|
print(msg)
|
|
priorities.append((len(links), source))
|
|
k = fix_domain_for_display(source)
|
|
if k not in priorities_d:
|
|
priorities_d[k] = 0
|
|
priorities_d[k]+=len(links)
|
|
del medias[source]
|
|
continue
|
|
|
|
for linkset in medias.values():
|
|
linkset.sort(key=lambda a: (
|
|
fix_domain_for_display(cache.get_domain(cache.get_normalized_link(a['link']))),
|
|
a['timestamp'],
|
|
a['datakey'],
|
|
))
|
|
|
|
Path('fetch_missing.json').write_text(json.dumps(dict(list(map(reversed, sorted(list(map(list, map(reversed, priorities_d.items()))))))), sort_keys=True, indent=1))
|
|
top_priorities = list(reversed(sorted(priorities)))[:10]
|
|
prioremain = sum(map(lambda a: a[0], list(reversed(sorted(priorities)))[10:]))
|
|
priolen = len(priorities)
|
|
del priorities
|
|
|
|
ignorelist = list()
|
|
if os.path.isfile(os.path.join(wdir, 'ignored.txt')):
|
|
with open(os.path.join(wdir, 'ignored.txt')) as f:
|
|
ignorelist = f.read().splitlines()
|
|
|
|
image_catalog_file = Path('i_c.json')
|
|
if not image_catalog_file.exists():
|
|
image_catalog_file.write_text('[]')
|
|
image_cde_file = Path('i_cde.json')
|
|
if not image_cde_file.exists():
|
|
image_cde_file.write_text('[]')
|
|
|
|
image_cde = list()
|
|
if retryEmptyAlbums:
|
|
image_cde = json.loads(image_cde_file.read_text())
|
|
image_catalog = json.loads(image_catalog_file.read_text())
|
|
image_catalog_set = set(list(map(lambda a: a['datakey'], image_catalog)))
|
|
image_catalog_keyed = dict(list(map(lambda a: (a['datakey'], a), image_catalog)))
|
|
downloadcount = 0
|
|
for source, links in sorted(medias.items()):
|
|
print('Changing downloader for next %d links on %s'%(len(links),source))
|
|
#if not source.endswith('.deviantart.com'): continue
|
|
#if source!='i.reddituploads.com': continue
|
|
downloaderClass = downloaderGetter(source)
|
|
downloader = downloaderClass()
|
|
untilDomainsChange = list()
|
|
prevDomain = None
|
|
currDomain = None
|
|
lastSeq = 0
|
|
for seq, link in enumerate(reversed(links)):
|
|
domain = fix_domain_for_display(cache.get_domain(cache.get_normalized_link(link['link'])))
|
|
if domain!=currDomain:
|
|
prevDomain = currDomain
|
|
currDomain = domain
|
|
lastSeq = seq
|
|
passCount = seq - lastSeq
|
|
pastDomain = dict(
|
|
nextDomain=prevDomain if prevDomain is not None else '-',
|
|
count=passCount
|
|
)
|
|
pastDomain['nextDomainAbbrv'] = (
|
|
"\u2026"+pastDomain['nextDomain'][-14:]
|
|
if len(pastDomain['nextDomain']) > 15 else
|
|
pastDomain['nextDomain']
|
|
)
|
|
untilDomainsChange.append(pastDomain)
|
|
untilDomainsChange.reverse()
|
|
|
|
for seq, link in enumerate(links):
|
|
if len(link['datakey'])>20:
|
|
continue
|
|
if link['datakey'] in image_catalog_set and not (retryEmptyAlbums and len(image_catalog_keyed.get(link['datakey'], {'downloaded':[1]})['downloaded'])==0):
|
|
continue
|
|
prnmsg = 'Downloading link #%05d of %05d: %s << %s | %03d>>%s' % (
|
|
seq+1,
|
|
len(links),
|
|
link['link'],
|
|
link['datakey'],
|
|
untilDomainsChange[seq]['count']+1,
|
|
untilDomainsChange[seq]['nextDomainAbbrv']
|
|
)
|
|
if link['link'] in ignorelist:
|
|
print('Skipping'+prnmsg[11:])
|
|
continue
|
|
else:
|
|
print(prnmsg)
|
|
if not downloader.recognizes(link['link']):
|
|
continue
|
|
target = os.path.join(wdir,'i',link['datakey'])
|
|
targetJson = os.path.join(target, 'meta.json')
|
|
if not os.path.exists(targetJson) or (retryEmptyAlbums and link['datakey'] not in image_cde):
|
|
if not cache.replicate_from_cache(target, link['link']) or (retryEmptyAlbums and link['datakey'] not in image_cde):
|
|
downloader.download(link['link']).into(target)
|
|
cache.uncache_download(link['link'])
|
|
if retryEmptyAlbums:
|
|
downloadcount += 1
|
|
image_cde.append(link['datakey'])
|
|
if downloadcount > 9:
|
|
print("Saving checkpoint... ", end='\r')
|
|
image_catalog_file.write_text(json.dumps(image_catalog, indent=1))
|
|
image_cde_file.write_text(json.dumps(image_cde, indent=1))
|
|
downloadcount = 0
|
|
thumbhashdata = cache.fix_cache(target, link['link'])
|
|
if retryEmptyAlbums and len(thumbhashdata)>0:
|
|
print(f"Profit! Got {len(thumbhashdata)} files. ")
|
|
metadata = link.copy()
|
|
del metadata['link']
|
|
del metadata['domain']
|
|
metadata['downloaded'] = thumbhashdata
|
|
if metadata['sharer'] == '[deleted]':
|
|
metadata['sharer'] = None
|
|
if retryEmptyAlbums:
|
|
found = None
|
|
if link['datakey'] in image_catalog_set:
|
|
for seqcat, savedEntry in enumerate(image_catalog):
|
|
if link['datakey'] == savedEntry['datakey']:
|
|
found = seqcat
|
|
break
|
|
if found is None:
|
|
image_catalog.append(metadata)
|
|
else:
|
|
image_catalog[found] = metadata
|
|
else:
|
|
image_catalog.append(metadata)
|
|
image_catalog_set.add(link['datakey'])
|
|
image_catalog_keyed[link['datakey']] = metadata
|
|
image_catalog_file.write_text(json.dumps(image_catalog, indent=1))
|
|
if retryEmptyAlbums:
|
|
image_cde_file.unlink()
|
|
if (pth := Path('latest_put_image.url')).exists():
|
|
pth.unlink()
|
|
if (pth := Path('latest_put_image.file')).exists():
|
|
pth.unlink()
|
|
|
|
print()
|
|
print('='*47)
|
|
print('| {0:^43} |'.format('Missing downloaders'))
|
|
print('='*47)
|
|
print('| {0:^30} | {1:^10} |'.format('Domain','Hits'))
|
|
print('-'*47)
|
|
for priority in top_priorities:
|
|
print('| {0:^30} | {1:^10} |'.format(*list(reversed(priority))))
|
|
del priority
|
|
del top_priorities
|
|
print('|'+'.'*32+'|'+'.'*12+'|')
|
|
print('| {0:^30} | {1:^10} |'.format('...and more %d domains'%(priolen-10), prioremain))
|
|
del priolen
|
|
print('='*47)
|
|
print()
|
|
|
|
def fix_domain_for_display(domain):
|
|
if domain.startswith('www.'):
|
|
return fix_domain_for_display(domain[4:])
|
|
elif domain.startswith('ww2.'):
|
|
return fix_domain_for_display(domain[4:])
|
|
elif domain.startswith('www2.'):
|
|
return fix_domain_for_display(domain[5:])
|
|
elif domain.startswith('old.'):
|
|
return fix_domain_for_display(domain[4:])
|
|
elif domain.startswith('new.'):
|
|
return fix_domain_for_display(domain[4:])
|
|
elif domain.startswith('beta.'):
|
|
return fix_domain_for_display(domain[5:])
|
|
elif domain.startswith('test.'):
|
|
return fix_domain_for_display(domain[5:])
|
|
elif domain.startswith('preview.'):
|
|
return fix_domain_for_display(domain[8:])
|
|
elif domain.startswith('prev.'):
|
|
return fix_domain_for_display(domain[5:])
|
|
elif domain.startswith('self.'):
|
|
return fix_domain_for_display(domain[5:])
|
|
elif domain.startswith('cdn.'):
|
|
return fix_domain_for_display(domain[4:])
|
|
elif domain.startswith('cdn2.'):
|
|
return fix_domain_for_display(domain[5:])
|
|
elif domain.startswith('i.'):
|
|
return fix_domain_for_display(domain[2:])
|
|
elif domain.startswith('v.'):
|
|
return fix_domain_for_display(domain[2:])
|
|
elif domain.startswith('r.'):
|
|
return fix_domain_for_display(domain[2:])
|
|
elif domain.endswith('deviantart.com') and domain != 'deviantart.com':
|
|
return fix_domain_for_display('deviantart.com')
|
|
elif domain.endswith('tumblr.com') and domain != 'tumblr.com':
|
|
return fix_domain_for_display('tumblr.com')
|
|
elif domain.endswith('blogspot.com') and domain != 'blogspot.com':
|
|
return fix_domain_for_display('blogspot.com')
|
|
elif domain.endswith('.e-hentai.org'):
|
|
return fix_domain_for_display('e-hentai.org')
|
|
else:
|
|
return domain
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|