reddit-image-wall-getter/reddit_imgs/fetch2.py

480 lines
18 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
2020-06-05 22:19:45 +00:00
import math
import shutil
import sys
import traceback
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
2020-06-05 22:19:45 +00:00
from typing import List, Optional, Tuple
import colored as clrlib
import gallery_dl
import gallery_dl.config
import gallery_dl.extractor
import gallery_dl.job
import gallery_dl.postprocessor.common
import reddit_imgs.sync
from .system.downloader.cache import get_normalized_link, get_path_for_caching
from .system.urlmatcher import search_urls
2020-06-05 22:19:45 +00:00
MAX_WORKERS = 12
SPLIT_WORKER_AFTER_N_LINKS = 1000
FORBIDDEN_WORKER_SPLITS = {
'deviantart',
}
def main():
subreddit_data_path = Path('r.json')
if not subreddit_data_path.exists():
print("Executing prerrequisite...")
reddit_imgs.sync.main()
print('Loading posts...')
2020-06-05 22:19:45 +00:00
workers_state_path = Path('i_gdl_w')
workers_state_path.mkdir(exist_ok=True, parents=True)
for wsp in workers_state_path.iterdir():
wsp.unlink()
subreddit_data = json.loads(subreddit_data_path.read_text())
links = dict()
postsl = [
{**post, 'subreddit': subreddit}
for subreddit, srdt in subreddit_data.items()
for post in srdt['links']
]
postsd = dict()
for post in postsl:
dk = post['datakey']
if dk not in postsd:
postsd[dk] = post.copy()
postsd[dk]['subreddits'] = list()
postsd[dk]['links'] = list()
del postsd[dk]['subreddit']
del postsd[dk]['link']
del postsd[dk]['domain']
if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']):
srs.append(sr)
if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']):
lnks.append(lnk)
posts = postsd
del postsl
del postsd
print(f'{len(posts)} posts identified.')
print(f'Identifying alternative trivial links...')
for post in posts.values():
dk = post['datakey']
post_links = post['links']
has_added_any_link = True
while has_added_any_link:
has_added_any_link = False
for link in post_links:
linkcopy = link
while linkcopy.endswith('/') or linkcopy.endswith('#') or linkcopy.endswith('?'):
linkcopy = linkcopy[:-1]
if linkcopy not in post_links:
post_links.append(linkcopy)
has_added_any_link = True
if '<!--' in link:
for linkcopy in search_urls(link):
if linkcopy not in post_links:
post_links.append(linkcopy)
has_added_any_link = True
Path('r_gdl_p.json').write_text(json.dumps(posts, indent=1))
print(f'Grouping links with the posts they show up in...')
for dk, post in posts.items():
for link in post['links']:
if link not in links:
links[link] = list()
links[link].append(dk)
Path('r_gdl_lp.json').write_text(json.dumps(links, indent=1))
print(f'{len(links)} links found')
print(f'Checking if there is an extractor for each link...')
r_gdl_le_path = Path('r_gdl_le.json')
link_extractors = dict()
if r_gdl_le_path.exists():
link_extractors = json.loads(r_gdl_le_path.read_text())
2020-06-05 22:19:45 +00:00
for link in links.keys():
if link not in link_extractors or link_extractors[link] == '':
ext = None
try:
ext = gallery_dl.extractor.find(link)
except gallery_dl.exception.NotFoundError:
pass
link_extractors[link] = type(
ext).category if ext is not None else ''
r_gdl_le_path.write_text(json.dumps(link_extractors, indent=1))
links_by_extractor = {
extractor: list()
for extractor in list(set(link_extractors.values()))
}
for link, extractor in link_extractors.items():
links_by_extractor[extractor].append(link)
undownloadable_posts = links_by_extractor.get('', [])
Path('i_undownloadable.json').write_text(
json.dumps(undownloadable_posts, indent=1))
if '' in links_by_extractor:
del links_by_extractor['']
print(f'{len(links)-len(undownloadable_posts)} downloadable links found')
print(f'{len(undownloadable_posts)} undownloadable links found')
print(f'{len(links_by_extractor)} extractors found')
Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1))
2020-06-05 22:19:45 +00:00
ignored_links = set()
if (p := Path('i_gdl_ignored.txt')).exists():
ignored_links = set(list(filter(len, p.read_text().splitlines())))
max_expected_jobs_for_extractor = 0
for extractor, links in links_by_extractor.items():
links = [link for link in links if link not in ignored_links]
workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS)
if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:
workers = 1
max_expected_jobs_for_extractor = max(
max_expected_jobs_for_extractor,
workers
)
worker_by_seq = [list() for _ in range(max_expected_jobs_for_extractor)]
links_to_worker = dict()
for extractor, links in links_by_extractor.items():
links = [link for link in links if link not in ignored_links]
workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS)
if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:
links_to_worker[extractor] = links
worker_by_seq[0].append(extractor)
else:
digits = math.ceil(math.log10(max(1, workers+1)))
fmt = "%%0%dd" % digits
for worker_no in range(workers):
lowerlimit = (worker_no+0)*SPLIT_WORKER_AFTER_N_LINKS
upperlimit = (worker_no+1)*SPLIT_WORKER_AFTER_N_LINKS
thisrange = links[lowerlimit:upperlimit]
worker_nm = extractor + ':' + (fmt % (worker_no))
links_to_worker[worker_nm] = thisrange
worker_by_seq[worker_no].append(worker_nm)
for w in worker_by_seq:
w.sort()
workers_nicely_grouped = [
worker
for workergroup in worker_by_seq
for worker in workergroup
if worker != ''
]
print(f'{len(links_to_worker)} workers to be spawned')
configure_gdl()
gallery_dl.output.select = lambda: ColoredLineOutput(False)
files_from_links = dict()
totalfiles = 0
2020-06-05 22:19:45 +00:00
thread_ids = workers_nicely_grouped.copy()
for line, thread_id in enumerate(thread_ids):
workers_state_path.joinpath(thread_id+'=line').write_text(str(line))
linkcount = len(links_to_worker[thread_id])
workers_state_path.joinpath(thread_id).write_text(f'waiting:{linkcount}:{linkcount}:0:0')
do_fancy_multithreading_panel = False
2020-06-05 22:19:45 +00:00
thread_id_count = len(thread_ids)
with PoolExecutor(min(MAX_WORKERS, thread_id_count)) as pe:
if do_fancy_multithreading_panel:
print(f'\033[2J', end='', flush=True)
print(f'\033[0;0H', end='', flush=True)
print('Downloading...', flush=True)
if do_fancy_multithreading_panel:
print(f'\033[0;0H', end='', flush=True)
2020-06-05 22:19:45 +00:00
largest_tid_size = max(map(len, thread_ids))
line2tid = dict()
def done_callback_generator(line):
nonlocal totalfiles
def done_callback(job):
nonlocal totalfiles
thread_id = line2tid[line]
links_list = links_to_worker[thread_id]
workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:0:0')
print(clrlib.stylize(
f'Received job #{line}: {thread_id}', [
clrlib.fg('white'),
clrlib.bg('green'),
clrlib.attr('bold'),
]
))
totalbytes = 0
thisfiles = 0
generator = list()
try:
generator = job.result()
except:
with workers_state_path.joinpath(thread_id+'=exc').open('wt') as f:
traceback.print_exc(file=f)
traceback.print_exc()
sys.exit(255)
for link, files in generator:
files_from_links[link] = files
lenfiles = len(files)
totalfiles += lenfiles
for file in files:
st = Path(file).stat()
totalbytes += st.st_size
thisfiles += lenfiles
workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:{totalbytes}:{thisfiles}')
return done_callback
for line, thread_id in enumerate(thread_ids):
2020-06-05 22:19:45 +00:00
line2tid[line] = thread_id
links_list = links_to_worker[thread_id]
workers_state_path.joinpath(thread_id).write_text(f'enqueued:{len(links_list)}:{len(links_list)}:0:0')
print(clrlib.stylize(f'Starting job #{line}: {thread_id}', [
clrlib.fg('white'),
clrlib.bg('light_red'),
clrlib.attr('bold'),
]))
jobstardedmsg = clrlib.stylize(f'Starting job #{line}: {thread_id}', [
clrlib.fg('black'),
clrlib.bg('light_yellow'),
clrlib.attr('bold'),
])
thread_id_nmsz = len(thread_id)
thread_id_display = thread_id + ' ' * \
(largest_tid_size - thread_id_nmsz)
job = pe.submit(
download_link_list,
2020-06-05 22:19:45 +00:00
links_list,
thread_id_display,
line+3 if do_fancy_multithreading_panel else None,
2020-06-05 22:19:45 +00:00
jobstardedmsg,
workers_state_path.joinpath(thread_id),
)
2020-06-05 22:19:45 +00:00
job.add_done_callback(done_callback_generator(line))
Path('i_gdl_ffl.json').write_text(json.dumps(files_from_links, indent=1))
2020-06-05 22:19:45 +00:00
if (p := Path('latest_image_download.txt')).exists():
p.unlink()
if workers_state_path.exists():
for p in workers_state_path.glob('*'):
p.unlink()
shutil.rmtree(workers_state_path)
print(f'Downloaded {totalfiles} files')
def download_link_list(links: List[str],
thread_id: str,
line: Optional[int] = None,
2020-06-05 22:19:45 +00:00
job_started_msg: Optional[str] = None,
thread_state_path: Optional[Path] = None,
) -> List[Tuple[str, List[str]]]:
2020-06-05 22:19:45 +00:00
'''Downloads a link list inside a ProcessPoolExecutor'''
if job_started_msg is not None:
print(job_started_msg)
has_its_own_line = line is not None
2020-06-05 22:19:45 +00:00
link_count = len(links)
remaining_links = link_count
configure_gdl()
2020-06-05 22:19:45 +00:00
if thread_state_path is not None:
thread_state_path.write_text(f'running:{link_count}:{remaining_links}:0:0')
def get_printer():
return ColoredLineOutput(
has_its_own_line,
prefix=(f'\033[{line};0H' if has_its_own_line else '') +
clrlib.stylize('% 9d' % remaining_links, [clrlib.fg('light_cyan')]) +
clrlib.stylize('@', [clrlib.fg('light_red')]) +
clrlib.stylize(thread_id, [clrlib.fg('yellow')]) +
clrlib.stylize('=', [clrlib.fg('dark_gray')]),
suffix=('\033[K\033[0;0H' if has_its_own_line else ''),
2020-06-05 22:19:45 +00:00
prefixsz=len(('% 9d' % 0)+' '+thread_id),
suffixsz=0,
2020-06-05 22:19:45 +00:00
write_successes_to=Path('latest_image_download.txt'),
)
gallery_dl.output.select = get_printer
result = list()
2020-06-05 22:19:45 +00:00
totalbytes = 0
totalfiles = 0
try:
for link in links:
2020-06-05 22:19:45 +00:00
scrubbing = True
cachedir = get_path_for_caching(link, Path('i_gdl_c'))
cachedir.mkdir(parents=True, exist_ok=True)
metafile = cachedir.joinpath('_gdl_meta.json')
meta = dict()
2020-06-05 22:19:45 +00:00
link_already_downloaded = False
if metafile.exists():
meta = json.loads(metafile.read_text())
2020-06-05 22:19:45 +00:00
if link in meta:
link_already_downloaded = True
for fl in meta[link]:
pth = Path(fl)
if not pth.exists():
link_already_downloaded = False
break
if not link_already_downloaded:
scrubbing = False
if thread_state_path is not None:
thread_state_path.write_text(f'running:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}')
job = DownloadJobWithCallSaverPostProcessor(link)
job.out = get_printer()
2020-06-05 22:19:45 +00:00
job.out.message(link, clrlib.fg('light_magenta'))
job.run()
files = list(map(lambda a: a[0], job.cspp.calls['run_final']))
2020-06-05 22:19:45 +00:00
files = list(filter(lambda a: Path(a).exists(), files))
meta[link] = files
metafile.write_text(json.dumps(meta, indent=1))
2020-06-05 22:19:45 +00:00
for fl in meta[link]:
pth = Path(fl)
if not pth.exists():
raise FileNotFoundError((link, link_already_downloaded, meta[link]))
st = pth.stat()
totalbytes += st.st_size
totalfiles += 1
result.append((link, meta[link]))
remaining_links -= 1
2020-06-05 22:19:45 +00:00
if thread_state_path is not None:
scrubbing_running = 'scrubbing' if scrubbing else 'running'
thread_state_path.write_text(f'{scrubbing_running}:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}')
finally:
print((f'\033[{line};0H' if has_its_own_line else '') +
2020-06-05 22:19:45 +00:00
clrlib.stylize(thread_id.strip(), [clrlib.fg('yellow'), clrlib.attr('bold')]) +
clrlib.stylize('#', [clrlib.fg('light_red')]) +
clrlib.stylize('Done', [clrlib.fg('light_green')]) +
('\033[K' if has_its_own_line else '')
)
return result
def configure_gdl():
2020-06-05 22:19:45 +00:00
'''Configures Gallery-DL for usage.'''
parser = gallery_dl.option.build_parser()
args = parser.parse_args([
2020-06-05 22:19:45 +00:00
'--download-archive=i_gdl/archive.db',
'--dest=i_gdl',
'--write-metadata',
'--write-tags',
2020-06-05 22:19:45 +00:00
# '--write-log=i_gdl_log.txt',
# '--write-unsupported=i_gdl_unsupported.txt',
'--quiet',
2020-06-05 22:19:45 +00:00
'--retries=15',
# '--limit-rate=1500k',
])
gallery_dl.output.initialize_logging(args.loglevel)
# configuration
if args.load_config:
gallery_dl.config.load()
if args.cfgfiles:
gallery_dl.config.load(args.cfgfiles, strict=True)
if args.yamlfiles:
gallery_dl.config.load(args.yamlfiles, strict=True, fmt="yaml")
if args.postprocessors:
gallery_dl.config.set((), "postprocessors", args.postprocessors)
if args.abort:
gallery_dl.config.set((), "skip", "abort:" + str(args.abort))
for opts in args.options:
gallery_dl.config.set(*opts)
# loglevels
gallery_dl.output.configure_logging(args.loglevel)
gallery_dl.output.select = ColoredLineOutput
class DownloadJobWithCallSaverPostProcessor(gallery_dl.job.DownloadJob):
def __init__(self, url, parent=None):
super().__init__(url, parent)
self.cspp = CallSaverPostProcessor(self)
def initialize(self, kwdict=None):
super().initialize(kwdict)
self.postprocessors.append(self.cspp)
class ColoredLineOutput(gallery_dl.output.TerminalOutput):
2020-06-05 22:19:45 +00:00
def __init__(self, sameline=False, prefix="", suffix="", prefixsz=0, suffixsz=0, write_successes_to=None):
super().__init__()
self.sameline = sameline
self.eol = '\r' if sameline else '\n'
self.prefix = prefix
self.suffix = suffix
self.prefixsz = prefixsz
self.suffixsz = suffixsz
2020-06-05 22:19:45 +00:00
self.write_successes_to = write_successes_to
self._termsize_update()
def start(self, path):
2020-06-05 22:19:45 +00:00
self.message(path,
clrlib.fg("light_yellow"),
)
def skip(self, path):
2020-06-05 22:19:45 +00:00
self.message(path,
clrlib.attr('dim'),
)
def success(self, path, tries):
2020-06-05 22:19:45 +00:00
self.message(path,
clrlib.attr('bold'),
clrlib.fg('light_green'),
)
if self.write_successes_to is not None:
self.write_successes_to.write_text(path)
def message(self, txt: str, *attrs: List[str], do_print: bool = True) -> str:
"""Prints a message with given formatters"""
clrtxt = clrlib.stylize(self.shorten(txt), attrs)
fmtd = f"{self.prefix}{clrtxt}{self.suffix}"
if do_print:
print(fmtd, flush=True, end=self.eol)
return fmtd
def shorten(self, txt):
2020-06-05 22:19:45 +00:00
self._termsize_update()
self.width = self.termsize - self.prefixsz - self.suffixsz - 1
return super().shorten(txt)
2020-06-05 22:19:45 +00:00
def _termsize_update(self):
self.termsize = shutil.get_terminal_size().columns
class CallSaverPostProcessor(gallery_dl.postprocessor.common.PostProcessor):
def __init__(self, job):
super().__init__(job)
self.calls = dict(
prepare=list(),
run=list(),
run_metadata=list(),
run_after=list(),
run_final=list(),
)
def prepare(self, pathfmt):
"""Update file paths, etc."""
self.calls['prepare'].append((pathfmt.path,))
def run(self, pathfmt):
"""Execute the postprocessor for a file"""
self.calls['run'].append((pathfmt.path,))
def run_metadata(self, pathfmt):
"""Execute the postprocessor for a file"""
self.calls['run_metadata'].append((pathfmt.path,))
def run_after(self, pathfmt):
"""Execute postprocessor after moving a file to its target location"""
self.calls['run_after'].append((pathfmt.path,))
def run_final(self, pathfmt, status):
"""Postprocessor finalization after all files have been downloaded"""
self.calls['run_final'].append((pathfmt.path, status))
if __name__ == "__main__":
main()