reddit-image-wall-getter/reddit_imgs/fetch2.py

480 lines
18 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import math
import shutil
import sys
import traceback
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from typing import List, Optional, Tuple
import colored as clrlib
import gallery_dl
import gallery_dl.config
import gallery_dl.extractor
import gallery_dl.job
import gallery_dl.postprocessor.common
import reddit_imgs.sync
from .system.downloader.cache import get_normalized_link, get_path_for_caching
from .system.urlmatcher import search_urls
MAX_WORKERS = 12
SPLIT_WORKER_AFTER_N_LINKS = 1000
FORBIDDEN_WORKER_SPLITS = {
'deviantart',
}
def main():
subreddit_data_path = Path('r.json')
if not subreddit_data_path.exists():
print("Executing prerrequisite...")
reddit_imgs.sync.main()
print('Loading posts...')
workers_state_path = Path('i_gdl_w')
workers_state_path.mkdir(exist_ok=True, parents=True)
for wsp in workers_state_path.iterdir():
wsp.unlink()
subreddit_data = json.loads(subreddit_data_path.read_text())
links = dict()
postsl = [
{**post, 'subreddit': subreddit}
for subreddit, srdt in subreddit_data.items()
for post in srdt['links']
]
postsd = dict()
for post in postsl:
dk = post['datakey']
if dk not in postsd:
postsd[dk] = post.copy()
postsd[dk]['subreddits'] = list()
postsd[dk]['links'] = list()
del postsd[dk]['subreddit']
del postsd[dk]['link']
del postsd[dk]['domain']
if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']):
srs.append(sr)
if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']):
lnks.append(lnk)
posts = postsd
del postsl
del postsd
print(f'{len(posts)} posts identified.')
print(f'Identifying alternative trivial links...')
for post in posts.values():
dk = post['datakey']
post_links = post['links']
has_added_any_link = True
while has_added_any_link:
has_added_any_link = False
for link in post_links:
linkcopy = link
while linkcopy.endswith('/') or linkcopy.endswith('#') or linkcopy.endswith('?'):
linkcopy = linkcopy[:-1]
if linkcopy not in post_links:
post_links.append(linkcopy)
has_added_any_link = True
if '<!--' in link:
for linkcopy in search_urls(link):
if linkcopy not in post_links:
post_links.append(linkcopy)
has_added_any_link = True
Path('r_gdl_p.json').write_text(json.dumps(posts, indent=1))
print(f'Grouping links with the posts they show up in...')
for dk, post in posts.items():
for link in post['links']:
if link not in links:
links[link] = list()
links[link].append(dk)
Path('r_gdl_lp.json').write_text(json.dumps(links, indent=1))
print(f'{len(links)} links found')
print(f'Checking if there is an extractor for each link...')
r_gdl_le_path = Path('r_gdl_le.json')
link_extractors = dict()
if r_gdl_le_path.exists():
link_extractors = json.loads(r_gdl_le_path.read_text())
for link in links.keys():
if link not in link_extractors or link_extractors[link] == '':
ext = None
try:
ext = gallery_dl.extractor.find(link)
except gallery_dl.exception.NotFoundError:
pass
link_extractors[link] = type(
ext).category if ext is not None else ''
r_gdl_le_path.write_text(json.dumps(link_extractors, indent=1))
links_by_extractor = {
extractor: list()
for extractor in list(set(link_extractors.values()))
}
for link, extractor in link_extractors.items():
links_by_extractor[extractor].append(link)
undownloadable_posts = links_by_extractor.get('', [])
Path('i_undownloadable.json').write_text(
json.dumps(undownloadable_posts, indent=1))
if '' in links_by_extractor:
del links_by_extractor['']
print(f'{len(links)-len(undownloadable_posts)} downloadable links found')
print(f'{len(undownloadable_posts)} undownloadable links found')
print(f'{len(links_by_extractor)} extractors found')
Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1))
ignored_links = set()
if (p := Path('i_gdl_ignored.txt')).exists():
ignored_links = set(list(filter(len, p.read_text().splitlines())))
max_expected_jobs_for_extractor = 0
for extractor, links in links_by_extractor.items():
links = [link for link in links if link not in ignored_links]
workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS)
if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:
workers = 1
max_expected_jobs_for_extractor = max(
max_expected_jobs_for_extractor,
workers
)
worker_by_seq = [list() for _ in range(max_expected_jobs_for_extractor)]
links_to_worker = dict()
for extractor, links in links_by_extractor.items():
links = [link for link in links if link not in ignored_links]
workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS)
if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS:
links_to_worker[extractor] = links
worker_by_seq[0].append(extractor)
else:
digits = math.ceil(math.log10(max(1, workers+1)))
fmt = "%%0%dd" % digits
for worker_no in range(workers):
lowerlimit = (worker_no+0)*SPLIT_WORKER_AFTER_N_LINKS
upperlimit = (worker_no+1)*SPLIT_WORKER_AFTER_N_LINKS
thisrange = links[lowerlimit:upperlimit]
worker_nm = extractor + ':' + (fmt % (worker_no))
links_to_worker[worker_nm] = thisrange
worker_by_seq[worker_no].append(worker_nm)
for w in worker_by_seq:
w.sort()
workers_nicely_grouped = [
worker
for workergroup in worker_by_seq
for worker in workergroup
if worker != ''
]
print(f'{len(links_to_worker)} workers to be spawned')
configure_gdl()
gallery_dl.output.select = lambda: ColoredLineOutput(False)
files_from_links = dict()
totalfiles = 0
thread_ids = workers_nicely_grouped.copy()
for line, thread_id in enumerate(thread_ids):
workers_state_path.joinpath(thread_id+'=line').write_text(str(line))
linkcount = len(links_to_worker[thread_id])
workers_state_path.joinpath(thread_id).write_text(f'waiting:{linkcount}:{linkcount}:0:0')
do_fancy_multithreading_panel = False
thread_id_count = len(thread_ids)
with PoolExecutor(min(MAX_WORKERS, thread_id_count)) as pe:
if do_fancy_multithreading_panel:
print(f'\033[2J', end='', flush=True)
print(f'\033[0;0H', end='', flush=True)
print('Downloading...', flush=True)
if do_fancy_multithreading_panel:
print(f'\033[0;0H', end='', flush=True)
largest_tid_size = max(map(len, thread_ids))
line2tid = dict()
def done_callback_generator(line):
nonlocal totalfiles
def done_callback(job):
nonlocal totalfiles
thread_id = line2tid[line]
links_list = links_to_worker[thread_id]
workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:0:0')
print(clrlib.stylize(
f'Received job #{line}: {thread_id}', [
clrlib.fg('white'),
clrlib.bg('green'),
clrlib.attr('bold'),
]
))
totalbytes = 0
thisfiles = 0
generator = list()
try:
generator = job.result()
except:
with workers_state_path.joinpath(thread_id+'=exc').open('wt') as f:
traceback.print_exc(file=f)
traceback.print_exc()
sys.exit(255)
for link, files in generator:
files_from_links[link] = files
lenfiles = len(files)
totalfiles += lenfiles
for file in files:
st = Path(file).stat()
totalbytes += st.st_size
thisfiles += lenfiles
workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:{totalbytes}:{thisfiles}')
return done_callback
for line, thread_id in enumerate(thread_ids):
line2tid[line] = thread_id
links_list = links_to_worker[thread_id]
workers_state_path.joinpath(thread_id).write_text(f'enqueued:{len(links_list)}:{len(links_list)}:0:0')
print(clrlib.stylize(f'Starting job #{line}: {thread_id}', [
clrlib.fg('white'),
clrlib.bg('light_red'),
clrlib.attr('bold'),
]))
jobstardedmsg = clrlib.stylize(f'Starting job #{line}: {thread_id}', [
clrlib.fg('black'),
clrlib.bg('light_yellow'),
clrlib.attr('bold'),
])
thread_id_nmsz = len(thread_id)
thread_id_display = thread_id + ' ' * \
(largest_tid_size - thread_id_nmsz)
job = pe.submit(
download_link_list,
links_list,
thread_id_display,
line+3 if do_fancy_multithreading_panel else None,
jobstardedmsg,
workers_state_path.joinpath(thread_id),
)
job.add_done_callback(done_callback_generator(line))
Path('i_gdl_ffl.json').write_text(json.dumps(files_from_links, indent=1))
if (p := Path('latest_image_download.txt')).exists():
p.unlink()
if workers_state_path.exists():
for p in workers_state_path.glob('*'):
p.unlink()
shutil.rmtree(workers_state_path)
print(f'Downloaded {totalfiles} files')
def download_link_list(links: List[str],
thread_id: str,
line: Optional[int] = None,
job_started_msg: Optional[str] = None,
thread_state_path: Optional[Path] = None,
) -> List[Tuple[str, List[str]]]:
'''Downloads a link list inside a ProcessPoolExecutor'''
if job_started_msg is not None:
print(job_started_msg)
has_its_own_line = line is not None
link_count = len(links)
remaining_links = link_count
configure_gdl()
if thread_state_path is not None:
thread_state_path.write_text(f'running:{link_count}:{remaining_links}:0:0')
def get_printer():
return ColoredLineOutput(
has_its_own_line,
prefix=(f'\033[{line};0H' if has_its_own_line else '') +
clrlib.stylize('% 9d' % remaining_links, [clrlib.fg('light_cyan')]) +
clrlib.stylize('@', [clrlib.fg('light_red')]) +
clrlib.stylize(thread_id, [clrlib.fg('yellow')]) +
clrlib.stylize('=', [clrlib.fg('dark_gray')]),
suffix=('\033[K\033[0;0H' if has_its_own_line else ''),
prefixsz=len(('% 9d' % 0)+' '+thread_id),
suffixsz=0,
write_successes_to=Path('latest_image_download.txt'),
)
gallery_dl.output.select = get_printer
result = list()
totalbytes = 0
totalfiles = 0
try:
for link in links:
scrubbing = True
cachedir = get_path_for_caching(link, Path('i_gdl_c'))
cachedir.mkdir(parents=True, exist_ok=True)
metafile = cachedir.joinpath('_gdl_meta.json')
meta = dict()
link_already_downloaded = False
if metafile.exists():
meta = json.loads(metafile.read_text())
if link in meta:
link_already_downloaded = True
for fl in meta[link]:
pth = Path(fl)
if not pth.exists():
link_already_downloaded = False
break
if not link_already_downloaded:
scrubbing = False
if thread_state_path is not None:
thread_state_path.write_text(f'running:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}')
job = DownloadJobWithCallSaverPostProcessor(link)
job.out = get_printer()
job.out.message(link, clrlib.fg('light_magenta'))
job.run()
files = list(map(lambda a: a[0], job.cspp.calls['run_final']))
files = list(filter(lambda a: Path(a).exists(), files))
meta[link] = files
metafile.write_text(json.dumps(meta, indent=1))
for fl in meta[link]:
pth = Path(fl)
if not pth.exists():
raise FileNotFoundError((link, link_already_downloaded, meta[link]))
st = pth.stat()
totalbytes += st.st_size
totalfiles += 1
result.append((link, meta[link]))
remaining_links -= 1
if thread_state_path is not None:
scrubbing_running = 'scrubbing' if scrubbing else 'running'
thread_state_path.write_text(f'{scrubbing_running}:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}')
finally:
print((f'\033[{line};0H' if has_its_own_line else '') +
clrlib.stylize(thread_id.strip(), [clrlib.fg('yellow'), clrlib.attr('bold')]) +
clrlib.stylize('#', [clrlib.fg('light_red')]) +
clrlib.stylize('Done', [clrlib.fg('light_green')]) +
('\033[K' if has_its_own_line else '')
)
return result
def configure_gdl():
'''Configures Gallery-DL for usage.'''
parser = gallery_dl.option.build_parser()
args = parser.parse_args([
'--download-archive=i_gdl/archive.db',
'--dest=i_gdl',
'--write-metadata',
'--write-tags',
# '--write-log=i_gdl_log.txt',
# '--write-unsupported=i_gdl_unsupported.txt',
'--quiet',
'--retries=15',
# '--limit-rate=1500k',
])
gallery_dl.output.initialize_logging(args.loglevel)
# configuration
if args.load_config:
gallery_dl.config.load()
if args.cfgfiles:
gallery_dl.config.load(args.cfgfiles, strict=True)
if args.yamlfiles:
gallery_dl.config.load(args.yamlfiles, strict=True, fmt="yaml")
if args.postprocessors:
gallery_dl.config.set((), "postprocessors", args.postprocessors)
if args.abort:
gallery_dl.config.set((), "skip", "abort:" + str(args.abort))
for opts in args.options:
gallery_dl.config.set(*opts)
# loglevels
gallery_dl.output.configure_logging(args.loglevel)
gallery_dl.output.select = ColoredLineOutput
class DownloadJobWithCallSaverPostProcessor(gallery_dl.job.DownloadJob):
def __init__(self, url, parent=None):
super().__init__(url, parent)
self.cspp = CallSaverPostProcessor(self)
def initialize(self, kwdict=None):
super().initialize(kwdict)
self.postprocessors.append(self.cspp)
class ColoredLineOutput(gallery_dl.output.TerminalOutput):
def __init__(self, sameline=False, prefix="", suffix="", prefixsz=0, suffixsz=0, write_successes_to=None):
super().__init__()
self.sameline = sameline
self.eol = '\r' if sameline else '\n'
self.prefix = prefix
self.suffix = suffix
self.prefixsz = prefixsz
self.suffixsz = suffixsz
self.write_successes_to = write_successes_to
self._termsize_update()
def start(self, path):
self.message(path,
clrlib.fg("light_yellow"),
)
def skip(self, path):
self.message(path,
clrlib.attr('dim'),
)
def success(self, path, tries):
self.message(path,
clrlib.attr('bold'),
clrlib.fg('light_green'),
)
if self.write_successes_to is not None:
self.write_successes_to.write_text(path)
def message(self, txt: str, *attrs: List[str], do_print: bool = True) -> str:
"""Prints a message with given formatters"""
clrtxt = clrlib.stylize(self.shorten(txt), attrs)
fmtd = f"{self.prefix}{clrtxt}{self.suffix}"
if do_print:
print(fmtd, flush=True, end=self.eol)
return fmtd
def shorten(self, txt):
self._termsize_update()
self.width = self.termsize - self.prefixsz - self.suffixsz - 1
return super().shorten(txt)
def _termsize_update(self):
self.termsize = shutil.get_terminal_size().columns
class CallSaverPostProcessor(gallery_dl.postprocessor.common.PostProcessor):
def __init__(self, job):
super().__init__(job)
self.calls = dict(
prepare=list(),
run=list(),
run_metadata=list(),
run_after=list(),
run_final=list(),
)
def prepare(self, pathfmt):
"""Update file paths, etc."""
self.calls['prepare'].append((pathfmt.path,))
def run(self, pathfmt):
"""Execute the postprocessor for a file"""
self.calls['run'].append((pathfmt.path,))
def run_metadata(self, pathfmt):
"""Execute the postprocessor for a file"""
self.calls['run_metadata'].append((pathfmt.path,))
def run_after(self, pathfmt):
"""Execute postprocessor after moving a file to its target location"""
self.calls['run_after'].append((pathfmt.path,))
def run_final(self, pathfmt, status):
"""Postprocessor finalization after all files have been downloaded"""
self.calls['run_final'].append((pathfmt.path, status))
if __name__ == "__main__":
main()