|
|
@ -2,11 +2,13 @@ |
|
|
|
# -*- encoding: utf-8 -*- |
|
|
|
|
|
|
|
import json |
|
|
|
from concurrent.futures import Future |
|
|
|
import math |
|
|
|
import shutil |
|
|
|
import sys |
|
|
|
import traceback |
|
|
|
from concurrent.futures import ProcessPoolExecutor as PoolExecutor |
|
|
|
from pathlib import Path |
|
|
|
import shutil |
|
|
|
from typing import Any, List, Optional, Tuple |
|
|
|
from typing import List, Optional, Tuple |
|
|
|
|
|
|
|
import colored as clrlib |
|
|
|
import gallery_dl |
|
|
@ -20,6 +22,12 @@ import reddit_imgs.sync |
|
|
|
from .system.downloader.cache import get_normalized_link, get_path_for_caching |
|
|
|
from .system.urlmatcher import search_urls |
|
|
|
|
|
|
|
MAX_WORKERS = 12 |
|
|
|
SPLIT_WORKER_AFTER_N_LINKS = 1000 |
|
|
|
FORBIDDEN_WORKER_SPLITS = { |
|
|
|
'deviantart', |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
subreddit_data_path = Path('r.json') |
|
|
@ -27,6 +35,10 @@ def main(): |
|
|
|
print("Executing prerrequisite...") |
|
|
|
reddit_imgs.sync.main() |
|
|
|
print('Loading posts...') |
|
|
|
workers_state_path = Path('i_gdl_w') |
|
|
|
workers_state_path.mkdir(exist_ok=True, parents=True) |
|
|
|
for wsp in workers_state_path.iterdir(): |
|
|
|
wsp.unlink() |
|
|
|
subreddit_data = json.loads(subreddit_data_path.read_text()) |
|
|
|
links = dict() |
|
|
|
postsl = [ |
|
|
@ -85,7 +97,7 @@ def main(): |
|
|
|
link_extractors = dict() |
|
|
|
if r_gdl_le_path.exists(): |
|
|
|
link_extractors = json.loads(r_gdl_le_path.read_text()) |
|
|
|
for seq, link in enumerate(links.keys()): |
|
|
|
for link in links.keys(): |
|
|
|
if link not in link_extractors or link_extractors[link] == '': |
|
|
|
ext = None |
|
|
|
try: |
|
|
@ -112,6 +124,50 @@ def main(): |
|
|
|
print(f'{len(links_by_extractor)} extractors found') |
|
|
|
Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1)) |
|
|
|
|
|
|
|
ignored_links = set() |
|
|
|
if (p := Path('i_gdl_ignored.txt')).exists(): |
|
|
|
ignored_links = set(list(filter(len, p.read_text().splitlines()))) |
|
|
|
|
|
|
|
max_expected_jobs_for_extractor = 0 |
|
|
|
for extractor, links in links_by_extractor.items(): |
|
|
|
links = [link for link in links if link not in ignored_links] |
|
|
|
workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS) |
|
|
|
if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS: |
|
|
|
workers = 1 |
|
|
|
max_expected_jobs_for_extractor = max( |
|
|
|
max_expected_jobs_for_extractor, |
|
|
|
workers |
|
|
|
) |
|
|
|
worker_by_seq = [list() for _ in range(max_expected_jobs_for_extractor)] |
|
|
|
|
|
|
|
links_to_worker = dict() |
|
|
|
for extractor, links in links_by_extractor.items(): |
|
|
|
links = [link for link in links if link not in ignored_links] |
|
|
|
workers = math.ceil(len(links)/SPLIT_WORKER_AFTER_N_LINKS) |
|
|
|
if workers <= 1 or extractor in FORBIDDEN_WORKER_SPLITS: |
|
|
|
links_to_worker[extractor] = links |
|
|
|
worker_by_seq[0].append(extractor) |
|
|
|
else: |
|
|
|
digits = math.ceil(math.log10(max(1, workers+1))) |
|
|
|
fmt = "%%0%dd" % digits |
|
|
|
for worker_no in range(workers): |
|
|
|
lowerlimit = (worker_no+0)*SPLIT_WORKER_AFTER_N_LINKS |
|
|
|
upperlimit = (worker_no+1)*SPLIT_WORKER_AFTER_N_LINKS |
|
|
|
thisrange = links[lowerlimit:upperlimit] |
|
|
|
worker_nm = extractor + ':' + (fmt % (worker_no)) |
|
|
|
links_to_worker[worker_nm] = thisrange |
|
|
|
worker_by_seq[worker_no].append(worker_nm) |
|
|
|
for w in worker_by_seq: |
|
|
|
w.sort() |
|
|
|
workers_nicely_grouped = [ |
|
|
|
worker |
|
|
|
for workergroup in worker_by_seq |
|
|
|
for worker in workergroup |
|
|
|
if worker != '' |
|
|
|
] |
|
|
|
print(f'{len(links_to_worker)} workers to be spawned') |
|
|
|
|
|
|
|
|
|
|
|
configure_gdl() |
|
|
|
|
|
|
|
gallery_dl.output.select = lambda: ColoredLineOutput(False) |
|
|
@ -120,40 +176,110 @@ def main(): |
|
|
|
|
|
|
|
totalfiles = 0 |
|
|
|
|
|
|
|
thread_ids = sorted(list(links_by_extractor.keys())) |
|
|
|
thread_ids = workers_nicely_grouped.copy() |
|
|
|
for line, thread_id in enumerate(thread_ids): |
|
|
|
workers_state_path.joinpath(thread_id+'=line').write_text(str(line)) |
|
|
|
linkcount = len(links_to_worker[thread_id]) |
|
|
|
workers_state_path.joinpath(thread_id).write_text(f'waiting:{linkcount}:{linkcount}:0:0') |
|
|
|
do_fancy_multithreading_panel = False |
|
|
|
with PoolExecutor(len(thread_ids)) as pe: |
|
|
|
jobs: List[Future] = list() |
|
|
|
thread_id_count = len(thread_ids) |
|
|
|
with PoolExecutor(min(MAX_WORKERS, thread_id_count)) as pe: |
|
|
|
if do_fancy_multithreading_panel: |
|
|
|
print(f'\033[2J', end='', flush=True) |
|
|
|
print(f'\033[0;0H', end='', flush=True) |
|
|
|
print('Downloading...', flush=True) |
|
|
|
if do_fancy_multithreading_panel: |
|
|
|
print(f'\033[0;0H', end='', flush=True) |
|
|
|
largest_tid_size = max(map(len, thread_ids)) |
|
|
|
line2tid = dict() |
|
|
|
|
|
|
|
def done_callback_generator(line): |
|
|
|
nonlocal totalfiles |
|
|
|
def done_callback(job): |
|
|
|
nonlocal totalfiles |
|
|
|
thread_id = line2tid[line] |
|
|
|
links_list = links_to_worker[thread_id] |
|
|
|
workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:0:0') |
|
|
|
print(clrlib.stylize( |
|
|
|
f'Received job #{line}: {thread_id}', [ |
|
|
|
clrlib.fg('white'), |
|
|
|
clrlib.bg('green'), |
|
|
|
clrlib.attr('bold'), |
|
|
|
] |
|
|
|
)) |
|
|
|
totalbytes = 0 |
|
|
|
thisfiles = 0 |
|
|
|
generator = list() |
|
|
|
try: |
|
|
|
generator = job.result() |
|
|
|
except: |
|
|
|
with workers_state_path.joinpath(thread_id+'=exc').open('wt') as f: |
|
|
|
traceback.print_exc(file=f) |
|
|
|
traceback.print_exc() |
|
|
|
sys.exit(255) |
|
|
|
for link, files in generator: |
|
|
|
files_from_links[link] = files |
|
|
|
lenfiles = len(files) |
|
|
|
totalfiles += lenfiles |
|
|
|
for file in files: |
|
|
|
st = Path(file).stat() |
|
|
|
totalbytes += st.st_size |
|
|
|
thisfiles += lenfiles |
|
|
|
workers_state_path.joinpath(thread_id).write_text(f'finished:{len(links_list)}:0:{totalbytes}:{thisfiles}') |
|
|
|
|
|
|
|
|
|
|
|
return done_callback |
|
|
|
for line, thread_id in enumerate(thread_ids): |
|
|
|
line2tid[line] = thread_id |
|
|
|
links_list = links_to_worker[thread_id] |
|
|
|
workers_state_path.joinpath(thread_id).write_text(f'enqueued:{len(links_list)}:{len(links_list)}:0:0') |
|
|
|
print(clrlib.stylize(f'Starting job #{line}: {thread_id}', [ |
|
|
|
clrlib.fg('white'), |
|
|
|
clrlib.bg('light_red'), |
|
|
|
clrlib.attr('bold'), |
|
|
|
])) |
|
|
|
jobstardedmsg = clrlib.stylize(f'Starting job #{line}: {thread_id}', [ |
|
|
|
clrlib.fg('black'), |
|
|
|
clrlib.bg('light_yellow'), |
|
|
|
clrlib.attr('bold'), |
|
|
|
]) |
|
|
|
thread_id_nmsz = len(thread_id) |
|
|
|
thread_id_display = thread_id + ' ' * \ |
|
|
|
(largest_tid_size - thread_id_nmsz) |
|
|
|
job = pe.submit( |
|
|
|
download_link_list, |
|
|
|
links_by_extractor[thread_id], |
|
|
|
thread_id, |
|
|
|
links_list, |
|
|
|
thread_id_display, |
|
|
|
line+3 if do_fancy_multithreading_panel else None, |
|
|
|
jobstardedmsg, |
|
|
|
workers_state_path.joinpath(thread_id), |
|
|
|
) |
|
|
|
jobs.append(job) |
|
|
|
for job in jobs: |
|
|
|
generator = job.result() |
|
|
|
for k, v in generator: |
|
|
|
files_from_links[k] = v |
|
|
|
totalfiles += len(v) |
|
|
|
job.add_done_callback(done_callback_generator(line)) |
|
|
|
Path('i_gdl_ffl.json').write_text(json.dumps(files_from_links, indent=1)) |
|
|
|
if (p := Path('latest_image_download.txt')).exists(): |
|
|
|
p.unlink() |
|
|
|
if workers_state_path.exists(): |
|
|
|
for p in workers_state_path.glob('*'): |
|
|
|
p.unlink() |
|
|
|
shutil.rmtree(workers_state_path) |
|
|
|
print(f'Downloaded {totalfiles} files') |
|
|
|
|
|
|
|
|
|
|
|
def download_link_list(links: List[str], |
|
|
|
thread_id: str, |
|
|
|
line: Optional[int] = None, |
|
|
|
job_started_msg: Optional[str] = None, |
|
|
|
thread_state_path: Optional[Path] = None, |
|
|
|
) -> List[Tuple[str, List[str]]]: |
|
|
|
'''Downloads a link list inside a ProcessPoolExecutor''' |
|
|
|
if job_started_msg is not None: |
|
|
|
print(job_started_msg) |
|
|
|
has_its_own_line = line is not None |
|
|
|
remaining_links = len(links) |
|
|
|
link_count = len(links) |
|
|
|
remaining_links = link_count |
|
|
|
configure_gdl() |
|
|
|
if thread_state_path is not None: |
|
|
|
thread_state_path.write_text(f'running:{link_count}:{remaining_links}:0:0') |
|
|
|
|
|
|
|
def get_printer(): |
|
|
|
return ColoredLineOutput( |
|
|
@ -164,33 +290,59 @@ def download_link_list(links: List[str], |
|
|
|
clrlib.stylize(thread_id, [clrlib.fg('yellow')]) + |
|
|
|
clrlib.stylize('=', [clrlib.fg('dark_gray')]), |
|
|
|
suffix=('\033[K\033[0;0H' if has_its_own_line else ''), |
|
|
|
prefixsz=len(('% 9d'%0)+' '+thread_id), |
|
|
|
prefixsz=len(('% 9d' % 0)+' '+thread_id), |
|
|
|
suffixsz=0, |
|
|
|
write_successes_to=Path('latest_image_download.txt'), |
|
|
|
) |
|
|
|
|
|
|
|
gallery_dl.output.select = get_printer |
|
|
|
result = list() |
|
|
|
totalbytes = 0 |
|
|
|
totalfiles = 0 |
|
|
|
try: |
|
|
|
for link in links: |
|
|
|
scrubbing = True |
|
|
|
cachedir = get_path_for_caching(link, Path('i_gdl_c')) |
|
|
|
cachedir.mkdir(parents=True, exist_ok=True) |
|
|
|
metafile = cachedir.joinpath('_gdl_meta.json') |
|
|
|
meta = dict() |
|
|
|
link_already_downloaded = False |
|
|
|
if metafile.exists(): |
|
|
|
meta = json.loads(metafile.read_text()) |
|
|
|
if link not in meta: |
|
|
|
if link in meta: |
|
|
|
link_already_downloaded = True |
|
|
|
for fl in meta[link]: |
|
|
|
pth = Path(fl) |
|
|
|
if not pth.exists(): |
|
|
|
link_already_downloaded = False |
|
|
|
break |
|
|
|
if not link_already_downloaded: |
|
|
|
scrubbing = False |
|
|
|
if thread_state_path is not None: |
|
|
|
thread_state_path.write_text(f'running:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}') |
|
|
|
job = DownloadJobWithCallSaverPostProcessor(link) |
|
|
|
job.out = get_printer() |
|
|
|
job.out.start(clrlib.stylize(link, [clrlib.fg('light_magenta')])) |
|
|
|
job.out.message(link, clrlib.fg('light_magenta')) |
|
|
|
job.run() |
|
|
|
files = list(map(lambda a: a[0], job.cspp.calls['run_final'])) |
|
|
|
files = list(filter(lambda a: Path(a).exists(), files)) |
|
|
|
meta[link] = files |
|
|
|
metafile.write_text(json.dumps(meta, indent=1)) |
|
|
|
for fl in meta[link]: |
|
|
|
pth = Path(fl) |
|
|
|
if not pth.exists(): |
|
|
|
raise FileNotFoundError((link, link_already_downloaded, meta[link])) |
|
|
|
st = pth.stat() |
|
|
|
totalbytes += st.st_size |
|
|
|
totalfiles += 1 |
|
|
|
result.append((link, meta[link])) |
|
|
|
remaining_links -= 1 |
|
|
|
if thread_state_path is not None: |
|
|
|
scrubbing_running = 'scrubbing' if scrubbing else 'running' |
|
|
|
thread_state_path.write_text(f'{scrubbing_running}:{link_count}:{remaining_links}:{totalbytes}:{totalfiles}') |
|
|
|
finally: |
|
|
|
print((f'\033[{line};0H' if has_its_own_line else '') + |
|
|
|
clrlib.stylize(thread_id, [clrlib.fg('yellow')]) + |
|
|
|
clrlib.stylize(thread_id.strip(), [clrlib.fg('yellow'), clrlib.attr('bold')]) + |
|
|
|
clrlib.stylize('#', [clrlib.fg('light_red')]) + |
|
|
|
clrlib.stylize('Done', [clrlib.fg('light_green')]) + |
|
|
|
('\033[K' if has_its_own_line else '') |
|
|
@ -199,15 +351,18 @@ def download_link_list(links: List[str], |
|
|
|
|
|
|
|
|
|
|
|
def configure_gdl(): |
|
|
|
'''Configures Gallery-DL for usage.''' |
|
|
|
parser = gallery_dl.option.build_parser() |
|
|
|
args = parser.parse_args([ |
|
|
|
'--download-archive=i_gdl.sqlite3', |
|
|
|
'--download-archive=i_gdl/archive.db', |
|
|
|
'--dest=i_gdl', |
|
|
|
'--write-metadata', |
|
|
|
'--write-tags', |
|
|
|
'--write-log=i_gdl_log.txt', |
|
|
|
'--write-unsupported=i_gdl_unsupported.txt', |
|
|
|
# '--write-log=i_gdl_log.txt', |
|
|
|
# '--write-unsupported=i_gdl_unsupported.txt', |
|
|
|
'--quiet', |
|
|
|
'--retries=15', |
|
|
|
# '--limit-rate=1500k', |
|
|
|
]) |
|
|
|
gallery_dl.output.initialize_logging(args.loglevel) |
|
|
|
|
|
|
@ -242,7 +397,7 @@ class DownloadJobWithCallSaverPostProcessor(gallery_dl.job.DownloadJob): |
|
|
|
|
|
|
|
|
|
|
|
class ColoredLineOutput(gallery_dl.output.TerminalOutput): |
|
|
|
def __init__(self, sameline=False, prefix="", suffix="", prefixsz=0, suffixsz=0): |
|
|
|
def __init__(self, sameline=False, prefix="", suffix="", prefixsz=0, suffixsz=0, write_successes_to=None): |
|
|
|
super().__init__() |
|
|
|
self.sameline = sameline |
|
|
|
self.eol = '\r' if sameline else '\n' |
|
|
@ -250,24 +405,42 @@ class ColoredLineOutput(gallery_dl.output.TerminalOutput): |
|
|
|
self.suffix = suffix |
|
|
|
self.prefixsz = prefixsz |
|
|
|
self.suffixsz = suffixsz |
|
|
|
self.termsize = shutil.get_terminal_size().columns |
|
|
|
self.write_successes_to = write_successes_to |
|
|
|
self._termsize_update() |
|
|
|
|
|
|
|
def start(self, path): |
|
|
|
print(f'{self.prefix}{clrlib.stylize(self.shorten(path), [clrlib.fg("light_yellow")])}{self.suffix}', |
|
|
|
flush=True, end=self.eol) |
|
|
|
self.message(path, |
|
|
|
clrlib.fg("light_yellow"), |
|
|
|
) |
|
|
|
|
|
|
|
def skip(self, path): |
|
|
|
print(f"{self.prefix}\033[2m{self.shorten(path)}\033[0m{self.suffix}", |
|
|
|
flush=True, end=self.eol) |
|
|
|
self.message(path, |
|
|
|
clrlib.attr('dim'), |
|
|
|
) |
|
|
|
|
|
|
|
def success(self, path, tries): |
|
|
|
print(f"{self.prefix}\033[1;32m{self.shorten(path)}\033[0m{self.suffix}", |
|
|
|
flush=True, end=self.eol) |
|
|
|
|
|
|
|
self.message(path, |
|
|
|
clrlib.attr('bold'), |
|
|
|
clrlib.fg('light_green'), |
|
|
|
) |
|
|
|
if self.write_successes_to is not None: |
|
|
|
self.write_successes_to.write_text(path) |
|
|
|
|
|
|
|
def message(self, txt: str, *attrs: List[str], do_print: bool = True) -> str: |
|
|
|
"""Prints a message with given formatters""" |
|
|
|
clrtxt = clrlib.stylize(self.shorten(txt), attrs) |
|
|
|
fmtd = f"{self.prefix}{clrtxt}{self.suffix}" |
|
|
|
if do_print: |
|
|
|
print(fmtd, flush=True, end=self.eol) |
|
|
|
return fmtd |
|
|
|
|
|
|
|
def shorten(self, txt): |
|
|
|
self.width = self.termsize - self.prefixsz - self.suffixsz -1 |
|
|
|
self._termsize_update() |
|
|
|
self.width = self.termsize - self.prefixsz - self.suffixsz - 1 |
|
|
|
return super().shorten(txt) |
|
|
|
|
|
|
|
def _termsize_update(self): |
|
|
|
self.termsize = shutil.get_terminal_size().columns |
|
|
|
|
|
|
|
|
|
|
|
class CallSaverPostProcessor(gallery_dl.postprocessor.common.PostProcessor): |
|
|
|