307 lines
11 KiB
Python
307 lines
11 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# -*- encoding: utf-8 -*-
|
||
|
|
||
|
import json
|
||
|
from concurrent.futures import Future
|
||
|
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
|
||
|
from pathlib import Path
|
||
|
import shutil
|
||
|
from typing import Any, List, Optional, Tuple
|
||
|
|
||
|
import colored as clrlib
|
||
|
import gallery_dl
|
||
|
import gallery_dl.config
|
||
|
import gallery_dl.extractor
|
||
|
import gallery_dl.job
|
||
|
import gallery_dl.postprocessor.common
|
||
|
|
||
|
import reddit_imgs.sync
|
||
|
|
||
|
from .system.downloader.cache import get_normalized_link, get_path_for_caching
|
||
|
from .system.urlmatcher import search_urls
|
||
|
|
||
|
|
||
|
def main():
|
||
|
subreddit_data_path = Path('r.json')
|
||
|
if not subreddit_data_path.exists():
|
||
|
print("Executing prerrequisite...")
|
||
|
reddit_imgs.sync.main()
|
||
|
print('Loading posts...')
|
||
|
subreddit_data = json.loads(subreddit_data_path.read_text())
|
||
|
links = dict()
|
||
|
postsl = [
|
||
|
{**post, 'subreddit': subreddit}
|
||
|
for subreddit, srdt in subreddit_data.items()
|
||
|
for post in srdt['links']
|
||
|
]
|
||
|
postsd = dict()
|
||
|
for post in postsl:
|
||
|
dk = post['datakey']
|
||
|
if dk not in postsd:
|
||
|
postsd[dk] = post.copy()
|
||
|
postsd[dk]['subreddits'] = list()
|
||
|
postsd[dk]['links'] = list()
|
||
|
del postsd[dk]['subreddit']
|
||
|
del postsd[dk]['link']
|
||
|
del postsd[dk]['domain']
|
||
|
if (sr := post['subreddit']) not in (srs := postsd[dk]['subreddits']):
|
||
|
srs.append(sr)
|
||
|
if (lnk := get_normalized_link(post['link'])) not in (lnks := postsd[dk]['links']):
|
||
|
lnks.append(lnk)
|
||
|
posts = postsd
|
||
|
del postsl
|
||
|
del postsd
|
||
|
print(f'{len(posts)} posts identified.')
|
||
|
print(f'Identifying alternative trivial links...')
|
||
|
for post in posts.values():
|
||
|
dk = post['datakey']
|
||
|
post_links = post['links']
|
||
|
has_added_any_link = True
|
||
|
while has_added_any_link:
|
||
|
has_added_any_link = False
|
||
|
for link in post_links:
|
||
|
linkcopy = link
|
||
|
while linkcopy.endswith('/') or linkcopy.endswith('#') or linkcopy.endswith('?'):
|
||
|
linkcopy = linkcopy[:-1]
|
||
|
if linkcopy not in post_links:
|
||
|
post_links.append(linkcopy)
|
||
|
has_added_any_link = True
|
||
|
if '<!--' in link:
|
||
|
for linkcopy in search_urls(link):
|
||
|
if linkcopy not in post_links:
|
||
|
post_links.append(linkcopy)
|
||
|
has_added_any_link = True
|
||
|
Path('r_gdl_p.json').write_text(json.dumps(posts, indent=1))
|
||
|
print(f'Grouping links with the posts they show up in...')
|
||
|
for dk, post in posts.items():
|
||
|
for link in post['links']:
|
||
|
if link not in links:
|
||
|
links[link] = list()
|
||
|
links[link].append(dk)
|
||
|
Path('r_gdl_lp.json').write_text(json.dumps(links, indent=1))
|
||
|
print(f'{len(links)} links found')
|
||
|
print(f'Checking if there is an extractor for each link...')
|
||
|
r_gdl_le_path = Path('r_gdl_le.json')
|
||
|
link_extractors = dict()
|
||
|
if r_gdl_le_path.exists():
|
||
|
link_extractors = json.loads(r_gdl_le_path.read_text())
|
||
|
for seq, link in enumerate(links.keys()):
|
||
|
if link not in link_extractors or link_extractors[link] == '':
|
||
|
ext = None
|
||
|
try:
|
||
|
ext = gallery_dl.extractor.find(link)
|
||
|
except gallery_dl.exception.NotFoundError:
|
||
|
pass
|
||
|
link_extractors[link] = type(
|
||
|
ext).category if ext is not None else ''
|
||
|
r_gdl_le_path.write_text(json.dumps(link_extractors, indent=1))
|
||
|
links_by_extractor = {
|
||
|
extractor: list()
|
||
|
for extractor in list(set(link_extractors.values()))
|
||
|
}
|
||
|
for link, extractor in link_extractors.items():
|
||
|
links_by_extractor[extractor].append(link)
|
||
|
undownloadable_posts = links_by_extractor.get('', [])
|
||
|
Path('i_undownloadable.json').write_text(
|
||
|
json.dumps(undownloadable_posts, indent=1))
|
||
|
if '' in links_by_extractor:
|
||
|
del links_by_extractor['']
|
||
|
|
||
|
print(f'{len(links)-len(undownloadable_posts)} downloadable links found')
|
||
|
print(f'{len(undownloadable_posts)} undownloadable links found')
|
||
|
print(f'{len(links_by_extractor)} extractors found')
|
||
|
Path('r_gdl_lbe.json').write_text(json.dumps(links_by_extractor, indent=1))
|
||
|
|
||
|
configure_gdl()
|
||
|
|
||
|
gallery_dl.output.select = lambda: ColoredLineOutput(False)
|
||
|
|
||
|
files_from_links = dict()
|
||
|
|
||
|
totalfiles = 0
|
||
|
|
||
|
thread_ids = sorted(list(links_by_extractor.keys()))
|
||
|
do_fancy_multithreading_panel = False
|
||
|
with PoolExecutor(len(thread_ids)) as pe:
|
||
|
jobs: List[Future] = list()
|
||
|
if do_fancy_multithreading_panel:
|
||
|
print(f'\033[2J', end='', flush=True)
|
||
|
print(f'\033[0;0H', end='', flush=True)
|
||
|
print('Downloading...', flush=True)
|
||
|
if do_fancy_multithreading_panel:
|
||
|
print(f'\033[0;0H', end='', flush=True)
|
||
|
for line, thread_id in enumerate(thread_ids):
|
||
|
job = pe.submit(
|
||
|
download_link_list,
|
||
|
links_by_extractor[thread_id],
|
||
|
thread_id,
|
||
|
line+3 if do_fancy_multithreading_panel else None,
|
||
|
)
|
||
|
jobs.append(job)
|
||
|
for job in jobs:
|
||
|
generator = job.result()
|
||
|
for k, v in generator:
|
||
|
files_from_links[k] = v
|
||
|
totalfiles += len(v)
|
||
|
Path('i_gdl_ffl.json').write_text(json.dumps(files_from_links, indent=1))
|
||
|
print(f'Downloaded {totalfiles} files')
|
||
|
|
||
|
|
||
|
def download_link_list(links: List[str],
|
||
|
thread_id: str,
|
||
|
line: Optional[int] = None,
|
||
|
) -> List[Tuple[str, List[str]]]:
|
||
|
has_its_own_line = line is not None
|
||
|
remaining_links = len(links)
|
||
|
configure_gdl()
|
||
|
|
||
|
def get_printer():
|
||
|
return ColoredLineOutput(
|
||
|
has_its_own_line,
|
||
|
prefix=(f'\033[{line};0H' if has_its_own_line else '') +
|
||
|
clrlib.stylize('% 9d' % remaining_links, [clrlib.fg('light_cyan')]) +
|
||
|
clrlib.stylize('@', [clrlib.fg('light_red')]) +
|
||
|
clrlib.stylize(thread_id, [clrlib.fg('yellow')]) +
|
||
|
clrlib.stylize('=', [clrlib.fg('dark_gray')]),
|
||
|
suffix=('\033[K\033[0;0H' if has_its_own_line else ''),
|
||
|
prefixsz=len(('% 9d'%0)+' '+thread_id),
|
||
|
suffixsz=0,
|
||
|
)
|
||
|
|
||
|
gallery_dl.output.select = get_printer
|
||
|
result = list()
|
||
|
try:
|
||
|
for link in links:
|
||
|
cachedir = get_path_for_caching(link, Path('i_gdl_c'))
|
||
|
cachedir.mkdir(parents=True, exist_ok=True)
|
||
|
metafile = cachedir.joinpath('_gdl_meta.json')
|
||
|
meta = dict()
|
||
|
if metafile.exists():
|
||
|
meta = json.loads(metafile.read_text())
|
||
|
if link not in meta:
|
||
|
job = DownloadJobWithCallSaverPostProcessor(link)
|
||
|
job.out = get_printer()
|
||
|
job.out.start(clrlib.stylize(link, [clrlib.fg('light_magenta')]))
|
||
|
job.run()
|
||
|
files = list(map(lambda a: a[0], job.cspp.calls['run_final']))
|
||
|
meta[link] = files
|
||
|
metafile.write_text(json.dumps(meta, indent=1))
|
||
|
result.append((link, meta[link]))
|
||
|
remaining_links -= 1
|
||
|
finally:
|
||
|
print((f'\033[{line};0H' if has_its_own_line else '') +
|
||
|
clrlib.stylize(thread_id, [clrlib.fg('yellow')]) +
|
||
|
clrlib.stylize('#', [clrlib.fg('light_red')]) +
|
||
|
clrlib.stylize('Done', [clrlib.fg('light_green')]) +
|
||
|
('\033[K' if has_its_own_line else '')
|
||
|
)
|
||
|
return result
|
||
|
|
||
|
|
||
|
def configure_gdl():
|
||
|
parser = gallery_dl.option.build_parser()
|
||
|
args = parser.parse_args([
|
||
|
'--download-archive=i_gdl.sqlite3',
|
||
|
'--dest=i_gdl',
|
||
|
'--write-metadata',
|
||
|
'--write-tags',
|
||
|
'--write-log=i_gdl_log.txt',
|
||
|
'--write-unsupported=i_gdl_unsupported.txt',
|
||
|
'--quiet',
|
||
|
])
|
||
|
gallery_dl.output.initialize_logging(args.loglevel)
|
||
|
|
||
|
# configuration
|
||
|
if args.load_config:
|
||
|
gallery_dl.config.load()
|
||
|
if args.cfgfiles:
|
||
|
gallery_dl.config.load(args.cfgfiles, strict=True)
|
||
|
if args.yamlfiles:
|
||
|
gallery_dl.config.load(args.yamlfiles, strict=True, fmt="yaml")
|
||
|
if args.postprocessors:
|
||
|
gallery_dl.config.set((), "postprocessors", args.postprocessors)
|
||
|
if args.abort:
|
||
|
gallery_dl.config.set((), "skip", "abort:" + str(args.abort))
|
||
|
for opts in args.options:
|
||
|
gallery_dl.config.set(*opts)
|
||
|
|
||
|
# loglevels
|
||
|
gallery_dl.output.configure_logging(args.loglevel)
|
||
|
|
||
|
gallery_dl.output.select = ColoredLineOutput
|
||
|
|
||
|
|
||
|
class DownloadJobWithCallSaverPostProcessor(gallery_dl.job.DownloadJob):
|
||
|
def __init__(self, url, parent=None):
|
||
|
super().__init__(url, parent)
|
||
|
self.cspp = CallSaverPostProcessor(self)
|
||
|
|
||
|
def initialize(self, kwdict=None):
|
||
|
super().initialize(kwdict)
|
||
|
self.postprocessors.append(self.cspp)
|
||
|
|
||
|
|
||
|
class ColoredLineOutput(gallery_dl.output.TerminalOutput):
|
||
|
def __init__(self, sameline=False, prefix="", suffix="", prefixsz=0, suffixsz=0):
|
||
|
super().__init__()
|
||
|
self.sameline = sameline
|
||
|
self.eol = '\r' if sameline else '\n'
|
||
|
self.prefix = prefix
|
||
|
self.suffix = suffix
|
||
|
self.prefixsz = prefixsz
|
||
|
self.suffixsz = suffixsz
|
||
|
self.termsize = shutil.get_terminal_size().columns
|
||
|
|
||
|
def start(self, path):
|
||
|
print(f'{self.prefix}{clrlib.stylize(self.shorten(path), [clrlib.fg("light_yellow")])}{self.suffix}',
|
||
|
flush=True, end=self.eol)
|
||
|
|
||
|
def skip(self, path):
|
||
|
print(f"{self.prefix}\033[2m{self.shorten(path)}\033[0m{self.suffix}",
|
||
|
flush=True, end=self.eol)
|
||
|
|
||
|
def success(self, path, tries):
|
||
|
print(f"{self.prefix}\033[1;32m{self.shorten(path)}\033[0m{self.suffix}",
|
||
|
flush=True, end=self.eol)
|
||
|
|
||
|
def shorten(self, txt):
|
||
|
self.width = self.termsize - self.prefixsz - self.suffixsz -1
|
||
|
return super().shorten(txt)
|
||
|
|
||
|
|
||
|
|
||
|
class CallSaverPostProcessor(gallery_dl.postprocessor.common.PostProcessor):
|
||
|
def __init__(self, job):
|
||
|
super().__init__(job)
|
||
|
self.calls = dict(
|
||
|
prepare=list(),
|
||
|
run=list(),
|
||
|
run_metadata=list(),
|
||
|
run_after=list(),
|
||
|
run_final=list(),
|
||
|
)
|
||
|
|
||
|
def prepare(self, pathfmt):
|
||
|
"""Update file paths, etc."""
|
||
|
self.calls['prepare'].append((pathfmt.path,))
|
||
|
|
||
|
def run(self, pathfmt):
|
||
|
"""Execute the postprocessor for a file"""
|
||
|
self.calls['run'].append((pathfmt.path,))
|
||
|
|
||
|
def run_metadata(self, pathfmt):
|
||
|
"""Execute the postprocessor for a file"""
|
||
|
self.calls['run_metadata'].append((pathfmt.path,))
|
||
|
|
||
|
def run_after(self, pathfmt):
|
||
|
"""Execute postprocessor after moving a file to its target location"""
|
||
|
self.calls['run_after'].append((pathfmt.path,))
|
||
|
|
||
|
def run_final(self, pathfmt, status):
|
||
|
"""Postprocessor finalization after all files have been downloaded"""
|
||
|
self.calls['run_final'].append((pathfmt.path, status))
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|