diff --git a/.gitignore b/.gitignore index 7b311c3..439afb3 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,9 @@ i_h/** i_h.json i_hc.json i_hs.json +i_h_n +i_h_n/** +i_h_n.json i_t i_t/** **/*.pyc @@ -23,3 +26,5 @@ ignored.txt *.cookie.txt .vscode .vscode/** +.mypy_cache +.mypy_cache/** diff --git a/moresubredditsabout.py b/moresubredditsabout.py new file mode 100755 index 0000000..f891c4c --- /dev/null +++ b/moresubredditsabout.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +from reddit_imgs.search_for_subreddits import main + +if __name__ == '__main__': + main() + diff --git a/reddit_imgs/display_current_download.py b/reddit_imgs/display_current_download.py new file mode 100644 index 0000000..8882540 --- /dev/null +++ b/reddit_imgs/display_current_download.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import traceback +from io import BytesIO +from pathlib import Path +from typing import Optional + +from PIL import Image + +from .system.downloader.downloadedData import MainApp + +millisamount = 10 + +AnyInt = Optional[int] + + +def updateImage(tk: MainApp, old_url_modified: AnyInt, old_file_modified: AnyInt, old_file_sz: AnyInt): + try: + path_file = Path('latest_put_image.file') + path_url = Path('latest_put_image.url') + new_url_modified = None + new_file_modified = None + new_file_sz = None + try: + st = path_url.stat() + new_url_modified = st.st_mtime_ns + except BaseException: + pass + try: + st = path_file.stat() + new_file_modified = st.st_mtime_ns + new_file_sz = st.st_size + except BaseException: + pass + tk.after(millisamount, updateImage, tk, + new_url_modified, new_file_modified, new_file_sz) + if old_url_modified != new_url_modified or old_file_modified != new_file_modified or old_file_sz != new_file_sz: + url = None + bts = None + try: + url = path_url.read_text() + bts = path_file.read_bytes() + except BaseException: + pass + if url is not None and bts is not None: + try: + tk.update_image(Image.open(BytesIO(bts)), url) + except BaseException: + print() + print("Exception on link %r" % url) + traceback.print_exc() + if ((old_url_modified is not None or old_file_modified is not None) and (new_url_modified is None and new_file_modified is None)): + tk.destroy() + except BaseException: + print() + traceback.print_exc() + tk.destroy() + + +def main(): + tk = MainApp() + tk.after(1, updateImage, tk, None, None, None) + tk.mainloop() + + +if __name__ == '__main__': + main() diff --git a/reddit_imgs/fetch.py b/reddit_imgs/fetch.py index 2638177..24f3257 100755 --- a/reddit_imgs/fetch.py +++ b/reddit_imgs/fetch.py @@ -164,7 +164,8 @@ def main(retryEmptyAlbums = False): if not downloader.recognizes(link['link']): continue target = os.path.join(wdir,'i',link['datakey']) - if not os.path.exists(target) or (retryEmptyAlbums and link['datakey'] not in image_cde): + targetJson = os.path.join(target, 'meta.json') + if not os.path.exists(targetJson) or (retryEmptyAlbums and link['datakey'] not in image_cde): if not cache.replicate_from_cache(target, link['link']) or (retryEmptyAlbums and link['datakey'] not in image_cde): downloader.download(link['link']).into(target) cache.uncache_download(link['link']) @@ -203,6 +204,10 @@ def main(retryEmptyAlbums = False): image_catalog_file.write_text(json.dumps(image_catalog, indent=1)) if retryEmptyAlbums: image_cde_file.unlink() + if (pth := Path('latest_put_image.url')).exists(): + pth.unlink() + if (pth := Path('latest_put_image.file')).exists(): + pth.unlink() print() print('='*47) @@ -257,6 +262,8 @@ def fix_domain_for_display(domain): return fix_domain_for_display('tumblr.com') elif domain.endswith('blogspot.com') and domain != 'blogspot.com': return fix_domain_for_display('blogspot.com') + elif domain.endswith('.e-hentai.org'): + return fix_domain_for_display('e-hentai.org') else: return domain diff --git a/reddit_imgs/hashit.py b/reddit_imgs/hashit.py index b649c21..ff5c56e 100644 --- a/reddit_imgs/hashit.py +++ b/reddit_imgs/hashit.py @@ -1,12 +1,19 @@ #!/usr/bin/env python3 # -*- encoding: utf-8 -*- +import json +import shutil from pathlib import Path +from typing import List import filetype -import reddit_imgs.fetch + import reddit_imgs.cachedhash -import json +import reddit_imgs.fetch + + +def hash2pathsegments(sha256: str) -> List[str]: + return [sha256[0:3], sha256[3:6], sha256[6:]] def main(): @@ -62,7 +69,7 @@ def main(): for seq, sha256 in enumerate(hashCache.keys()): print(f"Hash {seq+1} of {hashCacheSize}...") if hashExtensions.get(sha256, None) is None: - hash_path = Path('i_h', sha256[0:3], sha256[3:6], sha256[6:]) + hash_path = Path('i_h', *hash2pathsegments(sha256)) hash_occur_path = hash_path.joinpath('occurrences') if not hash_occur_path.exists(): hash_occur_path.mkdir(parents=True, exist_ok=True) @@ -73,7 +80,21 @@ def main(): reffp.symlink_to('../../../../../'+path) hash_sample_path = hash_path.joinpath('sample') if not hash_sample_path.exists(): - hash_sample_path.symlink_to('../../../../'+hashStore[sha256][0]) + if not hash_sample_path.is_symlink(): + hash_sample_path.symlink_to('../../../../'+hashStore[sha256][0]) + else: # symlink is there, but pointing to a broken location + shutil.rmtree(hash_path) + for hashed_instance in hashStore[sha256]: + shutil.rmtree(Path(hashed_instance).parent) + for k in ['i_c.json', 'i_c_h.json', 'i_h.json', 'i_hc.json', 'i_hs.json', 'i_he.json']: + if (fl := Path(k)).exists(): + fl.unlink() + raise Exception( + 'Cannot proccess broken path.\n' + + 'Re-run pipeline since "fetch".\n' + + f'{hash_path}\n' + + "\n".join([f" - {Path(k).parent}" for k in hashStore[sha256]]) + ) hash_ext_path = hash_path.joinpath('ext') if not hash_ext_path.exists(): with hash_sample_path.open('rb') as f: diff --git a/reddit_imgs/normalizetobmp.py b/reddit_imgs/normalizetobmp.py new file mode 100644 index 0000000..97c396f --- /dev/null +++ b/reddit_imgs/normalizetobmp.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import json +import subprocess +from concurrent.futures import ProcessPoolExecutor as PoolExecutor +from pathlib import Path +from typing import Dict, Union +import hashlib +import sys +from io import BytesIO + +import PIL.Image + +import reddit_imgs.hashit + +from .hashit import hash2pathsegments +from .system.table_fmt import table_fmt + + +POOL_SIZE = 32 + + +def hexhashof(bts: bytes, using) -> str: + m = using() + m.update(bts) + return m.hexdigest() + + +def main(): + image_hashes_extensions_path = Path('i_he.json') + if not image_hashes_extensions_path.exists(): + print("Executing prerrequisite...") + reddit_imgs.hashit.main() + image_hashes_extensions = json.loads( + image_hashes_extensions_path.read_text() + ) + used_extensions = {i: 0 for i in set(image_hashes_extensions.values())} + for e in image_hashes_extensions.values(): + used_extensions[e] += 1 + image_hashed_path = Path('i_h') + image_hashed_normalized_path = Path('i_h_n') + image_hashed_normalized_path.mkdir(parents=True, exist_ok=True) + print(table_fmt( + ['ext', 'hashes'], + sorted(used_extensions.items(), key=lambda t: (t[1], t[0]))+[ + ('total', sum(used_extensions.values())), + ], + 'Extensions', + alignment='^>', + divide_last_line=True, + )) + converted_path = Path('i_h_n.json') + converted = dict() + if converted_path.exists(): + converted = json.loads(converted_path.read_text()) + with PoolExecutor(POOL_SIZE) as pe: + totalcnt = len(image_hashes_extensions) + on_finished_count = 0 + def on_finished(job): + nonlocal on_finished_count + on_finished_count += 1 + res = job.result() + converted[res['hash']] = res + if on_finished_count % 1000 == 0: + converted_path.write_text(json.dumps( + converted, + indent=1, + )) + for seq, (hsh, ext) in enumerate(image_hashes_extensions.items()): + job = pe.submit( + normalize_to_bmp, + seq, + totalcnt, + image_hashed_path.joinpath( + *hash2pathsegments(hsh), + 'sample' + ), + image_hashed_normalized_path.joinpath( + *hash2pathsegments(hsh), + ), + ext, + hsh, + converted.get(hsh, dict(hash=hsh)), + ) + job.add_done_callback(on_finished) + # return + converted_path.write_text(json.dumps( + converted, + indent=1, + )) + + +def normalize_to_bmp(seq: int, + total: int, + pathinfile: Path, + pathoutdir: Path, + knownext: str, + hsh: str, + lastout: Dict[str, Union[str, int]], + ) -> Dict[str, Union[str, int]]: + needed_info = ( + 'square', 'square_size', 'square_dimens', + 'aspect', 'aspect_size', 'aspect_dimens', + ) + if len(set(needed_info).difference(lastout.keys())) > 0: + progress_of = '%06d of %06d' % (seq+1, total) + print(f'{progress_of} - Ensuring that "{knownext}" entry exists: {hsh}') + if not pathinfile.exists(): + raise FileNotFoundError(pathinfile) + if not (pathoutfile := pathoutdir.joinpath('aspect.bmp')).exists(): + print(f'{progress_of} - Converting to BMP keeping aspect') + r = subprocess.run( + ['ffmpegthumbnailer', + '-i', str(pathinfile), + '-t', '10%', + '-s', '0', + '-c', 'png', + '-o', '-', + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + if r.returncode or len(r.stdout)==0: + sys.stdout.write(r.stderr.decode(errors='ignore')) + raise ValueError('Conversion failed for %r' % pathinfile) + r.check_returncode() + pathoutdir.mkdir(parents=True, exist_ok=True) + PIL.Image.open(BytesIO(r.stdout)).save(pathoutfile) + if len(set(['aspect', 'aspect_size', 'aspect_dimens']).difference(lastout.keys())) > 0: + print(f'{progress_of} - Extacting BMP data from aspect') + pathoutfile = pathoutdir.joinpath('aspect.bmp') + bts = pathoutfile.read_bytes() + aspectsha256 = hexhashof(bts, hashlib.sha256) + pathoutdir.joinpath('aspect.bmp.sha256').write_text(aspectsha256) + lastout['aspect'] = aspectsha256 + lastout['aspect_size'] = len(bts) + lastout['aspect_dimens'] = PIL.Image.open(pathoutfile).size + if not (pathoutfile := pathoutdir.joinpath('square.bmp')).exists(): + print(f'{progress_of} - Converting to BMP square') + im = PIL.Image.open(pathoutdir.joinpath('aspect.bmp')) + ax, ay = im.size + am = max(ax, ay) + sq = PIL.Image.new('RGB', (am, am)) + sq.paste(im, ((am-ax)//2, (am-ay)//2)) + sq.save(pathoutfile) + if len(set(['square', 'square_size', 'square_dimens']).difference(lastout.keys())) > 0: + print(f'{progress_of} - Extacting BMP data from square') + pathoutfile = pathoutdir.joinpath('square.bmp') + bts = pathoutfile.read_bytes() + aspectsha256 = hexhashof(bts, hashlib.sha256) + pathoutdir.joinpath('square.bmp.sha256').write_text(aspectsha256) + lastout['square'] = aspectsha256 + lastout['square_size'] = len(bts) + lastout['square_dimens'] = PIL.Image.open(pathoutfile).size + return lastout + + +if __name__ == "__main__": + main() diff --git a/reddit_imgs/normalizetorgbpng.py b/reddit_imgs/normalizetorgbpng.py deleted file mode 100644 index 27d0a14..0000000 --- a/reddit_imgs/normalizetorgbpng.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python3 -# -*- encoding: utf-8 -*- - -import PIL.Image -import json -from pathlib import Path - - -def main(): - pass - - -if __name__ == "__main__": - main() diff --git a/reddit_imgs/runner.py b/reddit_imgs/runner.py index ecca01d..eed4d66 100755 --- a/reddit_imgs/runner.py +++ b/reddit_imgs/runner.py @@ -7,7 +7,7 @@ import reddit_imgs.reorganize import reddit_imgs.wallpapers import reddit_imgs.thumbnailize import reddit_imgs.hashit -import reddit_imgs.normalizetorgbpng +import reddit_imgs.normalizetobmp import reddit_imgs.cachedhash import os @@ -162,7 +162,7 @@ def cmdline(): ('fetchretryingemptyalbuns', reddit_imgs.fetch.retry), ('cachedhash', reddit_imgs.cachedhash.main), ('hashit', reddit_imgs.hashit.main), - ('normalizetorgbpng', reddit_imgs.normalizetorgbpng.main), + ('normalizetobmp', reddit_imgs.normalizetobmp.main), ('thumbnailize', reddit_imgs.thumbnailize.main), ('reorganize', reddit_imgs.reorganize.main), ('wallpapers', reddit_imgs.wallpapers.main), diff --git a/reddit_imgs/search_for_subreddits.py b/reddit_imgs/search_for_subreddits.py new file mode 100644 index 0000000..7530dd9 --- /dev/null +++ b/reddit_imgs/search_for_subreddits.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import sys +import urllib.parse +from pathlib import Path +from typing import AnyStr, Callable, Dict, List, Optional, Tuple + +import colored as clrlib +import html2text as html2textlib +from bs4 import BeautifulSoup + +from .system import simpleDownloader + + +def html2text(html, withMd=True, limit=65535): + h = html2textlib.HTML2Text(baseurl="", bodywidth=limit) + if not withMd: + h.ignore_emphasis = True + h.ignore_images = True + h.ignore_links = True + h.ignore_tables = True + return h.handle(html) + + +def extract_subreddits_from_page(pagebs: BeautifulSoup) -> Tuple[AnyStr, List[Dict[str, str]]]: + nextbutton = pagebs.find(class_='nav-buttons') + if nextbutton: + nextbutton = nextbutton.find(class_='next-button') + if nextbutton: + nextbutton = nextbutton.find('a') + if nextbutton: + nextbutton = nextbutton['href'] + srs = list() + srtbs = pagebs.find(id='siteTable') + # print(srtbs) + for srbs in srtbs.find_all(class_='subreddit'): + isNsfw = srbs.find('span', alt='NSFW') is not None + titlebs = srbs.find('a', class_='title') + descriptionbs = srbs.find(class_='description') + # descriptionPrettyHtml = descriptionbs.prettify() + link = titlebs['href'] + if '/r/' not in link: + continue + name = titlebs.text + subreddit = name.split(':', 1)[0].split('/', 1)[-1].lower() + title = name.split(':', 1)[1][1:] + description = html2text(str(descriptionbs), False, 60).strip() + srs.append(dict( + isNsfw=isNsfw, + link=link, + subreddit=subreddit, + title=title, + description=description, + )) + # print(isNsfw) + # print(subreddit) + # print(title) + # print(link) + # print(description) + # print() + # print('-'*79) + # print() + # raise Exception() + return (nextbutton, srs) + + +def pad_text_block(s: str, wth: str) -> str: + return '\n'.join(list(map( + lambda l: f'{wth}{l}', + s.splitlines() + ))) + + +def findmany(text: str, terms: List[str]) -> Tuple[int, str]: + if len(terms) <= 0: + return -1, None + else: + incidences = dict() + for term in terms: + pos = text.find(term) + if pos >= 0: + incidences[pos] = term + if len(incidences) <= 0: + return -1, None + else: + m = min(incidences.keys()) + return m, incidences[m] + + +def highlight_search_term(terms: List[str], text: str, styler: Callable[[str], str], case_insensitive: bool = True) -> str: + termso = terms + texto = text + textl = text.lower() if case_insensitive else text + termsl = list(map(str.lower, terms)) if case_insensitive else terms + buffo = '' + while True: + matchpos, matchtrm = findmany(textl, termsl) + if matchpos < 0: + buffo += texto + break + else: + buffo += texto[:matchpos] + buffo += styler(texto[matchpos:matchpos+len(matchtrm)]) + texto = texto[matchpos+len(matchtrm):] + textl = textl[matchpos+len(matchtrm):] + return buffo + + +def do_search(term: str, include_nsfw: bool = True, colored: Optional[bool] = True) -> List[Dict[str, str]]: + simpleDownloader.cleanCookies() + simpleDownloader.setCookies({'over18': 1}) + next_page_url = ( + 'https://old.reddit.com/subreddits/search?' + + ('include_over_18=on&' if include_nsfw else '') + + 'q=' + urllib.parse.quote_plus(term) + ) + srs = list() + srlst = list() + nothing_new = True + while next_page_url: + pagebts = simpleDownloader.getUrlBytes(next_page_url) + pagebs = BeautifulSoup(pagebts, 'html5lib') + next_page_url, nsrs = extract_subreddits_from_page(pagebs) + srs += nsrs + for sr in srs: + if (nm := sr['subreddit']) in srlst: + continue + else: + srlst.append(nm) + iw = Path('r', sr['subreddit']).exists() + nothing_new = nothing_new and iw + if colored is not None: + ds = '@' if iw else '#' + srn = sr['subreddit'] + isw = sr['isNsfw'] + sfw = 'nsfw' if isw else 'sfw' + sfw = f'[{sfw}]' + srt = sr['title'] + srd = pad_text_block(sr['description'], ' '*8) + srl = sr['link'].replace('//old.', '//www.') + if colored: + ds = clrlib.stylize( + ds, + [clrlib.fg('light_green' if iw else 'light_red')] + ) + srn = clrlib.stylize( + srn, + [clrlib.fg('light_cyan')] + ) + sfw = clrlib.stylize( + sfw, + [clrlib.fg('light_green' if not isw else 'light_red')] + ) + srl = clrlib.stylize( + srl, + [clrlib.fg('light_blue')] + ) + srt = clrlib.stylize( + srt, + [clrlib.fg('cyan')] + ) + srd = '\n'.join(list(map( + lambda srdl: clrlib.stylize( + srdl, + [clrlib.fg('dark_gray' if iw else 'light_gray')] + ), + srd.splitlines() + ))) + termssplit = term.split() + def highligher(t): + clrlibobj = clrlib.colored('') + bgreset = clrlibobj.ESC+'49'+clrlibobj.END + return clrlib.bg('red') + t + bgreset + srn = highlight_search_term(termssplit, srn, highligher) + srt = highlight_search_term(termssplit, srt, highligher) + srd = highlight_search_term(termssplit, srd, highligher) + print(f"{ds} {srn} {sfw} {srl}") + print(f" {srt}") + print(srd) + print() + if nothing_new: + if colored is not None: + msg = "> Nothing new... move on!" + if colored: + msg = clrlib.stylize(msg, [clrlib.fg('yellow')]) + print(msg) + simpleDownloader.cleanCookies() + return srs + + +def main(): + search_term = ( + ' '.join(list(map(str.strip, map(str, sys.argv[1:])))) + ).strip() + if len(search_term) <= 0: + print(f'Usage:\n {sys.argv[0]} ') + else: + do_search(search_term) + + +if __name__ == '__main__': + main() diff --git a/reddit_imgs/sync.py b/reddit_imgs/sync.py index 9aad27b..5524a40 100755 --- a/reddit_imgs/sync.py +++ b/reddit_imgs/sync.py @@ -1,12 +1,19 @@ #!/usr/bin/env python3 # -*- encoding: utf-8 -*- -import os -from bs4 import BeautifulSoup as _BS -from .system import simpleDownloader -from .system.subredditTools import getEmptySubredditData, getSubredditPageJsonInfo, build_gateway_link, GATEWAY_LINK_ARGS import json +import os +from concurrent.futures import ProcessPoolExecutor as PoolExecutor from pathlib import Path +from urllib.error import ContentTooShortError, HTTPError, URLError + +from bs4 import BeautifulSoup as _BS + +from .system import simpleDownloader +from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link, + getEmptySubredditData, + getSubredditPageJsonInfo) + def BeautifulSoup(data): return _BS(data, 'html5lib') @@ -14,54 +21,93 @@ simpleDownloader.setCookies({'over18':1}) wdir = os.path.abspath('.') +def process_subreddit(subreddit): + simpleDownloader.setCookies({'over18':1}) + srp = os.path.abspath(os.path.join(wdir, 'r', subreddit)) + #if subreddit!='yiff': continue + nextpage = build_gateway_link(subreddit) + srdt = getEmptySubredditData(subreddit) + try: + with open(os.path.join(srp, 'subreddit.json')) as f: + srdt = json.loads(f.read()) + except: pass + #srdt = getEmptySubredditData(subreddit) + pageno = 0 + ygst = srdt['date_first'] + jsonPageSr = None + while nextpage: + pageno+=1 + print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit)) + print(' >> %s'%(nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),)) + redditBytes = None + try: + redditBytes = simpleDownloader.getUrlBytes(nextpage) + except (HTTPError, URLError, ContentTooShortError) as e: + print(" >> HTTP Error with code: Skipping...") + break + if redditBytes is None: + print(" >> HTTP Error: Skipping...") + break + # bs = BeautifulSoup(redditBytes) + jsonPage = json.loads(redditBytes) + first, last, nextpage, links = getSubredditPageJsonInfo(jsonPage, subreddit, pageno) + if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date + nextpage = None + srdt['date_first'] = max(first, srdt['date_first']) + srdt['date_last'] = min(last, srdt['date_last']) + for link in links[::-1]: + if link not in srdt['links']: + srdt['links'].append(link) + srid = next(iter(set.intersection( + set(jsonPage['subreddits'].keys()), + set(jsonPage['postFlair'].keys()), + set(jsonPage['subredditAboutInfo'].keys()) + ))) + jsonPageSr = dict( + id=srid, + name=subreddit, + definition=jsonPage['subreddits'][srid], + about=jsonPage['subredditAboutInfo'][srid], + flair=jsonPage['postFlair'][srid], + ) + with open(os.path.join(srp,'subreddit.json'),'w') as f: + f.write(json.dumps(srdt, sort_keys=True, indent=2)) + if jsonPageSr is not None: + with open(os.path.join(srp,'meta.json'),'w') as f: + f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2)) + + def main(): build_summary() subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r')))) - for subreddit in subreddits: - srp = os.path.abspath(os.path.join(wdir,'r',subreddit)) - #if subreddit!='yiff': continue - nextpage = build_gateway_link(subreddit) - srdt = getEmptySubredditData(subreddit) - try: - with open(os.path.join(srp,'subreddit.json')) as f: - srdt = json.loads(f.read()) - except: pass - #srdt = getEmptySubredditData(subreddit) - pageno = 0 - ygst = srdt['date_first'] - while nextpage: - pageno+=1 - print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit)) - print(' >> %s'%(nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),)) - redditBytes = None - try: - redditBytes = simpleDownloader.getUrlBytes(nextpage) - except BaseException as e: - print(" >> HTTP Error with code: Skipping...") - break - if redditBytes is None: - print(" >> HTTP Error: Skipping...") - break - # bs = BeautifulSoup(redditBytes) - jsonPage = json.loads(redditBytes) - first, last, nextpage, links = getSubredditPageJsonInfo(jsonPage, subreddit, pageno) - if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date - nextpage = None - srdt['date_first'] = max(first, srdt['date_first']) - srdt['date_last'] = min(last, srdt['date_last']) - for link in links[::-1]: - if link not in srdt['links']: - srdt['links'].append(link) - with open(os.path.join(srp,'subreddit.json'),'w') as f: - f.write(json.dumps(srdt ,sort_keys=True, indent=2)) + with PoolExecutor(16) as pe: + q = list() + for subreddit in subreddits: + job = pe.submit(process_subreddit, subreddit) + q.append(job) + for job in q: + job.result() build_summary() def build_summary(): + rjpath = Path(wdir, 'r.json') + oldsrs = dict() + if rjpath.exists(): + oldsrs = json.loads(rjpath.read_text()) srs = dict() for srp in Path(wdir, 'r').glob('*/subreddit.json'): - srs[srp.parent.name.lower()] = json.loads(srp.read_text()) - Path(wdir, 'r.json').write_text(json.dumps(srs, indent=1)) + sr = srp.parent.name.lower() + try: + srs[sr] = json.loads(srp.read_text()) + except json.decoder.JSONDecodeError: + if sr not in oldsrs: + raise + else: + print('Restoring old data for corrupted subrredit %r' % sr) + srs[sr] = oldsrs[sr] + srp.write_text(json.dumps(oldsrs[sr], indent=1)) + rjpath.write_text(json.dumps(srs, indent=1)) if __name__ == '__main__': diff --git a/reddit_imgs/system/downloader/cache.py b/reddit_imgs/system/downloader/cache.py index 263db17..d4d2c13 100644 --- a/reddit_imgs/system/downloader/cache.py +++ b/reddit_imgs/system/downloader/cache.py @@ -27,14 +27,31 @@ def get_path_for_caching(link: str) -> Path: target = Path('i_c').joinpath(link.split('://', 1)[1]) return limit_filename_lenght(target) +def has_file_cache(cached: Path) -> bool: + if not cached.exists(): + return False + metafile = cached.joinpath('_meta.json') + if not metafile.exists(): + return False + meta = json.loads(cached.joinpath('_meta.json').read_text()) + if meta['type'] != 'file': + return False + file = cached.joinpath(meta['disk']) + return file.exists() + def read_file_from_cache(cached: Path) -> bytes: if not cached.exists(): raise ValueError("Cannot read from non-existing cache: %r" % cached) + metafile = cached.joinpath('_meta.json') + if not metafile.exists(): + raise ValueError("Cannot read from broken cache: %r" % metafile) meta = json.loads(cached.joinpath('_meta.json').read_text()) if meta['type'] != 'file': raise ValueError("Cannot read a gallery as single file: %r" % cached) file = cached.joinpath(meta['disk']) + if not file.exists(): + raise ValueError("Cannot locate missing file: %r" % file) return file.read_bytes() @@ -142,8 +159,14 @@ def fix_cache_relocate_single_file_from_download(download_path, download, target downloaded_file.unlink() downloaded_file.symlink_to(f'../../{str(target_file)}') if not target_file.exists(): - shutil.rmtree(target) - raise Exception("Specified cached file does not exist.") + shutil.rmtree(target) # cache is invalid; remove it + for fl in download_path.glob('*'): + if fl.is_symlink(): # download has a broken symlink into cache + shutil.rmtree(download_path) + break + raise Exception("Specified cached file does not exist.\n" + + f"Download path: {repr(download_path)}\n" + + f"Target: {repr(target)}") if not target_hashfile.exists(): m = hashlib.sha256() m.update(target_file.read_bytes()) diff --git a/reddit_imgs/system/downloader/downloadedData.py b/reddit_imgs/system/downloader/downloadedData.py index 7f10472..182f02d 100644 --- a/reddit_imgs/system/downloader/downloadedData.py +++ b/reddit_imgs/system/downloader/downloadedData.py @@ -7,6 +7,7 @@ import shutil import tkinter from concurrent.futures import ThreadPoolExecutor from io import BytesIO +from pathlib import Path import filetype from PIL import Image, ImageTk @@ -60,6 +61,10 @@ class DownloadedData(object): rootWindow.update_image(Image.open(BytesIO(downloaded)), link) except: pass + if (pth := Path('latest_put_image.file')).exists(): + pth.unlink() + Path('latest_put_image.url').write_text(link) + Path('latest_put_image.file').write_bytes(downloaded) def remove(self, directory): directory = os.path.abspath(directory) @@ -119,7 +124,9 @@ class MainApp(tkinter.Tk): self._resize_image2() def _resize_image2(self): - self.display_photo = self.photo.thumbnail((self.known_width, self.known_height)) + size_tuple = (self.known_width, self.known_height) + self.display_photo = self.photo.copy() + self.display_photo.thumbnail(size_tuple) if self.display_photo is None: self.display_photo = self.photo self.image = ImageTk.PhotoImage(self.display_photo) diff --git a/reddit_imgs/system/downloader/modules/_cacheable.py b/reddit_imgs/system/downloader/modules/_cacheable.py new file mode 100644 index 0000000..281aed9 --- /dev/null +++ b/reddit_imgs/system/downloader/modules/_cacheable.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +from typing import Any, Dict, List, Optional + +from ... import simpleDownloader +from .. import cache + + +def get_link_bytes(link: str, *args: List[Any], **kwargs: Dict[str, Any]) -> Optional[bytes]: + bts = None + cached = cache.get_path_for_caching(link) + if cache.has_file_cache(cached): + bts = cache.read_file_from_cache(cached) + else: + bts = simpleDownloader.getUrlBytes(link, *args, **kwargs) + return bts diff --git a/reddit_imgs/system/downloader/modules/direct_link.py b/reddit_imgs/system/downloader/modules/direct_link.py index 6ca1f32..2b9037d 100644 --- a/reddit_imgs/system/downloader/modules/direct_link.py +++ b/reddit_imgs/system/downloader/modules/direct_link.py @@ -3,6 +3,7 @@ from ..downloadedData import DownloadedData from ... import simpleDownloader +from ._cacheable import get_link_bytes def works_on(domain): return domain=='direct_link' @@ -58,6 +59,16 @@ class DirectLink(object): 'pornreactor.cc' in link or 'u18chan.com' in link + or + 's3.amazonaws.com/' in link + or + '//data.tumblr.com/' in link + or + '//ve.media.tumblr.com/' in link + or + '//redgifs.com/' in link + or + '.redgifs.com/' in link ): return True return False @@ -65,7 +76,7 @@ class DirectLink(object): def download(self, link): dd = DownloadedData() simpleDownloader.cleanCookies() - bts = simpleDownloader.getUrlBytes(link, self.needsPromiscuity(link)) + bts = get_link_bytes(link, self.needsPromiscuity(link)) simpleDownloader.cleanCookies() if bts is not None: dd.put(link,bts) diff --git a/reddit_imgs/system/downloader/modules/gallerydl.py b/reddit_imgs/system/downloader/modules/gallerydl.py index 7dc8651..22dbb9e 100644 --- a/reddit_imgs/system/downloader/modules/gallerydl.py +++ b/reddit_imgs/system/downloader/modules/gallerydl.py @@ -3,6 +3,7 @@ import subprocess +from ._cacheable import get_link_bytes from ... import simpleDownloader from ..downloadedData import DownloadedData from .direct_link import DirectLink @@ -81,13 +82,10 @@ class GalleryDlWrapper(DirectLink): print('\r', end='', flush=True) # print('\n'+link) cached = cache.get_path_for_caching(link) - if cached.exists(): - bts = cache.read_file_from_cache(cached) - else: - simpleDownloader.cleanCookies() - bts = simpleDownloader.getUrlBytes( - link, self.needsPromiscuity(link)) - simpleDownloader.cleanCookies() + simpleDownloader.cleanCookies() + bts = get_link_bytes( + link, self.needsPromiscuity(link)) + simpleDownloader.cleanCookies() if bts is not None: dd.put(link, bts) if Path("gallery_dl_tmp.txt").exists(): diff --git a/reddit_imgs/system/downloader/modules/i_redd_it.py b/reddit_imgs/system/downloader/modules/i_redd_it.py index 7ee525e..5892f69 100644 --- a/reddit_imgs/system/downloader/modules/i_redd_it.py +++ b/reddit_imgs/system/downloader/modules/i_redd_it.py @@ -5,6 +5,7 @@ import re import json import filetype from bs4 import BeautifulSoup as _BS +from ._cacheable import get_link_bytes from ..downloadedData import DownloadedData from ... import simpleDownloader @@ -52,7 +53,7 @@ class IReddIt(object): post_media_content = next(filter(notnone, [post.get('media')]), dict()).get('content', None) imgloc = next(filter(notnone, [post_source_url, post_media_content]), None) if imgloc is not None: - data = simpleDownloader.getUrlBytes(imgloc) + data = get_link_bytes(imgloc) if data is not None: dd.put(imgloc, data, filetype.guess_extension(data)) elif matchRedditUploads(link): diff --git a/reddit_imgs/system/downloader/modules/imgur_com.py b/reddit_imgs/system/downloader/modules/imgur_com.py index 7582883..8a0d121 100644 --- a/reddit_imgs/system/downloader/modules/imgur_com.py +++ b/reddit_imgs/system/downloader/modules/imgur_com.py @@ -7,6 +7,7 @@ import json import shutil import filetype from pathlib import Path +from ._cacheable import get_link_bytes from ..downloadedData import DownloadedData from ... import simpleDownloader @@ -29,9 +30,9 @@ class ImgurCom(object): simpleDownloader.setCookie('over18', '1') bts = b'' if '/a/' not in link and '.gifv' not in link and '.webm' not in link: - bts = simpleDownloader.getUrlBytes(link) + bts = get_link_bytes(link) elif link.endswith('.gifv'): - bts = simpleDownloader.getUrlBytes(link[:-4]+'mp4') + bts = get_link_bytes(link[:-4]+'mp4') if bts is not None: ext = filetype.guess_extension(bts) if ext is not None: @@ -133,7 +134,7 @@ class ImgurCom(object): durl = 'http://i.imgur.com/'+img['hash']+img['ext'] if durl in dd.storedLinks(): continue - imb = simpleDownloader.getUrlBytes(durl) + imb = get_link_bytes(durl) if imb is None: print() print('Album part failed') diff --git a/reddit_imgs/system/table_fmt.py b/reddit_imgs/system/table_fmt.py new file mode 100644 index 0000000..accdbef --- /dev/null +++ b/reddit_imgs/system/table_fmt.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +from typing import Any, List, Optional, Union + + +def table_fmt(labels: List[str], + table: List[List[Any]], + title: Optional[str] = None, + alignment: Optional[Union[str, List[str]]] = None, + divide_last_line: bool = False + ) -> str: + lbl: List[str] = list(map(str, labels)) + tbl: List[List[str]] = list(map(lambda l: list(map(str, l)), table)) + lbl_sz = len(lbl) + for line in tbl: + line_sz = len(line) + if line_sz != lbl_sz: + raise ValueError( + "A table line does not match its cell count to the number of labels." + ) + algnmt: List[str] = list() + if alignment is None: + algnmt = ['<']*lbl_sz + else: + if len(alignment) != lbl_sz: + raise ValueError( + "Table alignment does not match its rule count to the number of labels." + ) + algnmt = list(alignment) + acceptable_rules = ('<', '^', '>') + for rl in algnmt: + if rl not in acceptable_rules: + raise ValueError( + "Table alignment rule %r is not one of %r" % + (rl, acceptable_rules) + ) + tbl_sz = len(tbl) + tbl_szs: List[List[int]] = ( + [list(map(len, lbl))] + + list(map(lambda l: list(map(len, l)), tbl)) + ) + row_widths: List[int] = list(map(max, zip(*tbl_szs))) + print(row_widths) + labels_tpl = ( + '| ' + + ' | '.join([f'{{{e}:^{l}}}' for e, l in enumerate(row_widths)]) + + ' |' + ) + data_tpl = ( + '| ' + + ' | '.join([f'{{{e}:{algnmt[e]}{l}}}' for e, l in enumerate(row_widths)]) + + ' |' + ) + lastlinesep = ( + '|.' + + '.|.'.join(['.'*w for w in row_widths]) + + '.|' + ) + title_tpl = '| {0:^%d} |\n' % (sum(row_widths)+3*(lbl_sz-1)) + linesep_total_sz = (4+sum(row_widths)+3*(lbl_sz-1)) + hugelinesep = '='*linesep_total_sz + linesep = '-'*linesep_total_sz + s = '' + # title section + s += hugelinesep + '\n' + if title: + s += title_tpl.format(title) + s += hugelinesep + '\n' + # row label section + s += labels_tpl.format(*lbl) + '\n' + s += linesep + '\n' + # data section + for seq, line in enumerate(tbl): + if seq+1 == tbl_sz and divide_last_line: + s+= lastlinesep + '\n' + s += data_tpl.format(*line) + '\n' + # row label section + s += linesep + '\n' + s += labels_tpl.format(*lbl) + '\n' + # title section + s += hugelinesep + '\n' + if title: + s += title_tpl.format(title) + s += hugelinesep + '\n' + return s diff --git a/showcurrentdownload.py b/showcurrentdownload.py new file mode 100755 index 0000000..d5f9901 --- /dev/null +++ b/showcurrentdownload.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +from reddit_imgs.display_current_download import main + + +if __name__ == '__main__': + main()