Browse Source

Added normalization stage to pipeline

keep-around/926d9e08ad2d0a3eb7e68386c5f31019b8e6aa6b
Adler Neves 11 months ago
parent
commit
553cda3ceb
19 changed files with 734 additions and 78 deletions
  1. +5
    -0
      .gitignore
  2. +8
    -0
      moresubredditsabout.py
  3. +68
    -0
      reddit_imgs/display_current_download.py
  4. +8
    -1
      reddit_imgs/fetch.py
  5. +25
    -4
      reddit_imgs/hashit.py
  6. +160
    -0
      reddit_imgs/normalizetobmp.py
  7. +0
    -14
      reddit_imgs/normalizetorgbpng.py
  8. +2
    -2
      reddit_imgs/runner.py
  9. +203
    -0
      reddit_imgs/search_for_subreddits.py
  10. +88
    -42
      reddit_imgs/sync.py
  11. +25
    -2
      reddit_imgs/system/downloader/cache.py
  12. +8
    -1
      reddit_imgs/system/downloader/downloadedData.py
  13. +17
    -0
      reddit_imgs/system/downloader/modules/_cacheable.py
  14. +12
    -1
      reddit_imgs/system/downloader/modules/direct_link.py
  15. +5
    -7
      reddit_imgs/system/downloader/modules/gallerydl.py
  16. +2
    -1
      reddit_imgs/system/downloader/modules/i_redd_it.py
  17. +4
    -3
      reddit_imgs/system/downloader/modules/imgur_com.py
  18. +86
    -0
      reddit_imgs/system/table_fmt.py
  19. +8
    -0
      showcurrentdownload.py

+ 5
- 0
.gitignore View File

@ -14,6 +14,9 @@ i_h/**
i_h.json
i_hc.json
i_hs.json
i_h_n
i_h_n/**
i_h_n.json
i_t
i_t/**
**/*.pyc
@ -23,3 +26,5 @@ ignored.txt
*.cookie.txt
.vscode
.vscode/**
.mypy_cache
.mypy_cache/**

+ 8
- 0
moresubredditsabout.py View File

@ -0,0 +1,8 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from reddit_imgs.search_for_subreddits import main
if __name__ == '__main__':
main()

+ 68
- 0
reddit_imgs/display_current_download.py View File

@ -0,0 +1,68 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import traceback
from io import BytesIO
from pathlib import Path
from typing import Optional
from PIL import Image
from .system.downloader.downloadedData import MainApp
millisamount = 10
AnyInt = Optional[int]
def updateImage(tk: MainApp, old_url_modified: AnyInt, old_file_modified: AnyInt, old_file_sz: AnyInt):
try:
path_file = Path('latest_put_image.file')
path_url = Path('latest_put_image.url')
new_url_modified = None
new_file_modified = None
new_file_sz = None
try:
st = path_url.stat()
new_url_modified = st.st_mtime_ns
except BaseException:
pass
try:
st = path_file.stat()
new_file_modified = st.st_mtime_ns
new_file_sz = st.st_size
except BaseException:
pass
tk.after(millisamount, updateImage, tk,
new_url_modified, new_file_modified, new_file_sz)
if old_url_modified != new_url_modified or old_file_modified != new_file_modified or old_file_sz != new_file_sz:
url = None
bts = None
try:
url = path_url.read_text()
bts = path_file.read_bytes()
except BaseException:
pass
if url is not None and bts is not None:
try:
tk.update_image(Image.open(BytesIO(bts)), url)
except BaseException:
print()
print("Exception on link %r" % url)
traceback.print_exc()
if ((old_url_modified is not None or old_file_modified is not None) and (new_url_modified is None and new_file_modified is None)):
tk.destroy()
except BaseException:
print()
traceback.print_exc()
tk.destroy()
def main():
tk = MainApp()
tk.after(1, updateImage, tk, None, None, None)
tk.mainloop()
if __name__ == '__main__':
main()

+ 8
- 1
reddit_imgs/fetch.py View File

@ -164,7 +164,8 @@ def main(retryEmptyAlbums = False):
if not downloader.recognizes(link['link']):
continue
target = os.path.join(wdir,'i',link['datakey'])
if not os.path.exists(target) or (retryEmptyAlbums and link['datakey'] not in image_cde):
targetJson = os.path.join(target, 'meta.json')
if not os.path.exists(targetJson) or (retryEmptyAlbums and link['datakey'] not in image_cde):
if not cache.replicate_from_cache(target, link['link']) or (retryEmptyAlbums and link['datakey'] not in image_cde):
downloader.download(link['link']).into(target)
cache.uncache_download(link['link'])
@ -203,6 +204,10 @@ def main(retryEmptyAlbums = False):
image_catalog_file.write_text(json.dumps(image_catalog, indent=1))
if retryEmptyAlbums:
image_cde_file.unlink()
if (pth := Path('latest_put_image.url')).exists():
pth.unlink()
if (pth := Path('latest_put_image.file')).exists():
pth.unlink()
print()
print('='*47)
@ -257,6 +262,8 @@ def fix_domain_for_display(domain):
return fix_domain_for_display('tumblr.com')
elif domain.endswith('blogspot.com') and domain != 'blogspot.com':
return fix_domain_for_display('blogspot.com')
elif domain.endswith('.e-hentai.org'):
return fix_domain_for_display('e-hentai.org')
else:
return domain


+ 25
- 4
reddit_imgs/hashit.py View File

@ -1,12 +1,19 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import shutil
from pathlib import Path
from typing import List
import filetype
import reddit_imgs.fetch
import reddit_imgs.cachedhash
import json
import reddit_imgs.fetch
def hash2pathsegments(sha256: str) -> List[str]:
return [sha256[0:3], sha256[3:6], sha256[6:]]
def main():
@ -62,7 +69,7 @@ def main():
for seq, sha256 in enumerate(hashCache.keys()):
print(f"Hash {seq+1} of {hashCacheSize}...")
if hashExtensions.get(sha256, None) is None:
hash_path = Path('i_h', sha256[0:3], sha256[3:6], sha256[6:])
hash_path = Path('i_h', *hash2pathsegments(sha256))
hash_occur_path = hash_path.joinpath('occurrences')
if not hash_occur_path.exists():
hash_occur_path.mkdir(parents=True, exist_ok=True)
@ -73,7 +80,21 @@ def main():
reffp.symlink_to('../../../../../'+path)
hash_sample_path = hash_path.joinpath('sample')
if not hash_sample_path.exists():
hash_sample_path.symlink_to('../../../../'+hashStore[sha256][0])
if not hash_sample_path.is_symlink():
hash_sample_path.symlink_to('../../../../'+hashStore[sha256][0])
else: # symlink is there, but pointing to a broken location
shutil.rmtree(hash_path)
for hashed_instance in hashStore[sha256]:
shutil.rmtree(Path(hashed_instance).parent)
for k in ['i_c.json', 'i_c_h.json', 'i_h.json', 'i_hc.json', 'i_hs.json', 'i_he.json']:
if (fl := Path(k)).exists():
fl.unlink()
raise Exception(
'Cannot proccess broken path.\n' +
'Re-run pipeline since "fetch".\n' +
f'{hash_path}\n' +
"\n".join([f" - {Path(k).parent}" for k in hashStore[sha256]])
)
hash_ext_path = hash_path.joinpath('ext')
if not hash_ext_path.exists():
with hash_sample_path.open('rb') as f:


+ 160
- 0
reddit_imgs/normalizetobmp.py View File

@ -0,0 +1,160 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import subprocess
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from typing import Dict, Union
import hashlib
import sys
from io import BytesIO
import PIL.Image
import reddit_imgs.hashit
from .hashit import hash2pathsegments
from .system.table_fmt import table_fmt
POOL_SIZE = 32
def hexhashof(bts: bytes, using) -> str:
m = using()
m.update(bts)
return m.hexdigest()
def main():
image_hashes_extensions_path = Path('i_he.json')
if not image_hashes_extensions_path.exists():
print("Executing prerrequisite...")
reddit_imgs.hashit.main()
image_hashes_extensions = json.loads(
image_hashes_extensions_path.read_text()
)
used_extensions = {i: 0 for i in set(image_hashes_extensions.values())}
for e in image_hashes_extensions.values():
used_extensions[e] += 1
image_hashed_path = Path('i_h')
image_hashed_normalized_path = Path('i_h_n')
image_hashed_normalized_path.mkdir(parents=True, exist_ok=True)
print(table_fmt(
['ext', 'hashes'],
sorted(used_extensions.items(), key=lambda t: (t[1], t[0]))+[
('total', sum(used_extensions.values())),
],
'Extensions',
alignment='^>',
divide_last_line=True,
))
converted_path = Path('i_h_n.json')
converted = dict()
if converted_path.exists():
converted = json.loads(converted_path.read_text())
with PoolExecutor(POOL_SIZE) as pe:
totalcnt = len(image_hashes_extensions)
on_finished_count = 0
def on_finished(job):
nonlocal on_finished_count
on_finished_count += 1
res = job.result()
converted[res['hash']] = res
if on_finished_count % 1000 == 0:
converted_path.write_text(json.dumps(
converted,
indent=1,
))
for seq, (hsh, ext) in enumerate(image_hashes_extensions.items()):
job = pe.submit(
normalize_to_bmp,
seq,
totalcnt,
image_hashed_path.joinpath(
*hash2pathsegments(hsh),
'sample'
),
image_hashed_normalized_path.joinpath(
*hash2pathsegments(hsh),
),
ext,
hsh,
converted.get(hsh, dict(hash=hsh)),
)
job.add_done_callback(on_finished)
# return
converted_path.write_text(json.dumps(
converted,
indent=1,
))
def normalize_to_bmp(seq: int,
total: int,
pathinfile: Path,
pathoutdir: Path,
knownext: str,
hsh: str,
lastout: Dict[str, Union[str, int]],
) -> Dict[str, Union[str, int]]:
needed_info = (
'square', 'square_size', 'square_dimens',
'aspect', 'aspect_size', 'aspect_dimens',
)
if len(set(needed_info).difference(lastout.keys())) > 0:
progress_of = '%06d of %06d' % (seq+1, total)
print(f'{progress_of} - Ensuring that "{knownext}" entry exists: {hsh}')
if not pathinfile.exists():
raise FileNotFoundError(pathinfile)
if not (pathoutfile := pathoutdir.joinpath('aspect.bmp')).exists():
print(f'{progress_of} - Converting to BMP keeping aspect')
r = subprocess.run(
['ffmpegthumbnailer',
'-i', str(pathinfile),
'-t', '10%',
'-s', '0',
'-c', 'png',
'-o', '-',
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if r.returncode or len(r.stdout)==0:
sys.stdout.write(r.stderr.decode(errors='ignore'))
raise ValueError('Conversion failed for %r' % pathinfile)
r.check_returncode()
pathoutdir.mkdir(parents=True, exist_ok=True)
PIL.Image.open(BytesIO(r.stdout)).save(pathoutfile)
if len(set(['aspect', 'aspect_size', 'aspect_dimens']).difference(lastout.keys())) > 0:
print(f'{progress_of} - Extacting BMP data from aspect')
pathoutfile = pathoutdir.joinpath('aspect.bmp')
bts = pathoutfile.read_bytes()
aspectsha256 = hexhashof(bts, hashlib.sha256)
pathoutdir.joinpath('aspect.bmp.sha256').write_text(aspectsha256)
lastout['aspect'] = aspectsha256
lastout['aspect_size'] = len(bts)
lastout['aspect_dimens'] = PIL.Image.open(pathoutfile).size
if not (pathoutfile := pathoutdir.joinpath('square.bmp')).exists():
print(f'{progress_of} - Converting to BMP square')
im = PIL.Image.open(pathoutdir.joinpath('aspect.bmp'))
ax, ay = im.size
am = max(ax, ay)
sq = PIL.Image.new('RGB', (am, am))
sq.paste(im, ((am-ax)//2, (am-ay)//2))
sq.save(pathoutfile)
if len(set(['square', 'square_size', 'square_dimens']).difference(lastout.keys())) > 0:
print(f'{progress_of} - Extacting BMP data from square')
pathoutfile = pathoutdir.joinpath('square.bmp')
bts = pathoutfile.read_bytes()
aspectsha256 = hexhashof(bts, hashlib.sha256)
pathoutdir.joinpath('square.bmp.sha256').write_text(aspectsha256)
lastout['square'] = aspectsha256
lastout['square_size'] = len(bts)
lastout['square_dimens'] = PIL.Image.open(pathoutfile).size
return lastout
if __name__ == "__main__":
main()

+ 0
- 14
reddit_imgs/normalizetorgbpng.py View File

@ -1,14 +0,0 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import PIL.Image
import json
from pathlib import Path
def main():
pass
if __name__ == "__main__":
main()

+ 2
- 2
reddit_imgs/runner.py View File

@ -7,7 +7,7 @@ import reddit_imgs.reorganize
import reddit_imgs.wallpapers
import reddit_imgs.thumbnailize
import reddit_imgs.hashit
import reddit_imgs.normalizetorgbpng
import reddit_imgs.normalizetobmp
import reddit_imgs.cachedhash
import os
@ -162,7 +162,7 @@ def cmdline():
('fetchretryingemptyalbuns', reddit_imgs.fetch.retry),
('cachedhash', reddit_imgs.cachedhash.main),
('hashit', reddit_imgs.hashit.main),
('normalizetorgbpng', reddit_imgs.normalizetorgbpng.main),
('normalizetobmp', reddit_imgs.normalizetobmp.main),
('thumbnailize', reddit_imgs.thumbnailize.main),
('reorganize', reddit_imgs.reorganize.main),
('wallpapers', reddit_imgs.wallpapers.main),


+ 203
- 0
reddit_imgs/search_for_subreddits.py View File

@ -0,0 +1,203 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import sys
import urllib.parse
from pathlib import Path
from typing import AnyStr, Callable, Dict, List, Optional, Tuple
import colored as clrlib
import html2text as html2textlib
from bs4 import BeautifulSoup
from .system import simpleDownloader
def html2text(html, withMd=True, limit=65535):
h = html2textlib.HTML2Text(baseurl="", bodywidth=limit)
if not withMd:
h.ignore_emphasis = True
h.ignore_images = True
h.ignore_links = True
h.ignore_tables = True
return h.handle(html)
def extract_subreddits_from_page(pagebs: BeautifulSoup) -> Tuple[AnyStr, List[Dict[str, str]]]:
nextbutton = pagebs.find(class_='nav-buttons')
if nextbutton:
nextbutton = nextbutton.find(class_='next-button')
if nextbutton:
nextbutton = nextbutton.find('a')
if nextbutton:
nextbutton = nextbutton['href']
srs = list()
srtbs = pagebs.find(id='siteTable')
# print(srtbs)
for srbs in srtbs.find_all(class_='subreddit'):
isNsfw = srbs.find('span', alt='NSFW') is not None
titlebs = srbs.find('a', class_='title')
descriptionbs = srbs.find(class_='description')
# descriptionPrettyHtml = descriptionbs.prettify()
link = titlebs['href']
if '/r/' not in link:
continue
name = titlebs.text
subreddit = name.split(':', 1)[0].split('/', 1)[-1].lower()
title = name.split(':', 1)[1][1:]
description = html2text(str(descriptionbs), False, 60).strip()
srs.append(dict(
isNsfw=isNsfw,
link=link,
subreddit=subreddit,
title=title,
description=description,
))
# print(isNsfw)
# print(subreddit)
# print(title)
# print(link)
# print(description)
# print()
# print('-'*79)
# print()
# raise Exception()
return (nextbutton, srs)
def pad_text_block(s: str, wth: str) -> str:
return '\n'.join(list(map(
lambda l: f'{wth}{l}',
s.splitlines()
)))
def findmany(text: str, terms: List[str]) -> Tuple[int, str]:
if len(terms) <= 0:
return -1, None
else:
incidences = dict()
for term in terms:
pos = text.find(term)
if pos >= 0:
incidences[pos] = term
if len(incidences) <= 0:
return -1, None
else:
m = min(incidences.keys())
return m, incidences[m]
def highlight_search_term(terms: List[str], text: str, styler: Callable[[str], str], case_insensitive: bool = True) -> str:
termso = terms
texto = text
textl = text.lower() if case_insensitive else text
termsl = list(map(str.lower, terms)) if case_insensitive else terms
buffo = ''
while True:
matchpos, matchtrm = findmany(textl, termsl)
if matchpos < 0:
buffo += texto
break
else:
buffo += texto[:matchpos]
buffo += styler(texto[matchpos:matchpos+len(matchtrm)])
texto = texto[matchpos+len(matchtrm):]
textl = textl[matchpos+len(matchtrm):]
return buffo
def do_search(term: str, include_nsfw: bool = True, colored: Optional[bool] = True) -> List[Dict[str, str]]:
simpleDownloader.cleanCookies()
simpleDownloader.setCookies({'over18': 1})
next_page_url = (
'https://old.reddit.com/subreddits/search?' +
('include_over_18=on&' if include_nsfw else '') +
'q=' + urllib.parse.quote_plus(term)
)
srs = list()
srlst = list()
nothing_new = True
while next_page_url:
pagebts = simpleDownloader.getUrlBytes(next_page_url)
pagebs = BeautifulSoup(pagebts, 'html5lib')
next_page_url, nsrs = extract_subreddits_from_page(pagebs)
srs += nsrs
for sr in srs:
if (nm := sr['subreddit']) in srlst:
continue
else:
srlst.append(nm)
iw = Path('r', sr['subreddit']).exists()
nothing_new = nothing_new and iw
if colored is not None:
ds = '@' if iw else '#'
srn = sr['subreddit']
isw = sr['isNsfw']
sfw = 'nsfw' if isw else 'sfw'
sfw = f'[{sfw}]'
srt = sr['title']
srd = pad_text_block(sr['description'], ' '*8)
srl = sr['link'].replace('//old.', '//www.')
if colored:
ds = clrlib.stylize(
ds,
[clrlib.fg('light_green' if iw else 'light_red')]
)
srn = clrlib.stylize(
srn,
[clrlib.fg('light_cyan')]
)
sfw = clrlib.stylize(
sfw,
[clrlib.fg('light_green' if not isw else 'light_red')]
)
srl = clrlib.stylize(
srl,
[clrlib.fg('light_blue')]
)
srt = clrlib.stylize(
srt,
[clrlib.fg('cyan')]
)
srd = '\n'.join(list(map(
lambda srdl: clrlib.stylize(
srdl,
[clrlib.fg('dark_gray' if iw else 'light_gray')]
),
srd.splitlines()
)))
termssplit = term.split()
def highligher(t):
clrlibobj = clrlib.colored('')
bgreset = clrlibobj.ESC+'49'+clrlibobj.END
return clrlib.bg('red') + t + bgreset
srn = highlight_search_term(termssplit, srn, highligher)
srt = highlight_search_term(termssplit, srt, highligher)
srd = highlight_search_term(termssplit, srd, highligher)
print(f"{ds} {srn} {sfw} {srl}")
print(f" {srt}")
print(srd)
print()
if nothing_new:
if colored is not None:
msg = "> Nothing new... move on!"
if colored:
msg = clrlib.stylize(msg, [clrlib.fg('yellow')])
print(msg)
simpleDownloader.cleanCookies()
return srs
def main():
search_term = (
' '.join(list(map(str.strip, map(str, sys.argv[1:]))))
).strip()
if len(search_term) <= 0:
print(f'Usage:\n {sys.argv[0]} <search_term>')
else:
do_search(search_term)
if __name__ == '__main__':
main()

+ 88
- 42
reddit_imgs/sync.py View File

@ -1,12 +1,19 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import os
from concurrent.futures import ProcessPoolExecutor as PoolExecutor
from pathlib import Path
from urllib.error import ContentTooShortError, HTTPError, URLError
from bs4 import BeautifulSoup as _BS
from .system import simpleDownloader
from .system.subredditTools import getEmptySubredditData, getSubredditPageJsonInfo, build_gateway_link, GATEWAY_LINK_ARGS
import json
from pathlib import Path
from .system.subredditTools import (GATEWAY_LINK_ARGS, build_gateway_link,
getEmptySubredditData,
getSubredditPageJsonInfo)
def BeautifulSoup(data): return _BS(data, 'html5lib')
@ -14,54 +21,93 @@ simpleDownloader.setCookies({'over18':1})
wdir = os.path.abspath('.')
def process_subreddit(subreddit):
simpleDownloader.setCookies({'over18':1})
srp = os.path.abspath(os.path.join(wdir, 'r', subreddit))
#if subreddit!='yiff': continue
nextpage = build_gateway_link(subreddit)
srdt = getEmptySubredditData(subreddit)
try:
with open(os.path.join(srp, 'subreddit.json')) as f:
srdt = json.loads(f.read())
except: pass
#srdt = getEmptySubredditData(subreddit)
pageno = 0
ygst = srdt['date_first']
jsonPageSr = None
while nextpage:
pageno+=1
print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit))
print(' >> %s'%(nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),))
redditBytes = None
try:
redditBytes = simpleDownloader.getUrlBytes(nextpage)
except (HTTPError, URLError, ContentTooShortError) as e:
print(" >> HTTP Error with code: Skipping...")
break
if redditBytes is None:
print(" >> HTTP Error: Skipping...")
break
# bs = BeautifulSoup(redditBytes)
jsonPage = json.loads(redditBytes)
first, last, nextpage, links = getSubredditPageJsonInfo(jsonPage, subreddit, pageno)
if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
nextpage = None
srdt['date_first'] = max(first, srdt['date_first'])
srdt['date_last'] = min(last, srdt['date_last'])
for link in links[::-1]:
if link not in srdt['links']:
srdt['links'].append(link)
srid = next(iter(set.intersection(
set(jsonPage['subreddits'].keys()),
set(jsonPage['postFlair'].keys()),
set(jsonPage['subredditAboutInfo'].keys())
)))
jsonPageSr = dict(
id=srid,
name=subreddit,
definition=jsonPage['subreddits'][srid],
about=jsonPage['subredditAboutInfo'][srid],
flair=jsonPage['postFlair'][srid],
)
with open(os.path.join(srp,'subreddit.json'),'w') as f:
f.write(json.dumps(srdt, sort_keys=True, indent=2))
if jsonPageSr is not None:
with open(os.path.join(srp,'meta.json'),'w') as f:
f.write(json.dumps(jsonPageSr, sort_keys=True, indent=2))
def main():
build_summary()
subreddits = sorted(filter(lambda sr: os.path.isdir(os.path.join(wdir,'r',sr)), os.listdir(os.path.join(wdir,'r'))))
for subreddit in subreddits:
srp = os.path.abspath(os.path.join(wdir,'r',subreddit))
#if subreddit!='yiff': continue
nextpage = build_gateway_link(subreddit)
srdt = getEmptySubredditData(subreddit)
try:
with open(os.path.join(srp,'subreddit.json')) as f:
srdt = json.loads(f.read())
except: pass
#srdt = getEmptySubredditData(subreddit)
pageno = 0
ygst = srdt['date_first']
while nextpage:
pageno+=1
print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit))
print(' >> %s'%(nextpage.replace(GATEWAY_LINK_ARGS, '[...]'),))
redditBytes = None
try:
redditBytes = simpleDownloader.getUrlBytes(nextpage)
except BaseException as e:
print(" >> HTTP Error with code: Skipping...")
break
if redditBytes is None:
print(" >> HTTP Error: Skipping...")
break
# bs = BeautifulSoup(redditBytes)
jsonPage = json.loads(redditBytes)
first, last, nextpage, links = getSubredditPageJsonInfo(jsonPage, subreddit, pageno)
if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date
nextpage = None
srdt['date_first'] = max(first, srdt['date_first'])
srdt['date_last'] = min(last, srdt['date_last'])
for link in links[::-1]:
if link not in srdt['links']:
srdt['links'].append(link)
with open(os.path.join(srp,'subreddit.json'),'w') as f:
f.write(json.dumps(srdt ,sort_keys=True, indent=2))
with PoolExecutor(16) as pe:
q = list()
for subreddit in subreddits:
job = pe.submit(process_subreddit, subreddit)
q.append(job)
for job in q:
job.result()
build_summary()
def build_summary():
rjpath = Path(wdir, 'r.json')
oldsrs = dict()
if rjpath.exists():
oldsrs = json.loads(rjpath.read_text())
srs = dict()
for srp in Path(wdir, 'r').glob('*/subreddit.json'):
srs[srp.parent.name.lower()] = json.loads(srp.read_text())
Path(wdir, 'r.json').write_text(json.dumps(srs, indent=1))
sr = srp.parent.name.lower()
try:
srs[sr] = json.loads(srp.read_text())
except json.decoder.JSONDecodeError:
if sr not in oldsrs:
raise
else:
print('Restoring old data for corrupted subrredit %r' % sr)
srs[sr] = oldsrs[sr]
srp.write_text(json.dumps(oldsrs[sr], indent=1))
rjpath.write_text(json.dumps(srs, indent=1))
if __name__ == '__main__':


+ 25
- 2
reddit_imgs/system/downloader/cache.py View File

@ -27,14 +27,31 @@ def get_path_for_caching(link: str) -> Path:
target = Path('i_c').joinpath(link.split('://', 1)[1])
return limit_filename_lenght(target)
def has_file_cache(cached: Path) -> bool:
if not cached.exists():
return False
metafile = cached.joinpath('_meta.json')
if not metafile.exists():
return False
meta = json.loads(cached.joinpath('_meta.json').read_text())
if meta['type'] != 'file':
return False
file = cached.joinpath(meta['disk'])
return file.exists()
def read_file_from_cache(cached: Path) -> bytes:
if not cached.exists():
raise ValueError("Cannot read from non-existing cache: %r" % cached)
metafile = cached.joinpath('_meta.json')
if not metafile.exists():
raise ValueError("Cannot read from broken cache: %r" % metafile)
meta = json.loads(cached.joinpath('_meta.json').read_text())
if meta['type'] != 'file':
raise ValueError("Cannot read a gallery as single file: %r" % cached)
file = cached.joinpath(meta['disk'])
if not file.exists():
raise ValueError("Cannot locate missing file: %r" % file)
return file.read_bytes()
@ -142,8 +159,14 @@ def fix_cache_relocate_single_file_from_download(download_path, download, target
downloaded_file.unlink()
downloaded_file.symlink_to(f'../../{str(target_file)}')
if not target_file.exists():
shutil.rmtree(target)
raise Exception("Specified cached file does not exist.")
shutil.rmtree(target) # cache is invalid; remove it
for fl in download_path.glob('*'):
if fl.is_symlink(): # download has a broken symlink into cache
shutil.rmtree(download_path)
break
raise Exception("Specified cached file does not exist.\n" +
f"Download path: {repr(download_path)}\n" +
f"Target: {repr(target)}")
if not target_hashfile.exists():
m = hashlib.sha256()
m.update(target_file.read_bytes())


+ 8
- 1
reddit_imgs/system/downloader/downloadedData.py View File

@ -7,6 +7,7 @@ import shutil
import tkinter
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO
from pathlib import Path
import filetype
from PIL import Image, ImageTk
@ -60,6 +61,10 @@ class DownloadedData(object):
rootWindow.update_image(Image.open(BytesIO(downloaded)), link)
except:
pass
if (pth := Path('latest_put_image.file')).exists():
pth.unlink()
Path('latest_put_image.url').write_text(link)
Path('latest_put_image.file').write_bytes(downloaded)
def remove(self, directory):
directory = os.path.abspath(directory)
@ -119,7 +124,9 @@ class MainApp(tkinter.Tk):
self._resize_image2()
def _resize_image2(self):
self.display_photo = self.photo.thumbnail((self.known_width, self.known_height))
size_tuple = (self.known_width, self.known_height)
self.display_photo = self.photo.copy()
self.display_photo.thumbnail(size_tuple)
if self.display_photo is None:
self.display_photo = self.photo
self.image = ImageTk.PhotoImage(self.display_photo)


+ 17
- 0
reddit_imgs/system/downloader/modules/_cacheable.py View File

@ -0,0 +1,17 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from typing import Any, Dict, List, Optional
from ... import simpleDownloader
from .. import cache
def get_link_bytes(link: str, *args: List[Any], **kwargs: Dict[str, Any]) -> Optional[bytes]:
bts = None
cached = cache.get_path_for_caching(link)
if cache.has_file_cache(cached):
bts = cache.read_file_from_cache(cached)
else:
bts = simpleDownloader.getUrlBytes(link, *args, **kwargs)
return bts

+ 12
- 1
reddit_imgs/system/downloader/modules/direct_link.py View File

@ -3,6 +3,7 @@
from ..downloadedData import DownloadedData
from ... import simpleDownloader
from ._cacheable import get_link_bytes
def works_on(domain):
return domain=='direct_link'
@ -58,6 +59,16 @@ class DirectLink(object):
'pornreactor.cc' in link
or
'u18chan.com' in link
or
's3.amazonaws.com/' in link
or
'//data.tumblr.com/' in link
or
'//ve.media.tumblr.com/' in link
or
'//redgifs.com/' in link
or
'.redgifs.com/' in link
):
return True
return False
@ -65,7 +76,7 @@ class DirectLink(object):
def download(self, link):
dd = DownloadedData()
simpleDownloader.cleanCookies()
bts = simpleDownloader.getUrlBytes(link, self.needsPromiscuity(link))
bts = get_link_bytes(link, self.needsPromiscuity(link))
simpleDownloader.cleanCookies()
if bts is not None:
dd.put(link,bts)


+ 5
- 7
reddit_imgs/system/downloader/modules/gallerydl.py View File

@ -3,6 +3,7 @@
import subprocess
from ._cacheable import get_link_bytes
from ... import simpleDownloader
from ..downloadedData import DownloadedData
from .direct_link import DirectLink
@ -81,13 +82,10 @@ class GalleryDlWrapper(DirectLink):
print('\r', end='', flush=True)
# print('\n'+link)
cached = cache.get_path_for_caching(link)
if cached.exists():
bts = cache.read_file_from_cache(cached)
else:
simpleDownloader.cleanCookies()
bts = simpleDownloader.getUrlBytes(
link, self.needsPromiscuity(link))
simpleDownloader.cleanCookies()
simpleDownloader.cleanCookies()
bts = get_link_bytes(
link, self.needsPromiscuity(link))
simpleDownloader.cleanCookies()
if bts is not None:
dd.put(link, bts)
if Path("gallery_dl_tmp.txt").exists():


+ 2
- 1
reddit_imgs/system/downloader/modules/i_redd_it.py View File

@ -5,6 +5,7 @@ import re
import json
import filetype
from bs4 import BeautifulSoup as _BS
from ._cacheable import get_link_bytes
from ..downloadedData import DownloadedData
from ... import simpleDownloader
@ -52,7 +53,7 @@ class IReddIt(object):
post_media_content = next(filter(notnone, [post.get('media')]), dict()).get('content', None)
imgloc = next(filter(notnone, [post_source_url, post_media_content]), None)
if imgloc is not None:
data = simpleDownloader.getUrlBytes(imgloc)
data = get_link_bytes(imgloc)
if data is not None:
dd.put(imgloc, data, filetype.guess_extension(data))
elif matchRedditUploads(link):


+ 4
- 3
reddit_imgs/system/downloader/modules/imgur_com.py View File

@ -7,6 +7,7 @@ import json
import shutil
import filetype
from pathlib import Path
from ._cacheable import get_link_bytes
from ..downloadedData import DownloadedData
from ... import simpleDownloader
@ -29,9 +30,9 @@ class ImgurCom(object):
simpleDownloader.setCookie('over18', '1')
bts = b''
if '/a/' not in link and '.gifv' not in link and '.webm' not in link:
bts = simpleDownloader.getUrlBytes(link)
bts = get_link_bytes(link)
elif link.endswith('.gifv'):
bts = simpleDownloader.getUrlBytes(link[:-4]+'mp4')
bts = get_link_bytes(link[:-4]+'mp4')
if bts is not None:
ext = filetype.guess_extension(bts)
if ext is not None:
@ -133,7 +134,7 @@ class ImgurCom(object):
durl = 'http://i.imgur.com/'+img['hash']+img['ext']
if durl in dd.storedLinks():
continue
imb = simpleDownloader.getUrlBytes(durl)
imb = get_link_bytes(durl)
if imb is None:
print()
print('Album part failed')


+ 86
- 0
reddit_imgs/system/table_fmt.py View File

@ -0,0 +1,86 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from typing import Any, List, Optional, Union
def table_fmt(labels: List[str],
table: List[List[Any]],
title: Optional[str] = None,
alignment: Optional[Union[str, List[str]]] = None,
divide_last_line: bool = False
) -> str:
lbl: List[str] = list(map(str, labels))
tbl: List[List[str]] = list(map(lambda l: list(map(str, l)), table))
lbl_sz = len(lbl)
for line in tbl:
line_sz = len(line)
if line_sz != lbl_sz:
raise ValueError(
"A table line does not match its cell count to the number of labels."
)
algnmt: List[str] = list()
if alignment is None:
algnmt = ['<']*lbl_sz
else:
if len(alignment) != lbl_sz:
raise ValueError(
"Table alignment does not match its rule count to the number of labels."
)
algnmt = list(alignment)
acceptable_rules = ('<', '^', '>')
for rl in algnmt:
if rl not in acceptable_rules:
raise ValueError(
"Table alignment rule %r is not one of %r" %
(rl, acceptable_rules)
)
tbl_sz = len(tbl)
tbl_szs: List[List[int]] = (
[list(map(len, lbl))] +
list(map(lambda l: list(map(len, l)), tbl))
)
row_widths: List[int] = list(map(max, zip(*tbl_szs)))
print(row_widths)
labels_tpl = (
'| ' +
' | '.join([f'{{{e}:^{l}}}' for e, l in enumerate(row_widths)]) +
' |'
)
data_tpl = (
'| ' +
' | '.join([f'{{{e}:{algnmt[e]}{l}}}' for e, l in enumerate(row_widths)]) +
' |'
)
lastlinesep = (
'|.' +
'.|.'.join(['.'*w for w in row_widths]) +
'.|'
)
title_tpl = '| {0:^%d} |\n' % (sum(row_widths)+3*(lbl_sz-1))
linesep_total_sz = (4+sum(row_widths)+3*(lbl_sz-1))
hugelinesep = '='*linesep_total_sz
linesep = '-'*linesep_total_sz
s = ''
# title section
s += hugelinesep + '\n'
if title:
s += title_tpl.format(title)
s += hugelinesep + '\n'
# row label section
s += labels_tpl.format(*lbl) + '\n'
s += linesep + '\n'
# data section
for seq, line in enumerate(tbl):
if seq+1 == tbl_sz and divide_last_line:
s+= lastlinesep + '\n'
s += data_tpl.format(*line) + '\n'
# row label section
s += linesep + '\n'
s += labels_tpl.format(*lbl) + '\n'
# title section
s += hugelinesep + '\n'
if title:
s += title_tpl.format(title)
s += hugelinesep + '\n'
return s

+ 8
- 0
showcurrentdownload.py View File

@ -0,0 +1,8 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from reddit_imgs.display_current_download import main
if __name__ == '__main__':
main()

Loading…
Cancel
Save