diff --git a/.gitignore b/.gitignore index 1615e04..2f62f08 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +tmp +tmp/** **/*.pyc **/__pycache__ **/__pycache__/** diff --git a/d/About.md b/d/About.md index 05844bd..56d8b27 100644 --- a/d/About.md +++ b/d/About.md @@ -1 +1 @@ -This folder contains all images grouped by discussions. +This folder contains all images grouped by subreddit. diff --git a/reddit_imgs/reorganize.py b/reddit_imgs/reorganize.py index 2deb27d..dd5cdf7 100755 --- a/reddit_imgs/reorganize.py +++ b/reddit_imgs/reorganize.py @@ -2,16 +2,13 @@ # -*- encoding: utf-8 -*- import os -import re import json import shutil -import datetime +from .system import subredditTools def readAllFile(s): with open(s) as f: return f.read() -def slugify(dat): - return re.sub(r'[^\w\s\.\-\(\)\[\]]', '-', dat) wdir = os.path.abspath('.') @@ -39,6 +36,10 @@ def main(): ) )) + linksDown = 0 + linksNotDown = 0 + linksErr = 0 + copyfiles = list() print('\r'+' '*79+'\r'+'Calculating changes...',end='') @@ -48,29 +49,19 @@ def main(): for link in links: imgd = os.path.join(idir, link['datakey']) meta = os.path.join(imgd, 'meta.json') - if os.path.exists(meta): + if not os.path.exists(meta): + linksNotDown+=1 + else: files = json.loads(readAllFile(meta)) - for seq, file in enumerate(files): - imgfrom = os.path.join(imgd, file['dname']) - imgfn = '' - imgfn+= subreddit - imgfn+= '__' - imgfn+= datetime.datetime.fromtimestamp(int(link['timestamp'])).isoformat().replace('T','_').replace(':','-') - imgfn+= '_' - imgfn+= 'nsfw' if link['nsfw'] else 'safe' - imgfn+= '___' - imgfn+= '-' if link['flair'] is None else slugify(link['flair']) - imgfn+= '___' - imgfn+= '-' if link['sharer'] is None else slugify(link['sharer']) - imgfn+= '___' - imgfn+= slugify(link['title'][:50]) - imgfn+= '___' - imgfn+= slugify(link['datakey']) - imgfn+= '___' - imgfn+= str('%04d'%seq) - imgfn+= '.'+file['ext'] - imgto = os.path.join(sdir,imgfn) - copyfiles.append((imgfrom,imgto)) + if len(files)>0: + linksDown+=1 + for seq, file in enumerate(files): + imgfrom = os.path.join(imgd, file['dname']) + imgfn = subredditTools.assembleFileName(subreddit, link, seq, file['ext']) + imgto = os.path.join(sdir,imgfn) + copyfiles.append((imgfrom,imgto)) + else: + linksErr+=1 del links lcf = len(copyfiles) @@ -89,9 +80,14 @@ def main(): print('\r'+' '*79+'\r'+'Aborted safely',end='') print() raise e - - print('\r'+' '*79+'\r'+'Done.',end='') + print('\r'+' '*79+'\r'+'100% copied') + print('%05d files sucessfully downloaded'%lcf) + print('%05d links downloaded'%linksDown) + print('%05d links errored'%linksErr) + print('%05d links ignored'%linksNotDown) + print('%05d links total'%(linksDown+linksNotDown+linksErr)) print() + print('Done.') if __name__ == '__main__': main() diff --git a/reddit_imgs/runner.py b/reddit_imgs/runner.py index 1608d03..ab86600 100755 --- a/reddit_imgs/runner.py +++ b/reddit_imgs/runner.py @@ -4,12 +4,16 @@ import reddit_imgs.sync import reddit_imgs.fetch import reddit_imgs.reorganize +import reddit_imgs.wallpapers import os +import sys import shutil wdir = os.path.abspath('.') def ensureFolderAvailability(): + if not os.path.exists(os.path.join(wdir,'w')): + os.makedirs(os.path.join(wdir,'w')) if not os.path.exists(os.path.join(wdir,'d')): os.makedirs(os.path.join(wdir,'d')) if not os.path.exists(os.path.join(wdir,'i')): @@ -27,6 +31,8 @@ def managesubreddits(): print('1) List monitored subreddits') print('2) Add monitored subreddit') print('3) Remove monitored subreddit') + print('4) Set as wallpaper source') + print('5) Unset as wallpaper source') print() print('0) Back') print('----------------------------------------------') @@ -39,10 +45,14 @@ def managesubreddits(): subreddits_dir = os.path.join(wdir,'r') subreddits_isfolder = lambda sr: os.path.isdir(os.path.join(subreddits_dir,sr)) subreddits = sorted(filter(subreddits_isfolder, os.listdir(subreddits_dir))) - if i=='1' or i=='3': + if i in ['1', '3', '4', '5']: print('Subreddits monitored:') for sr in subreddits: - print('/r/%s'%sr) + print('/r/{0}'.format(sr),end='') + if os.path.isfile(os.path.join(subreddits_dir,sr,'wallpaper.flag')): + print('\t\t(wallpaper)') + else: + print() print() if i=='1': print('Press enter to continue') @@ -65,6 +75,37 @@ def managesubreddits(): print('Done.') print('Press enter to continue') input() + elif i=='4': + print('Enter the subreddit you want to set as wallpaper source:') + add = input('/r/') + try: + dd = os.path.join(subreddits_dir,add) + if not os.path.exists(dd): + os.makedirs(dd) + f = open(os.path.join(dd, 'wallpaper.flag'),'w') + f.write('') + f.close() + except: pass + print() + print('Done.') + print('Press enter to continue') + input() + elif i=='5': + print('Enter the subreddit you want to unset as wallpaper source:') + add = input('/r/') + try: + dd = os.path.join(subreddits_dir,add) + if not os.path.exists(dd): + os.makedirs(dd) + f = open(os.path.join(dd, 'wallpaper.flag'),'w') + f.write('') + f.close() + os.remove(os.path.join(dd, 'wallpaper.flag')) + except: pass + print() + print('Done.') + print('Press enter to continue') + input() def mainmenu(): i = '' @@ -77,6 +118,7 @@ def mainmenu(): print('2) Get link list to be downloaded from reddit') print('3) Download grabbed links') print('4) Group and put nice names on downloaded data') + print('5) Sepparate wallpapers') print() print('0) Quit') print('----------------------------------------------') @@ -92,10 +134,28 @@ def mainmenu(): reddit_imgs.fetch.main() elif i=='4': reddit_imgs.reorganize.main() + elif i=='5': + reddit_imgs.wallpapers.main() def main(): ensureFolderAvailability() - mainmenu() + if len(sys.argv)>1: + cmdline() + else: + mainmenu() + +def cmdline(): + cmd = sys.argv[1] + if cmd == 'sync': + reddit_imgs.sync.main() + elif cmd == 'fetch': + reddit_imgs.fetch.main() + elif cmd == 'reorganize': + reddit_imgs.reorganize.main() + elif cmd == 'wallpapers': + reddit_imgs.wallpapers.main() + else: + print('Usage {0} [sync/fetch/reorganize/wallpapers]'.format(sys.argv[0])) if __name__ == '__main__': main() diff --git a/reddit_imgs/sync.py b/reddit_imgs/sync.py index e66d065..e6f1687 100755 --- a/reddit_imgs/sync.py +++ b/reddit_imgs/sync.py @@ -26,6 +26,7 @@ def main(): except: pass #srdt = getEmptySubredditData(subreddit) pageno = 0 + ygst = srdt['date_first'] while nextpage: pageno+=1 print(('/r/{0:<20} loading page #%05d'%pageno).format(subreddit)) @@ -33,7 +34,7 @@ def main(): redditBytes = simpleDownloader.getUrlBytes(nextpage) bs = BeautifulSoup(redditBytes) first, last, nextpage, links = getSubredditPageInfo(bs) - if srdt['date_last'] <= last: + if ygst >= first: #if latest stored post is at same age or older than the latest downloaded post, then we are up-to-date nextpage = None srdt['date_first'] = max(first, srdt['date_first']) srdt['date_last'] = min(last, srdt['date_last']) diff --git a/reddit_imgs/system/downloader/downloadedData.py b/reddit_imgs/system/downloader/downloadedData.py index 5dc687c..87f687d 100644 --- a/reddit_imgs/system/downloader/downloadedData.py +++ b/reddit_imgs/system/downloader/downloadedData.py @@ -7,9 +7,24 @@ import json import os class DownloadedData(object): - def __init__(self): + def __init__(self, loadfrom = None): + self.initialize() + self.loadfrom(loadfrom) + def initialize(self): self.control = list() self.fb = dict() + def loadfrom(self,loadfrom): + if loadfrom: + with open(os.path.join(loadfrom,'meta.json')) as f: + self.control = json.loads(f.read()) + for ctrl in self.control: + fnm = ctrl['dname'] + cnt = b'' + with open(os.path.join(loadfrom,fnm),'rb') as f: + cnt = f.read() + self.fb[fnm] = cnt + def storedLinks(self): + return [ctrl['link'] for ctrl in self.control] def put(self, link, downloaded, ext=None): if ext is None: try: diff --git a/reddit_imgs/system/downloader/modules/direct_link.py b/reddit_imgs/system/downloader/modules/direct_link.py index dc62d5f..440294e 100644 --- a/reddit_imgs/system/downloader/modules/direct_link.py +++ b/reddit_imgs/system/downloader/modules/direct_link.py @@ -17,14 +17,31 @@ class DirectLink(object): link.startswith('http://dl.dropboxusercontent.com') or link.startswith('https://dl.dropboxusercontent.com') + or + link.startswith('http://pawsru.org') + or + link.startswith('https://pawsru.org') ): return False return True + def needsPromiscuity(self, link): + if ( + link.startswith('http://cdn.discordapp.com') + or + link.startswith('https://cdn.discordapp.com') + or + link.startswith('http://www.weasyl.com') + or + link.startswith('https://www.weasyl.com') + ): + return True + return False + def download(self, link): dd = DownloadedData() simpleDownloader.cleanCookies() - bts = simpleDownloader.getUrlBytes(link) + bts = simpleDownloader.getUrlBytes(link, self.needsPromiscuity(link)) simpleDownloader.cleanCookies() if bts is not None: dd.put(link,bts) diff --git a/reddit_imgs/system/downloader/modules/imgur_com.py b/reddit_imgs/system/downloader/modules/imgur_com.py index 3a299ab..1a840ba 100644 --- a/reddit_imgs/system/downloader/modules/imgur_com.py +++ b/reddit_imgs/system/downloader/modules/imgur_com.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 # -*- encoding: utf-8 -*- +import os import re import json +import shutil import filetype from ..downloadedData import DownloadedData from ... import simpleDownloader @@ -62,6 +64,20 @@ class ImgurCom(object): imgs = [albnfo] if 'album_images' in albnfo: imgs = albnfo['album_images']['images'] + intermediarySaves = len(imgs)>=10 + if intermediarySaves: + if not os.path.isdir('tmp'): + os.makedirs('tmp') + if os.path.isfile('tmp/link.url'): + with open('tmp/link.url') as f: + svdlnk = f.read() + if svdlnk == link: + dd.loadfrom('tmp') + else: + shutil.rmtree('tmp') + os.makedirs('tmp') + with open('tmp/link.url', 'w') as f: + f.write(link) for seq, img in enumerate(imgs): print(' '*50,end='') print('\r',end='') @@ -70,6 +86,8 @@ class ImgurCom(object): if img['ext'] == '.gifv': img['ext'] = '.mp4' durl = 'http://i.imgur.com/'+img['hash']+img['ext'] + if durl in dd.storedLinks(): + continue imb = simpleDownloader.getUrlBytes(durl) if imb is None: print() @@ -78,7 +96,11 @@ class ImgurCom(object): simpleDownloader.cleanCookies() return None dd.put(durl, imb, img['ext'][1:]) + if intermediarySaves and seq%10 == 0: + dd.into('tmp') print('\r',end='') + if os.path.isdir('tmp'): + shutil.rmtree('tmp') else: dd.put(link, bts, ext) simpleDownloader.cleanCookies() diff --git a/reddit_imgs/system/simpleDownloader.py b/reddit_imgs/system/simpleDownloader.py index bca81b2..a8bdd0c 100644 --- a/reddit_imgs/system/simpleDownloader.py +++ b/reddit_imgs/system/simpleDownloader.py @@ -33,7 +33,7 @@ def setCookies(newCookies): cleanCookies() patchCookies(newCookies) -def getUrlBytes(url): +def getUrlBytes(url, giveUpOn403=False): global cookie request = urllib.request.Request(url) request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) '+ @@ -45,7 +45,7 @@ def getUrlBytes(url): request.add_header("Cookie", '; '.join(map(lambda a: '='.join(a), cookie.items()))) response = None try: - response = urllib.request.urlopen(request, timeout=15) + response = urllib.request.urlopen(request, timeout=30) except urllib.error.HTTPError as e: if e.code == 429: print('[URL] Got 429 (Too Many Requests): sleeping for 5 seconds') @@ -57,12 +57,28 @@ def getUrlBytes(url): print(' @ %s'%url) time.sleep(5) return getUrlBytes(url) + if e.code == 403 and giveUpOn403: + print('[URL] Got 403 (Forbidden): assuming "Not Found"') + print(' @ %s'%url) + return None + elif e.code == 500: + print('[URL] Got 500 (Server Error): assuming "Not Found"') + return None elif e.code == 404: return None elif e.code == 400: return None raise e except urllib.error.URLError as e: + if str(e.reason).startswith('EOF occurred in violation of protocol ('): + print('Server doesn\'t know how to use HTTP properly - assuming "Not Found"') + return None + if str(e.reason).startswith('[SSL: CERTIFICATE'): + print('Their SSL certificate is screwed up - assuming "Not Found"') + return None + if str(e.reason).startswith('[Errno -5]'): + print('Their DNS server is screwed up - assuming "Not Found"') + return None if str(e.reason).startswith('[Errno -2]'): return None if str(e.reason).startswith('[Errno -3]'): diff --git a/reddit_imgs/system/subredditTools.py b/reddit_imgs/system/subredditTools.py index 96de9b2..6f4ef2f 100644 --- a/reddit_imgs/system/subredditTools.py +++ b/reddit_imgs/system/subredditTools.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 # -*- encoding: utf-8 -*- +import datetime import dateutil.parser from .limits import minint, maxint +from .textTools import slugify def getInfoFromRedditItem(bs): nsfw = 'over18' in bs['class'] @@ -52,3 +54,23 @@ def getSubredditPageInfo(bs): except: pass structured_links = list(map(getInfoFromRedditItem, links)) return first, last, nextpage, structured_links + +def assembleFileName(subreddit,link,seq,ext): + imgfn = '' + imgfn+= subreddit + imgfn+= '__' + imgfn+= datetime.datetime.fromtimestamp(int(link['timestamp'])).isoformat().replace('T','_').replace(':','-') + imgfn+= '_' + imgfn+= 'nsfw' if link['nsfw'] else 'safe' + imgfn+= '___' + imgfn+= '-' if link['flair'] is None else slugify(link['flair']) + imgfn+= '___' + imgfn+= '-' if link['sharer'] is None else slugify(link['sharer']) + imgfn+= '___' + imgfn+= slugify(link['title'][:50]) + imgfn+= '___' + imgfn+= slugify(link['datakey']) + imgfn+= '___' + imgfn+= str('%04d'%seq) + imgfn+= '.'+ext + return imgfn diff --git a/reddit_imgs/system/textTools.py b/reddit_imgs/system/textTools.py new file mode 100644 index 0000000..1bfd56c --- /dev/null +++ b/reddit_imgs/system/textTools.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import re + +def slugify(dat): + return re.sub(r'[^\w\s\.\-\(\)\[\]]', '-', dat) diff --git a/reddit_imgs/wallpapers.py b/reddit_imgs/wallpapers.py new file mode 100644 index 0000000..a4541f7 --- /dev/null +++ b/reddit_imgs/wallpapers.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- + +import os +import json +import shutil +import filetype +import PIL.Image +from .system import subredditTools + +def readAllFile(d): + with open(d) as f: + return f.read() + +proportion = ( + 5/4, + 21/9 +) +minPixels = 1000**2 + +wdir = os.path.abspath('.') + +idir = os.path.join(wdir, 'i') +pdir = os.path.join(wdir, 'w') +rdir = os.path.join(wdir, 'r') + +def main(): + subreddits = ( + map( + lambda sr: json.loads(readAllFile(os.path.join(rdir,sr,'subreddit.json'))), + sorted( + filter( + lambda sr: + os.path.isdir(os.path.join(rdir,sr)) + and + os.path.isfile(os.path.join(rdir,sr,'subreddit.json')) + and + os.path.isfile(os.path.join(rdir,sr,'wallpaper.flag')), + os.listdir(rdir) + ) + ) + )) + pass + + copyfiles = list() + + linksDown = 0 + linksNotDown = 0 + linksErr = 0 + + print('Listing files...') + + for subreddit in subreddits: + sdir = pdir + for link in subreddit['links']: + imgd = os.path.join(idir, link['datakey']) + meta = os.path.join(imgd, 'meta.json') + if not os.path.exists(meta): + linksNotDown+=1 + else: + files = json.loads(readAllFile(meta)) + if len(files)<=0: + linksErr+=1 + else: + linksDown+=1 + for seq, file in enumerate(files): + imgfrom = os.path.join(imgd, file['dname']) + ext = filetype.guess_extension(imgfrom) + imgfn = subredditTools.assembleFileName( + subreddit['subreddit'], + link, + seq, + ext + ) + nsfwsafe = 'nsfw' if link['nsfw'] else 'safe' + imgto = os.path.join(sdir,nsfwsafe,imgfn) + copyfiles.append((imgfrom,imgto)) + + print('Creating folders...') + + lcf = len(copyfiles) + for (cnt, (src, dst)) in enumerate(copyfiles): + container = os.path.dirname(os.path.abspath(dst)) + if not os.path.exists(container): + os.makedirs(container) + + print('Ensuring minimum resolution and proportion...') + ignored=0 + kept=0 + + lcf = len(copyfiles) + print('\r'+' '*79+'\r'+'%03d%% processed: %05d of %05d'%(0, 0, lcf),end='') + for (cnt, (src, dst)) in reversed(list(enumerate(copyfiles))): + if os.path.exists(dst): continue + print('\r'+' '*79+'\r'+'%03d%% processed: %05d of %05d'%((((lcf-cnt)/lcf)*100)//1, lcf-cnt, lcf),end='') + with PIL.Image.open(src) as img: + width, height = img.size + prop = width/height + pxls = width*height + if not (pxls >= minPixels and prop >= proportion[0] and prop <= proportion[1]): + ignored+=1 + del copyfiles[cnt] + else: + kept+=1 + print() + + print('Copying files...') + + lcf = len(copyfiles) + print('\r'+' '*79+'\r'+'%03d%% copied: %05d of %05d'%(0, 0, lcf),end='') + for (cnt, (src, dst)) in enumerate(copyfiles): + if os.path.exists(dst): continue + print('\r'+' '*79+'\r'+'%03d%% copied: %05d of %05d'%((((cnt+1)/lcf)*100)//1, cnt+1, lcf),end='') + try: + shutil.copyfile(src, dst) + except KeyboardInterrupt as e: + print() + print('\r'+' '*79+'\r'+'Deleting interrupted file...',end='') + os.remove(dst) + print('\r'+' '*79+'\r'+'Aborted safely',end='') + print() + raise e + print() + print() + print('{0:>5} files were kept'.format(kept)) + print('{0:>5} files were ignored'.format(ignored)) + +if __name__ == '__main__': + main() + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f2c89b8..134ab90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ filetype==1.0.0 beautifulsoup4==4.6.0 +Pillow==5.0.0 diff --git a/w/.gitignore b/w/.gitignore new file mode 100644 index 0000000..b66ba8a --- /dev/null +++ b/w/.gitignore @@ -0,0 +1,3 @@ +** +!About.md +!.gitignore diff --git a/w/About.md b/w/About.md new file mode 100644 index 0000000..4625178 --- /dev/null +++ b/w/About.md @@ -0,0 +1 @@ +This folder contains all wallpapers grouped by safety.