reddit-image-wall-getter/reddit_imgs/system/downloader/modules/_user_tumblr_com.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

import filetype
from bs4 import BeautifulSoup as _BS
from ..downloadedData import DownloadedData
from ... import simpleDownloader

def BeautifulSoup(data): return _BS(data, 'html5lib')

def works_on(domain):
    return domain.endswith('.tumblr.com')

class UserTumblrCom(object):
    def recognizes(self, link):
        return True

    def download(self, link):
        dd = DownloadedData()
        print(' '*50,end='')
        print('\r',end='')
        print(' `--> Fetching image list link',end='')
        print('\r',end='')
        pagebytes = simpleDownloader.getUrlBytes(link)
        if pagebytes is None:
            return dd
        if b'safemode_actions_display' in pagebytes:
            return dd
        pagebs = BeautifulSoup(pagebytes)
        postbs = None
        if postbs is None:
            postbs = pagebs.find(class_='entries')
        if postbs is None:
            postbs = pagebs.find(id='post')
        if postbs is None:
            postbs = pagebs.find(id='Post')
        if postbs is None:
            postbs = pagebs.find(class_='content')
        if postbs is None:
            postbs = pagebs.find(class_='Photo')
        if postbs is None:
            postbs = pagebs.find(class_='photo')
        if postbs is None:
            postbs = pagebs.find(class_='stat-photo')
        if postbs is None:
            postbs = pagebs.find(id='posts')
        if postbs is None:
            postbs = pagebs.find(id='base-container')
        if postbs is None:
            postbs = pagebs.find(class_='posts')
        if postbs is None:
            postbs = pagebs.find(class_='post-wrap')
        if postbs is None:
            postbs = pagebs.find(class_='text-post')
        if postbs is None:
            postbs = pagebs.find(class_='entry')
        if postbs is None:
            postbs = pagebs.find(class_='grid')
        if postbs is None:
            postbs = pagebs.find(class_='stat-answer')
        if postbs is None:
            postbs = pagebs.find(class_='article-content')
        if postbs is None:
            postbs = pagebs.find(id='content')
        if postbs is None:
            postbs = pagebs.find(id='stuff')
        if postbs is None:
            postbs = pagebs.find(class_='audio-post')
        if postbs is None:
            postbs = pagebs.find(class_='video-post')
        if postbs is None:
            postbs = pagebs.find(class_='media-post')
        if postbs is None:
            postbs = pagebs.find(class_='post-media')
        if postbs is None:
            postbs = pagebs.find(class_='p')
        if postbs is None:
            postbs = pagebs.find(class_='PhotoPost')
        if postbs is None:
            postbs = pagebs.find(class_='tmblr-full')
        if postbs is None:
            postbs = pagebs.find(class_='PhotoSet')
        if postbs is None:
            postbs = pagebs.find(class_='bigthings')
        if postbs is None:
            postbs = pagebs.find(class_='photoset-post')
        if postbs is None:
            postbs = pagebs.find(class_='text_wrap')
        if postbs is None:
            postbs = pagebs.find(class_='post-photo')
        if postbs is None:
            postbs = pagebs.find(class_='photo-post-photo')
        if postbs is None:
            postbs = pagebs.find(class_='photo-post')
        if postbs is None:
            postbs = pagebs.find(class_='PhotoPost')
        if postbs is None:
            postbs = pagebs.find(class_='wholeimage')
        if postbs is None:
            postbs = pagebs.find(class_='image')
        if postbs is None:
            postbs = pagebs.find(class_='zoombox')
        iframebs = postbs.find('iframe', class_='photoset')
        imagesbs = None
        if iframebs is not None:
            iframesrc = iframebs['src']
            if iframesrc.startswith('/'):
                iframesrc='https://www.tumblr.com'+iframesrc
            print(' '*50,end='')
            print('\r',end='')
            print(' `--> Fetching image list',end='')
            print('\r',end='')
            iframebytes = simpleDownloader.getUrlBytes(iframesrc)
            iframebs = BeautifulSoup(iframebytes)
            imagesbs = iframebs.find_all('a',class_='photoset_photo')
        else:
            imagesbs = list()
            imagesbs += list(map(extractimage, postbs.find_all(class_='photo-wrapper')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='PhotoWrapper')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='ThePhoto')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='photo')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='photo-data')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='post')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='box')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='media')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='full-image')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='full-media')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='image')))
            imagesbs += list(map(extractimage, postbs.find_all(class_='imgback')))
            imagesbs += list(map(extractimage, postbs.find_all('figure')))
            imagesbs = list(filter(lambda a: a is not None, imagesbs))
            imagesbs = list(map(lambda a: a['href'], imagesbs))
            imagesbs = set(imagesbs)
            imagesbs = list(map(lambda a: {'href': a}, imagesbs))
        limagesbs = len(imagesbs)
        for seq, imgbs in enumerate(imagesbs):
            print(' '*50,end='')
            print('\r',end='')
            print(' `--> Album image #%03d of %03d'%(seq+1,limagesbs),end='')
            print('\r',end='')
            data = simpleDownloader.getUrlBytes(imgbs['href'])
            if data is None:
                return None
            dd.put(imgbs['href'], data, filetype.guess_extension(data))
        return dd

def extractimage(el):
    t = el.find('a')
    if t is None and el.name=='a':
        t = el
    img = el.find('img')
    if img is None and el.name=='img':
        img = el
    if t is None or '.jpg' in t['href'] or '.png' in t['href'] or '.gif' in t['href'] or '.bmp' in t['href']:
        if img is not None:
            try:
                t = {'href':img['src']}
            except:
                t = {'href':img['srcset'].split(' ')[0].split(',')[0]}
    if t is not None:
        if not ('.jpg' in t['href'] or '.png' in t['href'] or '.gif' in t['href'] or '.bmp' in t['href']):
            t = None
    return t

def get_class():
    return UserTumblrCom