167 lines
6.6 KiB
Python
167 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
import filetype
|
|
from bs4 import BeautifulSoup as _BS
|
|
from ..downloadedData import DownloadedData
|
|
from ... import simpleDownloader
|
|
|
|
def BeautifulSoup(data): return _BS(data, 'html5lib')
|
|
|
|
def works_on(domain):
|
|
return domain.endswith('.tumblr.com')
|
|
|
|
class UserTumblrCom(object):
|
|
def recognizes(self, link):
|
|
return True
|
|
|
|
def download(self, link):
|
|
dd = DownloadedData()
|
|
print(' '*50,end='')
|
|
print('\r',end='')
|
|
print(' `--> Fetching image list link',end='')
|
|
print('\r',end='')
|
|
pagebytes = simpleDownloader.getUrlBytes(link)
|
|
if pagebytes is None:
|
|
return dd
|
|
if b'safemode_actions_display' in pagebytes:
|
|
return dd
|
|
pagebs = BeautifulSoup(pagebytes)
|
|
postbs = None
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='entries')
|
|
if postbs is None:
|
|
postbs = pagebs.find(id='post')
|
|
if postbs is None:
|
|
postbs = pagebs.find(id='Post')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='content')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='Photo')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='photo')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='stat-photo')
|
|
if postbs is None:
|
|
postbs = pagebs.find(id='posts')
|
|
if postbs is None:
|
|
postbs = pagebs.find(id='base-container')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='posts')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='post-wrap')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='text-post')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='entry')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='grid')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='stat-answer')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='article-content')
|
|
if postbs is None:
|
|
postbs = pagebs.find(id='content')
|
|
if postbs is None:
|
|
postbs = pagebs.find(id='stuff')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='audio-post')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='video-post')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='media-post')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='post-media')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='p')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='PhotoPost')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='tmblr-full')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='PhotoSet')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='bigthings')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='photoset-post')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='text_wrap')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='post-photo')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='photo-post-photo')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='photo-post')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='PhotoPost')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='wholeimage')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='image')
|
|
if postbs is None:
|
|
postbs = pagebs.find(class_='zoombox')
|
|
iframebs = postbs.find('iframe', class_='photoset')
|
|
imagesbs = None
|
|
if iframebs is not None:
|
|
iframesrc = iframebs['src']
|
|
if iframesrc.startswith('/'):
|
|
iframesrc='https://www.tumblr.com'+iframesrc
|
|
print(' '*50,end='')
|
|
print('\r',end='')
|
|
print(' `--> Fetching image list',end='')
|
|
print('\r',end='')
|
|
iframebytes = simpleDownloader.getUrlBytes(iframesrc)
|
|
iframebs = BeautifulSoup(iframebytes)
|
|
imagesbs = iframebs.find_all('a',class_='photoset_photo')
|
|
else:
|
|
imagesbs = list()
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='photo-wrapper')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='PhotoWrapper')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='ThePhoto')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='photo')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='photo-data')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='post')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='box')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='media')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='full-image')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='full-media')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='image')))
|
|
imagesbs += list(map(extractimage, postbs.find_all(class_='imgback')))
|
|
imagesbs += list(map(extractimage, postbs.find_all('figure')))
|
|
imagesbs = list(filter(lambda a: a is not None, imagesbs))
|
|
imagesbs = list(map(lambda a: a['href'], imagesbs))
|
|
imagesbs = set(imagesbs)
|
|
imagesbs = list(map(lambda a: {'href': a}, imagesbs))
|
|
limagesbs = len(imagesbs)
|
|
for seq, imgbs in enumerate(imagesbs):
|
|
print(' '*50,end='')
|
|
print('\r',end='')
|
|
print(' `--> Album image #%03d of %03d'%(seq+1,limagesbs),end='')
|
|
print('\r',end='')
|
|
data = simpleDownloader.getUrlBytes(imgbs['href'])
|
|
if data is None:
|
|
return None
|
|
dd.put(imgbs['href'], data, filetype.guess_extension(data))
|
|
return dd
|
|
|
|
def extractimage(el):
|
|
t = el.find('a')
|
|
if t is None and el.name=='a':
|
|
t = el
|
|
img = el.find('img')
|
|
if img is None and el.name=='img':
|
|
img = el
|
|
if t is None or '.jpg' in t['href'] or '.png' in t['href'] or '.gif' in t['href'] or '.bmp' in t['href']:
|
|
if img is not None:
|
|
try:
|
|
t = {'href':img['src']}
|
|
except:
|
|
t = {'href':img['srcset'].split(' ')[0].split(',')[0]}
|
|
if t is not None:
|
|
if not ('.jpg' in t['href'] or '.png' in t['href'] or '.gif' in t['href'] or '.bmp' in t['href']):
|
|
t = None
|
|
return t
|
|
|
|
def get_class():
|
|
return UserTumblrCom
|