python-ebooker/ebooker/downloaders.py

187 lines
6.7 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import re
import json
import requests
import filetype
from hashlib import md5
from slugify import slugify
from bs4 import BeautifulSoup
from bs4 import BeautifulStoneSoup
from urllib.parse import urljoin
wattpadmatch1 = re.compile(
r'https?:\/\/(?:www.)wattpad.com\/story\/(\d+)(?:-[-\w\d%&?@]*)?'
)
wattpadpages = re.compile(
r'\"pages\"\s*:\s*(\d+)'
)
wattpadpagetext = re.compile(
r'\"text\"\s*:\s*\"(https:\/\/t\.wattpad\.com\/text-[^"]+)\"'
)
wattpadreplace1 = re.compile(
r'\ data\-[^\=]*\=\"[^\"]*\"'
)
wattpadcss = '''/* CSS for texts extracted from Wattpad */
img {
max-width: 100%;
max-height: 100%;
}
'''
def all_subclasses(cls):
return set(cls.__subclasses__()).union([s for c in cls.__subclasses__() for s in all_subclasses(c)])
class AllSubclasses:
@classmethod
def _all_subclasses(cls):
return all_subclasses(cls)
class CachedBook:
def __init__(self, cache_dir):
self._cache_dir = cache_dir
def is_cached(self):
return self._cache_dir.joinpath('cached.flag').exists()
def store(self, title, cover_img, chapters, images):
self._cache_dir.joinpath('title.txt').write_text(title)
self._cache_dir.joinpath('cover').write_bytes(cover_img)
self._cache_dir.joinpath('chapters.json').write_text(json.dumps(chapters))
imgs = dict()
for image_k, image_v in images.items():
image_d = md5(image_v).hexdigest()
imgs[image_k] = image_d
if not self._cache_dir.joinpath(image_d).exists():
self._cache_dir.joinpath(image_d).write_bytes(image_v)
self._cache_dir.joinpath('images.json').write_text(json.dumps(imgs))
self._cache_dir.joinpath('cached.flag').touch()
def load(self):
title = self._cache_dir.joinpath('title.txt').read_text()
cover_img = self._cache_dir.joinpath('cover').read_bytes()
chapters = json.loads(self._cache_dir.joinpath('chapters.json').read_text())
imgs = json.loads(self._cache_dir.joinpath('images.json').read_text())
imgs_files = {k: self._cache_dir.joinpath(k).read_bytes() for k in imgs.values()}
return title, cover_img, chapters, imgs, imgs_files
class AbstractBookDownloader(AllSubclasses):
@classmethod
def downloads(cls, link):
return False
def __init__(self, dirout):
self._base_dir_out = dirout
def fetch(self, link, ignore_cache = False):
return None, list(), dict()
def _cache_key(self):
return slugify(type(self).__name__)
def _cache_hash(self, link):
return slugify(md5(link.encode()).hexdigest())
def _cache_dir(self, link):
cd = self._base_dir_out.joinpath('cache').joinpath(self._cache_key()).joinpath(self._cache_hash(link))
cd.mkdir(parents=True, exist_ok=True)
return cd
def _cache(self, link):
return CachedBook(self._cache_dir(link))
def is_cached(self, link):
return self._cache(link).is_cached()
def neutralize(self, cached):
_, __, chapters, ___, ____ = cached.load()
return [ch[1] for ch in chapters], ''
class Wattpad(AbstractBookDownloader):
@classmethod
def downloads(cls, link):
return wattpadmatch1.match(link)
def fetch(self, link, ignore_cache = False):
if not ignore_cache:
if self.is_cached(link):
return self._cache(link)
with requests.session() as sess:
sess.headers = {'User-Agent': 'Mozilla/5.0'}
main_raw = sess.get(link).text
main_bs = BeautifulSoup(main_raw, 'html5lib')
image_link = urljoin(
'https://www.wattpad.com/',
main_bs.select_one('div.cover-lg img')['src']
)
title = main_bs.select_one('header h1').text.strip()
image = sess.get(image_link).content
chapters = list()
images = dict()
for chapter_link_bs in main_bs.select('ul.table-of-contents li a'):
chapter_name = chapter_link_bs.text.strip()
chapter_link = urljoin(
'https://www.wattpad.com/',
chapter_link_bs['href']
)
chapter_mainpg_raw = sess.get(chapter_link).text
chapter_pages_lks = [wattpadpagetext.search(chapter_mainpg_raw).group(1)]
chapter_pages_no = int(wattpadpages.search(chapter_mainpg_raw).group(1))
if chapter_pages_no > 1:
for page in range(2, chapter_pages_no+1):
bef, aft = chapter_pages_lks[0].split('?', 1)
chapter_pages_lks.append(bef+f'-{page}?'+aft)
chapter_segments = ['\n'.join(sess.get(link).text.splitlines()) for link in chapter_pages_lks]
chapter = '\n'.join(chapter_segments)
for image_ in BeautifulSoup(chapter, 'html5lib').find_all('img', src=True):
images[image_['src']] = sess.get(image_['src']).content
chapters.append((chapter_name, chapter))
cache = self._cache(link)
cache.store(title, image, chapters, images)
return cache
def neutralize(self, cached):
title, _, chapters, imgs, imgs_files = cached.load()
chapter_names = [ch[0] for ch in chapters]
chapters = [ch[1] for ch in chapters]
for i in range(len(chapters)):
for k, v in imgs.items():
chapters[i] = chapters[i].replace(k, f'../images/{v}.{filetype.guess(imgs_files[v]).extension}')
chapters[i] = wattpadreplace1.sub('', chapters[i])
xhtml = BeautifulSoup(BeautifulSoup(chapters[i], 'html5lib').prettify(), 'xml')
xhtml.head.append(xhtml.new_tag('title'))
xhtml.title.string = title
metatag = xhtml.new_tag('meta')
metatag.attrs['http-equiv'] = 'Content-Type'
metatag.attrs['content'] = 'text/html; charset=utf-8'
xhtml.head.append(metatag)
linktag = xhtml.new_tag('link')
linktag.attrs['rel'] = 'stylesheet'
linktag.attrs['type'] = 'text/css'
linktag.attrs['href'] = '../stylesheet.css'
xhtml.head.append(linktag)
chapnametag = xhtml.new_tag('h2')
chapnametag.string = chapter_names[i]
xhtml.body.insert(0, chapnametag)
xhtml.html['xmlns'] = "http://www.w3.org/1999/xhtml"
chapters[i] = xhtml.prettify()
return chapters, wattpadcss
class SpiritFanFiction(AbstractBookDownloader):
pass
class Tapas(AbstractBookDownloader):
pass
class FanFiction(AbstractBookDownloader):
pass