187 lines
6.7 KiB
Python
187 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
import re
|
|
import json
|
|
import requests
|
|
import filetype
|
|
from hashlib import md5
|
|
from slugify import slugify
|
|
from bs4 import BeautifulSoup
|
|
from bs4 import BeautifulStoneSoup
|
|
from urllib.parse import urljoin
|
|
|
|
wattpadmatch1 = re.compile(
|
|
r'https?:\/\/(?:www.)wattpad.com\/story\/(\d+)(?:-[-\w\d%&?@]*)?'
|
|
)
|
|
wattpadpages = re.compile(
|
|
r'\"pages\"\s*:\s*(\d+)'
|
|
)
|
|
wattpadpagetext = re.compile(
|
|
r'\"text\"\s*:\s*\"(https:\/\/t\.wattpad\.com\/text-[^"]+)\"'
|
|
)
|
|
wattpadreplace1 = re.compile(
|
|
r'\ data\-[^\=]*\=\"[^\"]*\"'
|
|
)
|
|
|
|
wattpadcss = '''/* CSS for texts extracted from Wattpad */
|
|
img {
|
|
max-width: 100%;
|
|
max-height: 100%;
|
|
}
|
|
'''
|
|
|
|
def all_subclasses(cls):
|
|
return set(cls.__subclasses__()).union([s for c in cls.__subclasses__() for s in all_subclasses(c)])
|
|
|
|
|
|
class AllSubclasses:
|
|
@classmethod
|
|
def _all_subclasses(cls):
|
|
return all_subclasses(cls)
|
|
|
|
|
|
class CachedBook:
|
|
def __init__(self, cache_dir):
|
|
self._cache_dir = cache_dir
|
|
|
|
def is_cached(self):
|
|
return self._cache_dir.joinpath('cached.flag').exists()
|
|
|
|
def store(self, title, cover_img, chapters, images):
|
|
self._cache_dir.joinpath('title.txt').write_text(title)
|
|
self._cache_dir.joinpath('cover').write_bytes(cover_img)
|
|
self._cache_dir.joinpath('chapters.json').write_text(json.dumps(chapters))
|
|
imgs = dict()
|
|
for image_k, image_v in images.items():
|
|
image_d = md5(image_v).hexdigest()
|
|
imgs[image_k] = image_d
|
|
if not self._cache_dir.joinpath(image_d).exists():
|
|
self._cache_dir.joinpath(image_d).write_bytes(image_v)
|
|
self._cache_dir.joinpath('images.json').write_text(json.dumps(imgs))
|
|
self._cache_dir.joinpath('cached.flag').touch()
|
|
|
|
def load(self):
|
|
title = self._cache_dir.joinpath('title.txt').read_text()
|
|
cover_img = self._cache_dir.joinpath('cover').read_bytes()
|
|
chapters = json.loads(self._cache_dir.joinpath('chapters.json').read_text())
|
|
imgs = json.loads(self._cache_dir.joinpath('images.json').read_text())
|
|
imgs_files = {k: self._cache_dir.joinpath(k).read_bytes() for k in imgs.values()}
|
|
return title, cover_img, chapters, imgs, imgs_files
|
|
|
|
|
|
class AbstractBookDownloader(AllSubclasses):
|
|
@classmethod
|
|
def downloads(cls, link):
|
|
return False
|
|
|
|
def __init__(self, dirout):
|
|
self._base_dir_out = dirout
|
|
|
|
def fetch(self, link, ignore_cache = False):
|
|
return None, list(), dict()
|
|
|
|
def _cache_key(self):
|
|
return slugify(type(self).__name__)
|
|
|
|
def _cache_hash(self, link):
|
|
return slugify(md5(link.encode()).hexdigest())
|
|
|
|
def _cache_dir(self, link):
|
|
cd = self._base_dir_out.joinpath('cache').joinpath(self._cache_key()).joinpath(self._cache_hash(link))
|
|
cd.mkdir(parents=True, exist_ok=True)
|
|
return cd
|
|
|
|
def _cache(self, link):
|
|
return CachedBook(self._cache_dir(link))
|
|
|
|
def is_cached(self, link):
|
|
return self._cache(link).is_cached()
|
|
|
|
def neutralize(self, cached):
|
|
_, __, chapters, ___, ____ = cached.load()
|
|
return [ch[1] for ch in chapters], ''
|
|
|
|
|
|
class Wattpad(AbstractBookDownloader):
|
|
@classmethod
|
|
def downloads(cls, link):
|
|
return wattpadmatch1.match(link)
|
|
|
|
def fetch(self, link, ignore_cache = False):
|
|
if not ignore_cache:
|
|
if self.is_cached(link):
|
|
return self._cache(link)
|
|
with requests.session() as sess:
|
|
sess.headers = {'User-Agent': 'Mozilla/5.0'}
|
|
main_raw = sess.get(link).text
|
|
main_bs = BeautifulSoup(main_raw, 'html5lib')
|
|
image_link = urljoin(
|
|
'https://www.wattpad.com/',
|
|
main_bs.select_one('div.cover-lg img')['src']
|
|
)
|
|
title = main_bs.select_one('header h1').text.strip()
|
|
image = sess.get(image_link).content
|
|
chapters = list()
|
|
images = dict()
|
|
for chapter_link_bs in main_bs.select('ul.table-of-contents li a'):
|
|
chapter_name = chapter_link_bs.text.strip()
|
|
chapter_link = urljoin(
|
|
'https://www.wattpad.com/',
|
|
chapter_link_bs['href']
|
|
)
|
|
chapter_mainpg_raw = sess.get(chapter_link).text
|
|
chapter_pages_lks = [wattpadpagetext.search(chapter_mainpg_raw).group(1)]
|
|
chapter_pages_no = int(wattpadpages.search(chapter_mainpg_raw).group(1))
|
|
if chapter_pages_no > 1:
|
|
for page in range(2, chapter_pages_no+1):
|
|
bef, aft = chapter_pages_lks[0].split('?', 1)
|
|
chapter_pages_lks.append(bef+f'-{page}?'+aft)
|
|
chapter_segments = ['\n'.join(sess.get(link).text.splitlines()) for link in chapter_pages_lks]
|
|
chapter = '\n'.join(chapter_segments)
|
|
for image_ in BeautifulSoup(chapter, 'html5lib').find_all('img', src=True):
|
|
images[image_['src']] = sess.get(image_['src']).content
|
|
chapters.append((chapter_name, chapter))
|
|
cache = self._cache(link)
|
|
cache.store(title, image, chapters, images)
|
|
return cache
|
|
|
|
def neutralize(self, cached):
|
|
title, _, chapters, imgs, imgs_files = cached.load()
|
|
chapter_names = [ch[0] for ch in chapters]
|
|
chapters = [ch[1] for ch in chapters]
|
|
for i in range(len(chapters)):
|
|
for k, v in imgs.items():
|
|
chapters[i] = chapters[i].replace(k, f'../images/{v}.{filetype.guess(imgs_files[v]).extension}')
|
|
chapters[i] = wattpadreplace1.sub('', chapters[i])
|
|
xhtml = BeautifulSoup(BeautifulSoup(chapters[i], 'html5lib').prettify(), 'xml')
|
|
xhtml.head.append(xhtml.new_tag('title'))
|
|
xhtml.title.string = title
|
|
metatag = xhtml.new_tag('meta')
|
|
metatag.attrs['http-equiv'] = 'Content-Type'
|
|
metatag.attrs['content'] = 'text/html; charset=utf-8'
|
|
xhtml.head.append(metatag)
|
|
linktag = xhtml.new_tag('link')
|
|
linktag.attrs['rel'] = 'stylesheet'
|
|
linktag.attrs['type'] = 'text/css'
|
|
linktag.attrs['href'] = '../stylesheet.css'
|
|
xhtml.head.append(linktag)
|
|
chapnametag = xhtml.new_tag('h2')
|
|
chapnametag.string = chapter_names[i]
|
|
xhtml.body.insert(0, chapnametag)
|
|
xhtml.html['xmlns'] = "http://www.w3.org/1999/xhtml"
|
|
chapters[i] = xhtml.prettify()
|
|
return chapters, wattpadcss
|
|
|
|
|
|
class SpiritFanFiction(AbstractBookDownloader):
|
|
pass
|
|
|
|
|
|
class Tapas(AbstractBookDownloader):
|
|
pass
|
|
|
|
|
|
class FanFiction(AbstractBookDownloader):
|
|
pass
|