#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import re import json import requests import filetype from hashlib import md5 from slugify import slugify from bs4 import BeautifulSoup from bs4 import BeautifulStoneSoup from urllib.parse import urljoin wattpadmatch1 = re.compile( r'https?:\/\/(?:www.)wattpad.com\/story\/(\d+)(?:-[-\w\d%&?@]*)?' ) wattpadpages = re.compile( r'\"pages\"\s*:\s*(\d+)' ) wattpadpagetext = re.compile( r'\"text\"\s*:\s*\"(https:\/\/t\.wattpad\.com\/text-[^"]+)\"' ) wattpadreplace1 = re.compile( r'\ data\-[^\=]*\=\"[^\"]*\"' ) wattpadcss = '''/* CSS for texts extracted from Wattpad */ img { max-width: 100%; max-height: 100%; } ''' def all_subclasses(cls): return set(cls.__subclasses__()).union([s for c in cls.__subclasses__() for s in all_subclasses(c)]) class AllSubclasses: @classmethod def _all_subclasses(cls): return all_subclasses(cls) class CachedBook: def __init__(self, cache_dir): self._cache_dir = cache_dir def is_cached(self): return self._cache_dir.joinpath('cached.flag').exists() def store(self, title, cover_img, chapters, images): self._cache_dir.joinpath('title.txt').write_text(title) self._cache_dir.joinpath('cover').write_bytes(cover_img) self._cache_dir.joinpath('chapters.json').write_text(json.dumps(chapters)) imgs = dict() for image_k, image_v in images.items(): image_d = md5(image_v).hexdigest() imgs[image_k] = image_d if not self._cache_dir.joinpath(image_d).exists(): self._cache_dir.joinpath(image_d).write_bytes(image_v) self._cache_dir.joinpath('images.json').write_text(json.dumps(imgs)) self._cache_dir.joinpath('cached.flag').touch() def load(self): title = self._cache_dir.joinpath('title.txt').read_text() cover_img = self._cache_dir.joinpath('cover').read_bytes() chapters = json.loads(self._cache_dir.joinpath('chapters.json').read_text()) imgs = json.loads(self._cache_dir.joinpath('images.json').read_text()) imgs_files = {k: self._cache_dir.joinpath(k).read_bytes() for k in imgs.values()} return title, cover_img, chapters, imgs, imgs_files class AbstractBookDownloader(AllSubclasses): @classmethod def downloads(cls, link): return False def __init__(self, dirout): self._base_dir_out = dirout def fetch(self, link, ignore_cache = False): return None, list(), dict() def _cache_key(self): return slugify(type(self).__name__) def _cache_hash(self, link): return slugify(md5(link.encode()).hexdigest()) def _cache_dir(self, link): cd = self._base_dir_out.joinpath('cache').joinpath(self._cache_key()).joinpath(self._cache_hash(link)) cd.mkdir(parents=True, exist_ok=True) return cd def _cache(self, link): return CachedBook(self._cache_dir(link)) def is_cached(self, link): return self._cache(link).is_cached() def neutralize(self, cached): _, __, chapters, ___, ____ = cached.load() return [ch[1] for ch in chapters], '' class Wattpad(AbstractBookDownloader): @classmethod def downloads(cls, link): return wattpadmatch1.match(link) def fetch(self, link, ignore_cache = False): if not ignore_cache: if self.is_cached(link): return self._cache(link) with requests.session() as sess: sess.headers = {'User-Agent': 'Mozilla/5.0'} main_raw = sess.get(link).text main_bs = BeautifulSoup(main_raw, 'html5lib') image_link = urljoin( 'https://www.wattpad.com/', main_bs.select_one('div.cover-lg img')['src'] ) title = main_bs.select_one('header h1').text.strip() image = sess.get(image_link).content chapters = list() images = dict() for chapter_link_bs in main_bs.select('ul.table-of-contents li a'): chapter_name = chapter_link_bs.text.strip() chapter_link = urljoin( 'https://www.wattpad.com/', chapter_link_bs['href'] ) chapter_mainpg_raw = sess.get(chapter_link).text chapter_pages_lks = [wattpadpagetext.search(chapter_mainpg_raw).group(1)] chapter_pages_no = int(wattpadpages.search(chapter_mainpg_raw).group(1)) if chapter_pages_no > 1: for page in range(2, chapter_pages_no+1): bef, aft = chapter_pages_lks[0].split('?', 1) chapter_pages_lks.append(bef+f'-{page}?'+aft) chapter_segments = ['\n'.join(sess.get(link).text.splitlines()) for link in chapter_pages_lks] chapter = '\n'.join(chapter_segments) for image_ in BeautifulSoup(chapter, 'html5lib').find_all('img', src=True): images[image_['src']] = sess.get(image_['src']).content chapters.append((chapter_name, chapter)) cache = self._cache(link) cache.store(title, image, chapters, images) return cache def neutralize(self, cached): title, _, chapters, imgs, imgs_files = cached.load() chapter_names = [ch[0] for ch in chapters] chapters = [ch[1] for ch in chapters] for i in range(len(chapters)): for k, v in imgs.items(): chapters[i] = chapters[i].replace(k, f'../images/{v}.{filetype.guess(imgs_files[v]).extension}') chapters[i] = wattpadreplace1.sub('', chapters[i]) xhtml = BeautifulSoup(BeautifulSoup(chapters[i], 'html5lib').prettify(), 'xml') xhtml.head.append(xhtml.new_tag('title')) xhtml.title.string = title metatag = xhtml.new_tag('meta') metatag.attrs['http-equiv'] = 'Content-Type' metatag.attrs['content'] = 'text/html; charset=utf-8' xhtml.head.append(metatag) linktag = xhtml.new_tag('link') linktag.attrs['rel'] = 'stylesheet' linktag.attrs['type'] = 'text/css' linktag.attrs['href'] = '../stylesheet.css' xhtml.head.append(linktag) chapnametag = xhtml.new_tag('h2') chapnametag.string = chapter_names[i] xhtml.body.insert(0, chapnametag) xhtml.html['xmlns'] = "http://www.w3.org/1999/xhtml" chapters[i] = xhtml.prettify() return chapters, wattpadcss class SpiritFanFiction(AbstractBookDownloader): pass class Tapas(AbstractBookDownloader): pass class FanFiction(AbstractBookDownloader): pass