python-ebooker/ebooker/downloaders.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

import re
import json
import requests
import filetype
from hashlib import md5
from slugify import slugify
from bs4 import BeautifulSoup
from bs4 import BeautifulStoneSoup
from urllib.parse import urljoin

wattpadmatch1 = re.compile(
    r'https?:\/\/(?:www.)wattpad.com\/story\/(\d+)(?:-[-\w\d%&?@]*)?'
)
wattpadpages = re.compile(
    r'\"pages\"\s*:\s*(\d+)'
)
wattpadpagetext = re.compile(
    r'\"text\"\s*:\s*\"(https:\/\/t\.wattpad\.com\/text-[^"]+)\"'
)
wattpadreplace1 = re.compile(
    r'\ data\-[^\=]*\=\"[^\"]*\"'
)

wattpadcss = '''/* CSS for texts extracted from Wattpad */
img {
    max-width: 100%;
    max-height: 100%;
}
'''

def all_subclasses(cls):
    return set(cls.__subclasses__()).union([s for c in cls.__subclasses__() for s in all_subclasses(c)])


class AllSubclasses:
    @classmethod
    def _all_subclasses(cls):
        return all_subclasses(cls)


class CachedBook:
    def __init__(self, cache_dir):
        self._cache_dir = cache_dir

    def is_cached(self):
        return self._cache_dir.joinpath('cached.flag').exists()

    def store(self, title, cover_img, chapters, images):
        self._cache_dir.joinpath('title.txt').write_text(title)
        self._cache_dir.joinpath('cover').write_bytes(cover_img)
        self._cache_dir.joinpath('chapters.json').write_text(json.dumps(chapters))
        imgs = dict()
        for image_k, image_v in images.items():
            image_d = md5(image_v).hexdigest()
            imgs[image_k] = image_d
            if not self._cache_dir.joinpath(image_d).exists():
                self._cache_dir.joinpath(image_d).write_bytes(image_v)
        self._cache_dir.joinpath('images.json').write_text(json.dumps(imgs))
        self._cache_dir.joinpath('cached.flag').touch()

    def load(self):
        title = self._cache_dir.joinpath('title.txt').read_text()
        cover_img = self._cache_dir.joinpath('cover').read_bytes()
        chapters = json.loads(self._cache_dir.joinpath('chapters.json').read_text())
        imgs = json.loads(self._cache_dir.joinpath('images.json').read_text())
        imgs_files = {k: self._cache_dir.joinpath(k).read_bytes() for k in imgs.values()}
        return title, cover_img, chapters, imgs, imgs_files


class AbstractBookDownloader(AllSubclasses):
    @classmethod
    def downloads(cls, link):
        return False

    def __init__(self, dirout):
        self._base_dir_out = dirout

    def fetch(self, link, ignore_cache = False):
        return None, list(), dict()

    def _cache_key(self):
        return slugify(type(self).__name__)

    def _cache_hash(self, link):
        return slugify(md5(link.encode()).hexdigest())

    def _cache_dir(self, link):
        cd = self._base_dir_out.joinpath('cache').joinpath(self._cache_key()).joinpath(self._cache_hash(link))
        cd.mkdir(parents=True, exist_ok=True)
        return cd

    def _cache(self, link):
        return CachedBook(self._cache_dir(link))

    def is_cached(self, link):
        return self._cache(link).is_cached()

    def neutralize(self, cached):
        _, __, chapters, ___, ____ = cached.load()
        return [ch[1] for ch in chapters], ''


class Wattpad(AbstractBookDownloader):
    @classmethod
    def downloads(cls, link):
        return wattpadmatch1.match(link)

    def fetch(self, link, ignore_cache = False):
        if not ignore_cache:
            if self.is_cached(link):
                return self._cache(link)
        with requests.session() as sess:
            sess.headers = {'User-Agent': 'Mozilla/5.0'}
            main_raw = sess.get(link).text
            main_bs = BeautifulSoup(main_raw, 'html5lib')
            image_link = urljoin(
                'https://www.wattpad.com/',
                main_bs.select_one('div.cover-lg img')['src']
            )
            title = main_bs.select_one('header h1').text.strip()
            image = sess.get(image_link).content
            chapters = list()
            images = dict()
            for chapter_link_bs in main_bs.select('ul.table-of-contents li a'):
                chapter_name = chapter_link_bs.text.strip()
                chapter_link = urljoin(
                    'https://www.wattpad.com/',
                    chapter_link_bs['href']
                )
                chapter_mainpg_raw = sess.get(chapter_link).text
                chapter_pages_lks = [wattpadpagetext.search(chapter_mainpg_raw).group(1)]
                chapter_pages_no = int(wattpadpages.search(chapter_mainpg_raw).group(1))
                if chapter_pages_no > 1:
                    for page in range(2, chapter_pages_no+1):
                        bef, aft = chapter_pages_lks[0].split('?', 1)
                        chapter_pages_lks.append(bef+f'-{page}?'+aft)
                chapter_segments = ['\n'.join(sess.get(link).text.splitlines()) for link in chapter_pages_lks]
                chapter = '\n'.join(chapter_segments)
                for image_ in BeautifulSoup(chapter, 'html5lib').find_all('img', src=True):
                    images[image_['src']] = sess.get(image_['src']).content
                chapters.append((chapter_name, chapter))
            cache = self._cache(link)
            cache.store(title, image, chapters, images)
            return cache

    def neutralize(self, cached):
        title, _, chapters, imgs, imgs_files = cached.load()
        chapter_names = [ch[0] for ch in chapters]
        chapters = [ch[1] for ch in chapters]
        for i in range(len(chapters)):
            for k, v in imgs.items():
                chapters[i] = chapters[i].replace(k, f'../images/{v}.{filetype.guess(imgs_files[v]).extension}')
            chapters[i] = wattpadreplace1.sub('', chapters[i])
            xhtml = BeautifulSoup(BeautifulSoup(chapters[i], 'html5lib').prettify(), 'xml')
            xhtml.head.append(xhtml.new_tag('title'))
            xhtml.title.string = title
            metatag = xhtml.new_tag('meta')
            metatag.attrs['http-equiv'] = 'Content-Type'
            metatag.attrs['content'] = 'text/html; charset=utf-8'
            xhtml.head.append(metatag)
            linktag = xhtml.new_tag('link')
            linktag.attrs['rel'] = 'stylesheet'
            linktag.attrs['type'] = 'text/css'
            linktag.attrs['href'] = '../stylesheet.css'
            xhtml.head.append(linktag)
            chapnametag = xhtml.new_tag('h2')
            chapnametag.string = chapter_names[i]
            xhtml.body.insert(0, chapnametag)
            xhtml.html['xmlns'] = "http://www.w3.org/1999/xhtml"
            chapters[i] = xhtml.prettify()
        return chapters, wattpadcss


class SpiritFanFiction(AbstractBookDownloader):
    pass


class Tapas(AbstractBookDownloader):
    pass


class FanFiction(AbstractBookDownloader):
    pass