#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import csv from io import StringIO from typing import List, Optional, Tuple import bs4 from .reader import AbstractReader from .table import Table class AbstractTableParser: def __init__(self, reader: AbstractReader): self._reader = reader self._read: Optional[str] = None def get_read(self) -> str: if self._read is None: self._read = self._reader.read() return self._read def parse(self, transpose: bool) -> Table: raise NotImplementedError() class CsvTableParser(AbstractTableParser): def parse(self, transpose: bool) -> Table: mtx: List[List[str]] = list(csv.reader(StringIO(self.get_read()))) tbl = [[(cell, '', '') for cell in line] for line in mtx] return Table(tbl) class HtmlTableParser(AbstractTableParser): def _parse_tbl(self, tbl: bs4.element.Tag, transpose: bool) -> List[List[Tuple[str, str, str]]]: m = ( [ [ ( cell.get_text(separator=" \n", strip=True) or '', (cell.select_one( '[data-content]') or {'data-content': ''})['data-content'].strip(), '' ) for cell in row.find_all(['td', 'th']) ] for row in tbl.find_all('tr') ] ) if transpose: m = list(map(list, zip(*m))) return m def parse(self, transpose: bool) -> Table: bs = bs4.BeautifulSoup(self.get_read(), 'html5lib') bstbl = bs.find('table') tbl = self._parse_tbl(bstbl, transpose) return Table(tbl) class HtmlMultiTableParser(HtmlTableParser): def parse(self, transpose: bool) -> Table: bs = bs4.BeautifulSoup(self.get_read(), 'html5lib') tables: List[Table] = list() for bstbl in bs.find_all('table'): extra_context = list(filter(len, [ x.get_text().strip() for x in bstbl.fetchPreviousSiblings()[::-1] ])) ecj = '\n'.join(extra_context) tbl = self._parse_tbl(bstbl, transpose) if len(ecj) > 0: tbl[1][0] = (f'{ecj} {tbl[1][0][0]}', tbl[1][0][1], '') tables.append(Table(tbl)) return Table.merge_from(tables)