75 lines
2.3 KiB
Python
75 lines
2.3 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# -*- encoding: utf-8 -*-
|
||
|
|
||
|
import csv
|
||
|
from io import StringIO
|
||
|
from typing import List, Optional, Tuple
|
||
|
|
||
|
import bs4
|
||
|
|
||
|
from .reader import AbstractReader
|
||
|
from .table import Table
|
||
|
|
||
|
|
||
|
class AbstractTableParser:
|
||
|
def __init__(self, reader: AbstractReader):
|
||
|
self._reader = reader
|
||
|
self._read: Optional[str] = None
|
||
|
|
||
|
def get_read(self) -> str:
|
||
|
if self._read is None:
|
||
|
self._read = self._reader.read()
|
||
|
return self._read
|
||
|
|
||
|
def parse(self, transpose: bool) -> Table:
|
||
|
raise NotImplementedError()
|
||
|
|
||
|
|
||
|
class CsvTableParser(AbstractTableParser):
|
||
|
def parse(self, transpose: bool) -> Table:
|
||
|
mtx: List[List[str]] = list(csv.reader(StringIO(self.get_read())))
|
||
|
tbl = [[(cell, '', '') for cell in line] for line in mtx]
|
||
|
return Table(tbl)
|
||
|
|
||
|
|
||
|
class HtmlTableParser(AbstractTableParser):
|
||
|
def _parse_tbl(self, tbl: bs4.element.Tag, transpose: bool) -> List[List[Tuple[str, str, str]]]:
|
||
|
m = (
|
||
|
[
|
||
|
[
|
||
|
(
|
||
|
cell.get_text(separator=" \n", strip=True) or '',
|
||
|
(cell.select_one(
|
||
|
'[data-content]') or {'data-content': ''})['data-content'].strip(),
|
||
|
''
|
||
|
) for cell in row.find_all(['td', 'th'])
|
||
|
] for row in tbl.find_all('tr')
|
||
|
]
|
||
|
)
|
||
|
if transpose:
|
||
|
m = list(map(list, zip(*m)))
|
||
|
return m
|
||
|
|
||
|
def parse(self, transpose: bool) -> Table:
|
||
|
bs = bs4.BeautifulSoup(self.get_read(), 'html5lib')
|
||
|
bstbl = bs.find('table')
|
||
|
tbl = self._parse_tbl(bstbl, transpose)
|
||
|
return Table(tbl)
|
||
|
|
||
|
|
||
|
class HtmlMultiTableParser(HtmlTableParser):
|
||
|
def parse(self, transpose: bool) -> Table:
|
||
|
bs = bs4.BeautifulSoup(self.get_read(), 'html5lib')
|
||
|
tables: List[Table] = list()
|
||
|
for bstbl in bs.find_all('table'):
|
||
|
extra_context = list(filter(len, [
|
||
|
x.get_text().strip()
|
||
|
for x in bstbl.fetchPreviousSiblings()[::-1]
|
||
|
]))
|
||
|
ecj = '\n'.join(extra_context)
|
||
|
tbl = self._parse_tbl(bstbl, transpose)
|
||
|
if len(ecj) > 0:
|
||
|
tbl[1][0] = (f'{ecj} {tbl[1][0][0]}', tbl[1][0][1], '')
|
||
|
tables.append(Table(tbl))
|
||
|
return Table.merge_from(tables)
|