furmeet_events_self/timetable_parser/tableparser.py

75 lines
2.3 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import csv
from io import StringIO
from typing import List, Optional, Tuple
import bs4
from .reader import AbstractReader
from .table import Table
class AbstractTableParser:
def __init__(self, reader: AbstractReader):
self._reader = reader
self._read: Optional[str] = None
def get_read(self) -> str:
if self._read is None:
self._read = self._reader.read()
return self._read
def parse(self, transpose: bool) -> Table:
raise NotImplementedError()
class CsvTableParser(AbstractTableParser):
def parse(self, transpose: bool) -> Table:
mtx: List[List[str]] = list(csv.reader(StringIO(self.get_read())))
tbl = [[(cell, '', '') for cell in line] for line in mtx]
return Table(tbl)
class HtmlTableParser(AbstractTableParser):
def _parse_tbl(self, tbl: bs4.element.Tag, transpose: bool) -> List[List[Tuple[str, str, str]]]:
m = (
[
[
(
cell.get_text(separator=" \n", strip=True) or '',
(cell.select_one(
'[data-content]') or {'data-content': ''})['data-content'].strip(),
''
) for cell in row.find_all(['td', 'th'])
] for row in tbl.find_all('tr')
]
)
if transpose:
m = list(map(list, zip(*m)))
return m
def parse(self, transpose: bool) -> Table:
bs = bs4.BeautifulSoup(self.get_read(), 'html5lib')
bstbl = bs.find('table')
tbl = self._parse_tbl(bstbl, transpose)
return Table(tbl)
class HtmlMultiTableParser(HtmlTableParser):
def parse(self, transpose: bool) -> Table:
bs = bs4.BeautifulSoup(self.get_read(), 'html5lib')
tables: List[Table] = list()
for bstbl in bs.find_all('table'):
extra_context = list(filter(len, [
x.get_text().strip()
for x in bstbl.fetchPreviousSiblings()[::-1]
]))
ecj = '\n'.join(extra_context)
tbl = self._parse_tbl(bstbl, transpose)
if len(ecj) > 0:
tbl[1][0] = (f'{ecj} {tbl[1][0][0]}', tbl[1][0][1], '')
tables.append(Table(tbl))
return Table.merge_from(tables)