conventionschedule-android/data/parse_tables.py

#!/usr/bin/env python3

import re
import json
from pathlib import Path
from bs4 import BeautifulSoup as _BS


def BeautifulSoup(markup):
    return _BS(markup, 'html5lib')


monthdb = [
    [
        'janeiro',
        'january',
        'enero',
    ],
    [
        'fevereiro',
        'february',
        'febrero',
    ],
    [
        'março',
        'march',
        'marzo',
    ],
    [
        'abril',
        'april',
        'abril',
    ],
    [
        'maio',
        'may',
        'mayo',
    ],
    [
        'junho',
        'june',
        'junio',
    ],
    [
        'julho',
        'july',
        'julio',
    ],
    [
        'agosto',
        'august',
        'agosto',
    ],
    [
        'setembro',
        'september',
        'septiembre',
    ],
    [
        'outubro',
        'october',
        'octubre',
    ],
    [
        'novembro',
        'november',
        'noviembre',
    ],
    [
        'dezembro',
        'december',
        'diciembre',
    ],
]


def guess_month(name):
    name = name.lower()
    for monthno, monthnms in enumerate(monthdb):
        for month in sorted(monthnms, key=lambda a: -len(a)):
            if month.lower() in name:
                return monthno+1
    raise KeyError(name)


def capitalize_each_word(string, capitalization_reasons=' !?.;*/()\\[]}{='):
    return ''.join(_capitalize_each_word(string, capitalization_reasons))


def _capitalize_each_word(string, capitalization_reasons):
    lst = list(string)
    capitalize = True
    for char in lst:
        yield char.upper() if capitalize else char.lower()
        capitalize = char in capitalization_reasons
    yield from iter('')


def get_hour_line(tr):
    cols = [col if not col.find('span') else col.select_one('span') for col in tr.select('td')]
    clock = cols[0].get_text()
    invalid = ['#ff0000' in ('' if col.get('style') is None else col.get('style')) for col in cols[1:]]
    rooms = ['' if invalid[e] else v.get_text(' ', strip=True).replace('\n', ' ') for e, v in enumerate(cols[1:])]
    filtered_rooms = list(map(lambda a: a != '', rooms))
    if len(filtered_rooms) <= 0:
        return None
    hour, minute = tuple(map(int, clock.split(':', 1)))
    return {
        'clock': clock,
        'hour': hour,
        'minute': minute,
        'activities': rooms
    }


def get_day_table(section):
    fullday = section.select_one('div.fusion-toggle-heading').get_text('', strip=True).replace('\n', ' ')
    return {
        'date': fullday,
        'weekday': capitalize_each_word(fullday.split(',')[0].strip()),
        'day': int(re.sub(r'[^0-9]', '', fullday)),
        'month': guess_month(fullday),
        'rooms': [capitalize_each_word(room.get_text('', strip=True).replace('\n', ' ')) for room in section.select('thead th')[1:]],
        'events': list(filter(lambda a: a is not None, [get_hour_line(line) for line in section.select('tbody tr')])),
    }


def get_days_tables(document):
    title = document.select_one('.post-content .title').get_text('', strip=True).replace('\n', ' ')
    return {
        'title': title,
        'year': int(re.sub(r'([^0-9])', '', title)),
        'days': [get_day_table(section) for section in document.select('.post-content div.panel-default')],
    }


pages = dict()
pages_path = Path('pages')
for page_path in pages_path.glob('*.html'):
    language = page_path.stem
    pages[language] = get_days_tables(BeautifulSoup(page_path.read_text()))

Path('table.json').write_text(json.dumps(pages, indent=4))