conventionschedule-android/data/parse_tables.py

144 lines
3.4 KiB
Python

#!/usr/bin/env python3
import re
import json
from pathlib import Path
from bs4 import BeautifulSoup as _BS
def BeautifulSoup(markup):
return _BS(markup, 'html5lib')
monthdb = [
[
'janeiro',
'january',
'enero',
],
[
'fevereiro',
'february',
'febrero',
],
[
'março',
'march',
'marzo',
],
[
'abril',
'april',
'abril',
],
[
'maio',
'may',
'mayo',
],
[
'junho',
'june',
'junio',
],
[
'julho',
'july',
'julio',
],
[
'agosto',
'august',
'agosto',
],
[
'setembro',
'september',
'septiembre',
],
[
'outubro',
'october',
'octubre',
],
[
'novembro',
'november',
'noviembre',
],
[
'dezembro',
'december',
'diciembre',
],
]
def guess_month(name):
name = name.lower()
for monthno, monthnms in enumerate(monthdb):
for month in sorted(monthnms, key=lambda a: -len(a)):
if month.lower() in name:
return monthno+1
raise KeyError(name)
def capitalize_each_word(string, capitalization_reasons=' !?.;*/()\\[]}{='):
return ''.join(_capitalize_each_word(string, capitalization_reasons))
def _capitalize_each_word(string, capitalization_reasons):
lst = list(string)
capitalize = True
for char in lst:
yield char.upper() if capitalize else char.lower()
capitalize = char in capitalization_reasons
yield from iter('')
def get_hour_line(tr):
cols = [col if not col.find('span') else col.select_one('span') for col in tr.select('td')]
clock = cols[0].get_text()
invalid = ['#ff0000' in ('' if col.get('style') is None else col.get('style')) for col in cols[1:]]
rooms = ['' if invalid[e] else v.get_text(' ', strip=True).replace('\n', ' ') for e, v in enumerate(cols[1:])]
filtered_rooms = list(map(lambda a: a != '', rooms))
if len(filtered_rooms) <= 0:
return None
hour, minute = tuple(map(int, clock.split(':', 1)))
return {
'clock': clock,
'hour': hour,
'minute': minute,
'activities': rooms
}
def get_day_table(section):
fullday = section.select_one('div.fusion-toggle-heading').get_text('', strip=True).replace('\n', ' ')
return {
'date': fullday,
'weekday': capitalize_each_word(fullday.split(',')[0].strip()),
'day': int(re.sub(r'[^0-9]', '', fullday)),
'month': guess_month(fullday),
'rooms': [capitalize_each_word(room.get_text('', strip=True).replace('\n', ' ')) for room in section.select('thead th')[1:]],
'events': list(filter(lambda a: a is not None, [get_hour_line(line) for line in section.select('tbody tr')])),
}
def get_days_tables(document):
title = document.select_one('.post-content .title').get_text('', strip=True).replace('\n', ' ')
return {
'title': title,
'year': int(re.sub(r'([^0-9])', '', title)),
'days': [get_day_table(section) for section in document.select('.post-content div.panel-default')],
}
pages = dict()
pages_path = Path('pages')
for page_path in pages_path.glob('*.html'):
language = page_path.stem
pages[language] = get_days_tables(BeautifulSoup(page_path.read_text()))
Path('table.json').write_text(json.dumps(pages, indent=4))