144 lines
3.4 KiB
Python
144 lines
3.4 KiB
Python
|
#!/usr/bin/env python3
|
||
|
|
||
|
import re
|
||
|
import json
|
||
|
from pathlib import Path
|
||
|
from bs4 import BeautifulSoup as _BS
|
||
|
|
||
|
|
||
|
def BeautifulSoup(markup):
|
||
|
return _BS(markup, 'html5lib')
|
||
|
|
||
|
|
||
|
monthdb = [
|
||
|
[
|
||
|
'janeiro',
|
||
|
'january',
|
||
|
'enero',
|
||
|
],
|
||
|
[
|
||
|
'fevereiro',
|
||
|
'february',
|
||
|
'febrero',
|
||
|
],
|
||
|
[
|
||
|
'março',
|
||
|
'march',
|
||
|
'marzo',
|
||
|
],
|
||
|
[
|
||
|
'abril',
|
||
|
'april',
|
||
|
'abril',
|
||
|
],
|
||
|
[
|
||
|
'maio',
|
||
|
'may',
|
||
|
'mayo',
|
||
|
],
|
||
|
[
|
||
|
'junho',
|
||
|
'june',
|
||
|
'junio',
|
||
|
],
|
||
|
[
|
||
|
'julho',
|
||
|
'july',
|
||
|
'julio',
|
||
|
],
|
||
|
[
|
||
|
'agosto',
|
||
|
'august',
|
||
|
'agosto',
|
||
|
],
|
||
|
[
|
||
|
'setembro',
|
||
|
'september',
|
||
|
'septiembre',
|
||
|
],
|
||
|
[
|
||
|
'outubro',
|
||
|
'october',
|
||
|
'octubre',
|
||
|
],
|
||
|
[
|
||
|
'novembro',
|
||
|
'november',
|
||
|
'noviembre',
|
||
|
],
|
||
|
[
|
||
|
'dezembro',
|
||
|
'december',
|
||
|
'diciembre',
|
||
|
],
|
||
|
]
|
||
|
|
||
|
|
||
|
def guess_month(name):
|
||
|
name = name.lower()
|
||
|
for monthno, monthnms in enumerate(monthdb):
|
||
|
for month in sorted(monthnms, key=lambda a: -len(a)):
|
||
|
if month.lower() in name:
|
||
|
return monthno+1
|
||
|
raise KeyError(name)
|
||
|
|
||
|
|
||
|
def capitalize_each_word(string, capitalization_reasons=' !?.;*/()\\[]}{='):
|
||
|
return ''.join(_capitalize_each_word(string, capitalization_reasons))
|
||
|
|
||
|
|
||
|
def _capitalize_each_word(string, capitalization_reasons):
|
||
|
lst = list(string)
|
||
|
capitalize = True
|
||
|
for char in lst:
|
||
|
yield char.upper() if capitalize else char.lower()
|
||
|
capitalize = char in capitalization_reasons
|
||
|
yield from iter('')
|
||
|
|
||
|
|
||
|
def get_hour_line(tr):
|
||
|
cols = [col if not col.find('span') else col.select_one('span') for col in tr.select('td')]
|
||
|
clock = cols[0].get_text()
|
||
|
invalid = ['#ff0000' in ('' if col.get('style') is None else col.get('style')) for col in cols[1:]]
|
||
|
rooms = ['' if invalid[e] else v.get_text(' ', strip=True).replace('\n', ' ') for e, v in enumerate(cols[1:])]
|
||
|
filtered_rooms = list(map(lambda a: a != '', rooms))
|
||
|
if len(filtered_rooms) <= 0:
|
||
|
return None
|
||
|
hour, minute = tuple(map(int, clock.split(':', 1)))
|
||
|
return {
|
||
|
'clock': clock,
|
||
|
'hour': hour,
|
||
|
'minute': minute,
|
||
|
'activities': rooms
|
||
|
}
|
||
|
|
||
|
|
||
|
def get_day_table(section):
|
||
|
fullday = section.select_one('div.fusion-toggle-heading').get_text('', strip=True).replace('\n', ' ')
|
||
|
return {
|
||
|
'date': fullday,
|
||
|
'weekday': capitalize_each_word(fullday.split(',')[0].strip()),
|
||
|
'day': int(re.sub(r'[^0-9]', '', fullday)),
|
||
|
'month': guess_month(fullday),
|
||
|
'rooms': [capitalize_each_word(room.get_text('', strip=True).replace('\n', ' ')) for room in section.select('thead th')[1:]],
|
||
|
'events': list(filter(lambda a: a is not None, [get_hour_line(line) for line in section.select('tbody tr')])),
|
||
|
}
|
||
|
|
||
|
|
||
|
def get_days_tables(document):
|
||
|
title = document.select_one('.post-content .title').get_text('', strip=True).replace('\n', ' ')
|
||
|
return {
|
||
|
'title': title,
|
||
|
'year': int(re.sub(r'([^0-9])', '', title)),
|
||
|
'days': [get_day_table(section) for section in document.select('.post-content div.panel-default')],
|
||
|
}
|
||
|
|
||
|
|
||
|
pages = dict()
|
||
|
pages_path = Path('pages')
|
||
|
for page_path in pages_path.glob('*.html'):
|
||
|
language = page_path.stem
|
||
|
pages[language] = get_days_tables(BeautifulSoup(page_path.read_text()))
|
||
|
|
||
|
Path('table.json').write_text(json.dumps(pages, indent=4))
|