#!/usr/bin/env python3 import re import json from pathlib import Path from bs4 import BeautifulSoup as _BS def BeautifulSoup(markup): return _BS(markup, 'html5lib') monthdb = [ [ 'janeiro', 'january', 'enero', ], [ 'fevereiro', 'february', 'febrero', ], [ 'março', 'march', 'marzo', ], [ 'abril', 'april', 'abril', ], [ 'maio', 'may', 'mayo', ], [ 'junho', 'june', 'junio', ], [ 'julho', 'july', 'julio', ], [ 'agosto', 'august', 'agosto', ], [ 'setembro', 'september', 'septiembre', ], [ 'outubro', 'october', 'octubre', ], [ 'novembro', 'november', 'noviembre', ], [ 'dezembro', 'december', 'diciembre', ], ] def guess_month(name): name = name.lower() for monthno, monthnms in enumerate(monthdb): for month in sorted(monthnms, key=lambda a: -len(a)): if month.lower() in name: return monthno+1 raise KeyError(name) def capitalize_each_word(string, capitalization_reasons=' !?.;*/()\\[]}{='): return ''.join(_capitalize_each_word(string, capitalization_reasons)) def _capitalize_each_word(string, capitalization_reasons): lst = list(string) capitalize = True for char in lst: yield char.upper() if capitalize else char.lower() capitalize = char in capitalization_reasons yield from iter('') def get_hour_line(tr): cols = [col if not col.find('span') else col.select_one('span') for col in tr.select('td')] clock = cols[0].get_text() invalid = ['#ff0000' in ('' if col.get('style') is None else col.get('style')) for col in cols[1:]] rooms = ['' if invalid[e] else v.get_text(' ', strip=True).replace('\n', ' ') for e, v in enumerate(cols[1:])] filtered_rooms = list(map(lambda a: a != '', rooms)) if len(filtered_rooms) <= 0: return None hour, minute = tuple(map(int, clock.split(':', 1))) return { 'clock': clock, 'hour': hour, 'minute': minute, 'activities': rooms } def get_day_table(section): fullday = section.select_one('div.fusion-toggle-heading').get_text('', strip=True).replace('\n', ' ') return { 'date': fullday, 'weekday': capitalize_each_word(fullday.split(',')[0].strip()), 'day': int(re.sub(r'[^0-9]', '', fullday)), 'month': guess_month(fullday), 'rooms': [capitalize_each_word(room.get_text('', strip=True).replace('\n', ' ')) for room in section.select('thead th')[1:]], 'events': list(filter(lambda a: a is not None, [get_hour_line(line) for line in section.select('tbody tr')])), } def get_days_tables(document): title = document.select_one('.post-content .title').get_text('', strip=True).replace('\n', ' ') return { 'title': title, 'year': int(re.sub(r'([^0-9])', '', title)), 'days': [get_day_table(section) for section in document.select('.post-content div.panel-default')], } pages = dict() pages_path = Path('pages') for page_path in pages_path.glob('*.html'): language = page_path.stem pages[language] = get_days_tables(BeautifulSoup(page_path.read_text())) Path('table.json').write_text(json.dumps(pages, indent=4))