conventionschedule-android/data/parse_tables.py

#!/usr/bin/env python3

import re
import json
from pathlib import Path
from bs4 import BeautifulSoup as _BS


def BeautifulSoup(markup):
    return _BS(markup, 'html5lib')


monthdb = [
    [
        'janeiro',
        'january',
        'enero',
    ],
    [
        'fevereiro',
        'february',
        'febrero',
    ],
    [
        'março',
        'march',
        'marzo',
    ],
    [
        'abril',
        'april',
        'abril',
    ],
    [
        'maio',
        'may',
        'mayo',
    ],
    [
        'junho',
        'june',
        'junio',
    ],
    [
        'julho',
        'july',
        'julio',
    ],
    [
        'agosto',
        'august',
        'agosto',
    ],
    [
        'setembro',
        'september',
        'septiembre',
    ],
    [
        'outubro',
        'october',
        'octubre',
    ],
    [
        'novembro',
        'november',
        'noviembre',
    ],
    [
        'dezembro',
        'december',
        'diciembre',
    ],
]


def guess_month(name):
    name = name.lower()
    for monthno, monthnms in enumerate(monthdb):
        for month in sorted(monthnms, key=lambda a: -len(a)):
            if month.lower() in name:
                return monthno+1
    raise KeyError(name)


def capitalize_each_word(string, capitalization_reasons=' !?.;*/()\\[]}{='):
    return ''.join(_capitalize_each_word(string, capitalization_reasons))


def _capitalize_each_word(string, capitalization_reasons):
    lst = list(string)
    capitalize = True
    for char in lst:
        yield char.upper() if capitalize else char.lower()
        capitalize = char in capitalization_reasons
    yield from iter('')


def get_hour_line(tr):
    cols = [col if not col.find('span') else col.select_one('span') for col in tr.select('td')]
    clock = cols[0].get_text()
    invalid = ['#ff0000' in ('' if col.get('style') is None else col.get('style')) for col in cols[1:]]
    rooms = ['' if invalid[e] else v.get_text(' ', strip=True).replace('\n', ' ') for e, v in enumerate(cols[1:])]
    filtered_rooms = list(map(lambda a: a != '', rooms))
    if len(filtered_rooms) <= 0:
        return None
    hour, minute = tuple(map(int, clock.split(':', 1)))
    return {
        'clock': clock,
        'hour': hour,
        'minute': minute,
        'activities': rooms
    }


def get_day_table(section):
    fullday = section.select_one('div.fusion-toggle-heading').get_text('', strip=True).replace('\n', ' ')
    return {
        'date': fullday,
        'weekday': capitalize_each_word(fullday.split(',')[0].strip()),
        'day': int(re.sub(r'[^0-9]', '', fullday)),
        'month': guess_month(fullday),
        'rooms': [capitalize_each_word(room.get_text('', strip=True).replace('\n', ' ')) for room in section.select('thead th')[1:]],
        'events': list(filter(lambda a: a is not None, [get_hour_line(line) for line in section.select('tbody tr')])),
    }


def get_days_tables(document):
    title = document.select_one('.post-content .title').get_text('', strip=True).replace('\n', ' ')
    return {
        'title': title,
        'year': int(re.sub(r'([^0-9])', '', title)),
        'days': [get_day_table(section) for section in document.select('.post-content div.panel-default')],
    }


pages = dict()
pages_path = Path('pages')
for page_path in pages_path.glob('*.html'):
    language = page_path.stem
    pages[language] = get_days_tables(BeautifulSoup(page_path.read_text()))

Path('table.json').write_text(json.dumps(pages, indent=4))
many works in progress 2018-07-13 21:09:43 +00:00			`#!/usr/bin/env python3`

			`import re`
			`import json`
			`from pathlib import Path`
			`from bs4 import BeautifulSoup as _BS`


			`def BeautifulSoup(markup):`
			`return _BS(markup, 'html5lib')`


			`monthdb = [`
			`[`
			`'janeiro',`
			`'january',`
			`'enero',`
			`],`
			`[`
			`'fevereiro',`
			`'february',`
			`'febrero',`
			`],`
			`[`
			`'março',`
			`'march',`
			`'marzo',`
			`],`
			`[`
			`'abril',`
			`'april',`
			`'abril',`
			`],`
			`[`
			`'maio',`
			`'may',`
			`'mayo',`
			`],`
			`[`
			`'junho',`
			`'june',`
			`'junio',`
			`],`
			`[`
			`'julho',`
			`'july',`
			`'julio',`
			`],`
			`[`
			`'agosto',`
			`'august',`
			`'agosto',`
			`],`
			`[`
			`'setembro',`
			`'september',`
			`'septiembre',`
			`],`
			`[`
			`'outubro',`
			`'october',`
			`'octubre',`
			`],`
			`[`
			`'novembro',`
			`'november',`
			`'noviembre',`
			`],`
			`[`
			`'dezembro',`
			`'december',`
			`'diciembre',`
			`],`
			`]`


			`def guess_month(name):`
			`name = name.lower()`
			`for monthno, monthnms in enumerate(monthdb):`
			`for month in sorted(monthnms, key=lambda a: -len(a)):`
			`if month.lower() in name:`
			`return monthno+1`
			`raise KeyError(name)`


			`def capitalize_each_word(string, capitalization_reasons=' !?.;*/()\\[]}{='):`
			`return ''.join(_capitalize_each_word(string, capitalization_reasons))`


			`def _capitalize_each_word(string, capitalization_reasons):`
			`lst = list(string)`
			`capitalize = True`
			`for char in lst:`
			`yield char.upper() if capitalize else char.lower()`
			`capitalize = char in capitalization_reasons`
			`yield from iter('')`


			`def get_hour_line(tr):`
			`cols = [col if not col.find('span') else col.select_one('span') for col in tr.select('td')]`
			`clock = cols[0].get_text()`
			`invalid = ['#ff0000' in ('' if col.get('style') is None else col.get('style')) for col in cols[1:]]`
			`rooms = ['' if invalid[e] else v.get_text(' ', strip=True).replace('\n', ' ') for e, v in enumerate(cols[1:])]`
			`filtered_rooms = list(map(lambda a: a != '', rooms))`
			`if len(filtered_rooms) <= 0:`
			`return None`
			`hour, minute = tuple(map(int, clock.split(':', 1)))`
			`return {`
			`'clock': clock,`
			`'hour': hour,`
			`'minute': minute,`
			`'activities': rooms`
			`}`


			`def get_day_table(section):`
			`fullday = section.select_one('div.fusion-toggle-heading').get_text('', strip=True).replace('\n', ' ')`
			`return {`
			`'date': fullday,`
			`'weekday': capitalize_each_word(fullday.split(',')[0].strip()),`
			`'day': int(re.sub(r'[^0-9]', '', fullday)),`
			`'month': guess_month(fullday),`
			`'rooms': [capitalize_each_word(room.get_text('', strip=True).replace('\n', ' ')) for room in section.select('thead th')[1:]],`
			`'events': list(filter(lambda a: a is not None, [get_hour_line(line) for line in section.select('tbody tr')])),`
			`}`


			`def get_days_tables(document):`
			`title = document.select_one('.post-content .title').get_text('', strip=True).replace('\n', ' ')`
			`return {`
			`'title': title,`
			`'year': int(re.sub(r'([^0-9])', '', title)),`
			`'days': [get_day_table(section) for section in document.select('.post-content div.panel-default')],`
			`}`


			`pages = dict()`
			`pages_path = Path('pages')`
			`for page_path in pages_path.glob('*.html'):`
			`language = page_path.stem`
			`pages[language] = get_days_tables(BeautifulSoup(page_path.read_text()))`

			`Path('table.json').write_text(json.dumps(pages, indent=4))`