corpustagger-webservice/resources/ptbr/create_prevalency_charts.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt

def getMatrixColumn(mtx,col): return [ln[col] for ln in mtx]
avg = lambda a: sum(a)/len(a)

plot_sort_key_func = max

treino_dict = {'G1': 20, 'G2': 35, 'G3': 30, 'G4': 35, 'G5': 27}
teste_dict = {'G1': 25, 'G2': 32, 'G3': 34, 'G4': 20, 'G5': 25}

def normalize_prevalency(treino_dict, teste_dict, etiq_dict):
    all_keys = set().union(*[set(treino_dict.keys()),set(teste_dict.keys()), set(etiq_dict.keys())])

    master_tbl = [(key, treino_dict.get(key,0), teste_dict.get(key,0), etiq_dict.get(key,0)) for key in all_keys]

    master_tbl = [(plot_sort_key_func(entry[1:]), *entry) for entry in master_tbl]
    master_tbl.sort()
    master_tbl.reverse()
    master_tbl = [entry[1:] for entry in master_tbl]
    return master_tbl

def make_plot(treino_dict, teste_dict, etiq_dict):
    master_tbl = normalize_prevalency(treino_dict, teste_dict, etiq_dict)[:15]

    treino_values = getMatrixColumn(master_tbl, 1)

    ind = np.arange(len(treino_values))  # the x locations for the groups
    width = 0.25       # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(ind - 0.5*width, treino_values, width, color='r')

    teste_values = getMatrixColumn(master_tbl, 2)
    rects2 = ax.bar(ind + 0.5*width, teste_values, width, color='b')

    etiquetador_values = getMatrixColumn(master_tbl, 3)
    rects3 = ax.bar(ind + 1.5*width, etiquetador_values, width, color='g')

    # add some text for labels, title and axes ticks
    ax.set_xlabel('Etiquetas')
    ax.set_ylabel('Prevalência')
    #ax.set_title('Prevalência de etiquetas no treino e teste')
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(getMatrixColumn(master_tbl, 0),rotation=90, ha='center')

    yticks = ax.get_yticks()
    ax.set_yticklabels([('{:3.2f}%'.format(x*100)).replace('.',',') for x in yticks])

    ax.legend([
        rects1[0],
        rects2[0],
        rects3[0],
      ], [
        'Treino',
        'Teste',
        'Etiquetador',
     ])

    def autolabel(rects):
        """
        Attach a text label above each bar displaying its height
        """
        for rect in rects:
            continue
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width()/2., height,
                    ('%.04f' % height).replace('.',','),
                    ha='center', va='bottom', backgroundcolor='#FFFFFF88')

    autolabel(rects1)
    autolabel(rects2)
    autolabel(rects3)

    fig.tight_layout()

    return fig

def readAllFile(loc):
    with open(loc) as f:
        return f.read()

def writeAllFile(loc,cnt):
    with open(loc,'w') as f:
        return f.write(cnt)

def cleanEntry(entry):
    t = entry.split('/')
    h = t[0]
    t = t[1].split('+')[0].split(':')[0]
    return (h,t)

def notEmpty(string):
    return len(string.strip())>0

def pickLabels(entry):
    return entry[1]

def makeFrequencyDict(lst):
    fd = dict()
    for item in lst: fd[item] = fd.get(item,0)+1
    return fd

def frequencyDict2prevalencyDict(fd):
    sm = sum(fd.values())
    pd = dict()
    for k, v in fd.items(): pd[k] = v/sm
    return pd

def formatFloat(num):
    return '% 6.02f'%round(num,2)

def matrix2tabular(mtx, percent=None):
    s = r'\begin{tabular}{'+'|'.join(list('c'*len(mtx[0])))+'|}\n'
    for line in mtx:
        for xi, cell in enumerate(line):
            if cell is not None:
                if bool(percent) and (isinstance(cell, int) or isinstance(cell, float)):
                    tcell = cell
                    if percent == 100:
                        tcell*= 100
                    s+= formatFloat(tcell)
                    s+=r'\%'
                else:
                    s+= str(cell)
            if xi+1<len(mtx[0]):
                s+= ' & '
        s+=r' \\'
        s+=r' \hline'
        s+= '\n'
    s+=r'\end{tabular}'+'\n'
    return s

convertAns = [
    ('PROP', 'N'),
    ('VAUX', 'V'),
    ('PP', 'PREP'),
    ('PRP', 'PREP'),
    ('DET', 'PRON'),
    ('INTJ', 'INTERJ'),
]

convertGss = [
    ('A', 'ADJ'),
    ('X', 'N'),
    ('SIGL', 'N'),
    ('ABREV', 'N'),
    ('PRO', 'PRON'),
    ('PREPXDET', 'PREP'),
    ('PREPXPRO', 'PREP'),
    ('PONCT', '???'),
]

convertMap = dict(convertAns+convertGss)

converttag = lambda t: convertMap.get(t,t)

makePrevalencyDict = lambda lst: frequencyDict2prevalencyDict(makeFrequencyDict(lst))

locations = list(map(lambda a: a.split('\t'), readAllFile('locations.txt').strip().splitlines()))

getcorpus = lambda filedir: list(map(cleanEntry, filter(notEmpty, readAllFile(filedir).strip().splitlines())))

for (location, alias, needsconv) in locations:
    needsconv = bool(int(needsconv))
    traincorpus = getcorpus(location+'/unitexable_train/corpus.txt.answersheet.txt')
    testcorpus = getcorpus(location+'/unitexable_test/corpus.answers_final.txt')
    guesscorpus = getcorpus(location+'/unitexable_test/corpus.guesses_final.txt')
    trainlbls = list(map(pickLabels, traincorpus))
    testlbls = list(map(pickLabels, testcorpus))
    guesslbls = list(map(pickLabels, guesscorpus))
    if needsconv:
        trainlbls = list(map(converttag, trainlbls))
        testlbls = list(map(converttag, testlbls))
        guesslbls = list(map(converttag, guesslbls))
    trainpd = makePrevalencyDict(trainlbls)
    testpd = makePrevalencyDict(testlbls)
    guesspd = makePrevalencyDict(guesslbls)
    fig = make_plot(trainpd, testpd, guesspd)
    fig.savefig(alias+'_prev.pdf', format='pdf')
    fig.savefig(alias+'_prev.svg', format='svg')
    prevtbl = [(None,'Treino', 'Teste', 'Etiquetador')]+normalize_prevalency(trainpd, testpd, guesspd)
    prevtex = matrix2tabular(prevtbl,100)
    writeAllFile(alias+'_prev.tex', prevtex)