corpustagger-webservice/resources/ptbr/create_prevalency_charts.py

190 lines
5.7 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
def getMatrixColumn(mtx,col): return [ln[col] for ln in mtx]
avg = lambda a: sum(a)/len(a)
plot_sort_key_func = max
treino_dict = {'G1': 20, 'G2': 35, 'G3': 30, 'G4': 35, 'G5': 27}
teste_dict = {'G1': 25, 'G2': 32, 'G3': 34, 'G4': 20, 'G5': 25}
def normalize_prevalency(treino_dict, teste_dict, etiq_dict):
all_keys = set().union(*[set(treino_dict.keys()),set(teste_dict.keys()), set(etiq_dict.keys())])
master_tbl = [(key, treino_dict.get(key,0), teste_dict.get(key,0), etiq_dict.get(key,0)) for key in all_keys]
master_tbl = [(plot_sort_key_func(entry[1:]), *entry) for entry in master_tbl]
master_tbl.sort()
master_tbl.reverse()
master_tbl = [entry[1:] for entry in master_tbl]
return master_tbl
def make_plot(treino_dict, teste_dict, etiq_dict):
master_tbl = normalize_prevalency(treino_dict, teste_dict, etiq_dict)[:15]
treino_values = getMatrixColumn(master_tbl, 1)
ind = np.arange(len(treino_values)) # the x locations for the groups
width = 0.25 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind - 0.5*width, treino_values, width, color='r')
teste_values = getMatrixColumn(master_tbl, 2)
rects2 = ax.bar(ind + 0.5*width, teste_values, width, color='b')
etiquetador_values = getMatrixColumn(master_tbl, 3)
rects3 = ax.bar(ind + 1.5*width, etiquetador_values, width, color='g')
# add some text for labels, title and axes ticks
ax.set_xlabel('Etiquetas')
ax.set_ylabel('Prevalência')
#ax.set_title('Prevalência de etiquetas no treino e teste')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(getMatrixColumn(master_tbl, 0),rotation=90, ha='center')
yticks = ax.get_yticks()
ax.set_yticklabels([('{:3.2f}%'.format(x*100)).replace('.',',') for x in yticks])
ax.legend([
rects1[0],
rects2[0],
rects3[0],
], [
'Treino',
'Teste',
'Etiquetador',
])
def autolabel(rects):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
continue
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., height,
('%.04f' % height).replace('.',','),
ha='center', va='bottom', backgroundcolor='#FFFFFF88')
autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
fig.tight_layout()
return fig
def readAllFile(loc):
with open(loc) as f:
return f.read()
def writeAllFile(loc,cnt):
with open(loc,'w') as f:
return f.write(cnt)
def cleanEntry(entry):
t = entry.split('/')
h = t[0]
t = t[1].split('+')[0].split(':')[0]
return (h,t)
def notEmpty(string):
return len(string.strip())>0
def pickLabels(entry):
return entry[1]
def makeFrequencyDict(lst):
fd = dict()
for item in lst: fd[item] = fd.get(item,0)+1
return fd
def frequencyDict2prevalencyDict(fd):
sm = sum(fd.values())
pd = dict()
for k, v in fd.items(): pd[k] = v/sm
return pd
def formatFloat(num):
return '% 6.02f'%round(num,2)
def matrix2tabular(mtx, percent=None):
s = r'\begin{tabular}{'+'|'.join(list('c'*len(mtx[0])))+'|}\n'
for line in mtx:
for xi, cell in enumerate(line):
if cell is not None:
if bool(percent) and (isinstance(cell, int) or isinstance(cell, float)):
tcell = cell
if percent == 100:
tcell*= 100
s+= formatFloat(tcell)
s+=r'\%'
else:
s+= str(cell)
if xi+1<len(mtx[0]):
s+= ' & '
s+=r' \\'
s+=r' \hline'
s+= '\n'
s+=r'\end{tabular}'+'\n'
return s
convertAns = [
('PROP', 'N'),
('VAUX', 'V'),
('PP', 'PREP'),
('PRP', 'PREP'),
('DET', 'PRON'),
('INTJ', 'INTERJ'),
]
convertGss = [
('A', 'ADJ'),
('X', 'N'),
('SIGL', 'N'),
('ABREV', 'N'),
('PRO', 'PRON'),
('PREPXDET', 'PREP'),
('PREPXPRO', 'PREP'),
('PONCT', '???'),
]
convertMap = dict(convertAns+convertGss)
converttag = lambda t: convertMap.get(t,t)
makePrevalencyDict = lambda lst: frequencyDict2prevalencyDict(makeFrequencyDict(lst))
locations = list(map(lambda a: a.split('\t'), readAllFile('locations.txt').strip().splitlines()))
getcorpus = lambda filedir: list(map(cleanEntry, filter(notEmpty, readAllFile(filedir).strip().splitlines())))
for (location, alias, needsconv) in locations:
needsconv = bool(int(needsconv))
traincorpus = getcorpus(location+'/unitexable_train/corpus.txt.answersheet.txt')
testcorpus = getcorpus(location+'/unitexable_test/corpus.answers_final.txt')
guesscorpus = getcorpus(location+'/unitexable_test/corpus.guesses_final.txt')
trainlbls = list(map(pickLabels, traincorpus))
testlbls = list(map(pickLabels, testcorpus))
guesslbls = list(map(pickLabels, guesscorpus))
if needsconv:
trainlbls = list(map(converttag, trainlbls))
testlbls = list(map(converttag, testlbls))
guesslbls = list(map(converttag, guesslbls))
trainpd = makePrevalencyDict(trainlbls)
testpd = makePrevalencyDict(testlbls)
guesspd = makePrevalencyDict(guesslbls)
fig = make_plot(trainpd, testpd, guesspd)
fig.savefig(alias+'_prev.pdf', format='pdf')
fig.savefig(alias+'_prev.svg', format='svg')
prevtbl = [(None,'Treino', 'Teste', 'Etiquetador')]+normalize_prevalency(trainpd, testpd, guesspd)
prevtex = matrix2tabular(prevtbl,100)
writeAllFile(alias+'_prev.tex', prevtex)