190 lines
5.7 KiB
Python
190 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
def getMatrixColumn(mtx,col): return [ln[col] for ln in mtx]
|
|
avg = lambda a: sum(a)/len(a)
|
|
|
|
plot_sort_key_func = max
|
|
|
|
treino_dict = {'G1': 20, 'G2': 35, 'G3': 30, 'G4': 35, 'G5': 27}
|
|
teste_dict = {'G1': 25, 'G2': 32, 'G3': 34, 'G4': 20, 'G5': 25}
|
|
|
|
def normalize_prevalency(treino_dict, teste_dict, etiq_dict):
|
|
all_keys = set().union(*[set(treino_dict.keys()),set(teste_dict.keys()), set(etiq_dict.keys())])
|
|
|
|
master_tbl = [(key, treino_dict.get(key,0), teste_dict.get(key,0), etiq_dict.get(key,0)) for key in all_keys]
|
|
|
|
master_tbl = [(plot_sort_key_func(entry[1:]), *entry) for entry in master_tbl]
|
|
master_tbl.sort()
|
|
master_tbl.reverse()
|
|
master_tbl = [entry[1:] for entry in master_tbl]
|
|
return master_tbl
|
|
|
|
def make_plot(treino_dict, teste_dict, etiq_dict):
|
|
master_tbl = normalize_prevalency(treino_dict, teste_dict, etiq_dict)[:15]
|
|
|
|
treino_values = getMatrixColumn(master_tbl, 1)
|
|
|
|
ind = np.arange(len(treino_values)) # the x locations for the groups
|
|
width = 0.25 # the width of the bars
|
|
|
|
fig, ax = plt.subplots()
|
|
rects1 = ax.bar(ind - 0.5*width, treino_values, width, color='r')
|
|
|
|
teste_values = getMatrixColumn(master_tbl, 2)
|
|
rects2 = ax.bar(ind + 0.5*width, teste_values, width, color='b')
|
|
|
|
etiquetador_values = getMatrixColumn(master_tbl, 3)
|
|
rects3 = ax.bar(ind + 1.5*width, etiquetador_values, width, color='g')
|
|
|
|
# add some text for labels, title and axes ticks
|
|
ax.set_xlabel('Etiquetas')
|
|
ax.set_ylabel('Prevalência')
|
|
#ax.set_title('Prevalência de etiquetas no treino e teste')
|
|
ax.set_xticks(ind + width / 2)
|
|
ax.set_xticklabels(getMatrixColumn(master_tbl, 0),rotation=90, ha='center')
|
|
|
|
yticks = ax.get_yticks()
|
|
ax.set_yticklabels([('{:3.2f}%'.format(x*100)).replace('.',',') for x in yticks])
|
|
|
|
ax.legend([
|
|
rects1[0],
|
|
rects2[0],
|
|
rects3[0],
|
|
], [
|
|
'Treino',
|
|
'Teste',
|
|
'Etiquetador',
|
|
])
|
|
|
|
def autolabel(rects):
|
|
"""
|
|
Attach a text label above each bar displaying its height
|
|
"""
|
|
for rect in rects:
|
|
continue
|
|
height = rect.get_height()
|
|
ax.text(rect.get_x() + rect.get_width()/2., height,
|
|
('%.04f' % height).replace('.',','),
|
|
ha='center', va='bottom', backgroundcolor='#FFFFFF88')
|
|
|
|
autolabel(rects1)
|
|
autolabel(rects2)
|
|
autolabel(rects3)
|
|
|
|
fig.tight_layout()
|
|
|
|
return fig
|
|
|
|
def readAllFile(loc):
|
|
with open(loc) as f:
|
|
return f.read()
|
|
|
|
def writeAllFile(loc,cnt):
|
|
with open(loc,'w') as f:
|
|
return f.write(cnt)
|
|
|
|
def cleanEntry(entry):
|
|
t = entry.split('/')
|
|
h = t[0]
|
|
t = t[1].split('+')[0].split(':')[0]
|
|
return (h,t)
|
|
|
|
def notEmpty(string):
|
|
return len(string.strip())>0
|
|
|
|
def pickLabels(entry):
|
|
return entry[1]
|
|
|
|
def makeFrequencyDict(lst):
|
|
fd = dict()
|
|
for item in lst: fd[item] = fd.get(item,0)+1
|
|
return fd
|
|
|
|
def frequencyDict2prevalencyDict(fd):
|
|
sm = sum(fd.values())
|
|
pd = dict()
|
|
for k, v in fd.items(): pd[k] = v/sm
|
|
return pd
|
|
|
|
def formatFloat(num):
|
|
return '% 6.02f'%round(num,2)
|
|
|
|
def matrix2tabular(mtx, percent=None):
|
|
s = r'\begin{tabular}{'+'|'.join(list('c'*len(mtx[0])))+'|}\n'
|
|
for line in mtx:
|
|
for xi, cell in enumerate(line):
|
|
if cell is not None:
|
|
if bool(percent) and (isinstance(cell, int) or isinstance(cell, float)):
|
|
tcell = cell
|
|
if percent == 100:
|
|
tcell*= 100
|
|
s+= formatFloat(tcell)
|
|
s+=r'\%'
|
|
else:
|
|
s+= str(cell)
|
|
if xi+1<len(mtx[0]):
|
|
s+= ' & '
|
|
s+=r' \\'
|
|
s+=r' \hline'
|
|
s+= '\n'
|
|
s+=r'\end{tabular}'+'\n'
|
|
return s
|
|
|
|
convertAns = [
|
|
('PROP', 'N'),
|
|
('VAUX', 'V'),
|
|
('PP', 'PREP'),
|
|
('PRP', 'PREP'),
|
|
('DET', 'PRON'),
|
|
('INTJ', 'INTERJ'),
|
|
]
|
|
|
|
convertGss = [
|
|
('A', 'ADJ'),
|
|
('X', 'N'),
|
|
('SIGL', 'N'),
|
|
('ABREV', 'N'),
|
|
('PRO', 'PRON'),
|
|
('PREPXDET', 'PREP'),
|
|
('PREPXPRO', 'PREP'),
|
|
('PONCT', '???'),
|
|
]
|
|
|
|
convertMap = dict(convertAns+convertGss)
|
|
|
|
converttag = lambda t: convertMap.get(t,t)
|
|
|
|
makePrevalencyDict = lambda lst: frequencyDict2prevalencyDict(makeFrequencyDict(lst))
|
|
|
|
locations = list(map(lambda a: a.split('\t'), readAllFile('locations.txt').strip().splitlines()))
|
|
|
|
getcorpus = lambda filedir: list(map(cleanEntry, filter(notEmpty, readAllFile(filedir).strip().splitlines())))
|
|
|
|
for (location, alias, needsconv) in locations:
|
|
needsconv = bool(int(needsconv))
|
|
traincorpus = getcorpus(location+'/unitexable_train/corpus.txt.answersheet.txt')
|
|
testcorpus = getcorpus(location+'/unitexable_test/corpus.answers_final.txt')
|
|
guesscorpus = getcorpus(location+'/unitexable_test/corpus.guesses_final.txt')
|
|
trainlbls = list(map(pickLabels, traincorpus))
|
|
testlbls = list(map(pickLabels, testcorpus))
|
|
guesslbls = list(map(pickLabels, guesscorpus))
|
|
if needsconv:
|
|
trainlbls = list(map(converttag, trainlbls))
|
|
testlbls = list(map(converttag, testlbls))
|
|
guesslbls = list(map(converttag, guesslbls))
|
|
trainpd = makePrevalencyDict(trainlbls)
|
|
testpd = makePrevalencyDict(testlbls)
|
|
guesspd = makePrevalencyDict(guesslbls)
|
|
fig = make_plot(trainpd, testpd, guesspd)
|
|
fig.savefig(alias+'_prev.pdf', format='pdf')
|
|
fig.savefig(alias+'_prev.svg', format='svg')
|
|
prevtbl = [(None,'Treino', 'Teste', 'Etiquetador')]+normalize_prevalency(trainpd, testpd, guesspd)
|
|
prevtex = matrix2tabular(prevtbl,100)
|
|
writeAllFile(alias+'_prev.tex', prevtex)
|
|
|
|
|