ufes-mestrado-projetopesqui.../docRefNetCreator/word_count.py

42 lines
1.2 KiB
Python
Raw Normal View History

2019-06-25 03:12:25 +00:00
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
ignoreList = [
*',.;/\\|?:~^`´[{(<>)}]=+-_&¨¬%$#@!"\'\r\n\b',
]
def list_cos_sim(a, b):
return cosine_similarity(np.array([a]), np.array([b]))
class WordCounter:
def __init__(self, text, ignore=ignoreList):
wf = dict()
self._wordFreq = wf
for exp in ignoreList:
text = text.replace(exp, ' ')
for word in text.split(' '):
if len(word) <= 0:
continue
lc = word.lower()
if lc not in wf:
wf[lc] = 0
wf[lc] += 1
def unionKeySets(self, other):
return sorted(list(set(list(self._wordFreq.keys())+list(other._wordFreq.keys()))))
def populateFrequency(self, word_vector):
return [self._wordFreq.get(word, 0) for word in word_vector]
def vectorSimilarity(self, other, function=list_cos_sim):
resultingVectorKeys = self.unionKeySets(other)
thisVector = self.populateFrequency(resultingVectorKeys)
thatVector = other.populateFrequency(resultingVectorKeys)
return function(thisVector, thatVector)