ufes-mestrado-projetopesqui.../docRefNetCreator/word_count.py

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


ignoreList = [
    *',.;/\\|?:~^`´[{(<>)}]=+-_&¨¬%$#@!"\'\r\n\b',
]


def list_cos_sim(a, b):
    return cosine_similarity(np.array([a]), np.array([b]))


class WordCounter:
    def __init__(self, text, ignore=ignoreList):
        wf = dict()
        self._wordFreq = wf
        for exp in ignoreList:
            text = text.replace(exp, ' ')
        for word in text.split(' '):
            if len(word) <= 0:
                continue
            lc = word.lower()
            if lc not in wf:
                wf[lc] = 0
            wf[lc] += 1

    def unionKeySets(self, other):
        return sorted(list(set(list(self._wordFreq.keys())+list(other._wordFreq.keys()))))

    def populateFrequency(self, word_vector):
        return [self._wordFreq.get(word, 0) for word in word_vector]

    def vectorSimilarity(self, other, function=list_cos_sim):
        resultingVectorKeys = self.unionKeySets(other)
        thisVector = self.populateFrequency(resultingVectorKeys)
        thatVector = other.populateFrequency(resultingVectorKeys)
        return function(thisVector, thatVector)