42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
|
#!/usr/bin/env python3
|
|||
|
# -*- encoding: utf-8 -*-
|
|||
|
|
|||
|
from sklearn.metrics.pairwise import cosine_similarity
|
|||
|
import numpy as np
|
|||
|
|
|||
|
|
|||
|
ignoreList = [
|
|||
|
*',.;/\\|?:~^`´[{(<>)}]=+-_&¨¬%$#@!"\'\r\n\b',
|
|||
|
]
|
|||
|
|
|||
|
|
|||
|
def list_cos_sim(a, b):
|
|||
|
return cosine_similarity(np.array([a]), np.array([b]))
|
|||
|
|
|||
|
|
|||
|
class WordCounter:
|
|||
|
def __init__(self, text, ignore=ignoreList):
|
|||
|
wf = dict()
|
|||
|
self._wordFreq = wf
|
|||
|
for exp in ignoreList:
|
|||
|
text = text.replace(exp, ' ')
|
|||
|
for word in text.split(' '):
|
|||
|
if len(word) <= 0:
|
|||
|
continue
|
|||
|
lc = word.lower()
|
|||
|
if lc not in wf:
|
|||
|
wf[lc] = 0
|
|||
|
wf[lc] += 1
|
|||
|
|
|||
|
def unionKeySets(self, other):
|
|||
|
return sorted(list(set(list(self._wordFreq.keys())+list(other._wordFreq.keys()))))
|
|||
|
|
|||
|
def populateFrequency(self, word_vector):
|
|||
|
return [self._wordFreq.get(word, 0) for word in word_vector]
|
|||
|
|
|||
|
def vectorSimilarity(self, other, function=list_cos_sim):
|
|||
|
resultingVectorKeys = self.unionKeySets(other)
|
|||
|
thisVector = self.populateFrequency(resultingVectorKeys)
|
|||
|
thatVector = other.populateFrequency(resultingVectorKeys)
|
|||
|
return function(thisVector, thatVector)
|