corpusslayer/plugins/concord/searcher.py

371 lines
12 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright (c) 2017 Adler Neves <adlerosn@gmail.com>
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import sys
def mocksearch(query, taggedSentences):
resultado = {
'sentence':[['A','ART'],['casa','N'],['é','V'],['vermelha','N'],],
'match':[['casa','N'],],
}
return [resultado,resultado]
class SearchException(Exception): pass
class SearchQueryException(SearchException): pass
import math
#direct translation from JS code; forgive me if the code stinks
def backslashUnescaper(stri):
escape = False
escaped = ''
for char in stri:
if escape:
escape=False;
escaped+=char;
else:
if char=='\\':
escape=True;
else:
escaped+=char;
return escaped
#direct translation from JS code; forgive me if the code stinks
def backslashRemovingEscaped(stri):
escape = False
escaped = ''
for char in stri:
if escape:
escape=False;
else:
if char=='\\':
escape=True;
else:
escaped+=char;
return escaped
def parseInt(var=None):
try:
return int(var)
except:
return math.nan
def isNaN(var=None):
if var is None or math.isnan(var):
return True
else:
return False
#direct translation from JS code; forgive me if the code stinks
def interpretRange(stri, conc=None):
conclusion={
'type':'fragment',
'text':[]
}
if conc is not None:
conclusion = conc
conclusion['type']='skip'
rangi = list(map(parseInt,stri[1:-1].split(',')[:2]))
if(
len(rangi)>1
and
(
rangi[0]==rangi[1]
or
(
isNaN(rangi[0])
and
isNaN(rangi[1])
)
)
):
rangi = rangi[:1]
if len(rangi)==1 and isNaN(rangi[0]):
rangi = [0,rangi[0]]
if len(rangi)>1:
conclusion['type']+='range'
for i in range(len(rangi)):
if isNaN(rangi[i]):
rangi[i]='any'
if len(rangi)>1 and rangi[0]=='any':
rangi[0] = 0
conclusion['text'] = rangi;
return conclusion
import re
regex_ul = re.compile(r'\?([UL])({[0-9,* ]+})?')
regex_be = re.compile(r'^([^.]+)\.\.')
regex_mi = re.compile(r'\.\.([^.]+)\.\.')
regex_en = re.compile(r'\.\.([^.]+)$')
#direct translation from JS code; forgive me if the code stinks
def parseQuery(typed):
parts = list(map(lambda a: a.split('__'), list(filter(lambda a: a!='', typed.split(' ')))))
for part_ndx, _ in enumerate(parts):
text = parts[part_ndx][0];
visible = backslashRemovingEscaped(text)
conclusion = {
'type':'plain',
'text':[backslashUnescaper(text)]
}
if text=='':
conclusion['type']='ignore'
conclusion['text']=[]
elif text.find('..')>=0:
begin = regex_be.search(text)
middle = regex_mi.search(text)
end = regex_en.search(text)
conclusion['type']='fragment'
conclusion['text']=[]
if begin:
conclusion['text'].append(['starts with',backslashUnescaper(begin.groups()[0])])
if middle:
conclusion['text'].append(['contains',backslashUnescaper(middle.groups()[0])])
if end:
conclusion['text'].append(['ends with',backslashUnescaper(end.groups()[0])])
elif len(visible)>0 and visible[0]=='{' and visible[-1]=='}':
conclusion = interpretRange(visible,conclusion)
'''
elif text.find('?U')>=0 or text.find('?L')>=0:
conclusion['type']='cases'
conclusion['text']=[]
remain = text
while len(remain)>0:
nextMatch = regex_ul.search(remain)
if nextMatch:
matchedData = nextMatch.string[nextMatch.start(0):nextMatch.end(0)]
nextMatch = nextMatch.groups()
nextMatch = [matchedData]+list(nextMatch)
toConsume = remain.find(nextMatch[0])
if toConsume>0:
conclusion['text'].append([
'plain',
backslashUnescaper(remain[0:toConsume])
]);
remain = remain[toConsume:]
tcase = None
if nextMatch[1]=='U':
tcase = 'uppercase'
else:
tcase = 'lowercase'
irange = interpretRange('{1}')
if nextMatch[2]:
irange = interpretRange(nextMatch[2])
conclusion['text'].append([
'casefind',
tcase,
irange['text']
])
remain = remain[len(nextMatch[0]):]
else:
conclusion['text'].append([
'plain',
backslashUnescaper(remain)
])
remain = ''
'''
parts[part_ndx][0] = conclusion
return parts
def aux_skipAdd(val1,val2):
if val1=='any' or val2=='any':
return 'any'
else:
return val1+val2
def aux_skipNormalize(val):
if len(val)==1:
return [val[0], val[0]]
return val
def aux_skipToRange(val,padding,limit):
v = aux_skipNormalize(val)
if v[1]=='any':
v[1]=limit
return range(min(padding+v[0],limit),min(padding+v[1],limit))
def aux_skipReduce(val):
if val[0] == val[1]:
return [val[0]]
return val
def aux_sumSkips(skip1, skip2):
val = {'type':'skip','text':[]}
toSum = list(zip(aux_skipNormalize(skip1['text']),aux_skipNormalize(skip2['text'])))
txt = aux_skipReduce([aux_skipAdd(*i) for i in toSum])
if len(txt)>1:
val['type']+='range'
val['text']=txt
return val
def optimizeSkips(parsed):
i = 0
while i+1<len(parsed):
if parsed[i][0]['type'].startswith('skip') and parsed[i+1][0]['type'].startswith('skip'):
optimized = aux_sumSkips(parsed[i][0],parsed[i+1][0])
del parsed[i+1]
parsed[i]=[optimized]
i-=1
i+=1
return parsed
class ResultRanges(object):
positions = []
currentPos = 0
def setCurrentPos(self,v):
self.currentPos = v
def push(self,v):
self.positions.append((self.currentPos,v))
def reset(self):
self.positions = list()
class StepFinderIgnore(object):
def __init__(self, _next=None, headcheck=None, tagcheck=None):
self._next = _next
self.ranges = ResultRanges()
if headcheck is None:
headcheck = list()
self.headcheck = headcheck
if tagcheck is None:
tagcheck = list()
self.tagcheck = tagcheck
def test(self, taggedSentence):
self.ranges.reset()
for i in range(len(taggedSentence)):
self.ranges.setCurrentPos(i)
self._test(taggedSentence, self.ranges,i)
return self.results
def _test(self, taggedSentence, manager, current):
if current not in range(len(taggedSentence)):
return
for test in self._getTests(taggedSentence,current):
pos = test()
if pos:
if self._checkTags(pos,taggedSentence):
if self._next:
self._next._test(taggedSentence, manager, pos+1)
else:
manager.push(pos+1)
def _checkTags(self, pos, taggedSentence):
if pos not in range(len(taggedSentence)):
return False
if len(self.tagcheck):
return taggedSentence[pos][1] in self.tagcheck
else:
return True
def _getTests(self, *args):
return [lambda: True]
@property
def results(self):
return self.ranges.positions
class StepFinderPlain(StepFinderIgnore):
def _getTests(self, taggedSentence, current):
return [
lambda: int(self.headcheck[0]==taggedSentence[current][0])*current
]
class StepFinderFragment(StepFinderIgnore):
def _getTests(self, taggedSentence, current):
keys = {
'starts with': lambda a: taggedSentence[current][0].startswith(a),
'contains':lambda a: taggedSentence[current][0].__contains__(a),
'ends with':lambda a: taggedSentence[current][0].endswith(a),
}
return [
lambda: int(all([
keys[check[0]](check[1])
for check in self.headcheck
]))*current
]
class returnsGiven:
def __init__(self, toSave):
self.saved = toSave
def __call__(self):
return self.saved
class StepFinderSkip(StepFinderIgnore):
def _getTests(self, taggedSentence, current):
newpos = [returnsGiven(x) for x in aux_skipToRange(self.headcheck,current, len(taggedSentence))]
return newpos
def buildClasses(query):
curr = None
for part in query[::-1]:
args = [curr, part[0]['text'], part[1:]]
if part[0]['type'] == 'ignore':
curr = StepFinderIgnore(*args)
elif part[0]['type'] == 'plain':
curr = StepFinderPlain(*args)
elif part[0]['type'] == 'fragment':
curr = StepFinderFragment(*args)
elif part[0]['type'].startswith('skip'):
curr = StepFinderSkip(*args)
else:
raise SearchQueryException('Couldn\'t locate type: '+str(part['type']))
return curr
def search(query, taggedSentences, context = -1):
if len(query)<=0: return list()
tag = optimizeSkips(parseQuery(query))
searcher = buildClasses(tag)
results = list()
for taggedSentence in taggedSentences:
ranges = searcher.test(taggedSentence)
for rng in ranges:
if rng[0] >= rng[1]:
continue
matches = taggedSentence[rng[0]:rng[1]]
r = range(rng[0],rng[1])
if len(matches)<=0:
continue
hintedFragment = None
hintedSentence = [[word,tag,i in r] for (i, (word, tag)) in enumerate(taggedSentence)]
if context == 0:
hintedFragment = [[word,tag,True] for word, tag in matches]
elif context < 0:
hintedFragment = hintedSentence
else:
lr = rng[0]
ur = rng[1]
lr-=context
ur+=context
lr = max(0,lr)
ur = min(ur,len(hintedSentence))
hintedFragment = hintedSentence[lr:ur]
results.append({
'excerpt': rng,
'excerptInc': [rng[0]+1, rng[1]+1],
'sentence': taggedSentence,
'match': matches,
'hintedSentence': hintedSentence,
'fragsize':context,
'hintedFragment': hintedFragment,
})
return results