unitex-webservice/unitexActions.py

642 lines
23 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
__author__ = "Adler Neves"
__email__ = "adlerosn@gmail.com"
__title__ = None
__description__ = None
__license__ = 'MIT'
__copyright__ = 'Copyright 2017 Adler O. S. Neves'
__version__ = '0.0.1'
import re
import os
import json
import time
import errno
import shutil
import traceback
import pyunitex2
from commandlineLogger import CommandLogger
import unitexOutputParser as uniparser
class UnitexActions(object):
unitex = None
logger = None
workspace = "work"
lang = None
_executionPlan = list()
corpus_content = ""
corpus_diskname = "corpus"
corpus_directory = ''
corpus_file = ''
corpus_sentences =''
corpus_wdir = ''
volume_id = 0
langNameFs = {
'ar': 'Arabic',
'en': 'English',
'fi': 'Finnish',
'fr': 'French',
'ka': 'Georgian (Ancient)',
'de': 'German',
'ela': 'Greek (Ancient)',
'el': 'Greek (Modern)',
'it': 'Italian',
'ko': 'Korean',
'la': 'Latin',
'mg': 'Malagasy',
'nb': 'Norwegian (Bokmal)',
'nn': 'Norwegian (Nynorsk)',
'po': 'Polish',
'ptb': 'Portuguese (Brazil)',
'pt': 'Portuguese (Portugal)',
'ru': 'Russian',
'src': 'Serbian-Cyrillic',
'sr': 'Serbian-Latin',
'es': 'Spanish',
'th': 'Thai',
}
assets = {
'snt_grf': '/Graphs/Preprocessing/Sentence/Sentence.grf',
'snt_fst2': '/Graphs/Preprocessing/Sentence/Sentence.fst2',# autoderived from above
'norm_txt': '/Norm.txt',
'norm_grf': '/Graphs/Normalization/Norm.grf',
'norm_fst2': '/Graphs/Normalization/Norm.fst2',# autoderived from above
'replace_grf': '/Graphs/Preprocessing/Replace/Replace.grf',
'replace_fst2': '/Graphs/Preprocessing/Replace/Replace.fst2', # autoderived from above
'alp_txt': '/Alphabet.txt',
'alp_sort_txt': '/Alphabet_sort.txt',
'aec_grf': '/Graphs/auto-ex-corpus.grf', #output
#'regexp_txt': '/regexp.txt', # user input
#'regexp_grf': '/regexp.grf', # autoderived from above
#'regexp_fst2': '/regexp.fst2', # autoderived from above
'tagger_bin': '/Dela/tagger_data_cat.bin',
'tagger_inf': '/Dela/tagger_data_cat.inf',
'tagger_def': '/Elag/tagset.def',
'dela_bin_dics':[],
}
reopening = False
@property
def id(self):
return self.volume_id
def __init__(self,
corpus_content = None,
lang = None,
volume_id = None,
workspace = None,
custom_assets = None,
dumbMode = False
):
if workspace is not None:
self.workspace = workspace
if volume_id is not None:
self.volume_id = volume_id
else:
self.volume_id = self._nextVolumeId
self.lang = lang
del workspace
del volume_id
del lang
if not dumbMode:
self.__assert_existance()
if not dumbMode:
self.__create_unitex_instance()
self.__setup_working_directory()
self.__update_supported_languages()
if not dumbMode:
self.__provide_default_linguistic_resources()
self.__update_asset_definitions()
self.__setup_corpus_filenames()
self.__write_corpus_file(corpus_content)
self.__install_custom_assets(custom_assets)
def __assert_existance(self):
if self.lang is None and not os.path.exists(self._workspace_volume(self.volume_id)):
raise FileNotFoundError(
errno.ENOENT,
os.strerror(errno.ENOENT),
os.path.abspath(self._workspace_volume(self.volume_id))
)
def __install_custom_assets(self, custom_assets):
if custom_assets is not None:
for custom_asset in custom_assets:
pass
def __update_supported_languages(self):
pass
def __update_asset_definitions(self):
self.__update_dela_dics_list()
def __write_corpus_file(self,corpus_content):
if corpus_content is None and not os.path.exists(self.corpus_file):
corpus_content = ''
if corpus_content is not None:
with open(self.corpus_file, 'w') as f:
f.writelines(re.split(r'(\r\n|\r|\n)', corpus_content))
def __setup_corpus_filenames(self):
self.corpus_file = self.corpus_directory+self.corpus_diskname+".txt"
self.corpus_sentences = self.corpus_directory+self.corpus_diskname+".snt"
self.corpus_wdir = self.corpus_directory+self.corpus_diskname+"_snt"+os.path.sep
if not os.path.exists(self.corpus_wdir):
os.makedirs(self.corpus_wdir)
def __provide_default_linguistic_resources(self):
if not os.path.exists(self.corpus_workspace):
shutil.copytree(self.langWorkspaceSource, self.corpus_workspace)
def __update_dela_dics_list(self):
self.assets['dela_bin_dics'] = [
f for f in
[
os.path.abspath(self.corpus_workspace+'/Dela/'+file)
for file in os.listdir(os.path.abspath(self.corpus_workspace+'/Dela'))
]
if os.path.isfile(f) and f.lower().endswith('.bin')
]
def __setup_working_directory(self):
self.corpus_directory = (
self._workspaces_container +
str(self.volume_id) +
os.path.sep
)
if not os.path.exists(self.corpus_directory):
os.makedirs(self.corpus_directory)
else:
self.reopening = True
self.langWorkspaceSource = (
os.path.abspath('lang') +
os.path.sep +
self.langNameFs.get(self.lang,self.langNameFs['en'])
)
self.corpus_workspace = (
self.corpus_directory +
"workspace" +
os.path.sep
)
self.__configure_logger()
def __configure_logger(self):
self.logger = CommandLogger(
self.corpus_directory+"log",
dontTouchPid = self.reopening
)
def __create_unitex_instance(self):
self.unitex = pyunitex2.Unitex(
None,
self.__context_dir +
os.path.sep +
'UnitexTool'
)
@property
def __context_dir(self):
return os.path.dirname(os.path.abspath(__file__))
@property
def _workspaces_container(self):
return (
self.__context_dir +
os.path.sep +
str(self.workspace) +
os.path.sep
)
def _workspace_volume(self, num):
return self._workspaces_container+str(num)
@property
def _nextVolumeId(self):
i = 1
while os.path.exists(self._workspace_volume(i)):
i+=1
return i
def _preprocess(self):
# Preprocessing
log = self.logger.task("preprocessing_normalize_input")
log.began = True
log.result = self.unitex.Normalize(
self.corpus_file,
'-r'+os.path.abspath(self.corpus_workspace+self.assets['norm_txt']),
'--output_offsets='+self.corpus_wdir+'normalize.out.offsets',
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_converting_graph")
log.began = True
log.result = self.unitex.Grf2Fst2(
os.path.abspath(self.corpus_workspace+self.assets['snt_grf']),
'-y',
'--alphabet='+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_flattening_graph")
log.began = True
log.result = self.unitex.Flatten(
os.path.abspath(self.corpus_workspace+self.assets['snt_fst2']),
'--rtn',
'-d5',
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_plainifying_graph")
log.began = True
log.result = self.unitex.Fst2Txt(
'-t'+self.corpus_sentences,
os.path.abspath(self.corpus_workspace+self.assets['snt_fst2']),
'-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
'-M',
'--input_offsets='+self.corpus_wdir+'normalize.out.offsets',
'--output_offsets='+self.corpus_wdir+'normalize.out.offsets',
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_convering_graph_again")
log.began = True
log.result = self.unitex.Grf2Fst2(
os.path.abspath(self.corpus_workspace+self.assets['replace_grf']),
'-y',
'--alphabet='+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_flattening_graph_again")
log.began = True
log.result = self.unitex.Fst2Txt(
'-t'+self.corpus_sentences,
os.path.abspath(self.corpus_workspace+self.assets['replace_fst2']),
'-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
'-R',
'--input_offsets='+self.corpus_wdir+'normalize.out.offsets',
'--output_offsets='+self.corpus_wdir+'normalize.out.offsets',
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_extracting_tokens")
log.began = True
log.result = self.unitex.Tokenize(
self.corpus_sentences,
'-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
'--input_offsets='+self.corpus_wdir+'normalize.out.offsets',
'--output_offsets='+self.corpus_wdir+'tokenize.out.offsets',
'-qutf8-no-bom'
)
if len(self.assets['dela_bin_dics'])>0:
log = self.logger.task("preprocessing_using_dictionary")
log.began = True
log.result = self.unitex.Dico(*[
'-t'+self.corpus_sentences,
'-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
]+self.assets['dela_bin_dics']+[
'-qutf8-no-bom'
])
log = self.logger.task("preprocessing_sorting_text_simple")
log.began = True
log.result = self.unitex.SortTxt(
self.corpus_wdir+'dlf',
'-l'+self.corpus_wdir+'dlf.n',
'-o'+os.path.abspath(self.corpus_workspace+self.assets['alp_sort_txt']),
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_sorting_text_compound")
log.began = True
log.result = self.unitex.SortTxt(
self.corpus_wdir+'dlc',
'-l'+self.corpus_wdir+'dlc.n',
'-o'+os.path.abspath(self.corpus_workspace+self.assets['alp_sort_txt']),
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_sorting_text_unknown")
log.began = True
log.result = self.unitex.SortTxt(
self.corpus_wdir+'err',
'-l'+self.corpus_wdir+'err.n',
'-o'+os.path.abspath(self.corpus_workspace+self.assets['alp_sort_txt']),
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_sorting_text_unknown_unique")
log.began = True
log.result = self.unitex.SortTxt(
self.corpus_wdir+'tags_err',
'-l'+self.corpus_wdir+'tags_err.n',
'-o'+os.path.abspath(self.corpus_workspace+self.assets['alp_sort_txt']),
'-qutf8-no-bom'
)
log = self.logger.task("preprocessing_wordlist_parsing")
log.began = True
with open(self.corpus_wdir+'dlf') as fdlf:
with open(self.corpus_wdir+'dlc') as fdlc:
with open(self.corpus_wdir+'err') as ferr:
dlf = fdlf.read().strip().splitlines()
dlc = fdlc.read().strip().splitlines()
err = ferr.read().strip().splitlines()
dlf = [uniparser.parseDelaf(k) for k in dlf]
dlc = [uniparser.parseDelaf(k) for k in dlc]
with open(self.corpus_directory+'wordlist.json','w') as fw:
fw.write(
json.dumps(
{
'simples': dlf,
'composto': dlc,
'naoReconhecido': err,
}
)
)
log.completed = True
else:
log = self.logger.task("preprocessing_wordlist_parsing")
log.began = True
with open(self.corpus_wdir+'tokens.txt') as fr:
with open(self.corpus_directory+'wordlist.json','w') as fw:
fw.write(
json.dumps(
{
'simples': [],
'composto': [],
'naoReconhecido': sorted(fr.read().splitlines()[1:]),
}
)
)
log.completed = True
log = self.logger.task("preprocessing_wordfreq_parsing")
log.began = True
with open(self.corpus_wdir+'tok_by_freq.txt') as fr:
keyPair = [line.split('\t') for line in fr.read().splitlines() if '\t' in line]
with open(self.corpus_directory+'wordfreq.json','w') as fw:
fw.write(
json.dumps(
{
word: int(freq)
for freq,word in keyPair
}
)
)
log.completed = True
log = self.logger.task("preprocessing_sentences_parsing")
log.began = True
#sentenceCount = 0
with open(self.corpus_sentences) as fr:
sentences = [sentence.replace('\n',' ').replace('\r','') for sentence in fr.read().split('{S}')]
#sentenceCount = len(sentences)
with open(self.corpus_directory+'sentences.json','w') as fw:
fw.write(json.dumps(sentences))
log.completed = True
def _fstText(self):
# FST text
log = self.logger.task("converting_fst_graph")
log.result = self.unitex.Grf2Fst2(
os.path.abspath(self.corpus_workspace+self.assets['norm_grf']),
'-y',
'--alphabet='+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
'-qutf8-no-bom'
)
log = self.logger.task("building_fst_text")
log.result = self.unitex.Txt2Tfst(
self.corpus_sentences,
'-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
'--clean',
'-n'+os.path.abspath(self.corpus_workspace+self.assets['norm_fst2']),
'-qutf8-no-bom'
)
log = self.logger.task("parsing_fst_text")
log.began = True
with open(self.corpus_wdir+'tokens.txt') as fr2:
tokens = fr2.read().splitlines()[1:]
with open(self.corpus_wdir+'text.tfst') as fr:
tfst = fr.read().splitlines()
grafos = uniparser.segmentaTfst(tfst)
grafosParsed = [uniparser.parseTfstSegment(grafo, tokens) for grafo in grafos]
with open(self.corpus_directory+'fsttext.json','w') as fw:
fw.write(json.dumps(grafosParsed))
log.completed = True
log = self.logger.task("parsing_fst_tagfreq")
log.began = True
with open(self.corpus_wdir+'tfst_tags_by_freq.txt') as fr:
tagfreq = [[x.split('\t')[-1],float(x.split('\t')[0])] for x in fr.read().splitlines()]
for tag in tagfreq:
if tag[0].startswith('{') and tag[0].endswith('}'):
tag[0] = uniparser.parseDelaf(tag[0][1:-1])
else:
tag[0] = uniparser.delasToDelaf(uniparser.textToDelas(tag[0]))
with open(self.corpus_directory+'tagfreq.json','w') as fw:
fw.write(json.dumps(tagfreq))
log.completed = True
log = self.logger.task("tagging_fst_text")
log.result = self.unitex.Tagger(
os.path.abspath(self.corpus_wdir+'text.tfst'),
'-d'+os.path.abspath(self.corpus_workspace+self.assets['tagger_bin']),
'-t'+os.path.abspath(self.corpus_workspace+self.assets['tagger_def']),
'-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
'-qutf8-no-bom'
)
taggingFailed = log.errored
log = self.logger.task("parsing_fst_text_tagged")
log.began = True
with open(self.corpus_wdir+'tokens.txt') as fr2:
tokens = fr2.read().splitlines()[1:]
with open(self.corpus_wdir+'text.tfst') as fr:
grafosParsed = None
if not taggingFailed:
tfst = fr.read().splitlines()
grafos = uniparser.segmentaTfst(tfst)
grafosParsed = [uniparser.parseTfstSegment(grafo, tokens) for grafo in grafos]
with open(self.corpus_directory+'fsttexttagged.json','w') as fw:
fw.write(json.dumps(grafosParsed))
log.completed = True
log = self.logger.task("parsing_fst_taggedfreq")
log.began = True
with open(self.corpus_wdir+'tfst_tags_by_freq.txt') as fr:
tagfreq = None
if not taggingFailed:
tagfreq = [[x.split('\t')[-1],float(x.split('\t')[0])] for x in fr.read().splitlines()]
for tag in tagfreq:
if tag[0].startswith('{') and tag[0].endswith('}'):
tag[0] = uniparser.parseDelaf(tag[0][1:-1])
else:
tag[0] = uniparser.delasToDelaf(uniparser.textToDelas(tag[0]))
with open(self.corpus_directory+'taggedfreq.json','w') as fw:
fw.write(json.dumps(tagfreq))
log.completed = True
def _sequenceAutomata(self):
# Automato de sequencias
log = self.logger.task("seq_auto_process")
log.result = self.unitex.Seq2Grf(
self.corpus_file,
'-o'+os.path.abspath(self.corpus_workspace+self.assets['aec_grf']),
'-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']),
'--b',
'-qutf8-no-bom'
)
log = self.logger.task("seq_auto_parsing")
log.began = True
with open(self.corpus_workspace+self.assets['aec_grf']) as fr:
with open(self.corpus_directory+'aec.json','w') as fw:
fw.write(json.dumps(uniparser.parseGrf(fr.read().splitlines())))
log.completed = True
self.logger.done = True
def planPreprocessing(self):
self.logger.planTasks(10)
if len(self.assets['dela_bin_dics'])>0:
self.logger.planTasks(5)
self._executionPlan.append(self._preprocess)
def planFstText(self):
self.logger.planTasks(7)
self._executionPlan.append(self._fstText)
def planSequenceAutomata(self):
self.logger.planTasks(2)
self._executionPlan.append(self._sequenceAutomata)
def planAll_shortcut(self):
self.planPreprocessing()
self.planFstText()
self.planSequenceAutomata()
def executePlanning(self):
try:
while len(self._executionPlan)>0:
call = self._executionPlan[0]
del self._executionPlan[0]
call()
except:
traceback.print_exc()
self.logger.done = True
@staticmethod
def get_dumb_one():
return UnitexActions(dumbMode = True)
@staticmethod
def get_languages():
return UnitexActions.get_dumb_one().langNameFs
@staticmethod
def _get_volumes():
d = UnitexActions.get_dumb_one()
c = d._workspaces_container
return [v for v in os.listdir(c) if os.path.isdir(c+v)]
@staticmethod
def reopen(proj_id):
return UnitexActions(volume_id=proj_id, lang=None)
@staticmethod
def reopen_all():
return [UnitexActions.reopen(v) for v in UnitexActions._get_volumes()]
@staticmethod
def reopen_list():
return [v for v in UnitexActions._get_volumes()]
@property
def status(self):
s = dict()
s['logs'] = self.logger._jsonable
return s
@property
def _jsonable(self):
j = dict()
j['status'] = self.status
j['results'] = {
'aec': None,
'fsttext': None,
'fsttexttagged': None,
'sentences': None,
'wordfreq': None,
'wordlist': None,
'tagfreq': None,
'taggedfreq': None,
}
for key in j['results'].keys():
try:
with open(self.corpus_directory+key+'.json') as f:
j['results'][key] = json.loads(f.read())
except:
pass
j['id'] = self.id
j['age'] = self.newestAge
return j
@property
def _fileList(self):
return [
os.path.join(dirname, filename)
for dirname, dirnames, filenames in os.walk(self.corpus_directory)
for filename in filenames
if not os.path.join(dirname, filename).startswith(self.corpus_workspace)
]
@property
def oldestFile(self):
return min(self._fileList, key=lambda fn: os.stat(fn).st_mtime)
@property
def newestFile(self):
return max(self._fileList, key=lambda fn: os.stat(fn).st_mtime)
@property
def oldestAge(self):
return time.time()-os.stat(self.oldestFile).st_mtime
@property
def newestAge(self):
return time.time()-os.stat(self.newestFile).st_mtime
def delete(self):
shutil.rmtree(os.path.abspath(self._workspace_volume(self.volume_id)))
def __repr__(self):
return json.dumps(self._jsonable)
def __str__(self):
return os.path.abspath(self._workspaces_container)