#!/usr/bin/env python3 # -*- encoding: utf-8 -*- __author__ = "Adler Neves" __email__ = "adlerosn@gmail.com" __title__ = None __description__ = None __license__ = 'MIT' __copyright__ = 'Copyright 2017 Adler O. S. Neves' __version__ = '0.0.1' import re import os import json import time import errno import shutil import traceback import pyunitex2 from commandlineLogger import CommandLogger import unitexOutputParser as uniparser class UnitexActions(object): unitex = None logger = None workspace = "work" lang = None _executionPlan = list() corpus_content = "" corpus_diskname = "corpus" corpus_directory = '' corpus_file = '' corpus_sentences ='' corpus_wdir = '' volume_id = 0 langNameFs = { 'ar': 'Arabic', 'en': 'English', 'fi': 'Finnish', 'fr': 'French', 'ka': 'Georgian (Ancient)', 'de': 'German', 'ela': 'Greek (Ancient)', 'el': 'Greek (Modern)', 'it': 'Italian', 'ko': 'Korean', 'la': 'Latin', 'mg': 'Malagasy', 'nb': 'Norwegian (Bokmal)', 'nn': 'Norwegian (Nynorsk)', 'po': 'Polish', 'ptb': 'Portuguese (Brazil)', 'pt': 'Portuguese (Portugal)', 'ru': 'Russian', 'src': 'Serbian-Cyrillic', 'sr': 'Serbian-Latin', 'es': 'Spanish', 'th': 'Thai', } assets = { 'snt_grf': '/Graphs/Preprocessing/Sentence/Sentence.grf', 'snt_fst2': '/Graphs/Preprocessing/Sentence/Sentence.fst2',# autoderived from above 'norm_txt': '/Norm.txt', 'norm_grf': '/Graphs/Normalization/Norm.grf', 'norm_fst2': '/Graphs/Normalization/Norm.fst2',# autoderived from above 'replace_grf': '/Graphs/Preprocessing/Replace/Replace.grf', 'replace_fst2': '/Graphs/Preprocessing/Replace/Replace.fst2', # autoderived from above 'alp_txt': '/Alphabet.txt', 'alp_sort_txt': '/Alphabet_sort.txt', 'aec_grf': '/Graphs/auto-ex-corpus.grf', #output #'regexp_txt': '/regexp.txt', # user input #'regexp_grf': '/regexp.grf', # autoderived from above #'regexp_fst2': '/regexp.fst2', # autoderived from above 'tagger_bin': '/Dela/tagger_data_cat.bin', 'tagger_inf': '/Dela/tagger_data_cat.inf', 'tagger_def': '/Elag/tagset.def', 'dela_bin_dics':[], } reopening = False @property def id(self): return self.volume_id def __init__(self, corpus_content = None, lang = None, volume_id = None, workspace = None, custom_assets = None, dumbMode = False ): if workspace is not None: self.workspace = workspace if volume_id is not None: self.volume_id = volume_id else: self.volume_id = self._nextVolumeId self.lang = lang del workspace del volume_id del lang if not dumbMode: self.__assert_existance() if not dumbMode: self.__create_unitex_instance() self.__setup_working_directory() self.__update_supported_languages() if not dumbMode: self.__provide_default_linguistic_resources() self.__update_asset_definitions() self.__setup_corpus_filenames() self.__write_corpus_file(corpus_content) self.__install_custom_assets(custom_assets) def __assert_existance(self): if self.lang is None and not os.path.exists(self._workspace_volume(self.volume_id)): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), os.path.abspath(self._workspace_volume(self.volume_id)) ) def __install_custom_assets(self, custom_assets): if custom_assets is not None: for custom_asset in custom_assets: pass def __update_supported_languages(self): pass def __update_asset_definitions(self): self.__update_dela_dics_list() def __write_corpus_file(self,corpus_content): if corpus_content is None and not os.path.exists(self.corpus_file): corpus_content = '' if corpus_content is not None: with open(self.corpus_file, 'w') as f: f.writelines(re.split(r'(\r\n|\r|\n)', corpus_content)) def __setup_corpus_filenames(self): self.corpus_file = self.corpus_directory+self.corpus_diskname+".txt" self.corpus_sentences = self.corpus_directory+self.corpus_diskname+".snt" self.corpus_wdir = self.corpus_directory+self.corpus_diskname+"_snt"+os.path.sep if not os.path.exists(self.corpus_wdir): os.makedirs(self.corpus_wdir) def __provide_default_linguistic_resources(self): if not os.path.exists(self.corpus_workspace): shutil.copytree(self.langWorkspaceSource, self.corpus_workspace) def __update_dela_dics_list(self): self.assets['dela_bin_dics'] = [ f for f in [ os.path.abspath(self.corpus_workspace+'/Dela/'+file) for file in os.listdir(os.path.abspath(self.corpus_workspace+'/Dela')) ] if os.path.isfile(f) and f.lower().endswith('.bin') ] def __setup_working_directory(self): self.corpus_directory = ( self._workspaces_container + str(self.volume_id) + os.path.sep ) if not os.path.exists(self.corpus_directory): os.makedirs(self.corpus_directory) else: self.reopening = True self.langWorkspaceSource = ( os.path.abspath('lang') + os.path.sep + self.langNameFs.get(self.lang,self.langNameFs['en']) ) self.corpus_workspace = ( self.corpus_directory + "workspace" + os.path.sep ) self.__configure_logger() def __configure_logger(self): self.logger = CommandLogger( self.corpus_directory+"log", dontTouchPid = self.reopening ) def __create_unitex_instance(self): self.unitex = pyunitex2.Unitex( None, self.__context_dir + os.path.sep + 'UnitexTool' ) @property def __context_dir(self): return os.path.dirname(os.path.abspath(__file__)) @property def _workspaces_container(self): return ( self.__context_dir + os.path.sep + str(self.workspace) + os.path.sep ) def _workspace_volume(self, num): return self._workspaces_container+str(num) @property def _nextVolumeId(self): i = 1 while os.path.exists(self._workspace_volume(i)): i+=1 return i def _preprocess(self): # Preprocessing log = self.logger.task("preprocessing_normalize_input") log.began = True log.result = self.unitex.Normalize( self.corpus_file, '-r'+os.path.abspath(self.corpus_workspace+self.assets['norm_txt']), '--output_offsets='+self.corpus_wdir+'normalize.out.offsets', '-qutf8-no-bom' ) log = self.logger.task("preprocessing_converting_graph") log.began = True log.result = self.unitex.Grf2Fst2( os.path.abspath(self.corpus_workspace+self.assets['snt_grf']), '-y', '--alphabet='+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), '-qutf8-no-bom' ) log = self.logger.task("preprocessing_flattening_graph") log.began = True log.result = self.unitex.Flatten( os.path.abspath(self.corpus_workspace+self.assets['snt_fst2']), '--rtn', '-d5', '-qutf8-no-bom' ) log = self.logger.task("preprocessing_plainifying_graph") log.began = True log.result = self.unitex.Fst2Txt( '-t'+self.corpus_sentences, os.path.abspath(self.corpus_workspace+self.assets['snt_fst2']), '-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), '-M', '--input_offsets='+self.corpus_wdir+'normalize.out.offsets', '--output_offsets='+self.corpus_wdir+'normalize.out.offsets', '-qutf8-no-bom' ) log = self.logger.task("preprocessing_convering_graph_again") log.began = True log.result = self.unitex.Grf2Fst2( os.path.abspath(self.corpus_workspace+self.assets['replace_grf']), '-y', '--alphabet='+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), '-qutf8-no-bom' ) log = self.logger.task("preprocessing_flattening_graph_again") log.began = True log.result = self.unitex.Fst2Txt( '-t'+self.corpus_sentences, os.path.abspath(self.corpus_workspace+self.assets['replace_fst2']), '-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), '-R', '--input_offsets='+self.corpus_wdir+'normalize.out.offsets', '--output_offsets='+self.corpus_wdir+'normalize.out.offsets', '-qutf8-no-bom' ) log = self.logger.task("preprocessing_extracting_tokens") log.began = True log.result = self.unitex.Tokenize( self.corpus_sentences, '-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), '--input_offsets='+self.corpus_wdir+'normalize.out.offsets', '--output_offsets='+self.corpus_wdir+'tokenize.out.offsets', '-qutf8-no-bom' ) if len(self.assets['dela_bin_dics'])>0: log = self.logger.task("preprocessing_using_dictionary") log.began = True log.result = self.unitex.Dico(*[ '-t'+self.corpus_sentences, '-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), ]+self.assets['dela_bin_dics']+[ '-qutf8-no-bom' ]) log = self.logger.task("preprocessing_sorting_text_simple") log.began = True log.result = self.unitex.SortTxt( self.corpus_wdir+'dlf', '-l'+self.corpus_wdir+'dlf.n', '-o'+os.path.abspath(self.corpus_workspace+self.assets['alp_sort_txt']), '-qutf8-no-bom' ) log = self.logger.task("preprocessing_sorting_text_compound") log.began = True log.result = self.unitex.SortTxt( self.corpus_wdir+'dlc', '-l'+self.corpus_wdir+'dlc.n', '-o'+os.path.abspath(self.corpus_workspace+self.assets['alp_sort_txt']), '-qutf8-no-bom' ) log = self.logger.task("preprocessing_sorting_text_unknown") log.began = True log.result = self.unitex.SortTxt( self.corpus_wdir+'err', '-l'+self.corpus_wdir+'err.n', '-o'+os.path.abspath(self.corpus_workspace+self.assets['alp_sort_txt']), '-qutf8-no-bom' ) log = self.logger.task("preprocessing_sorting_text_unknown_unique") log.began = True log.result = self.unitex.SortTxt( self.corpus_wdir+'tags_err', '-l'+self.corpus_wdir+'tags_err.n', '-o'+os.path.abspath(self.corpus_workspace+self.assets['alp_sort_txt']), '-qutf8-no-bom' ) log = self.logger.task("preprocessing_wordlist_parsing") log.began = True with open(self.corpus_wdir+'dlf') as fdlf: with open(self.corpus_wdir+'dlc') as fdlc: with open(self.corpus_wdir+'err') as ferr: dlf = fdlf.read().strip().splitlines() dlc = fdlc.read().strip().splitlines() err = ferr.read().strip().splitlines() dlf = [uniparser.parseDelaf(k) for k in dlf] dlc = [uniparser.parseDelaf(k) for k in dlc] with open(self.corpus_directory+'wordlist.json','w') as fw: fw.write( json.dumps( { 'simples': dlf, 'composto': dlc, 'naoReconhecido': err, } ) ) log.completed = True else: log = self.logger.task("preprocessing_wordlist_parsing") log.began = True with open(self.corpus_wdir+'tokens.txt') as fr: with open(self.corpus_directory+'wordlist.json','w') as fw: fw.write( json.dumps( { 'simples': [], 'composto': [], 'naoReconhecido': sorted(fr.read().splitlines()[1:]), } ) ) log.completed = True log = self.logger.task("preprocessing_wordfreq_parsing") log.began = True with open(self.corpus_wdir+'tok_by_freq.txt') as fr: keyPair = [line.split('\t') for line in fr.read().splitlines() if '\t' in line] with open(self.corpus_directory+'wordfreq.json','w') as fw: fw.write( json.dumps( { word: int(freq) for freq,word in keyPair } ) ) log.completed = True log = self.logger.task("preprocessing_sentences_parsing") log.began = True #sentenceCount = 0 with open(self.corpus_sentences) as fr: sentences = [sentence.replace('\n',' ').replace('\r','') for sentence in fr.read().split('{S}')] #sentenceCount = len(sentences) with open(self.corpus_directory+'sentences.json','w') as fw: fw.write(json.dumps(sentences)) log.completed = True def _fstText(self): # FST text log = self.logger.task("converting_fst_graph") log.result = self.unitex.Grf2Fst2( os.path.abspath(self.corpus_workspace+self.assets['norm_grf']), '-y', '--alphabet='+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), '-qutf8-no-bom' ) log = self.logger.task("building_fst_text") log.result = self.unitex.Txt2Tfst( self.corpus_sentences, '-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), '--clean', '-n'+os.path.abspath(self.corpus_workspace+self.assets['norm_fst2']), '-qutf8-no-bom' ) log = self.logger.task("parsing_fst_text") log.began = True with open(self.corpus_wdir+'tokens.txt') as fr2: tokens = fr2.read().splitlines()[1:] with open(self.corpus_wdir+'text.tfst') as fr: tfst = fr.read().splitlines() grafos = uniparser.segmentaTfst(tfst) grafosParsed = [uniparser.parseTfstSegment(grafo, tokens) for grafo in grafos] with open(self.corpus_directory+'fsttext.json','w') as fw: fw.write(json.dumps(grafosParsed)) log.completed = True log = self.logger.task("parsing_fst_tagfreq") log.began = True with open(self.corpus_wdir+'tfst_tags_by_freq.txt') as fr: tagfreq = [[x.split('\t')[-1],float(x.split('\t')[0])] for x in fr.read().splitlines()] for tag in tagfreq: if tag[0].startswith('{') and tag[0].endswith('}'): tag[0] = uniparser.parseDelaf(tag[0][1:-1]) else: tag[0] = uniparser.delasToDelaf(uniparser.textToDelas(tag[0])) with open(self.corpus_directory+'tagfreq.json','w') as fw: fw.write(json.dumps(tagfreq)) log.completed = True log = self.logger.task("tagging_fst_text") log.result = self.unitex.Tagger( os.path.abspath(self.corpus_wdir+'text.tfst'), '-d'+os.path.abspath(self.corpus_workspace+self.assets['tagger_bin']), '-t'+os.path.abspath(self.corpus_workspace+self.assets['tagger_def']), '-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), '-qutf8-no-bom' ) taggingFailed = log.errored log = self.logger.task("parsing_fst_text_tagged") log.began = True with open(self.corpus_wdir+'tokens.txt') as fr2: tokens = fr2.read().splitlines()[1:] with open(self.corpus_wdir+'text.tfst') as fr: grafosParsed = None if not taggingFailed: tfst = fr.read().splitlines() grafos = uniparser.segmentaTfst(tfst) grafosParsed = [uniparser.parseTfstSegment(grafo, tokens) for grafo in grafos] with open(self.corpus_directory+'fsttexttagged.json','w') as fw: fw.write(json.dumps(grafosParsed)) log.completed = True log = self.logger.task("parsing_fst_taggedfreq") log.began = True with open(self.corpus_wdir+'tfst_tags_by_freq.txt') as fr: tagfreq = None if not taggingFailed: tagfreq = [[x.split('\t')[-1],float(x.split('\t')[0])] for x in fr.read().splitlines()] for tag in tagfreq: if tag[0].startswith('{') and tag[0].endswith('}'): tag[0] = uniparser.parseDelaf(tag[0][1:-1]) else: tag[0] = uniparser.delasToDelaf(uniparser.textToDelas(tag[0])) with open(self.corpus_directory+'taggedfreq.json','w') as fw: fw.write(json.dumps(tagfreq)) log.completed = True def _sequenceAutomata(self): # Automato de sequencias log = self.logger.task("seq_auto_process") log.result = self.unitex.Seq2Grf( self.corpus_file, '-o'+os.path.abspath(self.corpus_workspace+self.assets['aec_grf']), '-a'+os.path.abspath(self.corpus_workspace+self.assets['alp_txt']), '--b', '-qutf8-no-bom' ) log = self.logger.task("seq_auto_parsing") log.began = True with open(self.corpus_workspace+self.assets['aec_grf']) as fr: with open(self.corpus_directory+'aec.json','w') as fw: fw.write(json.dumps(uniparser.parseGrf(fr.read().splitlines()))) log.completed = True self.logger.done = True def planPreprocessing(self): self.logger.planTasks(10) if len(self.assets['dela_bin_dics'])>0: self.logger.planTasks(5) self._executionPlan.append(self._preprocess) def planFstText(self): self.logger.planTasks(7) self._executionPlan.append(self._fstText) def planSequenceAutomata(self): self.logger.planTasks(2) self._executionPlan.append(self._sequenceAutomata) def planAll_shortcut(self): self.planPreprocessing() self.planFstText() self.planSequenceAutomata() def executePlanning(self): try: while len(self._executionPlan)>0: call = self._executionPlan[0] del self._executionPlan[0] call() except: traceback.print_exc() self.logger.done = True @staticmethod def get_dumb_one(): return UnitexActions(dumbMode = True) @staticmethod def get_languages(): return UnitexActions.get_dumb_one().langNameFs @staticmethod def _get_volumes(): d = UnitexActions.get_dumb_one() c = d._workspaces_container return [v for v in os.listdir(c) if os.path.isdir(c+v)] @staticmethod def reopen(proj_id): return UnitexActions(volume_id=proj_id, lang=None) @staticmethod def reopen_all(): return [UnitexActions.reopen(v) for v in UnitexActions._get_volumes()] @staticmethod def reopen_list(): return [v for v in UnitexActions._get_volumes()] @property def status(self): s = dict() s['logs'] = self.logger._jsonable return s @property def _jsonable(self): j = dict() j['status'] = self.status j['results'] = { 'aec': None, 'fsttext': None, 'fsttexttagged': None, 'sentences': None, 'wordfreq': None, 'wordlist': None, 'tagfreq': None, 'taggedfreq': None, } for key in j['results'].keys(): try: with open(self.corpus_directory+key+'.json') as f: j['results'][key] = json.loads(f.read()) except: pass j['id'] = self.id j['age'] = self.newestAge return j @property def _fileList(self): return [ os.path.join(dirname, filename) for dirname, dirnames, filenames in os.walk(self.corpus_directory) for filename in filenames if not os.path.join(dirname, filename).startswith(self.corpus_workspace) ] @property def oldestFile(self): return min(self._fileList, key=lambda fn: os.stat(fn).st_mtime) @property def newestFile(self): return max(self._fileList, key=lambda fn: os.stat(fn).st_mtime) @property def oldestAge(self): return time.time()-os.stat(self.oldestFile).st_mtime @property def newestAge(self): return time.time()-os.stat(self.newestFile).st_mtime def delete(self): shutil.rmtree(os.path.abspath(self._workspace_volume(self.volume_id))) def __repr__(self): return json.dumps(self._jsonable) def __str__(self): return os.path.abspath(self._workspaces_container)