1
0
mirror of https://github.com/adlerosn/rede-especificacoes-tecnicas-em-redes synced 2024-07-05 17:00:12 +00:00
ufes-mestrado-projetopesqui.../docRefNetCreator/__init__.py

464 lines
17 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import json
import sqlite3
import networkx
import graphviz
import multiprocessing
from pathlib import Path
import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor
from .documents import fromFile as DocumentFromFile
from .document_finder import classes as docClasses
from .document_finder import find_references as referenceFinder
INFINITY = float('inf')
EMPTY_ITER = iter(list())
def find_rootdoc(rootdoc='rootdoc.txt'):
rootsrc, rootname = Path(rootdoc).read_text().splitlines()
docCchMgr = docClasses[rootsrc](rootname)
docPath = docCchMgr.cached()
currName = f"{docCchMgr.__class__.__name__}: {docCchMgr._identifier}"
if docPath is not None:
currName = str(docPath)[6:]
return {
'name': currName,
'generic_name': str(docCchMgr),
'type': docCchMgr.__class__.__name__,
'doc_id': docCchMgr._identifier,
'filepath': str(docPath),
'object': docCchMgr,
}
def generate_graph(rootdoc='rootdoc.txt', grapfn='graph.json', keep_temporal_context=True):
rootsrc, rootname = Path(rootdoc).read_text().splitlines()
analyzedDocPaths = list()
pendingDocCchMgr = [docClasses[rootsrc](rootname)]
graph = dict()
while len(pendingDocCchMgr) > 0:
docCchMgr, *pendingDocCchMgr = pendingDocCchMgr
docPath = docCchMgr.cached()
currName = f"{docCchMgr.__class__.__name__}: {docCchMgr._identifier}"
if docPath is not None:
currName = str(docPath)[6:]
if currName not in graph:
graph[currName] = {
'name': currName,
'generic_name': str(docCchMgr),
'type': docCchMgr.__class__.__name__,
'doc_id': docCchMgr._identifier,
'monitored': False if docPath is None else docPath.exists(),
'pub_date': docCchMgr.publication_date(docPath),
'in_force': docCchMgr.is_in_force(docPath),
'filepath': str(docPath),
'mention_freq': dict(),
}
if docPath in analyzedDocPaths:
continue
analyzedDocPaths.append(docPath)
docFF = DocumentFromFile(str(docPath))
print(f"Document @ {currName}")
if docFF is None:
continue
doc = docFF.parsed_from_cache(str(docPath)[6:])
newReferences = referenceFinder(doc, docCchMgr.context(docPath))
if not keep_temporal_context:
newReferences = list(map(lambda a: a.whithout_temporal_context(), newReferences))
for newReference in newReferences:
newDocPath = newReference.cached()
newName = f"{newReference.__class__.__name__}: {newReference._identifier}"
if newDocPath is not None:
newName = str(newDocPath)[6:]
graph[currName]['mention_freq'][newName] = graph[currName]['mention_freq'].get(newName, 0) + 1
pendingDocCchMgr = sorted(
[*pendingDocCchMgr, *newReferences],
key=lambda dcm: (not dcm.is_cached(), dcm.slowness(), dcm._identifier)
)
Path(grapfn).write_text(json.dumps(graph))
def dijkstra(graph, initial, hops_mode=False):
visited = {initial: 0}
path = dict()
nodes = set(graph.keys())
mentions = {node: list(graph[node]['mention_freq'].items()) for node in nodes}
while len(nodes) > 0:
min_node = None
for node in nodes:
if node in visited:
if min_node is None:
min_node = node
elif visited[node] < visited[min_node]:
min_node = node
if min_node is None:
break
nodes.remove(min_node)
current_weight = visited[min_node]
for edge, possible_weight in mentions[min_node]:
weight = current_weight + (1 if hops_mode else possible_weight)
if edge not in visited or weight < visited[edge]:
visited[edge] = weight
path[edge] = min_node
return visited, path
class Dijkstra:
def __init__(self, graph, hops_mode=False):
self._graph = graph
self._hops_mode = hops_mode
def __call__(self, initial):
return dijkstra(self._graph, initial, self._hops_mode)
def dijkstra_min_path(dijkstra_tuple, initial, target):
visited, path = dijkstra_tuple
min_path = list()
current = target
if current in path or current == initial:
while current is not None:
min_path.append(current)
current = path.get(current)
return (list(reversed(min_path)), visited[target])
return ([], None)
def embed_metrics(graph):
metrics = dict()
metrics['basic'] = dict()
metrics['basic']['node_count'] = len(graph)
metrics['basic']['vertex_count'] = 0
metrics['basic']['vertex_weight_sum'] = 0
for node in graph.values():
metrics['basic']['vertex_count'] += len(node['mention_freq'])
metrics['basic']['vertex_weight_sum'] += sum(node['mention_freq'].values())
metrics['matrix_labels'] = list(graph.keys())
metrics['degree'] = dict()
for key, node in graph.items():
metric = dict()
metric['degree_out'] = len(node['mention_freq'].values())
metric['weight_out'] = sum(node['mention_freq'].values())
metric['degree_in'] = 0
metric['weight_in'] = 0
for node2 in graph.values():
count = node2['mention_freq'].get(key, 0)
if count > 0:
metric['degree_in'] += 1
metric['weight_in'] += count
metrics['degree'][key] = metric
return metrics
def embed_metrics_distance(graph, metrics):
distance = dict()
matrix_labels = metrics['matrix_labels']
tpe = ProcessPoolExecutor(multiprocessing.cpu_count())
print("Slow Dijkstra: Hops")
dj = Dijkstra(graph, True)
dijkstra = list(tpe.map(dj, matrix_labels))
distance['distance_matrix_hops'] = [[
dijkstra[pos][0].get(target, -1)
for target in matrix_labels
] for pos, initial in enumerate(matrix_labels)]
del dijkstra
del dj
print("Slow Dijkstra: Weight")
dj = Dijkstra(graph, False)
dijkstra = list(tpe.map(dj, matrix_labels))
distance['distance_matrix_weight'] = [[
dijkstra[pos][0].get(target, -1)
for target in matrix_labels
] for pos, initial in enumerate(matrix_labels)]
del dijkstra
del dj
tpe.shutdown()
return distance
def embed_metrics_connectivity(graph, metrics, g, namefield):
connectivity = dict()
print('connectivity_edge')
connectivity['connectivity_edge'] = networkx.edge_connectivity(g)
print('connectivity_node')
connectivity['connectivity_node'] = networkx.node_connectivity(g)
return connectivity
def get_transition_map(graph):
transitions = [(source, target) for source, nd in graph.items() for target in nd['mention_freq'].keys()]
transmap = dict()
for s, t in transitions:
if t not in transmap:
transmap[t] = list()
transmap[t].append(s)
return transmap
def find_all_paths(tm, initial, target, accumulator=None):
if accumulator is None:
accumulator = list()
accumulator = [*accumulator, initial]
if initial == target:
yield accumulator
else:
for intermediate in tm[initial]:
if intermediate not in accumulator:
yield from find_all_paths(tm, intermediate, target, accumulator)
yield from EMPTY_ITER
def find_all_loopy_paths(graph, node):
tm = get_transition_map(graph)
accumulator = [node]
for intermediate in tm[node]:
yield from find_all_paths(tm, intermediate, node, accumulator)
yield from EMPTY_ITER
def get_reverse_transition_map(graph, sequential):
transitions = [
(sequential.index(source), sequential.index(target))
for source, nd in graph.items() for target in nd['mention_freq'].keys()
]
revtransmap = [list() for _ in sequential]
for s, t in transitions:
revtransmap[t].append(s)
return tuple([tuple(i) for i in revtransmap])
def find_all_loopy_paths_reversedly(graph, node, sequential):
revtransmap = get_reverse_transition_map(graph, sequential)
accumulator = [sequential.index(node)]
for intermediate in revtransmap[accumulator[0]]:
yield from find_all_paths_reversedly(revtransmap, intermediate, accumulator[0], accumulator)
yield from EMPTY_ITER
def find_all_paths_reversedly(revtransmap, initial, target, accumulator=None):
if accumulator is None:
accumulator = list()
accumulator = [initial, *accumulator]
if initial == target:
yield accumulator
else:
for intermediate in revtransmap[initial]:
if intermediate not in accumulator:
yield from find_all_paths_reversedly(revtransmap, intermediate, target, accumulator)
yield from EMPTY_ITER
def find_related_to_root(graph, root, sequential=None):
print(root)
lst = list()
if sequential is None:
sequential = list(graph.keys())
sequential = tuple(sequential)
for item in find_all_loopy_paths_reversedly(graph, root['name'], sequential):
item = [sequential[i] for i in item]
print(item)
lst.append(item)
print()
print(lst)
print()
return lst
def draw_degree_quadrants(graph, degrees, key):
points = [
(degree[f'{key}_in'], degree[f'{key}_out'])
for degree in degrees.values()
]
xs, ys = list(zip(*points))
maxx = max(xs)
minx = min(xs)
maxy = max(ys)
miny = min(ys)
avgx = sum(xs)/len(xs)
avgy = sum(ys)/len(ys)
plt.scatter(*list(zip(*points)), color='blue', alpha=.1)
plt.plot([minx, maxx], [avgy, avgy], color='red', alpha=.5)
plt.plot([avgx, avgx], [miny, maxy], color='red', alpha=.5)
plt.text(0, -maxy/6, 'x=%.2f; y=%.2f' % (avgx, avgy), color='red')
plt.xlabel(f"{key} in")
plt.ylabel(f"{key} out")
def convert_outputs(prefix, temporal_context):
Path("flavors.json").write_text(
json.dumps([
*json.loads(Path("flavors.json").read_text()),
prefix
])
)
if not Path(f'{prefix}.json').exists():
generate_graph(grapfn=f'{prefix}.json', keep_temporal_context=temporal_context)
graph = json.loads(Path(f'{prefix}.json').read_text())
if not Path(f'{prefix}_metrics.json').exists():
Path(f'{prefix}_metrics.json').write_text(json.dumps(embed_metrics(graph), indent=2))
metrics = json.loads(Path(f'{prefix}_metrics.json').read_text())
if not Path(f'{prefix}_metrics_distances.json').exists():
Path(f'{prefix}_metrics_distances.json').write_text(json.dumps(embed_metrics_distance(graph, metrics)))
# to_networkx
g = networkx.DiGraph()
g.add_nodes_from(list(graph.keys()) if temporal_context else [node['generic_name'] for node in graph.values()])
g.add_edges_from([
((node_source['name'], target) if temporal_context else (node_source['generic_name'], graph[target]['generic_name']))
for node_source in graph.values()
for target in node_source['mention_freq'].keys()
])
networkx.write_graphml(g, f'{prefix}_unweighted.graphml')
for src in graph.values():
srcnm = src['name' if temporal_context else 'generic_name']
for tgt, w in src['mention_freq'].items():
tgtnm = graph[tgt]['name' if temporal_context else 'generic_name']
g[srcnm][tgtnm]['weight'] = w
networkx.write_graphml(g, f'{prefix}_weighted.graphml')
g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_unweighted.graphml'))
g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_weighted.graphml'))
# to_sqlite
if Path(f'{prefix}.db').exists():
Path(f'{prefix}.db').unlink()
sqldb = sqlite3.connect(f'{prefix}.db')
cur = sqldb.cursor()
cur.execute('''CREATE TABLE node (
name VARCHAR(255),
generic_name VARCHAR(255),
type VARCHAR(255),
doc_id VARCHAR(255),
monitored bool,
pub_date VARCHAR(255),
in_force bool)''')
cur.execute('''CREATE TABLE edge (
node_src INTEGER,
node_dst INTEGER,
mentions INTEGER,
FOREIGN KEY(node_src) REFERENCES node(rowid) ON UPDATE CASCADE ON DELETE CASCADE,
FOREIGN KEY(node_dst) REFERENCES node(rowid) ON UPDATE CASCADE ON DELETE CASCADE)''')
cur.execute(f'''CREATE VIEW nodes AS
SELECT
rowid as id,
{'name' if temporal_context else 'generic_name'} as label
FROM node''')
cur.execute('''CREATE VIEW edges AS
SELECT
rowid as id,
node_src as source,
node_dst as target,
mentions as weight
FROM edge''')
node_name_to_id = dict()
for node in graph.values():
cur.execute(
'''INSERT INTO node(
name,
generic_name,
type,
doc_id,
monitored,
pub_date,
in_force
) VALUES(?,?,?,?,?,?,?)''',
(
node['name'],
node['generic_name'],
node['type'],
node['doc_id'],
node['monitored'],
node['pub_date'],
node['in_force']
)
)
node_name_to_id[node['name']] = cur.lastrowid
for node in graph.values():
node_src_nm = node['name']
node_src = node_name_to_id[node_src_nm]
for node_dst_nm, frequency in node['mention_freq'].items():
node_dst = node_name_to_id[node_dst_nm]
cur.execute(
'''INSERT INTO edge(node_src,node_dst,mentions) VALUES(?,?,?)''',
(node_src, node_dst, frequency)
)
cur.close()
sqldb.commit()
Path(f'{prefix}.sql').write_text('\n'.join(sqldb.iterdump()))
sqldb.close()
# to_csv
with open(f'{prefix}.csv', 'w') as file:
file.write('%s,%s,%s\n' % ("source", "target", "weight"))
for node in graph.values():
node_src_nm = node['name']
for node_dst_nm, frequency in node['mention_freq'].items():
file.write('%s,%s,%d\n' % (
graph[node_src_nm]['name' if temporal_context else 'generic_name'],
graph[node_dst_nm]['name' if temporal_context else 'generic_name'],
frequency
))
# to_graphviz
gv = graphviz.Digraph()
for node in graph.values():
gv.node(
str(node_name_to_id[node['name']]),
label='\n'.join(list(map(str, filter(
lambda a: a is not None,
[node['type'], node['doc_id'], node['pub_date']]
))))
)
for node in graph.values():
node_src_nm = node['name']
node_src = node_name_to_id[node_src_nm]
for node_dst_nm, frequency in node['mention_freq'].items():
node_dst = node_name_to_id[node_dst_nm]
gv.edge(str(node_src), str(node_dst), str(frequency))
gv.save(f'{prefix}.gv') # takes "forever" to render, "never" finishes
# connectivity
g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_unweighted.graphml'))
if not Path(f'{prefix}_metrics_connectivity.json').exists():
Path(f'{prefix}_metrics_connectivity.json').write_text(json.dumps(
embed_metrics_connectivity(graph, metrics, g, 'name' if temporal_context else 'generic_name'), indent=2))
# matplotlib rendering
if not Path(f'{prefix}_unweighted.pdf').exists() or not Path(f'{prefix}_unweighted.png').exists():
g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_unweighted.graphml'))
networkx.draw(g)
plt.savefig(f'{prefix}_unweighted.pdf')
plt.savefig(f'{prefix}_unweighted.png')
plt.close()
if not Path(f'{prefix}_weighted.pdf').exists() or not Path(f'{prefix}_weighted.png').exists():
g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_weighted.graphml'))
networkx.draw(g)
plt.savefig(f'{prefix}_weighted.pdf')
plt.savefig(f'{prefix}_weighted.png')
plt.close()
# Leave root document explicit
if not Path(f'{prefix}_root.json').exists():
Path(f'{prefix}_root.json').write_text(json.dumps(
graph[find_rootdoc()['name']]
))
# Plot quadrants
for weight in [True, False]:
desc = ('un'*int(not weight))+'weighted'
if True or not Path(f'{prefix}_quads_{desc}.pdf').exists() or not Path(f'{prefix}_quads_{desc}.png').exists():
draw_degree_quadrants(
graph,
metrics['degree'],
'weight' if weight else 'degree'
)
plt.savefig(f'{prefix}_quads_{desc}.pdf')
plt.savefig(f'{prefix}_quads_{desc}.png')
plt.close()
def main():
Path("flavors.json").write_text("[]")
convert_outputs('graph', True)
convert_outputs('graph_noctx', False)