#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import json import sqlite3 import networkx import graphviz import multiprocessing from pathlib import Path import matplotlib.pyplot as plt from concurrent.futures import ProcessPoolExecutor from .documents import fromFile as DocumentFromFile from .document_finder import classes as docClasses from .document_finder import find_references as referenceFinder INFINITY = float('inf') EMPTY_ITER = iter(list()) QUADRANT_COLOR = ['#7DB643', '#43B5B5', '#7C43B5', '#B54343'] def find_rootdoc(rootdoc='rootdoc.txt'): rootsrc, rootname = Path(rootdoc).read_text().splitlines() docCchMgr = docClasses[rootsrc](rootname) docPath = docCchMgr.cached() currName = f"{docCchMgr.__class__.__name__}: {docCchMgr._identifier}" if docPath is not None: currName = str(docPath)[6:] return { 'name': currName, 'generic_name': str(docCchMgr), 'type': docCchMgr.__class__.__name__, 'doc_id': docCchMgr._identifier, 'filepath': str(docPath), 'object': docCchMgr, } def generate_graph(rootdoc='rootdoc.txt', grapfn='graph.json', keep_temporal_context=True): rootsrc, rootname = Path(rootdoc).read_text().splitlines() analyzedDocPaths = list() pendingDocCchMgr = [docClasses[rootsrc](rootname)] graph = dict() while len(pendingDocCchMgr) > 0: docCchMgr, *pendingDocCchMgr = pendingDocCchMgr docPath = docCchMgr.cached() currName = f"{docCchMgr.__class__.__name__}: {docCchMgr._identifier}" if docPath is not None: currName = str(docPath)[6:] if currName not in graph: graph[currName] = { 'name': currName, 'generic_name': str(docCchMgr), 'type': docCchMgr.__class__.__name__, 'doc_id': docCchMgr._identifier, 'monitored': False if docPath is None else docPath.exists(), 'pub_date': docCchMgr.publication_date(docPath), 'in_force': docCchMgr.is_in_force(docPath), 'filepath': str(docPath), 'mention_freq': dict(), } if docPath in analyzedDocPaths: continue analyzedDocPaths.append(docPath) docFF = DocumentFromFile(str(docPath)) print(f"Document @ {currName}") if docFF is None: continue doc = docFF.parsed_from_cache(str(docPath)[6:]) newReferences = referenceFinder(doc, docCchMgr.context(docPath)) if not keep_temporal_context: newReferences = list(map(lambda a: a.whithout_temporal_context(), newReferences)) for newReference in newReferences: newDocPath = newReference.cached() newName = f"{newReference.__class__.__name__}: {newReference._identifier}" if newDocPath is not None: newName = str(newDocPath)[6:] graph[currName]['mention_freq'][newName] = graph[currName]['mention_freq'].get(newName, 0) + 1 pendingDocCchMgr = sorted( [*pendingDocCchMgr, *newReferences], key=lambda dcm: (not dcm.is_cached(), dcm.slowness(), dcm._identifier) ) Path(grapfn).write_text(json.dumps(graph)) def dijkstra(graph, initial, hops_mode=False): visited = {initial: 0} path = dict() nodes = set(graph.keys()) mentions = {node: list(graph[node]['mention_freq'].items()) for node in nodes} while len(nodes) > 0: min_node = None for node in nodes: if node in visited: if min_node is None: min_node = node elif visited[node] < visited[min_node]: min_node = node if min_node is None: break nodes.remove(min_node) current_weight = visited[min_node] for edge, possible_weight in mentions[min_node]: weight = current_weight + (1 if hops_mode else possible_weight) if edge not in visited or weight < visited[edge]: visited[edge] = weight path[edge] = min_node return visited, path class Dijkstra: def __init__(self, graph, hops_mode=False): self._graph = graph self._hops_mode = hops_mode def __call__(self, initial): return dijkstra(self._graph, initial, self._hops_mode) def dijkstra_min_path(dijkstra_tuple, initial, target): visited, path = dijkstra_tuple min_path = list() current = target if current in path or current == initial: while current is not None: min_path.append(current) current = path.get(current) return (list(reversed(min_path)), visited[target]) return ([], None) def embed_metrics(graph): metrics = dict() metrics['basic'] = dict() metrics['basic']['node_count'] = len(graph) metrics['basic']['vertex_count'] = 0 metrics['basic']['vertex_weight_sum'] = 0 for node in graph.values(): metrics['basic']['vertex_count'] += len(node['mention_freq']) metrics['basic']['vertex_weight_sum'] += sum(node['mention_freq'].values()) metrics['matrix_labels'] = list(graph.keys()) metrics['degree'] = dict() for key, node in graph.items(): metric = dict() metric['degree_out'] = len(node['mention_freq'].values()) metric['weight_out'] = sum(node['mention_freq'].values()) metric['degree_in'] = 0 metric['weight_in'] = 0 for node2 in graph.values(): count = node2['mention_freq'].get(key, 0) if count > 0: metric['degree_in'] += 1 metric['weight_in'] += count metrics['degree'][key] = metric return metrics def embed_metrics_distance(graph, metrics): distance = dict() matrix_labels = metrics['matrix_labels'] tpe = ProcessPoolExecutor(multiprocessing.cpu_count()) print("Slow Dijkstra: Hops") dj = Dijkstra(graph, True) dijkstra = list(tpe.map(dj, matrix_labels)) distance['distance_matrix_hops'] = [[ dijkstra[pos][0].get(target, -1) for target in matrix_labels ] for pos, initial in enumerate(matrix_labels)] del dijkstra del dj print("Slow Dijkstra: Weight") dj = Dijkstra(graph, False) dijkstra = list(tpe.map(dj, matrix_labels)) distance['distance_matrix_weight'] = [[ dijkstra[pos][0].get(target, -1) for target in matrix_labels ] for pos, initial in enumerate(matrix_labels)] del dijkstra del dj tpe.shutdown() return distance def embed_metrics_connectivity(graph, metrics, g, namefield): connectivity = dict() print('connectivity_edge') connectivity['connectivity_edge'] = networkx.edge_connectivity(g) print('connectivity_node') connectivity['connectivity_node'] = networkx.node_connectivity(g) return connectivity def get_transition_map(graph): transitions = [(source, target) for source, nd in graph.items() for target in nd['mention_freq'].keys()] transmap = dict() for s, t in transitions: if t not in transmap: transmap[t] = list() transmap[t].append(s) return transmap def find_all_paths(tm, initial, target, accumulator=None): if accumulator is None: accumulator = list() accumulator = [*accumulator, initial] if initial == target: yield accumulator else: for intermediate in tm[initial]: if intermediate not in accumulator: yield from find_all_paths(tm, intermediate, target, accumulator) yield from EMPTY_ITER def find_all_loopy_paths(graph, node): tm = get_transition_map(graph) accumulator = [node] for intermediate in tm[node]: yield from find_all_paths(tm, intermediate, node, accumulator) yield from EMPTY_ITER def get_reverse_transition_map(graph, sequential): transitions = [ (sequential.index(source), sequential.index(target)) for source, nd in graph.items() for target in nd['mention_freq'].keys() ] revtransmap = [list() for _ in sequential] for s, t in transitions: revtransmap[t].append(s) return tuple([tuple(i) for i in revtransmap]) def find_all_loopy_paths_reversedly(graph, node, sequential): revtransmap = get_reverse_transition_map(graph, sequential) accumulator = [sequential.index(node)] for intermediate in revtransmap[accumulator[0]]: yield from find_all_paths_reversedly(revtransmap, intermediate, accumulator[0], accumulator) yield from EMPTY_ITER def find_all_paths_reversedly(revtransmap, initial, target, accumulator=None): if accumulator is None: accumulator = list() accumulator = [initial, *accumulator] if initial == target: yield accumulator else: for intermediate in revtransmap[initial]: if intermediate not in accumulator: yield from find_all_paths_reversedly(revtransmap, intermediate, target, accumulator) yield from EMPTY_ITER def find_related_to_root(graph, root, sequential=None): print(root) lst = list() if sequential is None: sequential = list(graph.keys()) sequential = tuple(sequential) for item in find_all_loopy_paths_reversedly(graph, root['name'], sequential): item = [sequential[i] for i in item] print(item) lst.append(item) print() print(lst) print() return lst def get_quadrant(x, y, lx, ly): if x < lx and y < ly: return 3 elif x >= lx and y < ly: return 4 elif x >= lx and y >= ly: return 1 else: return 2 def draw_degree_quadrants(graph, degrees, key): quadrants = dict() points = [ (degree[f'{key}_in'], degree[f'{key}_out']) for degree in degrees.values() ] xs, ys = list(zip(*points)) maxx = max(xs) minx = min(xs) maxy = max(ys) miny = min(ys) avgx = sum(xs)/len(xs) avgy = sum(ys)/len(ys) midx = (maxx-minx)/2 midy = (maxy-miny)/2 quads = [0, 0, 0, 0] for point in points: quads[get_quadrant(*point, midx, midy)-1] += 1 plt.figure(figsize=(12, 9), dpi=300) plt.scatter(*list(zip(*points)), color='blue', alpha=.1) plt.plot([minx, maxx], [avgy, avgy], color='red', alpha=.5) plt.plot([avgx, avgx], [miny, maxy], color='red', alpha=.5) plt.plot([minx, maxx], [midy, midy], color='green', alpha=.5) plt.plot([midx, midx], [miny, maxy], color='green', alpha=.5) plt.text(1.5*midx, 1.5*midy, str(quads[0]), color='green') plt.text(0.5*midx, 1.5*midy, str(quads[1]), color='green') plt.text(0.5*midx, 0.5*midy, str(quads[2]), color='green') plt.text(1.5*midx, 0.5*midy, str(quads[3]), color='green') plt.text(0, -maxy/9, 'x=%.2f; y=%.2f' % (avgx, avgy), color='red') plt.text(1.5*midx, -maxy/9, 'x=%.2f; y=%.2f' % (midx, midy), color='green') plt.xlabel(f"{key} in") plt.ylabel(f"{key} out") quadrants['centroid'] = {'x': avgx, 'y': avgy} quadrants['halfrange'] = {'x': midx, 'y': midy} quadrants['halfrange_quadrants'] = quads return quadrants def convert_outputs(prefix, temporal_context): Path("flavors.json").write_text( json.dumps( [ *json.loads(Path("flavors.json").read_text()), prefix ], indent=4 ) ) label_key = 'name' if temporal_context else 'generic_name' if not Path(f'{prefix}.json').exists(): generate_graph(grapfn=f'{prefix}.json', keep_temporal_context=temporal_context) graph = json.loads(Path(f'{prefix}.json').read_text()) if not Path(f'{prefix}_metrics.json').exists(): Path(f'{prefix}_metrics.json').write_text(json.dumps(embed_metrics(graph), indent=2)) metrics = json.loads(Path(f'{prefix}_metrics.json').read_text()) if not Path(f'{prefix}_metrics_distances.json').exists(): Path(f'{prefix}_metrics_distances.json').write_text(json.dumps(embed_metrics_distance(graph, metrics))) # to_networkx g = networkx.DiGraph() g.add_nodes_from([node[label_key] for node in graph.values()]) g.add_edges_from([ (node_source[label_key], graph[target][label_key]) for node_source in graph.values() for target in node_source['mention_freq'].keys() ]) networkx.write_graphml(g, f'{prefix}_unweighted.graphml') for src in graph.values(): srcnm = src[label_key] for tgt, w in src['mention_freq'].items(): tgtnm = graph[tgt][label_key] g[srcnm][tgtnm]['weight'] = w networkx.write_graphml(g, f'{prefix}_weighted.graphml') g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_unweighted.graphml')) g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_weighted.graphml')) # to_sqlite if Path(f'{prefix}.db').exists(): Path(f'{prefix}.db').unlink() sqldb = sqlite3.connect(f'{prefix}.db') cur = sqldb.cursor() cur.execute('''CREATE TABLE node ( name VARCHAR(255), generic_name VARCHAR(255), type VARCHAR(255), doc_id VARCHAR(255), monitored bool, pub_date VARCHAR(255), in_force bool)''') cur.execute('''CREATE TABLE edge ( node_src INTEGER, node_dst INTEGER, mentions INTEGER, FOREIGN KEY(node_src) REFERENCES node(rowid) ON UPDATE CASCADE ON DELETE CASCADE, FOREIGN KEY(node_dst) REFERENCES node(rowid) ON UPDATE CASCADE ON DELETE CASCADE)''') cur.execute(f'''CREATE VIEW nodes AS SELECT rowid as id, {label_key} as label FROM node''') cur.execute('''CREATE VIEW edges AS SELECT rowid as id, node_src as source, node_dst as target, mentions as weight FROM edge''') node_name_to_id = dict() for node in graph.values(): cur.execute( '''INSERT INTO node( name, generic_name, type, doc_id, monitored, pub_date, in_force ) VALUES(?,?,?,?,?,?,?)''', ( node['name'], node['generic_name'], node['type'], node['doc_id'], node['monitored'], node['pub_date'], node['in_force'] ) ) node_name_to_id[node['name']] = cur.lastrowid for node in graph.values(): node_src_nm = node['name'] node_src = node_name_to_id[node_src_nm] for node_dst_nm, frequency in node['mention_freq'].items(): node_dst = node_name_to_id[node_dst_nm] cur.execute( '''INSERT INTO edge(node_src,node_dst,mentions) VALUES(?,?,?)''', (node_src, node_dst, frequency) ) cur.close() sqldb.commit() Path(f'{prefix}.sql').write_text('\n'.join(sqldb.iterdump())) sqldb.close() # to_csv with open(f'{prefix}.csv', 'w') as file: file.write('%s,%s,%s\n' % ("source", "target", "weight")) for node in graph.values(): node_src_nm = node['name'] for node_dst_nm, frequency in node['mention_freq'].items(): file.write('%s,%s,%d\n' % ( graph[node_src_nm][label_key], graph[node_dst_nm][label_key], frequency )) # to_graphviz gv = graphviz.Digraph() for node in graph.values(): gv.node( str(node_name_to_id[node['name']]), label='\n'.join(list(map(str, filter( lambda a: a is not None, [node['type'], node['doc_id'], node['pub_date']] )))) ) for node in graph.values(): node_src_nm = node['name'] node_src = node_name_to_id[node_src_nm] for node_dst_nm, frequency in node['mention_freq'].items(): node_dst = node_name_to_id[node_dst_nm] gv.edge(str(node_src), str(node_dst), str(frequency)) gv.save(f'{prefix}.gv') # takes "forever" to render, "never" finishes # connectivity g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_unweighted.graphml')) if not Path(f'{prefix}_metrics_connectivity.json').exists(): Path(f'{prefix}_metrics_connectivity.json').write_text(json.dumps( embed_metrics_connectivity(graph, metrics, g, label_key), indent=2)) # matplotlib rendering if not Path(f'{prefix}_unweighted.pdf').exists() or not Path(f'{prefix}_unweighted.png').exists(): g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_unweighted.graphml')) networkx.draw(g) plt.savefig(f'{prefix}_unweighted.pdf') plt.savefig(f'{prefix}_unweighted.png') plt.close() if not Path(f'{prefix}_weighted.pdf').exists() or not Path(f'{prefix}_weighted.png').exists(): g = networkx.DiGraph(networkx.read_graphml(f'{prefix}_weighted.graphml')) networkx.draw(g) plt.savefig(f'{prefix}_weighted.pdf') plt.savefig(f'{prefix}_weighted.png') plt.close() # Leave root document explicit if not Path(f'{prefix}_root.json').exists(): Path(f'{prefix}_root.json').write_text(json.dumps( graph[find_rootdoc()['name']] )) # Plot quadrants for weight in [True, False]: desc = ('un'*int(not weight))+'weighted' if not Path(f'{prefix}_quads_{desc}.pdf').exists() or not Path(f'{prefix}_quads_{desc}.png').exists(): key = 'weight' if weight else 'degree' dimen_cutoff = draw_degree_quadrants(graph, metrics['degree'], key) plt.savefig(f'{prefix}_quads_{desc}.pdf', bbox_inches='tight') plt.savefig(f'{prefix}_quads_{desc}.png', bbox_inches='tight') Path(f'{prefix}_quads_{desc}.json').read_text(json.dumps(dimen_cutoff, indent=4)) for weight in [True, False]: desc = ('un'*int(not weight))+'weighted' if True or not Path(f'{prefix}_quads_{desc}.csv').exists(): key = 'weight' if weight else 'degree' dimen_cutoff = json.loads(Path(f'{prefix}_quads_{desc}.json').read_text()) with open(f'{prefix}_quads_{desc}.csv', 'w') as file: fmt = ','.join(['%s']*(4+int(weight)))+'\n' hr = (dimen_cutoff['halfrange']['x'], dimen_cutoff['halfrange']['y']) file.write(fmt % ("source", "target", *(["weight"]*int(weight)), "source_color", "target_color")) for node in graph.values(): node_src_nm = node['name'] src_metric = metrics['degree'][node_src_nm] for node_dst_nm, frequency in node['mention_freq'].items(): dst_metric = metrics['degree'][node_dst_nm] file.write(fmt % ( graph[node_src_nm][label_key], graph[node_dst_nm][label_key], *([frequency]*int(weight)), QUADRANT_COLOR[get_quadrant(src_metric[f'{key}_in'], src_metric[f'{key}_out'], *hr)-1], QUADRANT_COLOR[get_quadrant(dst_metric[f'{key}_in'], dst_metric[f'{key}_out'], *hr)-1], )) with open(f'{prefix}_quads_{desc}_nodst3rdquad.csv', 'w') as file: fmt = ','.join(['%s']*(4+int(weight)))+'\n' hr = (dimen_cutoff['halfrange']['x'], dimen_cutoff['halfrange']['y']) file.write(fmt % ("source", "target", *(["weight"]*int(weight)), "source_color", "target_color")) for node in graph.values(): node_src_nm = node['name'] src_metric = metrics['degree'][node_src_nm] for node_dst_nm, frequency in node['mention_freq'].items(): dst_metric = metrics['degree'][node_dst_nm] if get_quadrant(dst_metric[f'{key}_in'], dst_metric[f'{key}_out'], *hr) == 3: continue file.write(fmt % ( graph[node_src_nm][label_key], graph[node_dst_nm][label_key], *([frequency]*int(weight)), QUADRANT_COLOR[get_quadrant(src_metric[f'{key}_in'], src_metric[f'{key}_out'], *hr)-1], QUADRANT_COLOR[get_quadrant(dst_metric[f'{key}_in'], dst_metric[f'{key}_out'], *hr)-1], )) with open(f'{prefix}_quads_{desc}_nosrc3rdquad.csv', 'w') as file: fmt = ','.join(['%s']*(4+int(weight)))+'\n' hr = (dimen_cutoff['halfrange']['x'], dimen_cutoff['halfrange']['y']) file.write(fmt % ("source", "target", *(["weight"]*int(weight)), "source_color", "target_color")) for node in graph.values(): node_src_nm = node['name'] src_metric = metrics['degree'][node_src_nm] if get_quadrant(src_metric[f'{key}_in'], src_metric[f'{key}_out'], *hr) == 3: continue for node_dst_nm, frequency in node['mention_freq'].items(): dst_metric = metrics['degree'][node_dst_nm] file.write(fmt % ( graph[node_src_nm][label_key], graph[node_dst_nm][label_key], *([frequency]*int(weight)), QUADRANT_COLOR[get_quadrant(src_metric[f'{key}_in'], src_metric[f'{key}_out'], *hr)-1], QUADRANT_COLOR[get_quadrant(dst_metric[f'{key}_in'], dst_metric[f'{key}_out'], *hr)-1], )) with open(f'{prefix}_quads_{desc}_no3rdquad.csv', 'w') as file: fmt = ','.join(['%s']*(4+int(weight)))+'\n' hr = (dimen_cutoff['halfrange']['x'], dimen_cutoff['halfrange']['y']) file.write(fmt % ("source", "target", *(["weight"]*int(weight)), "source_color", "target_color")) for node in graph.values(): node_src_nm = node['name'] src_metric = metrics['degree'][node_src_nm] if get_quadrant(src_metric[f'{key}_in'], src_metric[f'{key}_out'], *hr) == 3: continue for node_dst_nm, frequency in node['mention_freq'].items(): dst_metric = metrics['degree'][node_dst_nm] if get_quadrant(dst_metric[f'{key}_in'], dst_metric[f'{key}_out'], *hr) == 3: continue file.write(fmt % ( graph[node_src_nm][label_key], graph[node_dst_nm][label_key], *([frequency]*int(weight)), QUADRANT_COLOR[get_quadrant(src_metric[f'{key}_in'], src_metric[f'{key}_out'], *hr)-1], QUADRANT_COLOR[get_quadrant(dst_metric[f'{key}_in'], dst_metric[f'{key}_out'], *hr)-1], )) def main(): Path("flavors.json").write_text("[]") convert_outputs('graph', True) convert_outputs('graph_noctx', False)