From 25d4843f9905febf4ab5a61e7c3dbd41ccb1d8fd Mon Sep 17 00:00:00 2001 From: cwy-p8d-u1 <sven-ove.haensel@stud.hs-hannover.de> Date: Mon, 5 Aug 2024 10:49:01 +0200 Subject: [PATCH] Memgraphs_subs_without_correct_comments --- ...atch_kind_oriented_using_Merge_no_label.py | 458 ++++++++++++++++++ .../Procedure_for_module_in_memgraph.py | 49 ++ .../sub_mem_whole_batch_using_unwind.py | 361 ++++++++++++++ ...hole_batch_using_unwind_edges_in_Thread.py | 369 ++++++++++++++ ..._mem_whole_batch_using_unwind_two_label.py | 356 ++++++++++++++ ...batch_using_unwind_using_procedure_node.py | 362 ++++++++++++++ ...atch_using_unwind_using_procedure_nodes.py | 364 ++++++++++++++ .../sub_mem_whole_batch_kind_oriented.py | 392 +++++++++++++++ ..._mem_whole_batch_kind_oriented_no_Label.py | 387 +++++++++++++++ ...mem_whole_batch_kind_oriented_no_Params.py | 409 ++++++++++++++++ ...mem_whole_batch_kind_oriented_two_label.py | 397 +++++++++++++++ .../sub_mem_whole_batch_subgraph_oriented.py | 446 +++++++++++++++++ ...le_batch_subgraph_oriented_using_Params.py | 430 ++++++++++++++++ ...ubgraph_oriented_using_Params_two_label.py | 428 ++++++++++++++++ 14 files changed, 5208 insertions(+) create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/Merge/sub_mem_whole_batch_kind_oriented_using_Merge_no_label.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/Procedure_for_module_in_memgraph.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_edges_in_Thread.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_two_label.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_using_procedure_node.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_using_procedure_nodes.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_no_Label.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_no_Params.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_two_label.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented_using_Params.py create mode 100644 code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented_using_Params_two_label.py diff --git a/code/infrastructure/streaming/clients/sub/memgraph/Merge/sub_mem_whole_batch_kind_oriented_using_Merge_no_label.py b/code/infrastructure/streaming/clients/sub/memgraph/Merge/sub_mem_whole_batch_kind_oriented_using_Merge_no_label.py new file mode 100644 index 0000000..a5b7d36 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/Merge/sub_mem_whole_batch_kind_oriented_using_Merge_no_label.py @@ -0,0 +1,458 @@ +import datetime +import sys +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +# Muster +# CREATE ALL nodes (Using Param) +# MATCH all lookups (not known yet) +# CREATE all EDGES + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] +on_disk = False +analytic = True +retries=int(os.getenv('lines_per_window',default=1000)) + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + + + +# Data for the whole Batch, so it doesnt get copied when used in another function +# Holds a List with all known identifier, a directory to look up if the Node is already known +# Holds the Batch query +# Holds a number to create diffrent idetifier + +# Holds a list with the Nodes to match and a list with relations to add +# manages the build of these lists +class batchDataHolder: + def __init__(self): + self.knownNodes = dict() + self.create_Nodes = [] + self.identifierNumber = 0 #number to create unique identifier in the query + self.lookup_nodes = [] + self.insert_relations = [] + + def add_entry_Node(self, key, value): + self.knownNodes[key] = value + + def get_lookup_nodes(self): + return 'MERGE '.join(self.lookup_nodes) + + def get_insert_relations(self): + return ', '.join(self.insert_relations) + + def is_empty_lookup_nodes(self): + if self.lookup_nodes: + return False + else: + return True + + def is_empty_insert_relations(self): + if self.insert_relations: + return False + else: + return True + + + def get_knownNodes(self): + return self.knownNodes + + def get_Node_value(self, key): + return self.knownNodes[key] + + def check_key(self, key): + if key in self.knownNodes: + return True + else: + return False + + def append_with(self, ident): + self.with_list_identifier.append(ident) + + def get_Identifier_Increment_add_Node(self, type, key):#create a new identifier + identifier = type + str(self.identifierNumber) + self.identifierNumber += 1 + self.add_entry_Node(key, identifier) + return identifier + + def append_create_Nodes(self, add): + self.create_Nodes.append(add) + + def get_create_Nodes(self): + return ' '.join(self.create_Nodes) + + + def nodes_relations_to_cypher( self, val, newIdentifier, rel, type_letter, type): #type either ':Host' 'Subject' .... or '' + try: + if self.check_key(val): # Node is already known use the same identifier + ident = self.get_Node_value(val) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + else : + ident = self.get_Identifier_Increment_add_Node(type_letter, val) # get identifier and add to knownNode + s = f'({ident}:Node {{uuid: "{val}"}})' # Can't look after the Type because maybe the Node was already used but wihtout knowing the Type + # so a Merge with the type would create a new Node with the same uuid + self.lookup_nodes.append(s) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + except: + print(type_letter) + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + + if nodes_count % 2000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + + #nodes_count = nodes_count + + #knownNodes = dict() #to check if Node is known and no Lookup is needed + batchData = batchDataHolder() + all_values = {} + + + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + newIdentifier = batchData.get_Identifier_Increment_add_Node('_', value_flat['uuid']) + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + + + #Prepare the on .... Set Values + + formatted_attrs = [] + + + + # Durchlaufe jedes Attribut im Wert des aktuellen Schlüssels + for attr_name, attr_value in value_flat.items(): + if attr_name == 'uuid': + continue + + if isinstance(attr_value, str): + formatted_attr = f"{newIdentifier}.{attr_name} = '{attr_value}'" + else: + formatted_attr = f"{newIdentifier}.{attr_name} = {attr_value}" + formatted_attrs.append(formatted_attr) + + # Erstelle den finalen formatierten String für das aktuelle Element + all_values[newIdentifier] = f" {', '.join(formatted_attrs)}" + + relations = dict( + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + if rel =="residesOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'he', ':Host') + + elif rel == "runsOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'hu', ':Host') + + elif rel =="isGeneratedBy": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 's', ':Subject') + + elif rel =="hasOwningPrincipal": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'p', ':Principal') + + elif rel =="affects": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', '') + + elif rel == 'affects2': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', '') + # ... other relations for Object not in data + elif rel =="hasParent": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + elif rel =='hasAccountOn': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + elif rel == 'hasLocalPrincipal': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + batchData.append_create_Nodes(f"""MERGE ({newIdentifier}:Node {{uuid: '{value_flat['uuid']}'}}) ON CREATE SET {newIdentifier}.nodeType = '{nodeType}', {all_values[newIdentifier]} ON MATCH SET {newIdentifier}.nodeType = '{nodeType}', {all_values[newIdentifier]}""") + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + if batchData.is_empty_lookup_nodes(): + query = f""" {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + else: + query = f""" MERGE {batchData.get_lookup_nodes()} {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + # print() + # print('query: ', query) + # print() + # print('attributes: ', all_values) + # print(nodes_count) + #print(query) + + return query, all_values + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + query, value = parse_json_to_cypher(json) + return query, value + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + q, value = create_cypher_query_from_cdm(data) + execute_query(q, value) + +first_connect = True + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + global first_connect + if return_code == 0: + if (first_connect): #added so it doesnt overewirte the other prints..... + print("connected") + first_connect = False + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + if (analytic): + session.run("STORAGE MODE IN_MEMORY_ANALYTICAL;") + if(on_disk): + session.run("STORAGE MODE ON_DISK_TRANSACTIONAL") + # create indices here .... + if (create_indices): + session.run("CREATE INDEX ON :Node(_uuid);") + session.run("CREATE CONSTRAINT ON (node:Node) ASSERT node.uuid IS UNIQUE;") + + # session.run("CREATE INDEX ON :Event(_uuid);") + # session.run("CREATE INDEX ON :Host(_uuid);") + # session.run("CREATE INDEX ON :FileObject(_uuid);") + # session.run("CREATE INDEX ON :NetFlowObject(_uuid);") + # session.run("CREATE INDEX ON :SrcSinkObject(_uuid);") + # session.run("CREATE INDEX ON :UnnamedPipeObject(_uuid);") + # session.run("CREATE INDEX ON :Principal(_uuid);") + + return driver + +total_retries = 0 +total_new_retries = 0 +total_cancel = 0 + +def execute_query(query:str, value): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + global total_retries + global total_new_retries + global total_cancel + #retries + retry_delay = 10 + summary = '' + for retry in range(retries): + try: + with driver.session() as session: + # Führe die Transaktion aus + print(f"Count of total retries: {total_retries}, Count of new retry: {total_new_retries} \n Total of cancaled Transaction: {total_cancel} ") + + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + result = session.run(query+";") + #result = session.run(query+";", value) + summary = result.consume()# makes sure that the query was run by the Databases + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + # Wenn erfolgreich, breche die Schleife ab + break + except Exception as e: + if retry == 0: + total_new_retries = total_new_retries +1 + print(f"Transaktion fehlgeschlagen (Versuch {retry+1}): {e}") + if retry < retries - 1: + print(f"Warte {retry_delay} Sekunden vor dem nächsten Versuch...") + time.sleep(retry_delay) + total_retries = total_retries +1 + else: + total_cancel = total_cancel+1 + raise Exception("Maximale Anzahl an Retries erreicht, Transaktion fehlgeschlagen.") + + + + + + # with driver.session() as session: + # result = session.run(query, value) # maybe result = session.run(query, **value) + return summary + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/Procedure_for_module_in_memgraph.py b/code/infrastructure/streaming/clients/sub/memgraph/Procedure_for_module_in_memgraph.py new file mode 100644 index 0000000..7703735 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/Procedure_for_module_in_memgraph.py @@ -0,0 +1,49 @@ +# In Memgraph Lab open query Modules, new Module namen eingeben und +# erstellen und alles automatische durch untenstehenden ersetztn + +#MATCH (n {uuid: 'g'}) +#CALL <Name_des_Moduls>.bfs_non_only_host_neighbors(n, 3) YIELD result +#RETURN result; + +#Die parameter sind der Startknoten + + + +import mgp + + + +@mgp.read_proc +def bfs_non_only_host_neighbors(context: mgp.ProcCtx, start_node: mgp.Vertex, max_depth: int) -> mgp.Record(result=mgp.List[mgp.Vertex]): + queue = [(start_node, 0)] # Queue of tuples (node, current_depth) + visited = set() + non_host_neighbors = set() + non_host_neighbors.add(start_node) + + while queue: # while work not done repeat + node, depth = queue.pop(0) # load object for work + if node in visited or depth > max_depth: # validate if + continue + + visited.add(node) + for edge in node.in_edges: #load edges, where the Neighbor points at current node + neighbor = edge.from_vertex + if neighbor.properties.get('nodeType') == 'Host' or 'Host' in neighbor.labels:# Der Knoten hat das Label 'Host': + if neighbor not in visited: + non_host_neighbors.add(neighbor) + continue # Skip neighbors with nodeType 'Host' + + non_host_neighbors.add(neighbor) + queue.append((neighbor, depth + 1)) + + for edge in node.out_edges: # load edges which show to neighbor + neighbor = edge.to_vertex + if neighbor.properties.get('nodeType') == 'Host' or 'Host' in neighbor.labels: + if neighbor not in visited: + non_host_neighbors.add(neighbor) + continue # Skip neighbors with nodeType 'Host' + + non_host_neighbors.add(neighbor)#add neighbr to return + queue.append((neighbor, depth + 1))#add the neighbors of the neighbor to the todo list + + return mgp.Record(result=list(non_host_neighbors)) \ No newline at end of file diff --git a/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind.py b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind.py new file mode 100644 index 0000000..05ef1b4 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind.py @@ -0,0 +1,361 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +# Muster +# Unwind -> Do For each row in nodes +# Create All Nodes +# +# Unwind -> Do For each Start_node_type and end_node_type combination +# lookup Nodes of the Edge maybe with end_node_type or without +# Create Edge + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + +class batchDataHolder: + def __init__(self): + self.nodes = {} # {"properties": value_flat} all Values of a Node sorted under Nodetypes + self.edges = {} # keys are the relationstype value another dictonary with keys being the start node type value a list with all fitting relations + + + def add_node_data(self, node_type, node_data): + if node_type not in self.nodes: + self.nodes[node_type] = [] + self.nodes[node_type].append(node_data) + + def get_Node_data(self): + return self.nodes + + def add_edge_data(self, edge_type, start_type, edges_data): + if edge_type not in self.edges: + self.edges[edge_type] = {} + if start_type not in self.edges[edge_type]: + self.edges[edge_type][start_type] = [] + self.edges[edge_type][start_type].append(edges_data) + + def get_edge_data(self): + return self.edges + + + + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + batchdata = batchDataHolder() + + #{"sourceUuid": "1", "targetUuid": "2", "type": "RELATED_TO" + if nodes_count % 100000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + + batchdata.add_node_data(nodeType, value_flat) + #nodes.append({"nodeType": nodeType, "properties": value_flat}) + source_uuid = value_flat['uuid'] + + + relations = dict(#save the uuid to relationtype + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + + batchdata.add_edge_data(rel, nodeType, {"start_id": source_uuid, "end_id": val}) + + + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + return batchdata.get_Node_data(), batchdata.get_edge_data() + +def create_nodes(tx, node_type, properties): + if properties: # check if list is not empty + query = ( + f"UNWIND $rows AS row " + f"CREATE (n:{node_type}) " + f"SET n += row" + ) + tx.run(query, rows=properties) + +def create_relationships(tx, relationship_type, start_node_type, relationships): + if relationships: # Check if list is not empty + + end_node_type = "" + if relationship_type =="residesOn": + end_node_type = "Host" + elif relationship_type == "runsOn": + end_node_type = "Host" + elif relationship_type =="isGeneratedBy": + end_node_type = "Subject" + elif relationship_type =="hasOwningPrincipal": + end_node_type = "Principal" + elif relationship_type == "hasParent": + end_node_type = "Subject" + elif relationship_type == 'hasAccountOn': + end_node_type = "Host" + elif relationship_type == 'hasLocalPrincipal': + end_node_type = "Principal" +# elif relationship_type in ["affects", 'affects2', "hasParent", 'hasAccountOn', 'hasLocalPrincipal']: # the correct end node Type not known with these relationtypse +# batchdata.add_edge_data(rel, {"start_node_type": nodeType, "start_id": source_uuid, "end_node_type": '', "end_id": val}) + + + + if end_node_type == "": + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + "MATCH (target {uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + else: + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + f"MATCH (target: {end_node_type}"" {uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + tx.run(query, edges=relationships) + + + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + nodes, edges = parse_json_to_cypher(json) + return nodes, edges + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + nodes, edges = create_cypher_query_from_cdm(data) + execute_query(nodes, edges) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + # create indices here .... + if (create_indices): + session.run("CREATE INDEX ON :Subject(uuid);") + session.run("CREATE INDEX ON :Event(uuid);") + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + #session.run("CREATE INDEX ON :* (uuid);") # global index on all uuid, for the Match where target label unknown + + return driver + + +def execute_query(nodes, edges): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + with driver.session() as session: + + for node_type, properties in nodes.items(): + session.execute_write(create_nodes, node_type, properties) + + for relationship_type, relationships_start_type in edges.items(): #dictonary resolve with the relation type + for start_type , relationships in relationships_start_type.items(): # dictonary 2 resolve with the start_node_type as key -> is needed to use the Node types while creating the relations + #if not needed then running over all Nodes or over an index over all nodes better would be run over index to the one nodetype + # own query for each start node type and end type configuration (end node type encoded in relationship_type) + session.execute_write(create_relationships, relationship_type, start_type, relationships) + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + #result = session.run(queryNode, nodes=nodes) # maybe result = session.run(query, **value) + #result2 = session.run(queryEdge, edges=edges) + #return result2.data()12 24 + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_edges_in_Thread.py b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_edges_in_Thread.py new file mode 100644 index 0000000..9d3f9bb --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_edges_in_Thread.py @@ -0,0 +1,369 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os +import concurrent.futures + +# Muster +# Unwind -> Do For each row in nodes +# Create All Nodes +# +# Unwind -> Do For each Start_node_type and end_node_type combination +# lookup Nodes of the Edge maybe with end_node_type or without +# Create Edge + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + +class batchDataHolder: + def __init__(self): + self.nodes = {} # {"properties": value_flat} all Values of a Node sorted under Nodetypes + self.edges = {} # keys are the relationstype value another dictonary with keys being the start node type value a list with all fitting relations + + + def add_node_data(self, node_type, node_data): + if node_type not in self.nodes: + self.nodes[node_type] = [] + self.nodes[node_type].append(node_data) + + def get_Node_data(self): + return self.nodes + + def add_edge_data(self, edge_type, start_type, edges_data): + if edge_type not in self.edges: + self.edges[edge_type] = {} + if start_type not in self.edges[edge_type]: + self.edges[edge_type][start_type] = [] + self.edges[edge_type][start_type].append(edges_data) + + def get_edge_data(self): + return self.edges + + + + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + batchdata = batchDataHolder() + + #{"sourceUuid": "1", "targetUuid": "2", "type": "RELATED_TO" + if nodes_count % 10000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + + batchdata.add_node_data(nodeType, value_flat) + #nodes.append({"nodeType": nodeType, "properties": value_flat}) + source_uuid = value_flat['uuid'] + + + relations = dict(#save the uuid to relationtype + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + + batchdata.add_edge_data(rel, nodeType, {"start_id": source_uuid, "end_id": val}) + + + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + + return batchdata.get_Node_data(), batchdata.get_edge_data() + +def create_nodes(tx, node_type, properties): + if properties: # check if list is not empty + query = ( + f"UNWIND $rows AS row " + f"CREATE (n:{node_type}) " + f"SET n += row" + ) + tx.run(query, rows=properties) + +def create_relationships(tx, relationship_type, start_node_type, relationships): + if relationships: # Check if list is not empty + + end_node_type = "" + if relationship_type =="residesOn": + end_node_type = "Host" + elif relationship_type == "runsOn": + end_node_type = "Host" + elif relationship_type =="isGeneratedBy": + end_node_type = "Subject" + elif relationship_type =="hasOwningPrincipal": + end_node_type = "Principal" + elif relationship_type == "hasParent": + end_node_type = "Subject" + elif relationship_type == 'hasAccountOn': + end_node_type = "Host" + elif relationship_type == 'hasLocalPrincipal': + end_node_type = "Principal" +# elif relationship_type in ["affects", 'affects2', "hasParent", 'hasAccountOn', 'hasLocalPrincipal']: # the correct end node Type not known with these relationtypse +# batchdata.add_edge_data(rel, {"start_node_type": nodeType, "start_id": source_uuid, "end_node_type": '', "end_id": val}) + + + + if end_node_type == "": + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + "MATCH (target {uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + else: + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + f"MATCH (target: {end_node_type}"" {uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + tx.run(query, edges=relationships) + + + + + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + nodes, edges = parse_json_to_cypher(json) + return nodes, edges + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + nodes, edges = create_cypher_query_from_cdm(data) + execute_query(nodes, edges) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + # create indices here .... + if (create_indices): + session.run("CREATE INDEX ON :Subject(uuid);") + session.run("CREATE INDEX ON :Event(uuid);") + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + + return driver + + +def execute_query(nodes, edges): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + with driver.session() as session: + + + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + for node_type, properties in nodes.items(): + session.execute_write(create_nodes, node_type, properties) + + with concurrent.futures.ThreadPoolExecutor() as executor: #create Thread pool + futures = [] + for relationship_type, relationships_start_type in edges.items(): #dictonary resolve with the relation type + for start_type , relationships in relationships_start_type.items(): # dictonary 2 resolve with the start_node_type as key -> is needed to use the Node types while creating the relations + #if not needed then running over all Nodes or over an index over all nodes better would be run over index to the one nodetype + # own query for each start node type and end type configuration (end node type encoded in relationship_type) + future = executor.submit(session.execute_write(create_relationships, relationship_type, start_type, relationships)) + futures.append(future) + + for future in concurrent.futures.as_completed(futures): # wait on the Threads + pass + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_two_label.py b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_two_label.py new file mode 100644 index 0000000..b47d7d9 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_two_label.py @@ -0,0 +1,356 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +# Muster +# Unwind -> Do For each row in nodes +# Create All Nodes +# +# Unwind -> Do For each Start_node_type and end_node_type combination +# lookup Nodes of the Edge maybe with end_node_type or without +# Create Edge + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] +second_label = 'Node' + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + +class batchDataHolder: + def __init__(self): + self.nodes = {} # {"properties": value_flat} all Values of a Node sorted under Nodetypes + self.edges = {} # keys are the relationstype value another dictonary with keys being the start node type value a list with all fitting relations + + + def add_node_data(self, node_type, node_data): + if node_type not in self.nodes: + self.nodes[node_type] = [] + self.nodes[node_type].append(node_data) + + def get_Node_data(self): + return self.nodes + + def add_edge_data(self, edge_type, start_type, edges_data): + if edge_type not in self.edges: + self.edges[edge_type] = {} + if start_type not in self.edges[edge_type]: + self.edges[edge_type][start_type] = [] + self.edges[edge_type][start_type].append(edges_data) + + def get_edge_data(self): + return self.edges + + + + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + batchdata = batchDataHolder() + + #{"sourceUuid": "1", "targetUuid": "2", "type": "RELATED_TO" + if nodes_count % 100000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + + batchdata.add_node_data(nodeType, value_flat) + #nodes.append({"nodeType": nodeType, "properties": value_flat}) + source_uuid = value_flat['uuid'] + + + relations = dict(#save the uuid to relationtype + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + + batchdata.add_edge_data(rel, nodeType, {"start_id": source_uuid, "end_id": val}) + + + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + + + return batchdata.get_Node_data(), batchdata.get_edge_data() + +def create_nodes(tx, node_type, properties): + if properties: # check if list is not empty + query = ( + f"UNWIND $rows AS row " + f"CREATE (n:{node_type}:{second_label}) " + f"SET n += row" + ) + tx.run(query, rows=properties) + +def create_relationships(tx, relationship_type, start_node_type, relationships): + if relationships: # Check if list is not empty + + end_node_type = "" + if relationship_type =="residesOn": + end_node_type = "Host" + elif relationship_type == "runsOn": + end_node_type = "Host" + elif relationship_type =="isGeneratedBy": + end_node_type = f"Subject:{second_label}" + elif relationship_type =="hasOwningPrincipal": + end_node_type = "Principal" + elif relationship_type == "hasParent": + end_node_type = f"Subject:{second_label}" + elif relationship_type == 'hasAccountOn': + end_node_type = "Host" + elif relationship_type == 'hasLocalPrincipal': + end_node_type = "Principal" + + if end_node_type == "": + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + f"MATCH (target :{second_label}""{uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + else: + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + f"MATCH (target: {end_node_type}"" {uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + tx.run(query, edges=relationships) + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + nodes, edges = parse_json_to_cypher(json) + return nodes, edges + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + nodes, edges = create_cypher_query_from_cdm(data) + execute_query(nodes, edges) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + # create indices here .... + if (create_indices): + s = f"CREATE INDEX ON :{second_label}(uuid);" + session.run(s) + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + #session.run("CREATE INDEX ON :* (uuid);") # global index on all uuid, for the Match where target label unknown + + return driver + + +def execute_query(nodes, edges): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + with driver.session() as session: + + for node_type, properties in nodes.items(): + session.execute_write(create_nodes, node_type, properties) + + for relationship_type, relationships_start_type in edges.items(): #dictonary resolve with the relation type + for start_type , relationships in relationships_start_type.items(): # dictonary 2 resolve with the start_node_type as key -> is needed to use the Node types while creating the relations + #if not needed then running over all Nodes or over an index over all nodes better would be run over index to the one nodetype + # own query for each start node type and end type configuration (end node type encoded in relationship_type) + session.execute_write(create_relationships, relationship_type, start_type, relationships) + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_using_procedure_node.py b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_using_procedure_node.py new file mode 100644 index 0000000..dd4e2dd --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_using_procedure_node.py @@ -0,0 +1,362 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +# Muster +# Unwind -> Do For each row in nodes +# Create All Nodes +# +# Unwind -> Do For each Start_node_type and end_node_type combination +# lookup Nodes of the Edge maybe with end_node_type or without +# Create Edge + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + +class batchDataHolder: + def __init__(self): + self.nodes = [] # {"properties": value_flat} all Values of a Node sorted under Nodetypes + self.edges = {} # keys are the relationstype value another dictonary with keys being the start node type value a list with all fitting relations + + + def add_node_data(self, node_type, node_data): + node = {"data": node_data, "type": node_type} + self.nodes.append(node) + + + def get_Node_data(self): + return self.nodes + + def add_edge_data(self, edge_type, start_type, edges_data): + if edge_type not in self.edges: + self.edges[edge_type] = {} + if start_type not in self.edges[edge_type]: + self.edges[edge_type][start_type] = [] + self.edges[edge_type][start_type].append(edges_data) + + def get_edge_data(self): + return self.edges + + + + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + batchdata = batchDataHolder() + + #{"sourceUuid": "1", "targetUuid": "2", "type": "RELATED_TO" + if nodes_count % 100000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + + batchdata.add_node_data(nodeType, value_flat) + #nodes.append({"nodeType": nodeType, "properties": value_flat}) + source_uuid = value_flat['uuid'] + + + relations = dict(#save the uuid to relationtype + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + + batchdata.add_edge_data(rel, nodeType, {"start_id": source_uuid, "end_id": val}) + + + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + return batchdata.get_Node_data(), batchdata.get_edge_data() + +def create_nodes(tx, node_type, properties): + if properties: # check if list is not empty + query = ( + f"UNWIND $rows AS row " + f"CREATE (n:{node_type}) " + f"SET n += row" + ) + tx.run(query, rows=properties) + +def create_relationships(tx, relationship_type, start_node_type, relationships): + if relationships: # Check if list is not empty + + end_node_type = "" + if relationship_type =="residesOn": + end_node_type = "Host" + elif relationship_type == "runsOn": + end_node_type = "Host" + elif relationship_type =="isGeneratedBy": + end_node_type = "Subject" + elif relationship_type =="hasOwningPrincipal": + end_node_type = "Principal" + elif relationship_type == "hasParent": + end_node_type = "Subject" + elif relationship_type == 'hasAccountOn': + end_node_type = "Host" + elif relationship_type == 'hasLocalPrincipal': + end_node_type = "Principal" +# elif relationship_type in ["affects", 'affects2', "hasParent", 'hasAccountOn', 'hasLocalPrincipal']: # the correct end node Type not known with these relationtypse +# batchdata.add_edge_data(rel, {"start_node_type": nodeType, "start_id": source_uuid, "end_node_type": '', "end_id": val}) + + + + if end_node_type == "": + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + "MATCH (target {uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + else: + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + f"MATCH (target: {end_node_type}"" {uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + tx.run(query, edges=relationships) + + + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + nodes, edges = parse_json_to_cypher(json) + return nodes, edges + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + nodes, edges = create_cypher_query_from_cdm(data) + execute_query(nodes, edges) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + # create indices here .... + if (create_indices): + session.run("CREATE INDEX ON :Subject(uuid);") + session.run("CREATE INDEX ON :Event(uuid);") + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + #session.run("CREATE INDEX ON :* (uuid);") # global index on all uuid, for the Match where target label unknown + + return driver + + +def execute_query(nodes, edges): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + + with driver.session() as session: + call = """UNWIND $nodes AS Node CALL create.node([Node.type], Node.data) YIELD node;""" + + session.run(call, nodes=nodes) + + for relationship_type, relationships_start_type in edges.items(): #dictonary resolve with the relation type + for start_type , relationships in relationships_start_type.items(): # dictonary 2 resolve with the start_node_type as key -> is needed to use the Node types while creating the relations + #if not needed then running over all Nodes or over an index over all nodes better would be run over index to the one nodetype + # own query for each start node type and end type configuration (end node type encoded in relationship_type) + session.execute_write(create_relationships, relationship_type, start_type, relationships) + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + #result = session.run(queryNode, nodes=nodes) # maybe result = session.run(query, **value) + #result2 = session.run(queryEdge, edges=edges) + #return result2.data()12 24 + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_using_procedure_nodes.py b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_using_procedure_nodes.py new file mode 100644 index 0000000..22af87e --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/Unwind/sub_mem_whole_batch_using_unwind_using_procedure_nodes.py @@ -0,0 +1,364 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +# Muster +# Unwind -> Do For each row in nodes +# Create All Nodes +# +# Unwind -> Do For each Start_node_type and end_node_type combination +# lookup Nodes of the Edge maybe with end_node_type or without +# Create Edge + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + +class batchDataHolder: + def __init__(self): + self.nodes = {} # {"properties": value_flat} all Values of a Node sorted under Nodetypes + self.edges = {} # keys are the relationstype value another dictonary with keys being the start node type value a list with all fitting relations + + + def add_node_data(self, node_type, node_data): + if node_type not in self.nodes: + self.nodes[node_type] = [] + self.nodes[node_type].append(node_data) + + def get_Node_data(self): + return self.nodes + + def add_edge_data(self, edge_type, start_type, edges_data): + if edge_type not in self.edges: + self.edges[edge_type] = {} + if start_type not in self.edges[edge_type]: + self.edges[edge_type][start_type] = [] + self.edges[edge_type][start_type].append(edges_data) + + def get_edge_data(self): + return self.edges + + + + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + batchdata = batchDataHolder() + + #{"sourceUuid": "1", "targetUuid": "2", "type": "RELATED_TO" + if nodes_count % 100000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + + batchdata.add_node_data(nodeType, value_flat) + #nodes.append({"nodeType": nodeType, "properties": value_flat}) + source_uuid = value_flat['uuid'] + + + relations = dict(#save the uuid to relationtype + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + + batchdata.add_edge_data(rel, nodeType, {"start_id": source_uuid, "end_id": val}) + + + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + return batchdata.get_Node_data(), batchdata.get_edge_data() + +def create_nodes(tx, node_type, properties): + if properties: # check if list is not empty + query = ( + f"UNWIND $rows AS row " + f"CREATE (n:{node_type}) " + f"SET n += row" + ) + tx.run(query, rows=properties) + +def create_relationships(tx, relationship_type, start_node_type, relationships): + if relationships: # Check if list is not empty + + end_node_type = "" + if relationship_type =="residesOn": + end_node_type = "Host" + elif relationship_type == "runsOn": + end_node_type = "Host" + elif relationship_type =="isGeneratedBy": + end_node_type = "Subject" + elif relationship_type =="hasOwningPrincipal": + end_node_type = "Principal" + elif relationship_type == "hasParent": + end_node_type = "Subject" + elif relationship_type == 'hasAccountOn': + end_node_type = "Host" + elif relationship_type == 'hasLocalPrincipal': + end_node_type = "Principal" +# elif relationship_type in ["affects", 'affects2', "hasParent", 'hasAccountOn', 'hasLocalPrincipal']: # the correct end node Type not known with these relationtypse +# batchdata.add_edge_data(rel, {"start_node_type": nodeType, "start_id": source_uuid, "end_node_type": '', "end_id": val}) + + + + if end_node_type == "": + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + "MATCH (target {uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + else: + query = ( + f"UNWIND $edges AS edge " + f"MATCH (source: {start_node_type}""{uuid: edge.start_id}) " + f"MATCH (target: {end_node_type}"" {uuid: edge.end_id}) " + f"CREATE (source)-[r:{relationship_type}]->(target)" + ) + tx.run(query, edges=relationships) + + + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + nodes, edges = parse_json_to_cypher(json) + return nodes, edges + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + nodes, edges = create_cypher_query_from_cdm(data) + execute_query(nodes, edges) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + # create indices here .... + if (create_indices): + session.run("CREATE INDEX ON :Subject(uuid);") + session.run("CREATE INDEX ON :Event(uuid);") + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + #session.run("CREATE INDEX ON :* (uuid);") # global index on all uuid, for the Match where target label unknown + + return driver + + +def execute_query(nodes, edges): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + with driver.session() as session: + + for node_type, properties in nodes.items(): + properties_json = json.dumps(properties) + call = f"""CALL create.nodes($node_types, $properties_json) YIELD node""" + session.run(call, node_types=[node_type], properties_json=properties) + #session.execute_write(create_nodes, node_type, properties) + + for relationship_type, relationships_start_type in edges.items(): #dictonary resolve with the relation type + for start_type , relationships in relationships_start_type.items(): # dictonary 2 resolve with the start_node_type as key -> is needed to use the Node types while creating the relations + #if not needed then running over all Nodes or over an index over all nodes better would be run over index to the one nodetype + # own query for each start node type and end type configuration (end node type encoded in relationship_type) + session.execute_write(create_relationships, relationship_type, start_type, relationships) + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + #result = session.run(queryNode, nodes=nodes) # maybe result = session.run(query, **value) + #result2 = session.run(queryEdge, edges=edges) + #return result2.data()12 24 + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented.py b/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented.py new file mode 100644 index 0000000..e639b98 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented.py @@ -0,0 +1,392 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +# Muster +# CREATE ALL nodes (Using Param) +# MATCH all lookups (not known yet) +# CREATE all EDGES + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] +on_disk = False +analytic = True + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + + + +# Data for the whole Batch, so it doesnt get copied when used in another function +# Holds a List with all known identifier, a directory to look up if the Node is already known +# Holds the Batch query +# Holds a number to create diffrent idetifier + +# Holds a list with the Nodes to match and a list with relations to add +# manages the build of these lists +class batchDataHolder: + def __init__(self): + self.knownNodes = dict() + self.create_Nodes = [] + self.identifierNumber = 0 #number to create unique identifier in the query + self.lookup_nodes = [] + self.insert_relations = [] + + def add_entry_Node(self, key, value): + self.knownNodes[key] = value + + def get_lookup_nodes(self): + return ', '.join(self.lookup_nodes) + + def get_insert_relations(self): + return ', '.join(self.insert_relations) + + def is_empty_lookup_nodes(self): + if self.lookup_nodes: + return False + else: + return True + + def is_empty_insert_relations(self): + if self.insert_relations: + return False + else: + return True + + + def get_knownNodes(self): + return self.knownNodes + + def get_Node_value(self, key): + return self.knownNodes[key] + + def check_key(self, key): + if key in self.knownNodes: + return True + else: + return False + + def append_with(self, ident): + self.with_list_identifier.append(ident) + + def get_Identifier_Increment_add_Node(self, type, key):#create a new identifier + identifier = type + str(self.identifierNumber) + self.identifierNumber += 1 + self.add_entry_Node(key, identifier) + return identifier + + def append_create_Nodes(self, add): + self.create_Nodes.append(add) + + def get_create_Nodes(self): + return ', '.join(self.create_Nodes) + + + def nodes_relations_to_cypher( self, val, newIdentifier, rel, type_letter, type): #type either ':Host' 'Subject' .... or '' + try: + if self.check_key(val): # Node is already known use the same identifier + ident = self.get_Node_value(val) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + else : + ident = self.get_Identifier_Increment_add_Node(type_letter, val) # get identifier and add to knownNode + s = f'({ident}{type} {{uuid: "{val}"}})' + self.lookup_nodes.append(s) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + except: + print(type_letter) + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + if nodes_count % 10000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + #nodes_count = nodes_count + + #knownNodes = dict() #to check if Node is known and no Lookup is needed + batchData = batchDataHolder() + all_values = {} + + + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + newIdentifier = batchData.get_Identifier_Increment_add_Node('_', value_flat['uuid']) + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + all_values[newIdentifier] = value_flat + + relations = dict( + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + if rel =="residesOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'he', ':Host') + + elif rel == "runsOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'hu', ':Host') + + elif rel =="isGeneratedBy": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 's', ':Subject') + + elif rel =="hasOwningPrincipal": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'p', ':Principal') + + elif rel =="affects": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', '') + + elif rel == 'affects2': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', '') + # ... other relations for Object not in data + elif rel =="hasParent": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + elif rel =='hasAccountOn': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + elif rel == 'hasLocalPrincipal': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + batchData.append_create_Nodes(f"""({newIdentifier}:{nodeType} ${newIdentifier})""") + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + if batchData.is_empty_lookup_nodes(): + query = f""" CREATE {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + else: + query = f""" MATCH {batchData.get_lookup_nodes()} CREATE {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + # print() + # print('query: ', query) + # print() + # print('attributes: ', all_values) + # print(nodes_count) + return query, all_values + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + query, value = parse_json_to_cypher(json) + return query, value + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + q, value = create_cypher_query_from_cdm(data) + execute_query(q, value) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + if(on_disk): + session.run("STORAGE MODE ON_DISK_TRANSACTIONAL") + if (analytic): + session.run("STORAGE MODE IN_MEMORY_ANALYTICAL;") + # create indices here .... + if (create_indices): + session.run("CREATE INDEX ON :Subject(uuid);") + session.run("CREATE INDEX ON :Event(uuid);") + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + + return driver + + +def execute_query(query:str, value): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + with driver.session() as session: + result = session.run(query, value) + summary = result.consume()# makes sure that the query was run by the Databases + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + # with driver.session() as session: + # result = session.run(query, value) # maybe result = session.run(query, **value) + return summary + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_no_Label.py b/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_no_Label.py new file mode 100644 index 0000000..53eca40 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_no_Label.py @@ -0,0 +1,387 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +# Muster +# CREATE ALL nodes (Using Param) +# MATCH all lookups (not known yet) +# CREATE all EDGES + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] +on_disk = False +create_noteType = True # second index only if create_indices true + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + + + +# Data for the whole Batch, so it doesnt get copied when used in another function +# Holds a List with all known identifier, a directory to look up if the Node is already known +# Holds the Batch query +# Holds a number to create diffrent idetifier + +# Holds a list with the Nodes to match and a list with relations to add +# manages the build of these lists +class batchDataHolder: + def __init__(self): + self.knownNodes = dict() + self.create_Nodes = [] + self.identifierNumber = 0 #number to create unique identifier in the query + self.lookup_nodes = [] + self.insert_relations = [] + + def add_entry_Node(self, key, value): + self.knownNodes[key] = value + + def get_lookup_nodes(self): + return ', '.join(self.lookup_nodes) + + def get_insert_relations(self): + return ', '.join(self.insert_relations) + + def is_empty_lookup_nodes(self): + if self.lookup_nodes: + return False + else: + return True + + def is_empty_insert_relations(self): + if self.insert_relations: + return False + else: + return True + + + def get_knownNodes(self): + return self.knownNodes + + def get_Node_value(self, key): + return self.knownNodes[key] + + def check_key(self, key): + if key in self.knownNodes: + return True + else: + return False + + def append_with(self, ident): + self.with_list_identifier.append(ident) + + def get_Identifier_Increment_add_Node(self, type, key):#create a new identifier + identifier = type + str(self.identifierNumber) + self.identifierNumber += 1 + self.add_entry_Node(key, identifier) + return identifier + + def append_create_Nodes(self, add): + self.create_Nodes.append(add) + + def get_create_Nodes(self): + return ', '.join(self.create_Nodes) + + + def nodes_relations_to_cypher( self, val, newIdentifier, rel, type_letter, type): #type either ':Host' 'Subject' .... or '' + try: + if self.check_key(val): # Node is already known use the same identifier + ident = self.get_Node_value(val) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + else : + ident = self.get_Identifier_Increment_add_Node(type_letter, val) # get identifier and add to knownNode + s = f'({ident}:Node {{uuid: "{val}"}})' + self.lookup_nodes.append(s) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + except: + print(type_letter) + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + if nodes_count % 10000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + #nodes_count = nodes_count + + #knownNodes = dict() #to check if Node is known and no Lookup is needed + batchData = batchDataHolder() + all_values = {} + + + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + newIdentifier = batchData.get_Identifier_Increment_add_Node('_', value_flat['uuid']) + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + value_flat['nodeType'] = nodeType #adding the NodeType as an value to add as Param + all_values[newIdentifier] = value_flat + + relations = dict( + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + if rel =="residesOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'he', ':Host') + + elif rel == "runsOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'hu', ':Host') + + elif rel =="isGeneratedBy": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 's', ':Subject') + + elif rel =="hasOwningPrincipal": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'p', ':Principal') + + elif rel =="affects": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', '') + + elif rel == 'affects2': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', '') + # ... other relations for Object not in data + elif rel =="hasParent": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + elif rel =='hasAccountOn': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + elif rel == 'hasLocalPrincipal': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + batchData.append_create_Nodes(f"""({newIdentifier}:Node ${newIdentifier})""") + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + if batchData.is_empty_lookup_nodes(): + query = f""" CREATE {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + else: + query = f""" MATCH {batchData.get_lookup_nodes()} CREATE {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + # print() + # print('query: ', query) + # print() + # print('attributes: ', all_values) + # print(nodes_count) + return query, all_values + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + query, value = parse_json_to_cypher(json) + return query, value + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + q, value = create_cypher_query_from_cdm(data) + execute_query(q, value) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + if(on_disk): + session.run("STORAGE MODE ON_DISK_TRANSACTIONAL") + # create indices here .... + if (create_indices): + session.run("CREATE INDEX ON :Node(uuid);") + if(create_noteType): + session.run("CREATE INDEX ON :Node(nodeType);") + + + return driver + + +def execute_query(query:str, value): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + with driver.session() as session: + result = session.run(query, value) + summary = result.consume()# makes sure that the query was run by the Databases + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + # with driver.session() as session: + # result = session.run(query, value) # maybe result = session.run(query, **value) + return summary + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_no_Params.py b/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_no_Params.py new file mode 100644 index 0000000..f54e770 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_no_Params.py @@ -0,0 +1,409 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +# Muster +# CREATE ALL nodes (Using Param) +# MATCH all lookups (not known yet) +# CREATE all EDGES + +#Error at 574 750 - 575 000 , creates /' escape zeichen macht Query Anfrage ungültig verrsucht dreckig alle /' durch ' zu ersetzen prüfen. +# Später prüfen ob Daten Korrekt bleiben. + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] +on_disk = False + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + + + +# Data for the whole Batch, so it doesnt get copied when used in another function +# Holds a List with all known identifier, a directory to look up if the Node is already known +# Holds the Batch query +# Holds a number to create diffrent idetifier + +# Holds a list with the Nodes to match and a list with relations to add +# manages the build of these lists +class batchDataHolder: + def __init__(self): + self.knownNodes = dict() + self.create_Nodes = [] + self.identifierNumber = 0 #number to create unique identifier in the query + self.lookup_nodes = [] + self.insert_relations = [] + + def add_entry_Node(self, key, value): + self.knownNodes[key] = value + + def get_lookup_nodes(self): + return ', '.join(self.lookup_nodes) + + def get_insert_relations(self): + return ', '.join(self.insert_relations) + + def is_empty_lookup_nodes(self): + if self.lookup_nodes: + return False + else: + return True + + def is_empty_insert_relations(self): + if self.insert_relations: + return False + else: + return True + + + def get_knownNodes(self): + return self.knownNodes + + def get_Node_value(self, key): + return self.knownNodes[key] + + def check_key(self, key): + if key in self.knownNodes: + return True + else: + return False + + def append_with(self, ident): + self.with_list_identifier.append(ident) + + def get_Identifier_Increment_add_Node(self, type, key):#create a new identifier + identifier = type + str(self.identifierNumber) + self.identifierNumber += 1 + self.add_entry_Node(key, identifier) + return identifier + + def append_create_Nodes(self, add): + self.create_Nodes.append(add) + + def get_create_Nodes(self): + return ', '.join(self.create_Nodes) + + + def nodes_relations_to_cypher( self, val, newIdentifier, rel, type_letter, type): #type either ':Host' 'Subject' .... or '' + try: + if self.check_key(val): # Node is already known use the same identifier + ident = self.get_Node_value(val) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + else : + ident = self.get_Identifier_Increment_add_Node(type_letter, val) # get identifier and add to knownNode + s = f'({ident}{type} {{uuid: "{val}"}})' + self.lookup_nodes.append(s) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + except: + print(type_letter) + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + if nodes_count % 10000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + #nodes_count = nodes_count + + + + #knownNodes = dict() #to check if Node is known and no Lookup is needed + batchData = batchDataHolder() + all_values = {} + + + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + newIdentifier = batchData.get_Identifier_Increment_add_Node('_', value_flat['uuid']) + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + all_values[newIdentifier] = value_flat + + relations = dict( + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + if rel =="residesOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'he', ':Host') + + elif rel == "runsOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'hu', ':Host') + + elif rel =="isGeneratedBy": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 's', ':Subject') + + elif rel =="hasOwningPrincipal": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'p', ':Principal') + + elif rel =="affects": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', '') + + elif rel == 'affects2': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', '') + # ... other relations for Object not in data + elif rel =="hasParent": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + elif rel =='hasAccountOn': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + elif rel == 'hasLocalPrincipal': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', '') + + attribute_string = [] + #', '.join([f"{k}: '{v}'" for k, v in value_flat.items()]) + + for k, v in value_flat.items(): # Makes Sure only the Strings are handelded as Strings + if isinstance(v, str): + attribute_string.append(f"{k}: '{v}'") + else: + attribute_string.append(f"{k}: {v}") + + batchData.append_create_Nodes(f"""({newIdentifier}:{nodeType} {{{','.join(attribute_string)}}})""") + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + if batchData.is_empty_lookup_nodes(): + query = f""" CREATE {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + else: + query = f""" MATCH {batchData.get_lookup_nodes()} CREATE {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + # print() + # print('query: ', query) + # print() + # print('attributes: ', all_values) + # print(nodes_count) + return query, all_values + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + query, value = parse_json_to_cypher(json) + return query, value + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + q, value = create_cypher_query_from_cdm(data) + execute_query(q, value) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + if(on_disk): + session.run("STORAGE MODE ON_DISK_TRANSACTIONAL") + # create indices here .... + if (create_indices): + session.run("CREATE INDEX ON :Subject(uuid);") + session.run("CREATE INDEX ON :Event(uuid);") + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + + return driver + + +def execute_query(query:str, value): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + + if '\\' in query: + # Ersetze jeden `\` durch `\\` in der Zeichenkette + query = query.replace("\\", "\\\\") + + + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + with driver.session() as session: + result = session.run(query) + summary = result.consume()# makes sure that the query was run by the Databases + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + # with driver.session() as session: + # result = session.run(query, value) # maybe result = session.run(query, **value) + return summary + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_two_label.py b/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_two_label.py new file mode 100644 index 0000000..e0da121 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/kind_oriented/sub_mem_whole_batch_kind_oriented_two_label.py @@ -0,0 +1,397 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +# Muster +# CREATE ALL nodes (Using Param) +# MATCH all lookups (not known yet) +# CREATE all EDGES + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] +on_disk = False +second_label = 'Node' +analytic = False + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + + + + +# Data for the whole Batch, so it doesnt get copied when used in another function +# Holds a List with all known identifier, a directory to look up if the Node is already known +# Holds the Batch query +# Holds a number to create diffrent idetifier + +# Holds a list with the Nodes to match and a list with relations to add +# manages the build of these lists +class batchDataHolder: + def __init__(self): + self.knownNodes = dict() + self.create_Nodes = [] + self.identifierNumber = 0 #number to create unique identifier in the query + self.lookup_nodes = [] + self.insert_relations = [] + + def add_entry_Node(self, key, value): + self.knownNodes[key] = value + + def get_lookup_nodes(self): + return ', '.join(self.lookup_nodes) + + def get_insert_relations(self): + return ', '.join(self.insert_relations) + + def is_empty_lookup_nodes(self): + if self.lookup_nodes: + return False + else: + return True + + def is_empty_insert_relations(self): + if self.insert_relations: + return False + else: + return True + + + def get_knownNodes(self): + return self.knownNodes + + def get_Node_value(self, key): + return self.knownNodes[key] + + def check_key(self, key): + if key in self.knownNodes: + return True + else: + return False + + def append_with(self, ident): + self.with_list_identifier.append(ident) + + def get_Identifier_Increment_add_Node(self, type, key):#create a new identifier + identifier = type + str(self.identifierNumber) + self.identifierNumber += 1 + self.add_entry_Node(key, identifier) + return identifier + + def append_create_Nodes(self, add): + self.create_Nodes.append(add) + + def get_create_Nodes(self): + return ', '.join(self.create_Nodes) + + + def nodes_relations_to_cypher( self, val, newIdentifier, rel, type_letter, type): #type either ':Host' 'Subject' .... or '' + try: + if self.check_key(val): # Node is already known use the same identifier + ident = self.get_Node_value(val) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + else : + ident = self.get_Identifier_Increment_add_Node(type_letter, val) # get identifier and add to knownNode + s = f'({ident}{type} {{uuid: "{val}"}})' + self.lookup_nodes.append(s) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + except: + print(type_letter) + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + if nodes_count % 10000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + + #nodes_count = nodes_count + + #knownNodes = dict() #to check if Node is known and no Lookup is needed + batchData = batchDataHolder() + all_values = {} + + if nodes_count>2400000: + print('2400000 Nodes') + for input_string in data_list: # run throuh subgraphs of the Batch + + nodes_count = nodes_count +1 + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + newIdentifier = batchData.get_Identifier_Increment_add_Node('_', value_flat['uuid']) + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + all_values[newIdentifier] = value_flat + + relations = dict( + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + if rel =="residesOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'he', ':Host') + + elif rel == "runsOn": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'hu', ':Host') + + elif rel =="isGeneratedBy": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 's', f':Subject:{second_label}') + + elif rel =="hasOwningPrincipal": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'p', ':Principal') + + elif rel =="affects": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', f':{second_label}') + + elif rel == 'affects2': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'a', f':{second_label}') + # ... other relations for Object not in data + elif rel =="hasParent": + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', f':{second_label}') + + elif rel =='hasAccountOn': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', f':{second_label}') + + elif rel == 'hasLocalPrincipal': + batchData.nodes_relations_to_cypher(val, newIdentifier, rel, 'not', f':{second_label}') + + batchData.append_create_Nodes(f"""({newIdentifier}:{nodeType}:{second_label} ${newIdentifier})""") + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + + + if batchData.is_empty_lookup_nodes(): + query = f""" CREATE {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + else: + query = f""" MATCH {batchData.get_lookup_nodes()} CREATE {batchData.get_create_Nodes()} CREATE {batchData.get_insert_relations()}""" + # print() + # print('query: ', query) + # print() + # print('attributes: ', all_values) + # print(nodes_count) + return query, all_values + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + query, value = parse_json_to_cypher(json) + return query, value + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + q, value = create_cypher_query_from_cdm(data) + execute_query(q, value) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + if(on_disk): + session.run("STORAGE MODE ON_DISK_TRANSACTIONAL") + if (analytic): + session.run("STORAGE MODE IN_MEMORY_ANALYTICAL;") + # create indices here .... + if (create_indices): + s = f"CREATE INDEX ON :{second_label}(uuid);" + session.run(s) + session.run("CREATE INDEX ON :Subject(uuid);") + session.run("CREATE INDEX ON :Event(uuid);") + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + + return driver + + +def execute_query(query:str, value): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + with driver.session() as session: + result = session.run(query, value) + summary = result.consume()# makes sure that the query was run by the Databases + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + # with driver.session() as session: + # result = session.run(query, value) # maybe result = session.run(query, **value) + return summary + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented.py b/code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented.py new file mode 100644 index 0000000..7c4c6d9 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented.py @@ -0,0 +1,446 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + +#Muster +# MATCH lookupnodes (not known yet) +# CREATE current Node (not using PARAM) +# CREATE edges +# WITH +# repeat next node + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] +analytic = False + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + +# Holds a list with the Nodes to match and a list with relations to add +# manages the build of these lists +class subgraphNodeRelation: + + def __init__(self): + self.lookup_nodes = [] + self.insert_relations = [] + + def nodes_relations_to_cypher( self, val, newIdentifier, rel, batchData, type_letter, type): #type either ':Host' 'Subject' .... or '' + try: + if batchData.check_key(val): # Node is already known use the same identifier + ident = batchData.get_Node_value(val) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + else : + ident = batchData.get_Identifier_Increment_add_Node(type_letter, val) # get identifier and add to knownNode + s = f'({ident}{type} {{uuid: "{val}"}})' + self.lookup_nodes.append(s) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + except: + print(type_letter) + + def get_lookup_nodes(self): + return ', '.join(self.lookup_nodes) + + def get_insert_relations(self): + return ', '.join(self.insert_relations) + + def is_empty_lookup_nodes(self): + if self.lookup_nodes: + return False + else: + return True + + def is_empty_insert_relations(self): + if self.insert_relations: + return False + else: + return True + + +# Data for the whole Batch, so it doesnt get copied when used in another function +# Holds a List with all known identifier, a directory to look up if the Node is already known +# Holds the Batch query +# Holds a number to create diffrent idetifier +class batchDataHolder: + def __init__(self): + self.knownNodes = dict() + self.with_list_identifier = [] + self.batch_query = [] + self.identifierNumber = 0 #number to create unique identifier in the query + + def add_entry_Node(self, key, value): + self.knownNodes[key] = value + self.with_list_identifier.append(value) #add the value to with + + + def get_knownNodes(self): + return self.knownNodes + + def get_Node_value(self, key): + return self.knownNodes[key] + + def check_key(self, key): + if key in self.knownNodes: + return True + else: + return False + + def append_with(self, ident): + self.with_list_identifier.append(ident) + + def get_With_List(self): + return ', '.join(self.with_list_identifier) + + def get_Identifier_Increment_add_Node(self, type, key):#create a new identifier + identifier = type + str(self.identifierNumber) + self.identifierNumber += 1 + self.add_entry_Node(key, identifier) + return identifier + + def append_query(self, add): + self.batch_query.append(add) + + def get_query(self): + return ''.join(self.batch_query) + + def add_query_with(self): # Each Subgraph takes over the nodes from the Subgraph before + if self.batch_query: # only if batach_query not empty # empty list is false not empty true + self.batch_query.append(' WITH ' + ', '.join(self.with_list_identifier) + ' ') # append the with identifier transform to string split by ', ' + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + #knownNodes = dict() #to check if Node is known and no Lookup is needed + batchData = batchDataHolder() + if nodes_count % 5000 == 0: + print(nodes_count) + print(datetime.datetime.now().strftime("%H:%M:%S")) + + + for input_string in data_list: # run throuh subgraphs of the Batch + #print(nodes_count) + nodes_count = nodes_count +1 + + batchData.add_query_with() # add Nodes from Subgraph before, if not first + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + + relations = dict( + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + newIdentifier = batchData.get_Identifier_Increment_add_Node('_', value_flat['uuid']) # TODO add the central Node uuid + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + # Gives the known nodes to the next action so a new lookup isnt necessary + # + # next(iter(value_flat.values()), None) -> erstes value der Attribut liste müsste uuid sein + + subgraph = subgraphNodeRelation() + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + if rel =="residesOn": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'he', ':Host') + + elif rel == "runsOn": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'hu', ':Host') + + elif rel =="isGeneratedBy": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 's', ':Subject') + + elif rel =="hasOwningPrincipal": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'p', ':Principal') + + elif rel =="affects": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'a', '') + + elif rel == 'affects2': + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'a', '') + # ... other relations for Object not in data + elif rel =="hasParent": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'not', '') + + elif rel =='hasAccountOn': + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'not', '') + + elif rel == 'hasLocalPrincipal': + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'not', '') + + + + try: + + attribute_string = []#', '.join([f"{k}: '{v}'" for k, v in value_flat.items()]) + + for k, v in value_flat.items(): # Makes Sure only the Strings are handelded as Strings + if isinstance(k, str): + attribute_string.append(f"{k}: '{v}'") + else: + attribute_string.append(f"{k}: {v}") + + attribute_string2 = ', '.join(attribute_string) + + if subgraph.is_empty_lookup_nodes(): + if subgraph.is_empty_insert_relations(): # no relations or Nodes to add (should not happen) + batchData.append_query(f"""CREATE ({newIdentifier}:{nodeType} {{{attribute_string2}}})""") + else: # no new Nodes happen when all Nodes already matched + batchData.append_query(f"""CREATE ({newIdentifier}:{nodeType} {{{attribute_string2}}}) CREATE {subgraph.get_insert_relations()}""") + else: + batchData.append_query(f"""MATCH {subgraph.get_lookup_nodes()} CREATE ({newIdentifier}:{nodeType} {{{attribute_string2}}}) CREATE {subgraph.get_insert_relations()}""") + #print(batchData.get_query()) + except: + print('Exception') + print('input: ', input) + print() + print('relations: ', relations) + print() + print('Query: ', batchData.get_query) + print() + print('Relations: '+ subgraph.get_insert_relations) + print() + print('Match Nodes: ', subgraph.get_lookup_nodes) + print('Attribute_String: ', attribute_string2) + #q_rel = "".join(insert_strings) + + + #query = f""" + #CREATE ({identifierNumber}:{nodeType} $attributes) + #{q_rel} + #RETURN {identifierNumber} + #""" + + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + query = batchData.get_query() + # print() + # print(query) + # print() + + return query + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + query = parse_json_to_cypher(json) + return query + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + q = create_cypher_query_from_cdm(data) + execute_query(q) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + # create indices here .... + if (create_indices): + if (analytic): + session.run("STORAGE MODE IN_MEMORY_ANALYTICAL;") + session.run("CREATE INDEX ON :Subject(uuid);") + session.run("CREATE INDEX ON :Event(uuid);") + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + + return driver + + +def execute_query(query:str): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + + global nodes_count + + if '\\' in query: + # Ersetze jeden `\` durch `\\` in der Zeichenkette + query = query.replace("\\", "\\\\") + + + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + with driver.session() as session: + + + result = session.run(query) + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + return result.data() + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented_using_Params.py b/code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented_using_Params.py new file mode 100644 index 0000000..1df83d1 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented_using_Params.py @@ -0,0 +1,430 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + + +#Muster +# MATCH lookupnodes (not known yet) +# CREATE current Node (using PARAM) +# CREATE edges +# WITH +# repeat next node + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = True #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] +analytic = False + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + +# Holds a list with the Nodes to match and a list with relations to add +# manages the build of these lists +class subgraphNodeRelation: + + def __init__(self): + self.lookup_nodes = [] + self.insert_relations = [] + + def nodes_relations_to_cypher( self, val, newIdentifier, rel, batchData, type_letter, type): #type either ':Host' 'Subject' .... or '' + try: + if batchData.check_key(val): # Node is already known use the same identifier + ident = batchData.get_Node_value(val) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + else : + ident = batchData.get_Identifier_Increment_add_Node(type_letter, val) # get identifier and add to knownNode + s = f'({ident}{type} {{uuid: "{val}"}})' + self.lookup_nodes.append(s) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + except: + print(type_letter) + + def get_lookup_nodes(self): + return ', '.join(self.lookup_nodes) + + def get_insert_relations(self): + return ', '.join(self.insert_relations) + + def is_empty_lookup_nodes(self): + if self.lookup_nodes: + return False + else: + return True + + def is_empty_insert_relations(self): + if self.insert_relations: + return False + else: + return True + + +# Data for the whole Batch, so it doesnt get copied when used in another function +# Holds a List with all known identifier, a directory to look up if the Node is already known +# Holds the Batch query +# Holds a number to create diffrent idetifier +class batchDataHolder: + def __init__(self): + self.knownNodes = dict() + self.with_list_identifier = [] + self.batch_query = [] + self.identifierNumber = 0 #number to create unique identifier in the query + + def add_entry_Node(self, key, value): + self.knownNodes[key] = value + self.with_list_identifier.append(value) #add the value to with + + + def get_knownNodes(self): + return self.knownNodes + + def get_Node_value(self, key): + return self.knownNodes[key] + + def check_key(self, key): + if key in self.knownNodes: + return True + else: + return False + + def append_with(self, ident): + self.with_list_identifier.append(ident) + + def get_With_List(self): + return ', '.join(self.with_list_identifier) + + def get_Identifier_Increment_add_Node(self, type, key):#create a new identifier + identifier = type + str(self.identifierNumber) + self.identifierNumber += 1 + self.add_entry_Node(key, identifier) + return identifier + + def append_query(self, add): + self.batch_query.append(add) + + def get_query(self): + return ''.join(self.batch_query) + + def add_query_with(self): # Each Subgraph takes over the nodes from the Subgraph before + if self.batch_query: # only if batach_query not empty # empty list is false not empty true + self.batch_query.append(' WITH ' + ', '.join(self.with_list_identifier) + ' ') # append the with identifier transform to string split by ', ' + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + + + #knownNodes = dict() #to check if Node is known and no Lookup is needed + batchData = batchDataHolder() + all_values = {} + + + for input_string in data_list: # run throuh subgraphs of the Batch + + if nodes_count % 5000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + nodes_count = nodes_count +1 + + batchData.add_query_with() # add Nodes from Subgraph before, if not first + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + newIdentifier = batchData.get_Identifier_Increment_add_Node('_', value_flat['uuid']) + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + all_values[newIdentifier] = value_flat + + relations = dict( + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + subgraph = subgraphNodeRelation() + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + if rel =="residesOn": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'he', ':Host') + + elif rel == "runsOn": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'hu', ':Host') + + elif rel =="isGeneratedBy": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 's', ':Subject') + + elif rel =="hasOwningPrincipal": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'p', ':Principal') + + elif rel =="affects": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'a', '') + + elif rel == 'affects2': + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'a', '') + # ... other relations for Object not in data + elif rel =="hasParent": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'not', '') + + elif rel =='hasAccountOn': + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'not', '') + + elif rel == 'hasLocalPrincipal': + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'not', '') + + + + try: + + #attribute_string = ', '.join([f"{k[1:]}: '{v}'" if "properties_map_arg_mem_flags" not in k else f"{k[1:]}: {v}" for k, v in value_flat.items()]) + + if subgraph.is_empty_lookup_nodes(): + if subgraph.is_empty_insert_relations(): # no relations or Nodes to add (should not happen) + batchData.append_query(f"""CREATE ({newIdentifier}:{nodeType} ${newIdentifier})""") + else: # no new Nodes happen when all Nodes already matched + batchData.append_query(f"""CREATE ({newIdentifier}:{nodeType} ${newIdentifier}) CREATE {subgraph.get_insert_relations()}""") + else: + batchData.append_query(f"""MATCH {subgraph.get_lookup_nodes()} CREATE ({newIdentifier}:{nodeType} ${newIdentifier}) CREATE {subgraph.get_insert_relations()}""") + + except: + print('Exception') + print('input: ', input) + print() + print('relations: ', relations) + print() + print('Query: ', batchData.get_query) + print() + print('Relations: '+ subgraph.get_insert_relations) + print() + print('Match Nodes: ', subgraph.get_lookup_nodes) + #q_rel = "".join(insert_strings) + + + #query = f""" + #CREATE ({identifierNumber}:{nodeType} $attributes) + #{q_rel} + #RETURN {identifierNumber} + #""" + + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + query = batchData.get_query() + # print() #start 16:09 + # print('query: ', query) + # print() + # print('attributes: ', all_values) + #print(nodes_count) + return query, all_values + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + query, value = parse_json_to_cypher(json) + return query, value + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + q, value = create_cypher_query_from_cdm(data) + execute_query(q, value) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + # create indices here .... + if (analytic): + session.run("STORAGE MODE IN_MEMORY_ANALYTICAL;") + if (create_indices): + session.run("CREATE INDEX ON :Subject(uuid);") + session.run("CREATE INDEX ON :Event(uuid);") + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + + return driver + + +def execute_query(query:str, value): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + with driver.session() as session: + result = session.run(query, value) # maybe result = session.run(query, **value) + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + return result.data() + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + diff --git a/code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented_using_Params_two_label.py b/code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented_using_Params_two_label.py new file mode 100644 index 0000000..1bc1255 --- /dev/null +++ b/code/infrastructure/streaming/clients/sub/memgraph/subgraph_oriented/sub_mem_whole_batch_subgraph_oriented_using_Params_two_label.py @@ -0,0 +1,428 @@ +import datetime +import paho.mqtt.client as mqtt +import time +import json +from neo4j import GraphDatabase +import os + + +#Muster +# MATCH lookupnodes (not known yet) +# CREATE current Node (using PARAM) +# CREATE edges +# WITH +# repeat next node + +broker_hostname=str(os.getenv('mos_host',default="localhost")) +broker_port = int(os.getenv('mos_port',default=1883)) +client = mqtt.Client(mqtt.CallbackAPIVersion.VERSION1,"memgraph") +db_uri =str(os.getenv('mem_host',default="bolt://localhost:8687")) #local test Port 8687 not local 7687 +neo4j_auth=("","") +abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) +create_indices = False #os.getenv('create_indices', 'False').lower() in ['true', '1', 't', 'y', 'yes'] +second_label = 'Node' + +def flatten_obj(key, val, target): + complexType = type(val) + if val is None: + return None + elif complexType is dict: + for k, v in val.items(): + if "com.bbn.tc.schema.avro.cdm18.UUID" not in k: # node for Edge not needed as Attribute + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + elif complexType is list: + for i in range(len(val)): + v = val[i] + new_key = f"{key}_{i}" if key else str(i) + flatten_obj(new_key, v, target) + elif complexType is object: + for k, v in val.__dict__.items(): + new_key = f"{key}_{k}" if key else k + flatten_obj(new_key, v, target) + else: + if "properties_map_arg_mem_flags" in key: + # String to list and make separate Attribute Values pairs for the Objects of list + values_list = eval(val) + cleaned_values = [value.strip("'") for value in values_list] + + index = 0 + for value in cleaned_values: + index += 1 + target[f"{key}_{index}"] = value + else: + target[key] = val + + +# Holds a list with the Nodes to match and a list with relations to add +# manages the build of these lists +class subgraphNodeRelation: + + def __init__(self): + self.lookup_nodes = [] + self.insert_relations = [] + + def nodes_relations_to_cypher( self, val, newIdentifier, rel, batchData, type_letter, type): #type either ':Host' 'Subject' .... or '' + try: + if batchData.check_key(val): # Node is already known use the same identifier + ident = batchData.get_Node_value(val) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + else : + ident = batchData.get_Identifier_Increment_add_Node(type_letter, val) # get identifier and add to knownNode + s = f'({ident}{type} {{uuid: "{val}"}})' + self.lookup_nodes.append(s) + self.insert_relations.append(f'''({newIdentifier}) -[:{rel}]-> ({ident})''') + except: + print(type_letter) + + def get_lookup_nodes(self): + return ', '.join(self.lookup_nodes) + + def get_insert_relations(self): + return ', '.join(self.insert_relations) + + def is_empty_lookup_nodes(self): + if self.lookup_nodes: + return False + else: + return True + + def is_empty_insert_relations(self): + if self.insert_relations: + return False + else: + return True + + +# Data for the whole Batch, so it doesnt get copied when used in another function +# Holds a List with all known identifier, a directory to look up if the Node is already known +# Holds the Batch query +# Holds a number to create diffrent idetifier +class batchDataHolder: + def __init__(self): + self.knownNodes = dict() + self.with_list_identifier = [] + self.batch_query = [] + self.identifierNumber = 0 #number to create unique identifier in the query + + def add_entry_Node(self, key, value): + self.knownNodes[key] = value + self.with_list_identifier.append(value) #add the value to with + + + def get_knownNodes(self): + return self.knownNodes + + def get_Node_value(self, key): + return self.knownNodes[key] + + def check_key(self, key): + if key in self.knownNodes: + return True + else: + return False + + def append_with(self, ident): + self.with_list_identifier.append(ident) + + def get_With_List(self): + return ', '.join(self.with_list_identifier) + + def get_Identifier_Increment_add_Node(self, type, key):#create a new identifier + identifier = type + str(self.identifierNumber) + self.identifierNumber += 1 + self.add_entry_Node(key, identifier) + return identifier + + def append_query(self, add): + self.batch_query.append(add) + + def get_query(self): + return ''.join(self.batch_query) + + def add_query_with(self): # Each Subgraph takes over the nodes from the Subgraph before + if self.batch_query: # only if batach_query not empty # empty list is false not empty true + self.batch_query.append(' WITH ' + ', '.join(self.with_list_identifier) + ' ') # append the with identifier transform to string split by ', ' + + +nodes_count = 0 + +def parse_json_to_cypher(data_list) : + global nodes_count + if nodes_count % 10000 == 0: + print("Nodes: ", nodes_count, " Time: ", datetime.datetime.now().strftime("%H:%M:%S")) + + #knownNodes = dict() #to check if Node is known and no Lookup is needed + batchData = batchDataHolder() + all_values = {} + + + for input_string in data_list: # run throuh subgraphs of the Batch + + + nodes_count = nodes_count +1 + + batchData.add_query_with() # add Nodes from Subgraph before, if not first + + input = json.loads(input_string) + jsonType = list(input['datum'].keys())[0] + # short type string + nodeType = jsonType.rsplit(".", 1)[1] + # data of object + value = input["datum"][jsonType] + + value_flat = {} # TODO value_flat sind die Attribute eines Knotens, jedoch müssen alle Attribute aller neuen Knoten eingefügt werden + flatten_obj("",value,value_flat) + # newIdentifier is the central node of the current Subgraph, each line of the Batch is own new Subgraph + newIdentifier = batchData.get_Identifier_Increment_add_Node('_', value_flat['uuid']) + # makes sure in with are differen identifier used (one numbercount for al kind of nodes) + all_values[newIdentifier] = value_flat + + relations = dict( + runsOn="" + ,isGeneratedBy="" + ,affects="" + ,affects2="" + ,residesOn="" + ,isPartOf="" # not in data set + ,hasOwningPricipal="" + ,hasTag="" # not in data set + ,hasParent="" + ,hasAccountOn="" + ,hasLocalPrincipal="" + ) + + + + + # create relationships + try: + if nodeType == 'Subject': + if value['parentSubject'] != None: + relations.update({'hasParent':value['parentSubject']["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['hostId'] != None: + relations.update({'runsOn':value['hostId']}) + if value['localPrincipal'] != None: + relations.update({'hasLocalPrincipal':value['localPrincipal']}) + # the relationship for subject -[affects]-> event is missing... probably implicit through is generated by + + elif nodeType == 'FileObject': + if value['baseObject'] != None: + relations.update({"residesOn":value['baseObject']['hostId']}) + # relations.update({"isPartOf":}) + if value['localPrincipal']: + relations.update({"hasOwningPrincipal":value['localPrincipal']}) + # relations.update({"hasTag":}) + + # create relationships for host id + # mapping of cdm fields to relationships of nodes + elif nodeType == 'Event': + # if value['hostId'] != None: + # relations.update({'runsOn':value['hostId']}) + if value['subject'] != None: + relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject'] != None: + relations.update({'affects':value['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['predicateObject2'] != None: + relations.update({'affects2':value['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']}) + + elif nodeType == 'Principal': + if value['hostId'] != None: + relations.update({'hasAccountOn':value['hostId']}) + + elif nodeType == 'UnnamedPipeObject': + if value['sourceUUID'] != None: + relations.update({'affects':value['sourceUUID']['com.bbn.tc.schema.avro.cdm18.UUID']}) + if value['sinkUUID'] != None: + relations.update({'affects2':value["sinkUUID"]["com.bbn.tc.schema.avro.cdm18.UUID"]}) + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'NetFlowObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + elif nodeType == 'SrcSinkObject': + if value['baseObject'] != None: + relations.update({'residesOn':value['baseObject']['hostId']}) + + + + + subgraph = subgraphNodeRelation() + + + # lookup existing nodes for relations + for rel in relations.keys(): + val = relations[rel] + if val != '': + + + + if rel =="residesOn": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'he', ':Host') + + elif rel == "runsOn": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'hu', ':Host') + + elif rel =="isGeneratedBy": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 's', f':Subject:{second_label}') + + elif rel =="hasOwningPrincipal": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'p', ':Principal') + + elif rel =="affects": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'a', f':{second_label}') + + elif rel == 'affects2': + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'a', f':{second_label}') + # ... other relations for Object not in data + elif rel =="hasParent": + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'not', f':{second_label}') + + elif rel =='hasAccountOn': + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'not', f':{second_label}') + + elif rel == 'hasLocalPrincipal': + subgraph.nodes_relations_to_cypher(val, newIdentifier, rel, batchData, 'not', f':{second_label}') + + + + try: + + #attribute_string = ', '.join([f"{k[1:]}: '{v}'" if "properties_map_arg_mem_flags" not in k else f"{k[1:]}: {v}" for k, v in value_flat.items()]) + + if subgraph.is_empty_lookup_nodes(): + if subgraph.is_empty_insert_relations(): # no relations or Nodes to add (should not happen) + batchData.append_query(f"""CREATE ({newIdentifier}:{nodeType}:{second_label} ${newIdentifier})""") + else: # no new Nodes happen when all Nodes already matched + batchData.append_query(f"""CREATE ({newIdentifier}:{nodeType}:{second_label} ${newIdentifier}) CREATE {subgraph.get_insert_relations()}""") + else: + batchData.append_query(f"""MATCH {subgraph.get_lookup_nodes()} CREATE ({newIdentifier}:{nodeType}:{second_label} ${newIdentifier}) CREATE {subgraph.get_insert_relations()}""") + + except: + print('Exception') + print('input: ', input) + print() + print('relations: ', relations) + print() + print('Query: ', batchData.get_query) + print() + print('Relations: '+ subgraph.get_insert_relations) + print() + print('Match Nodes: ', subgraph.get_lookup_nodes) + #q_rel = "".join(insert_strings) + + + #query = f""" + #CREATE ({identifierNumber}:{nodeType} $attributes) + #{q_rel} + #RETURN {identifierNumber} + #""" + + except: + print('Exception') + print('input: ', input) + print('relations: ', relations) + break + query = batchData.get_query() + # print() #start 16:09 + # print('query: ', query) + # print() + # print('attributes: ', all_values) + #print(nodes_count) + return query, all_values + + +def create_cypher_query_from_cdm(json): + ''' + Create Cypher Queries from publisher message + ''' + query, value = parse_json_to_cypher(json) + return query, value + +def on_message(client, userdata, message): + ''' + The callback function for message listener + ''' + data = json.loads(message.payload.decode("utf-8")) + print("Received message from: ",message.topic) + q, value = create_cypher_query_from_cdm(data) + execute_query(q, value) + +def on_connect(client, userdata, flags, return_code): + ''' + Connecting and subscribing to the Mosquitto topic + ''' + if return_code == 0: + print("connected") + client.subscribe("neo4j",qos=1) + else: + print("could not connect, return code:", return_code) + client.failed_connect = True + +def connect_to_db(uri,auth): + ''' + Establish db connection to neo4j + ''' + driver = GraphDatabase.driver(uri, auth=auth) + with driver.session() as session: + print("Cleanup existing data...") + session.run("MATCH (n) detach delete n") + session.run("RETURN 1 as result") + print("Successfully connected to DB...") + + # create indices here .... + if (create_indices): + s = f"CREATE INDEX ON :{second_label}(uuid);" + session.run(s) + session.run("CREATE INDEX ON :Host(uuid);") + session.run("CREATE INDEX ON :FileObject(uuid);") + session.run("CREATE INDEX ON :NetFlowObject(uuid);") + session.run("CREATE INDEX ON :SrcSinkObject(uuid);") + session.run("CREATE INDEX ON :UnnamedPipeObject(uuid);") + session.run("CREATE INDEX ON :Principal(uuid);") + + return driver + + +def execute_query(query:str, value): + ''' + Execute any Neo4j Query. + + Expected Query Parameter: + query = Query String, + attributes = atttributes to be inserted + ''' + #print(query) + with driver.session() as session: + print("Before sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + result = session.run(query, value) # maybe result = session.run(query, **value) + + print("After sending: ", datetime.datetime.now().strftime("%H:%M:%S")) + return result.data() + +driver = connect_to_db(db_uri,neo4j_auth) + +# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth +client.on_connect = on_connect +client.on_message = on_message +client.failed_connect = False + +client.connect(broker_hostname, broker_port,keepalive=3600*4) +client.loop_start() + +# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits +try: + i = 0 + while i < abort_time_limit: #and client.failed_connect == False: + time.sleep(1) + i += 1 + if client.failed_connect == True: + print('Connection failed, exiting...') + +finally: + client.disconnect() + client.loop_stop() + driver.close() + -- GitLab