Skip to content
Snippets Groups Projects
Commit 3adf6c6f authored by Sven-Ove Hänsel's avatar Sven-Ove Hänsel
Browse files

add possible solution for faster inserts

parent e9a4c7b9
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Mögliche Optimierung der Neo4j Subscriber...
%% Cell type:code id: tags:
``` python
import paho.mqtt.client as mqtt
import time
import json
from neo4j import GraphDatabase
import os
from threading import Thread
from queue import Queue
# Existing setup for MQTT and Neo4j connection
broker_hostname = str(os.getenv('mos_host', default="localhost"))
broker_port = int(os.getenv('mos_port', default=1883))
client = mqtt.Client("Client4")
db_uri = str(os.getenv('mem_host', default="bolt://localhost:7687"))
neo4j_auth = ("", "")
abort_time_limit = int(os.getenv('abort_time_limit', default=99999))
# Initialize a Queue for incoming MQTT messages
message_queue = Queue()
def flatten_obj(key, val, target):
# Your existing flatten_obj function
pass
def parse_json_to_cypher(input):
# Your existing parse_json_to_cypher function
pass
def create_cypher_query_from_cdm(json):
# Your existing create_cypher_query_from_cdm function
pass
def on_message(client, userdata, message):
data = json.loads(message.payload.decode("utf-8"))
# Instead of processing immediately, put the message into the queue
message_queue.put(data)
def on_connect(client, userdata, flags, return_code):
# Your existing on_connect function
pass
def connect_to_db(uri, auth):
# Establish db connection to neo4j
driver = GraphDatabase.driver(uri, auth=auth)
# Consider moving session cleanup and connection verification outside of this function
return driver
def execute_batch_queries(batch):
# New function to handle batch processing of messages
with driver.session() as session:
for data in batch:
q, attr = create_cypher_query_from_cdm(data)
session.run(q, attributes=attr)
def process_message_batch():
while True:
batch = []
while not message_queue.empty():
batch.append(message_queue.get())
if batch:
execute_batch_queries(batch)
for _ in batch:
message_queue.task_done()
# Start processing thread for handling MQTT messages in batches
processing_thread = Thread(target=process_message_batch, daemon=True)
processing_thread.start()
driver = connect_to_db(db_uri, neo4j_auth)
client.on_connect = on_connect
client.on_message = on_message
client.connect(broker_hostname, broker_port)
client.loop_start()
# Your MQTT client loop and cleanup logic with try-finally
try:
i = 0
while i < abort_time_limit and not client.failed_connect:
time.sleep(1)
i += 1
finally:
client.disconnect()
client.loop_stop()
driver.close()
```
%% Cell type:markdown id: tags:
CDM Data CDM Data
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
```
%% Cell type:code id: tags:
``` python
import paho.mqtt.client as mqtt import paho.mqtt.client as mqtt
import time import time
import json import json
import os import os
import psycopg2 import psycopg2
pg_host = str(os.getenv('pg_host',default='localhost')) pg_host = str(os.getenv('pg_host',default='localhost'))
pg_port = int(os.getenv('pg_port',default='5432')) pg_port = int(os.getenv('pg_port',default='5432'))
pg_user = str(os.getenv('postgres',default='postgres')) pg_user = str(os.getenv('postgres',default='postgres'))
pg_pw = str(os.getenv('postgres',default='postgres')) pg_pw = str(os.getenv('postgres',default='postgres'))
pg_db = str(os.getenv('postgres',default='postgres')) pg_db = str(os.getenv('postgres',default='postgres'))
broker_hostname=str(os.getenv('mos_host',default="localhost")) broker_hostname=str(os.getenv('mos_host',default="localhost"))
broker_port = int(os.getenv('mos_port',default=1883)) broker_port = int(os.getenv('mos_port',default=1883))
client = mqtt.Client("Client3") client = mqtt.Client("Client3")
abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) abort_time_limit = int(os.getenv('abort_time_limit', default=99999))
def flatten_obj(key, val, target): def flatten_obj(key, val, target):
complexType = type(val) complexType = type(val)
if val is None: if val is None:
return None return None
elif complexType is dict: elif complexType is dict:
for k, v in val.items(): for k, v in val.items():
flatten_obj(key + "_" + k, v, target) flatten_obj(key + "_" + k, v, target)
elif complexType is list: elif complexType is list:
for i in range(len(val)): for i in range(len(val)):
v = val[i] v = val[i]
flatten_obj(key + "_" + str(i), v, target) flatten_obj(key + "_" + str(i), v, target)
elif complexType is object: elif complexType is object:
for k, v in val.__dict__.items(): for k, v in val.__dict__.items():
flatten_obj(key + "_" + k, v, target) flatten_obj(key + "_" + k, v, target)
else: else:
target[key] = val target[key] = val
def add_edge(key:str, keys, rel:str, dest:[], edge_type:[], data): def add_edge(key:str, keys, rel:str, dest:[], edge_type:[], data):
if key in keys: if key in keys:
value = data[key] value = data[key]
dest.append(prepare_string_value_for_sql_query(value)) dest.append(prepare_string_value_for_sql_query(value))
edge_type.append(rel) edge_type.append(rel)
def prepare_string_value_for_sql_query(value:str): def prepare_string_value_for_sql_query(value:str):
return f"'{value}'" return f"'{value}'"
def create_edge_insert_query(edge_values:[]): def create_edge_insert_query(edge_values:[]):
values = [] values = []
i = 0 i = 0
for edge in edge_values[1]: for edge in edge_values[1]:
s = [] s = []
s.append(f"nextval('edge_number_seq')") s.append(f"nextval('edge_number_seq')")
s.append(edge_values[0]) s.append(edge_values[0])
s.append(edge_values[1][i]) s.append(edge_values[1][i])
s.append(prepare_string_value_for_sql_query(edge_values[2][i])) s.append(prepare_string_value_for_sql_query(edge_values[2][i]))
i+=1 i+=1
values.append(f"({','.join(s)})") values.append(f"({','.join(s)})")
q = f"INSERT INTO edge_list (edge_no, source, dest, edge_type ) VALUES {','.join(values)} ;" q = f"INSERT INTO edge_list (edge_no, source, dest, edge_type ) VALUES {','.join(values)} ;"
return q return q
def parse_json_to_sql_query(json,node_type): def parse_json_to_sql_query(json,node_type):
print("\nparsing message: ") print("\nparsing message: ")
ignored_keys=['CDMVersion','source'] ignored_keys=['CDMVersion','source']
table_name= (str.lower(node_type)) table_name= (str.lower(node_type))
columns= [] columns= []
values = [] values = []
first = True first = True
# create query for inserting nodes in specific table # create query for inserting nodes in specific table
for key, value in json.items(): for key, value in json.items():
# remove header schema in json # remove header schema in json
short_key= key.replace("_com.bbn.tc.schema.avro.cdm18","") short_key= key.replace("_com.bbn.tc.schema.avro.cdm18","")
# build substring to get column name (if key has more than 11 characters source ) # build substring to get column name (if key has more than 11 characters source )
if (len(short_key) < 12): if (len(short_key) < 12):
short_key = short_key[1:] short_key = short_key[1:]
else: else:
short_key = short_key[2+6+len(node_type):] short_key = short_key[2+6+len(node_type):]
# replace . with _ to match column header # replace . with _ to match column header
short_key = short_key.replace('.','_') short_key = short_key.replace('.','_')
# escape single quotes (') in value # escape single quotes (') in value
if "'" in str(value): if "'" in str(value):
value = str(value).replace("'","''") value = str(value).replace("'","''")
if(short_key not in ignored_keys): if(short_key not in ignored_keys):
columns.append(short_key) columns.append(short_key)
values.append(prepare_string_value_for_sql_query(value)) values.append(prepare_string_value_for_sql_query(value))
# create queries for inserting nodes in node table # create queries for inserting nodes in node table
key_header = f'_datum_com.bbn.tc.schema.avro.cdm18.{node_type}' key_header = f'_datum_com.bbn.tc.schema.avro.cdm18.{node_type}'
key_postfix= f'_com.bbn.tc.schema.avro.cdm18.UUID' key_postfix= f'_com.bbn.tc.schema.avro.cdm18.UUID'
uuid = json[f'_datum_com.bbn.tc.schema.avro.cdm18.{node_type}_uuid'] uuid = json[f'_datum_com.bbn.tc.schema.avro.cdm18.{node_type}_uuid']
node_values = [prepare_string_value_for_sql_query(uuid), prepare_string_value_for_sql_query(node_type)] node_values = [prepare_string_value_for_sql_query(uuid), prepare_string_value_for_sql_query(node_type)]
# create queries for inserting edges in edge table # create queries for inserting edges in edge table
source = prepare_string_value_for_sql_query(uuid) source = prepare_string_value_for_sql_query(uuid)
dest = [] dest = []
edge_type = [] edge_type = []
keys = json.keys() keys = json.keys()
if node_type == 'Event': if node_type == 'Event':
rel = 'runsOn' rel = 'runsOn'
key = f"{key_header}_hostId" key = f"{key_header}_hostId"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
rel = 'isGeneratedBy' rel = 'isGeneratedBy'
key = f"{key_header}_subject{key_postfix}" key = f"{key_header}_subject{key_postfix}"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
rel = 'predicateObject' rel = 'predicateObject'
key = f"{key_header}_predicateObject{key_postfix}" key = f"{key_header}_predicateObject{key_postfix}"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
rel = 'predicateObject2' rel = 'predicateObject2'
key = f"{key_header}_predicateObject2{key_postfix}" key = f"{key_header}_predicateObject2{key_postfix}"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
if node_type =='Subject': if node_type =='Subject':
rel = 'parentSubject' rel = 'parentSubject'
key = f"{key_header}_parentSubject{key_postfix}" key = f"{key_header}_parentSubject{key_postfix}"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
rel ='hasLocalPrincipal' rel ='hasLocalPrincipal'
key = f"{key_header}_localPrincipal" key = f"{key_header}_localPrincipal"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
rel = 'runsOn' rel = 'runsOn'
key = f'{key_header}_hostId' key = f'{key_header}_hostId'
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
if node_type =='FileObject': if node_type =='FileObject':
rel = 'residesOn' rel = 'residesOn'
key = f"{key_header}_baseObject_hostId" key = f"{key_header}_baseObject_hostId"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
rel = 'hasOwningPrincipal' rel = 'hasOwningPrincipal'
key = f"{key_header}_localPrincipal" key = f"{key_header}_localPrincipal"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
# if node_type == 'Host': # if node_type == 'Host':
# Nothing to be done... no edges outgoing from host # Nothing to be done... no edges outgoing from host
if node_type == 'NetFlowObject' or node_type == 'SrcSinkObject': if node_type == 'NetFlowObject' or node_type == 'SrcSinkObject':
rel = 'residesOn' rel = 'residesOn'
key = f"{key_header}_baseObject_hostId" key = f"{key_header}_baseObject_hostId"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
if node_type == 'Principal': if node_type == 'Principal':
rel = 'hasAccountOn' rel = 'hasAccountOn'
key = f"{key_header}_hostId" key = f"{key_header}_hostId"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
if node_type =='UnnamedPipeObject': if node_type =='UnnamedPipeObject':
rel = 'resides_on' rel = 'resides_on'
key = f"{key_header}_baseObject_hostId" key = f"{key_header}_baseObject_hostId"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
rel = 'affects1' rel = 'affects1'
key = f"{key_header}_sourceUUID" key = f"{key_header}_sourceUUID"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
rel = 'affects2' rel = 'affects2'
key = f"{key_header}_sinkUUID" key = f"{key_header}_sinkUUID"
add_edge(key, keys, rel, dest, edge_type,json) add_edge(key, keys, rel, dest, edge_type,json)
edge_values = [source,dest,edge_type] edge_values = [source,dest,edge_type]
queries = [] queries = []
# node into each table # node into each table
q1 = f"INSERT INTO {table_name} (line, {','.join(columns)}) VALUES (nextval('line_number_seq'),{','.join(values)})" q1 = f"INSERT INTO {table_name} (line, {','.join(columns)}) VALUES (nextval('line_number_seq'),{','.join(values)})"
queries.append(q1) queries.append(q1)
# node into node list # node into node list
q2 = f"INSERT INTO node_list (node_no, uuid, type) VALUES ( nextval('node_number_seq'),{','.join(node_values)})" q2 = f"INSERT INTO node_list (node_no, uuid, type) VALUES ( nextval('node_number_seq'),{','.join(node_values)})"
queries.append(q2) queries.append(q2)
# edge into edge list # edge into edge list
print("edge_values: ",edge_values) print("edge_values: ",edge_values)
if len(edge_values[1]) != 0: if len(edge_values[1]) != 0:
q3 = create_edge_insert_query(edge_values) q3 = create_edge_insert_query(edge_values)
queries.append(q3) queries.append(q3)
else: else:
print("no edges") print("no edges")
return queries return queries
def handle_message(m): def handle_message(m):
print('\nnew message: ',m) print('\nnew message: ',m)
json_type = list(m['datum'].keys())[0] json_type = list(m['datum'].keys())[0]
# short type string # short type string
node_type = json_type.rsplit(".", 1)[1] node_type = json_type.rsplit(".", 1)[1]
# print(node_type) # print(node_type)
flat_json = {} flat_json = {}
flatten_obj("",m,flat_json) flatten_obj("",m,flat_json)
queries = parse_json_to_sql_query(flat_json,node_type) queries = parse_json_to_sql_query(flat_json,node_type)
for q in queries: for q in queries:
execute_db_query(q) execute_db_query(q)
def on_message(client, userdata, message): def on_message(client, userdata, message):
''' '''
The callback function for message listener The callback function for message listener
''' '''
data = json.loads(message.payload.decode("utf-8")) data = json.loads(message.payload.decode("utf-8"))
# print(f"Received message")# {data} from: ",message.topic) # print(f"Received message")# {data} from: ",message.topic)
# try: # try:
handle_message(data) handle_message(data)
# except Exception as e: # except Exception as e:
# print(e) # print(e)
def on_connect(client, userdata, flags, return_code): def on_connect(client, userdata, flags, return_code):
''' '''
Connecting and subscribing to the Mosquitto topic Connecting and subscribing to the Mosquitto topic
''' '''
if return_code == 0: if return_code == 0:
print("connected") print("connected")
client.subscribe("neo4j") client.subscribe("neo4j")
else: else:
print("could not connect, return code:", return_code) print("could not connect, return code:", return_code)
client.failed_connect = True client.failed_connect = True
def connect_database(): def connect_database():
print('Create DB Connection') print('Create DB Connection')
return psycopg2.connect(host=pg_host,port=pg_port,user=pg_user,password=pg_pw,database=pg_db) return psycopg2.connect(host=pg_host,port=pg_port,user=pg_user,password=pg_pw,database=pg_db)
def execute_db_query(q:str): def execute_db_query(q:str):
cursor = connection.cursor() cursor = connection.cursor()
cursor.execute(q) cursor.execute(q)
connection.commit() connection.commit()
def create_pg_schema(): def create_pg_schema():
print('Create PG CDM Schema: ') print('Create PG CDM Schema: ')
file_path = 'C:\\Studium_MIN\\05_Masterarbeit\\thesis\\ma_code\\code\\infrastructure\\streaming\\clients\\sub\\postgres\\import_node_edge.txt' file_path = 'C:\\Studium_MIN\\05_Masterarbeit\\thesis\\ma_code\\code\\infrastructure\\streaming\\clients\\sub\\postgres\\import_node_edge.txt'
try: try:
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
long_string = file.read() long_string = file.read()
# print(long_string) # print(long_string)
except FileNotFoundError: except FileNotFoundError:
print(f"The file {file_path} was not found.") print(f"The file {file_path} was not found.")
except Exception as e: except Exception as e:
print(f"An error occurred: {str(e)}") print(f"An error occurred: {str(e)}")
q = long_string q = long_string
execute_db_query(q) execute_db_query(q)
connection = connect_database() connection = connect_database()
create_pg_schema() create_pg_schema()
# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth # client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth
client.on_connect = on_connect client.on_connect = on_connect
client.on_message = on_message client.on_message = on_message
client.failed_connect = False client.failed_connect = False
client.connect(broker_hostname,broker_port) client.connect(broker_hostname,broker_port)
client.loop_start() client.loop_start()
# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits # this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits
try: try:
i = 0 i = 0
while i < abort_time_limit and client.failed_connect == False: while i < abort_time_limit and client.failed_connect == False:
time.sleep(1) time.sleep(1)
i += 1 i += 1
if client.failed_connect == True: if client.failed_connect == True:
print('Connection failed, exiting...') print('Connection failed, exiting...')
finally: finally:
client.disconnect() client.disconnect()
client.loop_stop() client.loop_stop()
connection.close() connection.close()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Postgres Subscriber first step # Postgres Subscriber first step
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import paho.mqtt.client as mqtt import paho.mqtt.client as mqtt
import time import time
import json import json
import os import os
import psycopg2 import psycopg2
pg_host = str(os.getenv('pg_host',default='localhost')) pg_host = str(os.getenv('pg_host',default='localhost'))
pg_port = int(os.getenv('pg_port',default='5432')) pg_port = int(os.getenv('pg_port',default='5432'))
pg_user = str(os.getenv('postgres',default='postgres')) pg_user = str(os.getenv('postgres',default='postgres'))
pg_pw = str(os.getenv('postgres',default='postgres')) pg_pw = str(os.getenv('postgres',default='postgres'))
pg_db = str(os.getenv('postgres',default='postgres')) pg_db = str(os.getenv('postgres',default='postgres'))
broker_hostname=str(os.getenv('mos_host',default="localhost")) broker_hostname=str(os.getenv('mos_host',default="localhost"))
broker_port = int(os.getenv('mos_port',default=1883)) broker_port = int(os.getenv('mos_port',default=1883))
client = mqtt.Client("Client3") client = mqtt.Client("Client3")
abort_time_limit = int(os.getenv('abort_time_limit', default=99999)) abort_time_limit = int(os.getenv('abort_time_limit', default=99999))
def handle_message(m): def handle_message(m):
print(m) print(m)
def on_message(client, userdata, message): def on_message(client, userdata, message):
''' '''
The callback function for message listener The callback function for message listener
''' '''
data = json.loads(message.payload.decode("utf-8")) data = json.loads(message.payload.decode("utf-8"))
print(f"Received message")# {data} from: ",message.topic) print(f"Received message")# {data} from: ",message.topic)
handle_message(data) handle_message(data)
def on_connect(client, userdata, flags, return_code): def on_connect(client, userdata, flags, return_code):
''' '''
Connecting and subscribing to the Mosquitto topic Connecting and subscribing to the Mosquitto topic
''' '''
if return_code == 0: if return_code == 0:
print("connected") print("connected")
client.subscribe("neo4j") client.subscribe("neo4j")
else: else:
print("could not connect, return code:", return_code) print("could not connect, return code:", return_code)
client.failed_connect = True client.failed_connect = True
def connect_database(): def connect_database():
print('Create DB Connection') print('Create DB Connection')
return psycopg2.connect(host=pg_host,port=pg_port,user=pg_user,password=pg_pw,database=pg_db) return psycopg2.connect(host=pg_host,port=pg_port,user=pg_user,password=pg_pw,database=pg_db)
def execute_db_query(q:str): def execute_db_query(q:str):
cursor = connection.cursor() cursor = connection.cursor()
cursor.execute(q) cursor.execute(q)
connection.commit() connection.commit()
def create_pg_schema(): def create_pg_schema():
print('Create PG CDM Schema: ') print('Create PG CDM Schema: ')
file_path = 'C:\\Studium_MIN\\05_Masterarbeit\\thesis\\ma_code\\code\\infrastructure\\streaming\\clients\\sub\\postgres\\import_sql.txt' file_path = 'C:\\Studium_MIN\\05_Masterarbeit\\thesis\\ma_code\\code\\infrastructure\\streaming\\clients\\sub\\postgres\\import_sql.txt'
try: try:
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
long_string = file.read() long_string = file.read()
print(long_string) print(long_string)
except FileNotFoundError: except FileNotFoundError:
print(f"The file {file_path} was not found.") print(f"The file {file_path} was not found.")
except Exception as e: except Exception as e:
print(f"An error occurred: {str(e)}") print(f"An error occurred: {str(e)}")
q = long_string q = long_string
execute_db_query(q) execute_db_query(q)
connection = connect_database() connection = connect_database()
create_pg_schema() create_pg_schema()
# client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth # client.username_pw_set(username="user_name", password="password") # uncomment if you use password auth
client.on_connect = on_connect client.on_connect = on_connect
client.on_message = on_message client.on_message = on_message
client.failed_connect = False client.failed_connect = False
client.connect(broker_hostname,broker_port) client.connect(broker_hostname,broker_port)
client.loop_start() client.loop_start()
# this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits # this try-finally block ensures that whenever we terminate the program earlier by hitting ctrl+c, it still gracefully exits
try: try:
i = 0 i = 0
while i < abort_time_limit and client.failed_connect == False: while i < abort_time_limit and client.failed_connect == False:
time.sleep(1) time.sleep(1)
i += 1 i += 1
if client.failed_connect == True: if client.failed_connect == True:
print('Connection failed, exiting...') print('Connection failed, exiting...')
finally: finally:
client.disconnect() client.disconnect()
client.loop_stop() client.loop_stop()
connection.close() connection.close()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import psycopg2 import psycopg2
# Connect to the PostgreSQL database # Connect to the PostgreSQL database
conn = psycopg2.connect( conn = psycopg2.connect(
host='localhost', host='localhost',
port='5432', port='5432',
user='postgres', user='postgres',
password='postgres', password='postgres',
database='postgres' database='postgres'
) )
# Create a cursor object to execute SQL queries # Create a cursor object to execute SQL queries
cursor = conn.cursor() cursor = conn.cursor()
# Define your data # Define your data
data = ('John Doe', 25, 'john.doe@example.com') data = ('John Doe', 25, 'john.doe@example.com')
# Execute an INSERT query # Execute an INSERT query
# cursor.execute('INSERT INTO your_table_name (name, age, email) VALUES (%s, %s, %s)', data) # cursor.execute('INSERT INTO your_table_name (name, age, email) VALUES (%s, %s, %s)', data)
cursor.execute('SELECT * FROM test') cursor.execute('SELECT * FROM test')
rows = cursor.fetchall() rows = cursor.fetchall()
# Display the results # Display the results
for row in rows: for row in rows:
print(row) print(row)
# Commit the transaction and close the connection # Commit the transaction and close the connection
conn.commit() conn.commit()
conn.close() conn.close()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
### Eine einzelne Datei in 1000er Schritten zu lesen dauert Minuten ### Eine einzelne Datei in 1000er Schritten zu lesen dauert Minuten
import json import json
import os import os
def read_moving_window_lines(file_path, lines_per_window, line_count): def read_moving_window_lines(file_path, lines_per_window, line_count):
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
i = 0 i = 0
while True: while True:
try: try:
i += 1 i += 1
if i>=line_count: if i>=line_count:
break break
# Read the next lines_per_window lines # Read the next lines_per_window lines
window_data = [next(file) for _ in range(lines_per_window)] window_data = [next(file) for _ in range(lines_per_window)]
# If no more data is left, break the loop # If no more data is left, break the loop
if not window_data[0]: if not window_data[0]:
break break
# Process the current window_data # Process the current window_data
process_window_lines(window_data) process_window_lines(window_data)
except StopIteration: except StopIteration:
break break
# Process a window # Process a window
def process_window_lines(window_data): def process_window_lines(window_data):
print("Processing window:") print("Processing window:")
values =[] values =[]
for line in window_data: for line in window_data:
json_obj = json.loads(line.strip()) json_obj = json.loads(line.strip())
json_string = json.dumps(json_obj) json_string = json.dumps(json_obj)
values.append(json_string) values.append(json_string)
return values return values
file_path = 'C:\\Studium_MIN\\05_Masterarbeit\\thesis\\ma_code\\code\\infrastructure\\streaming\\clients\\pub\\data\\ta1-cadets-e3-official_1.json' file_path = 'C:\\Studium_MIN\\05_Masterarbeit\\thesis\\ma_code\\code\\infrastructure\\streaming\\clients\\pub\\data\\ta1-cadets-e3-official_1.json'
lines_per_window = 1000 lines_per_window = 1000
path = 'C:\\Studium_MIN\\05_Masterarbeit\\thesis\\ma_code\\code\\infrastructure\\streaming\\clients\\pub\\data\\' path = 'C:\\Studium_MIN\\05_Masterarbeit\\thesis\\ma_code\\code\\infrastructure\\streaming\\clients\\pub\\data\\'
files = ['ta1-cadets-e3-official_1.json', 'ta1-cadets-e3-official_2.json', 'ta1-cadets-e3-official_3.json'] files = ['ta1-cadets-e3-official_1.json', 'ta1-cadets-e3-official_2.json', 'ta1-cadets-e3-official_3.json']
line_count = [4999999,4999999,3911712] # line_count corresponding to each file line_count = [4999999,4999999,3911712] # line_count corresponding to each file
i = 0 i = 0
for file in files: for file in files:
stop_index = line_count[i] stop_index = line_count[i]
file_path = path+file file_path = path+file
print("Reading file: ", file) print("Reading file: ", file)
read_moving_window_lines(file_path, lines_per_window,stop_index) read_moving_window_lines(file_path, lines_per_window,stop_index)
i += 1 i += 1
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# ''' # '''
# Create CDM data ready for publishing # Create CDM data ready for publishing
# ''' # '''
# # data = { # # data = {
# # "datum":{ # # "datum":{
# # "com.bbn.tc.schema.avro.cdm18.Event":{ # Label # # "com.bbn.tc.schema.avro.cdm18.Event":{ # Label
# # "uuid":"08DB617B-6100-51F3-9742-902710EDCA7D", # Property # # "uuid":"08DB617B-6100-51F3-9742-902710EDCA7D", # Property
# # # "sequence":{"long":4600591}, # Property # # # "sequence":{"long":4600591}, # Property
# # "sequence": 4600591, # # "sequence": 4600591,
# # "type":"EVENT_FCNTL", # Property # # "type":"EVENT_FCNTL", # Property
# # "threadId":{"int":100117}, # Property # # "threadId":{"int":100117}, # Property
# # "hostId":"83C8ED1F-5045-DBCD-B39F-918F0DF4F851", # Relationship: runsOn # # "hostId":"83C8ED1F-5045-DBCD-B39F-918F0DF4F851", # Relationship: runsOn
# # "subject":{"com.bbn.tc.schema.avro.cdm18.UUID":"72FB0406-3678-11E8-BF66-D9AA8AFF4A69"}, # Relationship: isGeneratedBy # # "subject":{"com.bbn.tc.schema.avro.cdm18.UUID":"72FB0406-3678-11E8-BF66-D9AA8AFF4A69"}, # Relationship: isGeneratedBy
# # "predicateObject":"null", # Relationship: affects1 # # "predicateObject":"null", # Relationship: affects1
# # "predicateObjectPath":"null", # Property # # "predicateObjectPath":"null", # Property
# # "predicateObject2":"null", # Relationship: affects2 # # "predicateObject2":"null", # Relationship: affects2
# # "predicateObject2Path":"null", # Property # # "predicateObject2Path":"null", # Property
# # "timestampNanos":1522828473820631110, # Property # # "timestampNanos":1522828473820631110, # Property
# # "name":{"string":"aue_fcntl"}, # Property # # "name":{"string":"aue_fcntl"}, # Property
# # "parameters":{ # # "parameters":{
# # "array":[{ # # "array":[{
# # "size":-1, # # "size":-1,
# # "type":"VALUE_TYPE_CONTROL", # # "type":"VALUE_TYPE_CONTROL",
# # "valueDataType":"VALUE_DATA_TYPE_INT", # # "valueDataType":"VALUE_DATA_TYPE_INT",
# # "isNull":False, # # "isNull":False,
# # "name":{"string":"cmd"}, # # "name":{"string":"cmd"},
# # "runtimeDataType":"null", # # "runtimeDataType":"null",
# # "valueBytes":{"bytes":"04"}, # # "valueBytes":{"bytes":"04"},
# # "provenance":"null", # # "provenance":"null",
# # "tag":"null", # # "tag":"null",
# # "components":"null" # # "components":"null"
# # }] # # }]
# # }, # Property # # }, # Property
# # "location":"null", # Property # # "location":"null", # Property
# # "size":"null", # Property # # "size":"null", # Property
# # "programPoint":"null", # Property # # "programPoint":"null", # Property
# # "properties":{ # # "properties":{
# # "map":{ # # "map":{
# # "host":"83c8ed1f-5045-dbcd-b39f-918f0df4f851", # # "host":"83c8ed1f-5045-dbcd-b39f-918f0df4f851",
# # "return_value":"0", # # "return_value":"0",
# # "fd":"4", # # "fd":"4",
# # "exec":"python2.7", # # "exec":"python2.7",
# # "ppid":"1" # # "ppid":"1"
# # } # # }
# # } # Property # # } # Property
# # } # # }
# # }, # # },
# # "CDMVersion":"18", # Property # # "CDMVersion":"18", # Property
# # "source":"SOURCE_FREEBSD_DTRACE_CADETS" # Property # # "source":"SOURCE_FREEBSD_DTRACE_CADETS" # Property
# # } # # }
# data = [{ # data = [{
# "datum":{ # "datum":{
# "com.bbn.tc.schema.avro.cdm18.Host":{ # "com.bbn.tc.schema.avro.cdm18.Host":{
# "uuid":"83C8ED1F-5045-DBCD-B39F-918F0DF4F851", # "uuid":"83C8ED1F-5045-DBCD-B39F-918F0DF4F851",
# "hostName":"ta1-cadets", # "hostName":"ta1-cadets",
# "hostIdentifiers":[], # "hostIdentifiers":[],
# "osDetails":"FreeBSD 12.0-CURRENT FreeBSD 12.0-CURRENT #1 1863588dca9(HEAD)-dirty: Wed Feb 28 17:23:37 UTC 2018 root@ta1-cadets:/usr/obj/data/update/build-meta/freebsd/amd64.amd64/sys/CADETS amd64", # "osDetails":"FreeBSD 12.0-CURRENT FreeBSD 12.0-CURRENT #1 1863588dca9(HEAD)-dirty: Wed Feb 28 17:23:37 UTC 2018 root@ta1-cadets:/usr/obj/data/update/build-meta/freebsd/amd64.amd64/sys/CADETS amd64",
# "hostType":"HOST_DESKTOP", # "hostType":"HOST_DESKTOP",
# "interfaces":[{ # "interfaces":[{
# "name":"vtnet0", # "name":"vtnet0",
# "macAddress":"52:54:00:f0:0d:23", # "macAddress":"52:54:00:f0:0d:23",
# "ipAddresses":["fe80::5054:ff:fef0:d23%vtnet0","10.0.6.23"] # "ipAddresses":["fe80::5054:ff:fef0:d23%vtnet0","10.0.6.23"]
# }, # },
# {"name":"vtnet1", # {"name":"vtnet1",
# "macAddress":"52:54:00:f0:08:23", # "macAddress":"52:54:00:f0:08:23",
# "ipAddresses":["fe80::5054:ff:fef0:823%vtnet1","128.55.12.73"] # "ipAddresses":["fe80::5054:ff:fef0:823%vtnet1","128.55.12.73"]
# }] # }]
# } # }
# }, # },
# "CDMVersion":"18", # "CDMVersion":"18",
# "source":"SOURCE_FREEBSD_DTRACE_CADETS" # "source":"SOURCE_FREEBSD_DTRACE_CADETS"
# }, # },
# { # {
# "datum":{ # "datum":{
# "com.bbn.tc.schema.avro.cdm18.FileObject":{ # "com.bbn.tc.schema.avro.cdm18.FileObject":{
# "uuid":"42DD2C9E-36C2-11E8-BF66-D9AA8AFF4A69", # "uuid":"42DD2C9E-36C2-11E8-BF66-D9AA8AFF4A69",
# "baseObject":{ # "baseObject":{
# "hostId":"83C8ED1F-5045-DBCD-B39F-918F0DF4F851", # "hostId":"83C8ED1F-5045-DBCD-B39F-918F0DF4F851",
# "permission":"null", # "permission":"null",
# "epoch":"null", # "epoch":"null",
# "properties":{"map":{}} # "properties":{"map":{}}
# }, # },
# "type":"FILE_OBJECT_FILE", # "type":"FILE_OBJECT_FILE",
# "fileDescriptor":"null", # "fileDescriptor":"null",
# "localPrincipal":"null", # "localPrincipal":"null",
# "size":"null", # "size":"null",
# "peInfo":"null", # "peInfo":"null",
# "hashes":"null" # "hashes":"null"
# } # }
# }, # },
# "CDMVersion":"18", # "CDMVersion":"18",
# "source":"SOURCE_FREEBSD_DTRACE_CADETS" # "source":"SOURCE_FREEBSD_DTRACE_CADETS"
# } # }
# ] # ]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import json import json
# ' # '
# data = '{"datum":{"com.bbn.tc.schema.avro.cdm18.Event":{"uuid":"5CC868CD-FF30-5E2B-BB74-6C5B474A62B2","sequence":{"long":1},"type":"EVENT_CLOSE","threadId":{"int":100117},"hostId":"83C8ED1F-5045-DBCD-B39F-918F0DF4F851","subject":{"com.bbn.tc.schema.avro.cdm18.UUID":"72FB0406-3678-11E8-BF66-D9AA8AFF4A69"},"predicateObject":{"com.bbn.tc.schema.avro.cdm18.UUID":"42DD2DBA-36C2-11E8-BF66-D9AA8AFF4A69"},"predicateObjectPath":null,"predicateObject2":null,"predicateObject2Path":null,"timestampNanos":1522706861813350340,"name":{"string":"aue_close"},"parameters":{"array":[]},"location":null,"size":null,"programPoint":null,"properties":{"map":{"host":"83c8ed1f-5045-dbcd-b39f-918f0df4f851","return_value":"0","fd":"28","exec":"python2.7","ppid":"1"}}}},"CDMVersion":"18","source":"SOURCE_FREEBSD_DTRACE_CADETS"}' # data = '{"datum":{"com.bbn.tc.schema.avro.cdm18.Event":{"uuid":"5CC868CD-FF30-5E2B-BB74-6C5B474A62B2","sequence":{"long":1},"type":"EVENT_CLOSE","threadId":{"int":100117},"hostId":"83C8ED1F-5045-DBCD-B39F-918F0DF4F851","subject":{"com.bbn.tc.schema.avro.cdm18.UUID":"72FB0406-3678-11E8-BF66-D9AA8AFF4A69"},"predicateObject":{"com.bbn.tc.schema.avro.cdm18.UUID":"42DD2DBA-36C2-11E8-BF66-D9AA8AFF4A69"},"predicateObjectPath":null,"predicateObject2":null,"predicateObject2Path":null,"timestampNanos":1522706861813350340,"name":{"string":"aue_close"},"parameters":{"array":[]},"location":null,"size":null,"programPoint":null,"properties":{"map":{"host":"83c8ed1f-5045-dbcd-b39f-918f0df4f851","return_value":"0","fd":"28","exec":"python2.7","ppid":"1"}}}},"CDMVersion":"18","source":"SOURCE_FREEBSD_DTRACE_CADETS"}'
# input = json.loads(data) # input = json.loads(data)
# jsonType = list(input['datum'].keys())[0] # jsonType = list(input['datum'].keys())[0]
# # print(jsonType) # # print(jsonType)
# # short type string # # short type string
# nodeType = jsonType.rsplit(".", 1)[1] # nodeType = jsonType.rsplit(".", 1)[1]
# # print(nodeType) # # print(nodeType)
# # data of object # # data of object
# value = input["datum"][jsonType] # value = input["datum"][jsonType]
# relations=dict( # relations=dict(
# runsOn="" # runsOn=""
# ,isGeneratedBy="" # ,isGeneratedBy=""
# ,affects=list() # ,affects=list()
# ,residesOn="" # ,residesOn=""
# ,isPartOf="" # ,isPartOf=""
# ,hasOwningPricipal="" # ,hasOwningPricipal=""
# ,hasTag="" # ,hasTag=""
# ,hasParent="" # ,hasParent=""
# ,hasOwningPrincipal="" # ,hasOwningPrincipal=""
# ,hasAccountOn="" # ,hasAccountOn=""
# ) # )
# # create relationships for host id # # create relationships for host id
# if nodeType == 'Event': # if nodeType == 'Event':
# relations.update({'runsOn':value['hostId']}) # relations.update({'runsOn':value['hostId']})
# relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']}) # relations.update({'isGeneratedBy':value['subject']['com.bbn.tc.schema.avro.cdm18.UUID']})
# if value['predicateObject'] != "Null": # if value['predicateObject'] != "Null":
# relations['affects'].append(value['predicateObject']) # relations['affects'].append(value['predicateObject'])
# if value['predicateObject2'] != "Null": # if value['predicateObject2'] != "Null":
# relations['affects'].append(value['predicateObject2']) # relations['affects'].append(value['predicateObject2'])
# value.pop('hostId') # value.pop('hostId')
# value.pop('subject') # value.pop('subject')
# value.pop('predicateObject') # value.pop('predicateObject')
# value.pop('predicateObjectPath') # value.pop('predicateObjectPath')
# value.pop('predicateObject2') # value.pop('predicateObject2')
# value.pop('predicateObject2Path') # value.pop('predicateObject2Path')
# print("json for insertion: ",value) # print("json for insertion: ",value)
# # print("relations: ",relations) # # print("relations: ",relations)
# # print(nodeType) # # print(nodeType)
def flatten_dict(d, parent_key='', sep='_'): def flatten_dict(d, parent_key='', sep='_'):
items = [] items = []
for k, v in d.items(): for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict): if isinstance(v, dict):
items.extend(flatten_dict(v, new_key, sep=sep).items()) items.extend(flatten_dict(v, new_key, sep=sep).items())
elif isinstance(v, list): elif isinstance(v, list):
for i, val in enumerate(v): for i, val in enumerate(v):
items.extend(flatten_dict(val, f"{new_key}{sep}{i}", sep=sep).items()) items.extend(flatten_dict(val, f"{new_key}{sep}{i}", sep=sep).items())
else: else:
items.append((new_key, v)) items.append((new_key, v))
return dict(items) return dict(items)
# Your JSON-like object # Your JSON-like object
attributes = { attributes = {
'uuid': '08DB617B-6100-51F3-9742-902710EDCA7D', 'uuid': '08DB617B-6100-51F3-9742-902710EDCA7D',
'sequence': 4600591, 'sequence': 4600591,
'type': 'EVENT_FCNTL', 'type': 'EVENT_FCNTL',
'threadId': {'int': 100117}, 'threadId': {'int': 100117},
'timestampNanos': 1522828473820631110, 'timestampNanos': 1522828473820631110,
'name': {'string': 'aue_fcntl'}, 'name': {'string': 'aue_fcntl'},
'parameters': {'array': [{'size': -1, 'type': 'VALUE_TYPE_CONTROL', 'valueDataType': 'VALUE_DATA_TYPE_INT', 'isNull': False, 'name': {'string': 'cmd'}, 'runtimeDataType': None, 'valueBytes': {'bytes': '04'}, 'provenance': None, 'tag': None, 'components': None}]}, 'parameters': {'array': [{'size': -1, 'type': 'VALUE_TYPE_CONTROL', 'valueDataType': 'VALUE_DATA_TYPE_INT', 'isNull': False, 'name': {'string': 'cmd'}, 'runtimeDataType': None, 'valueBytes': {'bytes': '04'}, 'provenance': None, 'tag': None, 'components': None}]},
'location': None, 'location': None,
'size': None, 'size': None,
'programPoint': None, 'programPoint': None,
'properties': {'map': {'host': '83c8ed1f-5045-dbcd-b39f-918f0df4f851', 'return_value': '0', 'fd': '4', 'exec': 'python2.7', 'ppid': '1'}} 'properties': {'map': {'host': '83c8ed1f-5045-dbcd-b39f-918f0df4f851', 'return_value': '0', 'fd': '4', 'exec': 'python2.7', 'ppid': '1'}}
} }
# Flatten the JSON-like object # Flatten the JSON-like object
flattened_attributes = flatten_dict(attributes) flattened_attributes = flatten_dict(attributes)
# Print the flattened dictionary # Print the flattened dictionary
print('flat: ',flattened_attributes) print('flat: ',flattened_attributes)
# trim values # trim values
# print(value) # print(value)
# print(isGeneratedBy) # print(isGeneratedBy)
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment