diff --git a/zeek-files-labeler.py b/zeek-files-labeler.py index f4db0ba..c5990f1 100755 --- a/zeek-files-labeler.py +++ b/zeek-files-labeler.py @@ -32,7 +32,7 @@ from os.path import isfile, join import subprocess -version = "0.1" +VERSION = "0.1" def output_netflow_line_to_file(outputfile, originalline, filetype='', genericlabel='', detailedlabel=''): @@ -72,7 +72,7 @@ def output_netflow_line_to_file(outputfile, originalline, filetype='', genericla print(type(inst)) # the exception instance print(inst.args) # arguments stored in .args print(inst) # __str__ allows args to printed directly - exit(-1) + sys.exit(-1) def define_columns(headerline, filetype): @@ -253,8 +253,8 @@ def define_type(data): """ try: # If line json, it can be Zeek or suricata - # If line CSV, it can be Argus - # If line TSV, it can be Argus or zeek + # If line CSV, it can be Argus + # If line TSV, it can be Argus or zeek input_type = 'unknown' @@ -430,12 +430,12 @@ def cache_labeled_file(): print(type(inst)) # the exception instance print(inst.args) # arguments stored in .args print(inst) # __str__ allows args to printed directly - exit(-1) + sys.exit(-1) # Get the first header line to find the type headerline = input_labeled_file.readline() - # If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date' + # If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date' if '#' not in headerline[0]: print('The labeled file has not headers. Please add them.') sys.exit(-1) @@ -446,7 +446,7 @@ def cache_labeled_file(): if args.verbose > 3: print(f'[+] Type of labeled file to use: {filetype}') - # Define the columns + # Define the columns if filetype == 'zeek-json': input_labeled_file_column_idx = define_columns(headerline, filetype='json') amount_lines_processed = 0 @@ -502,8 +502,10 @@ def cache_labeled_file(): def process_zeekfolder(): """ - This function takes the flowFile and parse it. Then it ask for a label and finally it calls a function to store the netflow in a file - Method: + This function takes the labeled conn.log file and parses it. + Then it asks for a label and finally it calls a function to store the netflow in a file. + + Method: 1. Read the labeled file once and store the uid and labels in a dictionary 2. Read each of the files in the zeek folder, read their uid, and assign the label given to that uid in the labeled file """ @@ -515,7 +517,7 @@ def process_zeekfolder(): print('\n[+] Processing the zeek folder {0} for files to label'.format(args.zeekfolder)) - # ----- Second, open each file in the folder, and label them. + # ----- Second, open each file in the folder, and label them. # Get the list of files in this folder zeekfiles = [f for f in listdir(args.zeekfolder) if isfile(join(args.zeekfolder, f))] @@ -524,8 +526,11 @@ def process_zeekfolder(): for zeekfile_name in zeekfiles: - # Ignore labeled files, summary file and conn.log file - if '.labeled' in zeekfile_name or 'services' in zeekfile_name or 'summary' in zeekfile_name or 'conn.log' in zeekfile_name: + # Ignore the following files + ignore_keywords = ['.labeled', 'services', 'summary', 'conn.log', 'capture_loss.log', + 'loaded_scripts.log', 'packet_filter.log', 'stats.log', 'reporter.log'] + + if any(keyword in zeekfile_name for keyword in ignore_keywords): continue # Ignore empty files @@ -542,12 +547,12 @@ def process_zeekfolder(): print(type(inst)) # the exception instance print(inst.args) # arguments stored in .args print(inst) # __str__ allows args to printed directly - exit(-1) + sys.exit(-1) # Get the first header line to find the type headerline = zeekfile.readline() - # If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date' + # If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date' if '#' not in headerline[0]: print('The file has not headers. Please add them.') sys.exit(-1) @@ -583,7 +588,7 @@ def process_zeekfolder(): zeek_file_file_separator = '\t' # ---- For the majority of zeek log files, using the UID from conn.log to find the related flow is ok - # ---- But it is not for x509.log and files.log. + # ---- But it is not for x509.log and files.log. if zeekfile_name == 'x509.log': line_to_label = zeekfile.readline().strip() @@ -636,7 +641,7 @@ def process_zeekfolder(): else: # ---- Read the lines from the rest of log files to label - + # Read each line of the labeled file and get the zeek uid line_to_label = zeekfile.readline().strip() @@ -654,7 +659,7 @@ def process_zeekfolder(): uid = line_values[column_idx['conn_uids']] lines_labeled += 1 - + try: # Get the labels generic_label_to_assign = labels_dict[uid][0] @@ -700,11 +705,11 @@ def process_zeekfolder(): print(type(inst)) # the exception instance print(inst.args) # arguments stored in .args print(inst) # __str__ allows args to printed directly - exit(-1) + sys.exit(-1) if __name__ == '__main__': - print('Zeek Files labeler from labeled conn.log.labeled file. Version {}'.format(version)) + print('Zeek Files labeler from labeled conn.log.labeled file. Version {}'.format(VERSION)) print('https://stratosphereips.org') # Parse the parameters @@ -724,5 +729,3 @@ def process_zeekfolder(): # CTRL-C pretty handling. print("Keyboard Interruption!. Exiting.") sys.exit(1) - -