Skip to content

Commit

Permalink
Merge pull request #21 from stratosphereips/feat_enhance-zeek-files-l…
Browse files Browse the repository at this point in the history
…abeler

Improve zeek-files-labeler.py PEP8 and ignore files
  • Loading branch information
verovaleros authored May 21, 2023
2 parents 8f0e202 + e37c7b9 commit ebfd3cd
Showing 1 changed file with 24 additions and 21 deletions.
45 changes: 24 additions & 21 deletions zeek-files-labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from os.path import isfile, join
import subprocess

version = "0.1"
VERSION = "0.1"


def output_netflow_line_to_file(outputfile, originalline, filetype='', genericlabel='', detailedlabel=''):
Expand Down Expand Up @@ -72,7 +72,7 @@ def output_netflow_line_to_file(outputfile, originalline, filetype='', genericla
print(type(inst)) # the exception instance
print(inst.args) # arguments stored in .args
print(inst) # __str__ allows args to printed directly
exit(-1)
sys.exit(-1)


def define_columns(headerline, filetype):
Expand Down Expand Up @@ -253,8 +253,8 @@ def define_type(data):
"""
try:
# If line json, it can be Zeek or suricata
# If line CSV, it can be Argus
# If line TSV, it can be Argus or zeek
# If line CSV, it can be Argus
# If line TSV, it can be Argus or zeek

input_type = 'unknown'

Expand Down Expand Up @@ -430,12 +430,12 @@ def cache_labeled_file():
print(type(inst)) # the exception instance
print(inst.args) # arguments stored in .args
print(inst) # __str__ allows args to printed directly
exit(-1)
sys.exit(-1)

# Get the first header line to find the type
headerline = input_labeled_file.readline()

# If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date'
# If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date'
if '#' not in headerline[0]:
print('The labeled file has not headers. Please add them.')
sys.exit(-1)
Expand All @@ -446,7 +446,7 @@ def cache_labeled_file():
if args.verbose > 3:
print(f'[+] Type of labeled file to use: {filetype}')

# Define the columns
# Define the columns
if filetype == 'zeek-json':
input_labeled_file_column_idx = define_columns(headerline, filetype='json')
amount_lines_processed = 0
Expand Down Expand Up @@ -502,8 +502,10 @@ def cache_labeled_file():

def process_zeekfolder():
"""
This function takes the flowFile and parse it. Then it ask for a label and finally it calls a function to store the netflow in a file
Method:
This function takes the labeled conn.log file and parses it.
Then it asks for a label and finally it calls a function to store the netflow in a file.
Method:
1. Read the labeled file once and store the uid and labels in a dictionary
2. Read each of the files in the zeek folder, read their uid, and assign the label given to that uid in the labeled file
"""
Expand All @@ -515,7 +517,7 @@ def process_zeekfolder():
print('\n[+] Processing the zeek folder {0} for files to label'.format(args.zeekfolder))


# ----- Second, open each file in the folder, and label them.
# ----- Second, open each file in the folder, and label them.
# Get the list of files in this folder
zeekfiles = [f for f in listdir(args.zeekfolder) if isfile(join(args.zeekfolder, f))]

Expand All @@ -524,8 +526,11 @@ def process_zeekfolder():

for zeekfile_name in zeekfiles:

# Ignore labeled files, summary file and conn.log file
if '.labeled' in zeekfile_name or 'services' in zeekfile_name or 'summary' in zeekfile_name or 'conn.log' in zeekfile_name:
# Ignore the following files
ignore_keywords = ['.labeled', 'services', 'summary', 'conn.log', 'capture_loss.log',
'loaded_scripts.log', 'packet_filter.log', 'stats.log', 'reporter.log']

if any(keyword in zeekfile_name for keyword in ignore_keywords):
continue

# Ignore empty files
Expand All @@ -542,12 +547,12 @@ def process_zeekfolder():
print(type(inst)) # the exception instance
print(inst.args) # arguments stored in .args
print(inst) # __str__ allows args to printed directly
exit(-1)
sys.exit(-1)

# Get the first header line to find the type
headerline = zeekfile.readline()

# If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date'
# If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date'
if '#' not in headerline[0]:
print('The file has not headers. Please add them.')
sys.exit(-1)
Expand Down Expand Up @@ -583,7 +588,7 @@ def process_zeekfolder():
zeek_file_file_separator = '\t'

# ---- For the majority of zeek log files, using the UID from conn.log to find the related flow is ok
# ---- But it is not for x509.log and files.log.
# ---- But it is not for x509.log and files.log.

if zeekfile_name == 'x509.log':
line_to_label = zeekfile.readline().strip()
Expand Down Expand Up @@ -636,7 +641,7 @@ def process_zeekfolder():

else:
# ---- Read the lines from the rest of log files to label

# Read each line of the labeled file and get the zeek uid
line_to_label = zeekfile.readline().strip()

Expand All @@ -654,7 +659,7 @@ def process_zeekfolder():
uid = line_values[column_idx['conn_uids']]

lines_labeled += 1

try:
# Get the labels
generic_label_to_assign = labels_dict[uid][0]
Expand Down Expand Up @@ -700,11 +705,11 @@ def process_zeekfolder():
print(type(inst)) # the exception instance
print(inst.args) # arguments stored in .args
print(inst) # __str__ allows args to printed directly
exit(-1)
sys.exit(-1)


if __name__ == '__main__':
print('Zeek Files labeler from labeled conn.log.labeled file. Version {}'.format(version))
print('Zeek Files labeler from labeled conn.log.labeled file. Version {}'.format(VERSION))
print('https://stratosphereips.org')

# Parse the parameters
Expand All @@ -724,5 +729,3 @@ def process_zeekfolder():
# CTRL-C pretty handling.
print("Keyboard Interruption!. Exiting.")
sys.exit(1)


0 comments on commit ebfd3cd

Please sign in to comment.