Merge pull request #21 from stratosphereips/feat_enhance-zeek-files-l…

…abeler Improve zeek-files-labeler.py PEP8 and ignore files
stratosphereips · May 21, 2023 · ebfd3cd · ebfd3cd
2 parents 8f0e202 + e37c7b9
commit ebfd3cd
Showing 1 changed file with 24 additions and 21 deletions.
diff --git a/zeek-files-labeler.py b/zeek-files-labeler.py
@@ -32,7 +32,7 @@
 from os.path import isfile, join
 import subprocess
 
-version = "0.1"
+VERSION = "0.1"
 
 
 def output_netflow_line_to_file(outputfile, originalline, filetype='', genericlabel='', detailedlabel=''):
@@ -72,7 +72,7 @@ def output_netflow_line_to_file(outputfile, originalline, filetype='', genericla
         print(type(inst))     # the exception instance
         print(inst.args)      # arguments stored in .args
         print(inst)           # __str__ allows args to printed directly
-        exit(-1)
+        sys.exit(-1)
 
 
 def define_columns(headerline, filetype):
@@ -253,8 +253,8 @@ def define_type(data):
     """
     try:
         # If line json, it can be Zeek or suricata
-        # If line CSV, it can be Argus 
-        # If line TSV, it can be Argus  or zeek
+        # If line CSV, it can be Argus
+        # If line TSV, it can be Argus or zeek
 
         input_type = 'unknown'
 
@@ -430,12 +430,12 @@ def cache_labeled_file():
             print(type(inst))     # the exception instance
             print(inst.args)      # arguments stored in .args
             print(inst)           # __str__ allows args to printed directly
-            exit(-1)
+            sys.exit(-1)
 
         # Get the first header line to find the type
         headerline = input_labeled_file.readline()
 
-        # If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date' 
+        # If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date'
         if '#' not in headerline[0]:
             print('The labeled file has not headers. Please add them.')
             sys.exit(-1)
@@ -446,7 +446,7 @@ def cache_labeled_file():
         if args.verbose > 3:
             print(f'[+] Type of labeled file to use: {filetype}')
 
-        # Define the columns 
+        # Define the columns
         if filetype == 'zeek-json':
             input_labeled_file_column_idx = define_columns(headerline, filetype='json')
             amount_lines_processed = 0
@@ -502,8 +502,10 @@ def cache_labeled_file():
 
 def process_zeekfolder():
     """
-    This function takes the flowFile and parse it. Then it ask for a label and finally it calls a function to store the netflow in a file
-    Method: 
+    This function takes the labeled conn.log file and parses it.
+    Then it asks for a label and finally it calls a function to store the netflow in a file.
+
+    Method:
     1. Read the labeled file once and store the uid and labels in a dictionary
     2. Read each of the files in the zeek folder, read their uid, and assign the label given to that uid in the labeled file
     """
@@ -515,7 +517,7 @@ def process_zeekfolder():
             print('\n[+] Processing the zeek folder {0} for files to label'.format(args.zeekfolder))
 
 
-        # ----- Second, open each file in the folder, and label them. 
+        # ----- Second, open each file in the folder, and label them.
         # Get the list of files in this folder
         zeekfiles = [f for f in listdir(args.zeekfolder) if isfile(join(args.zeekfolder, f))]
 
@@ -524,8 +526,11 @@ def process_zeekfolder():
 
         for zeekfile_name in zeekfiles:
 
-            # Ignore labeled files, summary file and conn.log file
-            if '.labeled' in zeekfile_name or 'services' in zeekfile_name or 'summary' in zeekfile_name or 'conn.log' in zeekfile_name:
+            # Ignore the following files
+            ignore_keywords = ['.labeled', 'services', 'summary', 'conn.log', 'capture_loss.log',
+            'loaded_scripts.log', 'packet_filter.log', 'stats.log', 'reporter.log']
+
+            if any(keyword in zeekfile_name for keyword in ignore_keywords):
                 continue
 
             # Ignore empty files
@@ -542,12 +547,12 @@ def process_zeekfolder():
                 print(type(inst))     # the exception instance
                 print(inst.args)      # arguments stored in .args
                 print(inst)           # __str__ allows args to printed directly
-                exit(-1)
+                sys.exit(-1)
 
             # Get the first header line to find the type
             headerline = zeekfile.readline()
 
-            # If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date' 
+            # If there are no headers, get out. Most start with '#' but Argus starts with 'StartTime' and nfdump with 'Date'
             if '#' not in headerline[0]:
                 print('The file has not headers. Please add them.')
                 sys.exit(-1)
@@ -583,7 +588,7 @@ def process_zeekfolder():
                 zeek_file_file_separator = '\t'
 
             # ---- For the majority of zeek log files, using the UID from conn.log to find the related flow is ok
-            # ---- But it is not for x509.log and files.log. 
+            # ---- But it is not for x509.log and files.log.
 
             if zeekfile_name == 'x509.log':
                 line_to_label = zeekfile.readline().strip()
@@ -636,7 +641,7 @@ def process_zeekfolder():
 
             else:
                 # ---- Read the lines from the rest of log files to label
-                
+
                 # Read each line of the labeled file and get the zeek uid
                 line_to_label = zeekfile.readline().strip()
 
@@ -654,7 +659,7 @@ def process_zeekfolder():
                             uid = line_values[column_idx['conn_uids']]
 
                         lines_labeled += 1
-                        
+
                         try:
                             # Get the labels
                             generic_label_to_assign = labels_dict[uid][0]
@@ -700,11 +705,11 @@ def process_zeekfolder():
         print(type(inst))     # the exception instance
         print(inst.args)      # arguments stored in .args
         print(inst)           # __str__ allows args to printed directly
-        exit(-1)
+        sys.exit(-1)
 
 
 if __name__ == '__main__':
-    print('Zeek Files labeler from labeled conn.log.labeled file. Version {}'.format(version))
+    print('Zeek Files labeler from labeled conn.log.labeled file. Version {}'.format(VERSION))
     print('https://stratosphereips.org')
 
     # Parse the parameters
@@ -724,5 +729,3 @@ def process_zeekfolder():
         # CTRL-C pretty handling.
         print("Keyboard Interruption!. Exiting.")
         sys.exit(1)
-
-