From ecd9749b43b94d9e7f154b853684fe2d2bd9c885 Mon Sep 17 00:00:00 2001
From: Hhyemin <huhuimin@gmail.com>
Date: Thu, 12 Sep 2024 15:48:46 +0000
Subject: [PATCH] Check the authors and dates of PUI suppression relevant
 commits.

---
 .../experiments/CheckAuthorDatePUI.py         |  82 ++++++++++++
 .../CheckLineDistanceOfPUISuppreesions.py     | 124 ++++++++++++------
 2 files changed, 167 insertions(+), 39 deletions(-)
 create mode 100644 src/suppression_study/experiments/CheckAuthorDatePUI.py

diff --git a/src/suppression_study/experiments/CheckAuthorDatePUI.py b/src/suppression_study/experiments/CheckAuthorDatePUI.py
new file mode 100644
index 0000000..0bf90c2
--- /dev/null
+++ b/src/suppression_study/experiments/CheckAuthorDatePUI.py
@@ -0,0 +1,82 @@
+import csv
+import json
+import subprocess
+from datetime import datetime
+from os.path import join
+from datetime import datetime
+
+
+def get_commit_info(commit_hash, project_dir):
+    """Get authors and date information for a given commit hash in a specific repo."""
+    # Command to get the commit author and co-authors
+    author_command = f"git -C {project_dir} log --format='%aN' {commit_hash} -n 1"
+    coauthor_command = f"git -C {project_dir} log --format='%b' {commit_hash} -n 1"
+    # Command to get the commit date
+    date_command = f"git -C {project_dir} log --format='%ai' {commit_hash} -n 1"
+
+    author = subprocess.check_output(author_command, shell=True).decode('utf-8').strip()
+    coauthor_output = subprocess.check_output(coauthor_command, shell=True).decode('utf-8').strip()
+    commit_date = subprocess.check_output(date_command, shell=True).decode('utf-8').strip()
+
+    # Find any Co-authored-by: lines in the commit body
+    coauthors = []
+    for line in coauthor_output.split('\n'):
+        if line.startswith('Co-authored-by:'):
+            coauthor_name = line.split(':', 1)[1].strip().split('<')[0].strip()
+            coauthors.append(coauthor_name)
+
+    # Combine author and co-authors into a single list
+    authors = [author] + coauthors
+
+    return authors, commit_date
+
+def calculate_date_difference(date1, date2):
+    """Calculate the difference in days between two dates."""
+    delta = date2 - date1
+    return delta.days
+
+def main(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+
+    results = []
+    for check_item in data:
+        url =  check_item["Check"][0]
+        repo_name = url.split("/")[-3]
+        repo_path = join("data", "repos", repo_name)
+
+        check_dict = check_item["Check"][1]
+        previous_commit = check_dict["previous_commit"]
+        commit = check_dict["commit"]
+
+        authors1, date1 = get_commit_info(previous_commit, repo_path)
+        authors2, date2 = get_commit_info(commit, repo_path)
+
+        # Check authors and calculate date difference
+        author = None
+        delta_days = None
+        if authors1 and authors2:
+            if authors1 == authors2:
+                author = "same"
+            elif set(authors1) & set(authors2):
+                author = "different but inclusive"
+            else:
+                author = "different"
+
+        if date1 and date2:
+            date1 = date1.split()[0]  # Extract only the date part (ignoring time)
+            date2 = date2.split()[0]
+            date_format = "%Y-%m-%d"
+            delta_days = (datetime.strptime(date2, date_format) - datetime.strptime(date1, date_format)).days
+
+        results.append([previous_commit, ', '.join(authors1), date1, commit, ', '.join(authors2), date2, author, delta_days])
+
+    with open(join("data", "results", "inspection_author_time3.csv"), 'w', newline='') as file:
+        writer = csv.writer(file)
+        writer.writerows(results)
+
+
+if __name__ == "__main__":
+    file_path = join("data", "results", "inspection_accidental_commits.json")
+    main(file_path)
+
diff --git a/src/suppression_study/experiments/CheckLineDistanceOfPUISuppreesions.py b/src/suppression_study/experiments/CheckLineDistanceOfPUISuppreesions.py
index 13d049b..9a7002e 100644
--- a/src/suppression_study/experiments/CheckLineDistanceOfPUISuppreesions.py
+++ b/src/suppression_study/experiments/CheckLineDistanceOfPUISuppreesions.py
@@ -5,63 +5,106 @@
 from sklearn.cluster import KMeans
     
 
-def plot_distance_from_warning_to_suppression(float_list, output_file, n_clusters):
-    data = np.array(float_list).reshape(-1, 1)
+def plot_distance_from_warning_to_suppression(distance_list, output_file, n_clusters):
+    data = np.array(distance_list).reshape(-1, 1)
     
     # Perform k-means clustering
     kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
     labels = kmeans.labels_
     cluster_centers = kmeans.cluster_centers_.flatten()
-
-    # Create a list of lists to hold grouped data
     grouped_data = [[] for _ in range(n_clusters)]
     
     for i, label in enumerate(labels):
-        grouped_data[label].append(float_list[i])
+        grouped_data[label].append(distance_list[i])
+    
     # Count the number of elements in each cluster
     counts = [len(group) for group in grouped_data]
     
-    # Sort groups by the cluster center
-    sorted_indices = np.argsort(cluster_centers)
-    sorted_counts = np.array(counts)[sorted_indices]
-    sorted_centers = np.array(cluster_centers)[sorted_indices]
-
-    # Define group ranges for x-axis labels, ensuring that the ranges are contiguous
-    sorted_centers = np.sort(sorted_centers)
+    # Round cluster centers to the nearest multiple of 100
+    def round_to_nearest_100(x):
+        return np.round(x / 100) * 100
+    
+    rounded_centers = np.array([round_to_nearest_100(center) for center in cluster_centers])
+    
+    # Sort centers and counts
+    sorted_indices = np.argsort(rounded_centers)
+    sorted_centers = rounded_centers[sorted_indices]
+    
+    # Define consistent group ranges
     group_ranges = []
+    lower_bound = np.floor(min(distance_list) / 100) * 100  # Start at the nearest lower multiple of 100
+    
     for i in range(n_clusters):
-        if i == 0:
-            lower_bound = 0
-        else:
-            lower_bound = sorted_centers[i - 1] + 1
-        
+        upper_bound = sorted_centers[i]
         if i == n_clusters - 1:
-            upper_bound = max(float_list)
-        else:
-            upper_bound = sorted_centers[i] + 1
-
-        # print(f"L: {lower_bound}\tU: {upper_bound}")
-        # group_ranges.append(f'{lower_bound:.1f} - {upper_bound:.1f}')
-        if i == n_clusters - 1:
-            group_ranges.append(f'[{lower_bound:.1f},\n  {upper_bound:.1f}]')
-        else:
-            group_ranges.append(f'[{lower_bound:.1f},\n  {upper_bound:.1f})')
-
+            upper_bound = max(distance_list) + 1 # np.floor(max(distance_list) / 100 + 1) * 100 
+        
+        group_ranges.append((lower_bound, upper_bound))
+        lower_bound = upper_bound
+    
+    # Count the number of elements in each range
+    counts = [sum(lower_bound <= x < upper_bound for x in distance_list) for lower_bound, upper_bound in group_ranges]
+    formatted_ranges = [f'[{int(lower_bound)},\n  {int(upper_bound)})' if i < len(group_ranges) - 1 
+                        else f'[{int(lower_bound)},\n  {int(upper_bound) - 1}]' 
+                        for i, (lower_bound, upper_bound) in enumerate(group_ranges)]
     plt.figure(figsize=(12, 5))
-    plt.rcParams.update({'font.size': 16})
-    bars = plt.bar(group_ranges, sorted_counts) #, color='blue')
+    plt.rcParams.update({'font.size': 14})
+    bars = plt.bar(formatted_ranges, counts)
     for bar in bars:
-        plt.text(bar.get_x() + bar.get_width() / 2, 0.96 * bar.get_height(),
+        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.2,
                 f'{int(bar.get_height())}', ha='center', va='bottom', color='black')
-    # plt.xticks(group_ranges, rotation=30)
+    
     plt.xlabel('Distance from warnings to suppression (number of lines)')
     plt.ylabel('Number of Warnings')
-    # Remove decimal places from x-axis ticks
-    plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True))
+
+    # Determine the y-axis limits and ticks dynamically
+    max_count = max(counts)
+    y_tick_interval = 10 ** np.floor(np.log10(max_count))  # Get an appropriate interval for ticks
+    plt.ylim(0, np.ceil(max_count / y_tick_interval) * y_tick_interval)
+    plt.yticks(np.arange(0, plt.ylim()[1] + y_tick_interval, y_tick_interval))
+    # plt.xticks(rotation=30)
     plt.tight_layout()
     plt.savefig(output_file)
 
-def main(file_path, output_file, n_clusters):
+def main(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+
+    # specify groups
+    group_1= range(0, 11)
+    group_2= range(11, 100)
+    group_3= range(101, 1000)
+    # group_4= range(1001, inf)
+    count_1 = 0
+    count_2 = 0
+    count_3 = 0
+    count_4 = 0
+
+    distance_list = []
+    for check_item in data:
+        check_dict = check_item["Check"][1]
+        current_suppression_line = check_dict["suppression"]["line"]
+        current_warnings = check_dict["warnings"]
+        for w in current_warnings:
+            distance = abs(w["line"] - current_suppression_line)
+            distance_list.append(distance)
+
+            if distance in group_1:
+                count_1 += 1
+            elif distance in group_2:
+                count_2 += 1
+            elif distance in group_3:
+                count_3 += 1
+            else:
+                count_4 += 1
+
+    print(f"minimum distance: {min(distance_list)}, maximum: {max(distance_list)}")
+    print(f"Distance {group_1}: {count_1}")
+    print(f"Distance {group_2}: {count_2}")
+    print(f"Distance {group_3}: {count_3}")
+    print(f"Distance > 1000 lines: {count_4}")
+
+def main_cluster(file_path, output_file, n_clusters):
     with open(file_path, 'r') as file:
         data = json.load(file)
 
@@ -79,8 +122,11 @@ def main(file_path, output_file, n_clusters):
 
 if __name__ == "__main__":
     file_path = join("data", "results", "inspection_accidental_commits.json")
-    output_file_path = join("data", "results", "distance_from_warnings_to_suppression.pdf")
-    n_clusters = 8 # with n_clusters=8, it gives meaningful clusters 
-    main(file_path, output_file_path, n_clusters)
-    
 
+    # option #1, use cluster as a guide to get the groups, may in different size.
+    # output_file_path = join("data", "results", "distance_from_warnings_to_suppression.pdf")
+    # n_clusters = 8 # with n_clusters=8, it gives meaningful clusters 
+    # main_cluster(file_path, output_file_path, n_clusters)
+
+    # option #2
+    main(file_path)