From ecd9749b43b94d9e7f154b853684fe2d2bd9c885 Mon Sep 17 00:00:00 2001 From: Hhyemin Date: Thu, 12 Sep 2024 15:48:46 +0000 Subject: [PATCH] Check the authors and dates of PUI suppression relevant commits. --- .../experiments/CheckAuthorDatePUI.py | 82 ++++++++++++ .../CheckLineDistanceOfPUISuppreesions.py | 124 ++++++++++++------ 2 files changed, 167 insertions(+), 39 deletions(-) create mode 100644 src/suppression_study/experiments/CheckAuthorDatePUI.py diff --git a/src/suppression_study/experiments/CheckAuthorDatePUI.py b/src/suppression_study/experiments/CheckAuthorDatePUI.py new file mode 100644 index 0000000..0bf90c2 --- /dev/null +++ b/src/suppression_study/experiments/CheckAuthorDatePUI.py @@ -0,0 +1,82 @@ +import csv +import json +import subprocess +from datetime import datetime +from os.path import join +from datetime import datetime + + +def get_commit_info(commit_hash, project_dir): + """Get authors and date information for a given commit hash in a specific repo.""" + # Command to get the commit author and co-authors + author_command = f"git -C {project_dir} log --format='%aN' {commit_hash} -n 1" + coauthor_command = f"git -C {project_dir} log --format='%b' {commit_hash} -n 1" + # Command to get the commit date + date_command = f"git -C {project_dir} log --format='%ai' {commit_hash} -n 1" + + author = subprocess.check_output(author_command, shell=True).decode('utf-8').strip() + coauthor_output = subprocess.check_output(coauthor_command, shell=True).decode('utf-8').strip() + commit_date = subprocess.check_output(date_command, shell=True).decode('utf-8').strip() + + # Find any Co-authored-by: lines in the commit body + coauthors = [] + for line in coauthor_output.split('\n'): + if line.startswith('Co-authored-by:'): + coauthor_name = line.split(':', 1)[1].strip().split('<')[0].strip() + coauthors.append(coauthor_name) + + # Combine author and co-authors into a single list + authors = [author] + coauthors + + return authors, commit_date + +def calculate_date_difference(date1, date2): + """Calculate the difference in days between two dates.""" + delta = date2 - date1 + return delta.days + +def main(file_path): + with open(file_path, 'r') as file: + data = json.load(file) + + results = [] + for check_item in data: + url = check_item["Check"][0] + repo_name = url.split("/")[-3] + repo_path = join("data", "repos", repo_name) + + check_dict = check_item["Check"][1] + previous_commit = check_dict["previous_commit"] + commit = check_dict["commit"] + + authors1, date1 = get_commit_info(previous_commit, repo_path) + authors2, date2 = get_commit_info(commit, repo_path) + + # Check authors and calculate date difference + author = None + delta_days = None + if authors1 and authors2: + if authors1 == authors2: + author = "same" + elif set(authors1) & set(authors2): + author = "different but inclusive" + else: + author = "different" + + if date1 and date2: + date1 = date1.split()[0] # Extract only the date part (ignoring time) + date2 = date2.split()[0] + date_format = "%Y-%m-%d" + delta_days = (datetime.strptime(date2, date_format) - datetime.strptime(date1, date_format)).days + + results.append([previous_commit, ', '.join(authors1), date1, commit, ', '.join(authors2), date2, author, delta_days]) + + with open(join("data", "results", "inspection_author_time3.csv"), 'w', newline='') as file: + writer = csv.writer(file) + writer.writerows(results) + + +if __name__ == "__main__": + file_path = join("data", "results", "inspection_accidental_commits.json") + main(file_path) + diff --git a/src/suppression_study/experiments/CheckLineDistanceOfPUISuppreesions.py b/src/suppression_study/experiments/CheckLineDistanceOfPUISuppreesions.py index 13d049b..9a7002e 100644 --- a/src/suppression_study/experiments/CheckLineDistanceOfPUISuppreesions.py +++ b/src/suppression_study/experiments/CheckLineDistanceOfPUISuppreesions.py @@ -5,63 +5,106 @@ from sklearn.cluster import KMeans -def plot_distance_from_warning_to_suppression(float_list, output_file, n_clusters): - data = np.array(float_list).reshape(-1, 1) +def plot_distance_from_warning_to_suppression(distance_list, output_file, n_clusters): + data = np.array(distance_list).reshape(-1, 1) # Perform k-means clustering kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data) labels = kmeans.labels_ cluster_centers = kmeans.cluster_centers_.flatten() - - # Create a list of lists to hold grouped data grouped_data = [[] for _ in range(n_clusters)] for i, label in enumerate(labels): - grouped_data[label].append(float_list[i]) + grouped_data[label].append(distance_list[i]) + # Count the number of elements in each cluster counts = [len(group) for group in grouped_data] - # Sort groups by the cluster center - sorted_indices = np.argsort(cluster_centers) - sorted_counts = np.array(counts)[sorted_indices] - sorted_centers = np.array(cluster_centers)[sorted_indices] - - # Define group ranges for x-axis labels, ensuring that the ranges are contiguous - sorted_centers = np.sort(sorted_centers) + # Round cluster centers to the nearest multiple of 100 + def round_to_nearest_100(x): + return np.round(x / 100) * 100 + + rounded_centers = np.array([round_to_nearest_100(center) for center in cluster_centers]) + + # Sort centers and counts + sorted_indices = np.argsort(rounded_centers) + sorted_centers = rounded_centers[sorted_indices] + + # Define consistent group ranges group_ranges = [] + lower_bound = np.floor(min(distance_list) / 100) * 100 # Start at the nearest lower multiple of 100 + for i in range(n_clusters): - if i == 0: - lower_bound = 0 - else: - lower_bound = sorted_centers[i - 1] + 1 - + upper_bound = sorted_centers[i] if i == n_clusters - 1: - upper_bound = max(float_list) - else: - upper_bound = sorted_centers[i] + 1 - - # print(f"L: {lower_bound}\tU: {upper_bound}") - # group_ranges.append(f'{lower_bound:.1f} - {upper_bound:.1f}') - if i == n_clusters - 1: - group_ranges.append(f'[{lower_bound:.1f},\n {upper_bound:.1f}]') - else: - group_ranges.append(f'[{lower_bound:.1f},\n {upper_bound:.1f})') - + upper_bound = max(distance_list) + 1 # np.floor(max(distance_list) / 100 + 1) * 100 + + group_ranges.append((lower_bound, upper_bound)) + lower_bound = upper_bound + + # Count the number of elements in each range + counts = [sum(lower_bound <= x < upper_bound for x in distance_list) for lower_bound, upper_bound in group_ranges] + formatted_ranges = [f'[{int(lower_bound)},\n {int(upper_bound)})' if i < len(group_ranges) - 1 + else f'[{int(lower_bound)},\n {int(upper_bound) - 1}]' + for i, (lower_bound, upper_bound) in enumerate(group_ranges)] plt.figure(figsize=(12, 5)) - plt.rcParams.update({'font.size': 16}) - bars = plt.bar(group_ranges, sorted_counts) #, color='blue') + plt.rcParams.update({'font.size': 14}) + bars = plt.bar(formatted_ranges, counts) for bar in bars: - plt.text(bar.get_x() + bar.get_width() / 2, 0.96 * bar.get_height(), + plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.2, f'{int(bar.get_height())}', ha='center', va='bottom', color='black') - # plt.xticks(group_ranges, rotation=30) + plt.xlabel('Distance from warnings to suppression (number of lines)') plt.ylabel('Number of Warnings') - # Remove decimal places from x-axis ticks - plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True)) + + # Determine the y-axis limits and ticks dynamically + max_count = max(counts) + y_tick_interval = 10 ** np.floor(np.log10(max_count)) # Get an appropriate interval for ticks + plt.ylim(0, np.ceil(max_count / y_tick_interval) * y_tick_interval) + plt.yticks(np.arange(0, plt.ylim()[1] + y_tick_interval, y_tick_interval)) + # plt.xticks(rotation=30) plt.tight_layout() plt.savefig(output_file) -def main(file_path, output_file, n_clusters): +def main(file_path): + with open(file_path, 'r') as file: + data = json.load(file) + + # specify groups + group_1= range(0, 11) + group_2= range(11, 100) + group_3= range(101, 1000) + # group_4= range(1001, inf) + count_1 = 0 + count_2 = 0 + count_3 = 0 + count_4 = 0 + + distance_list = [] + for check_item in data: + check_dict = check_item["Check"][1] + current_suppression_line = check_dict["suppression"]["line"] + current_warnings = check_dict["warnings"] + for w in current_warnings: + distance = abs(w["line"] - current_suppression_line) + distance_list.append(distance) + + if distance in group_1: + count_1 += 1 + elif distance in group_2: + count_2 += 1 + elif distance in group_3: + count_3 += 1 + else: + count_4 += 1 + + print(f"minimum distance: {min(distance_list)}, maximum: {max(distance_list)}") + print(f"Distance {group_1}: {count_1}") + print(f"Distance {group_2}: {count_2}") + print(f"Distance {group_3}: {count_3}") + print(f"Distance > 1000 lines: {count_4}") + +def main_cluster(file_path, output_file, n_clusters): with open(file_path, 'r') as file: data = json.load(file) @@ -79,8 +122,11 @@ def main(file_path, output_file, n_clusters): if __name__ == "__main__": file_path = join("data", "results", "inspection_accidental_commits.json") - output_file_path = join("data", "results", "distance_from_warnings_to_suppression.pdf") - n_clusters = 8 # with n_clusters=8, it gives meaningful clusters - main(file_path, output_file_path, n_clusters) - + # option #1, use cluster as a guide to get the groups, may in different size. + # output_file_path = join("data", "results", "distance_from_warnings_to_suppression.pdf") + # n_clusters = 8 # with n_clusters=8, it gives meaningful clusters + # main_cluster(file_path, output_file_path, n_clusters) + + # option #2 + main(file_path)