Skip to content

Commit

Permalink
Check the authors and dates of PUI suppression relevant commits.
Browse files Browse the repository at this point in the history
  • Loading branch information
Hhyemin committed Sep 12, 2024
1 parent 2b47e7a commit ecd9749
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 39 deletions.
82 changes: 82 additions & 0 deletions src/suppression_study/experiments/CheckAuthorDatePUI.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import csv
import json
import subprocess
from datetime import datetime
from os.path import join
from datetime import datetime


def get_commit_info(commit_hash, project_dir):
"""Get authors and date information for a given commit hash in a specific repo."""
# Command to get the commit author and co-authors
author_command = f"git -C {project_dir} log --format='%aN' {commit_hash} -n 1"
coauthor_command = f"git -C {project_dir} log --format='%b' {commit_hash} -n 1"
# Command to get the commit date
date_command = f"git -C {project_dir} log --format='%ai' {commit_hash} -n 1"

author = subprocess.check_output(author_command, shell=True).decode('utf-8').strip()
coauthor_output = subprocess.check_output(coauthor_command, shell=True).decode('utf-8').strip()
commit_date = subprocess.check_output(date_command, shell=True).decode('utf-8').strip()

# Find any Co-authored-by: lines in the commit body
coauthors = []
for line in coauthor_output.split('\n'):
if line.startswith('Co-authored-by:'):
coauthor_name = line.split(':', 1)[1].strip().split('<')[0].strip()
coauthors.append(coauthor_name)

# Combine author and co-authors into a single list
authors = [author] + coauthors

return authors, commit_date

def calculate_date_difference(date1, date2):
"""Calculate the difference in days between two dates."""
delta = date2 - date1
return delta.days

def main(file_path):
with open(file_path, 'r') as file:
data = json.load(file)

results = []
for check_item in data:
url = check_item["Check"][0]
repo_name = url.split("/")[-3]
repo_path = join("data", "repos", repo_name)

check_dict = check_item["Check"][1]
previous_commit = check_dict["previous_commit"]
commit = check_dict["commit"]

authors1, date1 = get_commit_info(previous_commit, repo_path)
authors2, date2 = get_commit_info(commit, repo_path)

# Check authors and calculate date difference
author = None
delta_days = None
if authors1 and authors2:
if authors1 == authors2:
author = "same"
elif set(authors1) & set(authors2):
author = "different but inclusive"
else:
author = "different"

if date1 and date2:
date1 = date1.split()[0] # Extract only the date part (ignoring time)
date2 = date2.split()[0]
date_format = "%Y-%m-%d"
delta_days = (datetime.strptime(date2, date_format) - datetime.strptime(date1, date_format)).days

results.append([previous_commit, ', '.join(authors1), date1, commit, ', '.join(authors2), date2, author, delta_days])

with open(join("data", "results", "inspection_author_time3.csv"), 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(results)


if __name__ == "__main__":
file_path = join("data", "results", "inspection_accidental_commits.json")
main(file_path)

Original file line number Diff line number Diff line change
Expand Up @@ -5,63 +5,106 @@
from sklearn.cluster import KMeans


def plot_distance_from_warning_to_suppression(float_list, output_file, n_clusters):
data = np.array(float_list).reshape(-1, 1)
def plot_distance_from_warning_to_suppression(distance_list, output_file, n_clusters):
data = np.array(distance_list).reshape(-1, 1)

# Perform k-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data)
labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_.flatten()

# Create a list of lists to hold grouped data
grouped_data = [[] for _ in range(n_clusters)]

for i, label in enumerate(labels):
grouped_data[label].append(float_list[i])
grouped_data[label].append(distance_list[i])

# Count the number of elements in each cluster
counts = [len(group) for group in grouped_data]

# Sort groups by the cluster center
sorted_indices = np.argsort(cluster_centers)
sorted_counts = np.array(counts)[sorted_indices]
sorted_centers = np.array(cluster_centers)[sorted_indices]

# Define group ranges for x-axis labels, ensuring that the ranges are contiguous
sorted_centers = np.sort(sorted_centers)
# Round cluster centers to the nearest multiple of 100
def round_to_nearest_100(x):
return np.round(x / 100) * 100

rounded_centers = np.array([round_to_nearest_100(center) for center in cluster_centers])

# Sort centers and counts
sorted_indices = np.argsort(rounded_centers)
sorted_centers = rounded_centers[sorted_indices]

# Define consistent group ranges
group_ranges = []
lower_bound = np.floor(min(distance_list) / 100) * 100 # Start at the nearest lower multiple of 100

for i in range(n_clusters):
if i == 0:
lower_bound = 0
else:
lower_bound = sorted_centers[i - 1] + 1

upper_bound = sorted_centers[i]
if i == n_clusters - 1:
upper_bound = max(float_list)
else:
upper_bound = sorted_centers[i] + 1

# print(f"L: {lower_bound}\tU: {upper_bound}")
# group_ranges.append(f'{lower_bound:.1f} - {upper_bound:.1f}')
if i == n_clusters - 1:
group_ranges.append(f'[{lower_bound:.1f},\n {upper_bound:.1f}]')
else:
group_ranges.append(f'[{lower_bound:.1f},\n {upper_bound:.1f})')

upper_bound = max(distance_list) + 1 # np.floor(max(distance_list) / 100 + 1) * 100

group_ranges.append((lower_bound, upper_bound))
lower_bound = upper_bound

# Count the number of elements in each range
counts = [sum(lower_bound <= x < upper_bound for x in distance_list) for lower_bound, upper_bound in group_ranges]
formatted_ranges = [f'[{int(lower_bound)},\n {int(upper_bound)})' if i < len(group_ranges) - 1
else f'[{int(lower_bound)},\n {int(upper_bound) - 1}]'
for i, (lower_bound, upper_bound) in enumerate(group_ranges)]
plt.figure(figsize=(12, 5))
plt.rcParams.update({'font.size': 16})
bars = plt.bar(group_ranges, sorted_counts) #, color='blue')
plt.rcParams.update({'font.size': 14})
bars = plt.bar(formatted_ranges, counts)
for bar in bars:
plt.text(bar.get_x() + bar.get_width() / 2, 0.96 * bar.get_height(),
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.2,
f'{int(bar.get_height())}', ha='center', va='bottom', color='black')
# plt.xticks(group_ranges, rotation=30)

plt.xlabel('Distance from warnings to suppression (number of lines)')
plt.ylabel('Number of Warnings')
# Remove decimal places from x-axis ticks
plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True))

# Determine the y-axis limits and ticks dynamically
max_count = max(counts)
y_tick_interval = 10 ** np.floor(np.log10(max_count)) # Get an appropriate interval for ticks
plt.ylim(0, np.ceil(max_count / y_tick_interval) * y_tick_interval)
plt.yticks(np.arange(0, plt.ylim()[1] + y_tick_interval, y_tick_interval))
# plt.xticks(rotation=30)
plt.tight_layout()
plt.savefig(output_file)

def main(file_path, output_file, n_clusters):
def main(file_path):
with open(file_path, 'r') as file:
data = json.load(file)

# specify groups
group_1= range(0, 11)
group_2= range(11, 100)
group_3= range(101, 1000)
# group_4= range(1001, inf)
count_1 = 0
count_2 = 0
count_3 = 0
count_4 = 0

distance_list = []
for check_item in data:
check_dict = check_item["Check"][1]
current_suppression_line = check_dict["suppression"]["line"]
current_warnings = check_dict["warnings"]
for w in current_warnings:
distance = abs(w["line"] - current_suppression_line)
distance_list.append(distance)

if distance in group_1:
count_1 += 1
elif distance in group_2:
count_2 += 1
elif distance in group_3:
count_3 += 1
else:
count_4 += 1

print(f"minimum distance: {min(distance_list)}, maximum: {max(distance_list)}")
print(f"Distance {group_1}: {count_1}")
print(f"Distance {group_2}: {count_2}")
print(f"Distance {group_3}: {count_3}")
print(f"Distance > 1000 lines: {count_4}")

def main_cluster(file_path, output_file, n_clusters):
with open(file_path, 'r') as file:
data = json.load(file)

Expand All @@ -79,8 +122,11 @@ def main(file_path, output_file, n_clusters):

if __name__ == "__main__":
file_path = join("data", "results", "inspection_accidental_commits.json")
output_file_path = join("data", "results", "distance_from_warnings_to_suppression.pdf")
n_clusters = 8 # with n_clusters=8, it gives meaningful clusters
main(file_path, output_file_path, n_clusters)


# option #1, use cluster as a guide to get the groups, may in different size.
# output_file_path = join("data", "results", "distance_from_warnings_to_suppression.pdf")
# n_clusters = 8 # with n_clusters=8, it gives meaningful clusters
# main_cluster(file_path, output_file_path, n_clusters)

# option #2
main(file_path)

0 comments on commit ecd9749

Please sign in to comment.