Skip to content

Commit

Permalink
analyze script is included to repo
Browse files Browse the repository at this point in the history
  • Loading branch information
mrtrkmn committed Jul 19, 2023
1 parent 848c463 commit 40f1719
Showing 1 changed file with 87 additions and 0 deletions.
87 changes: 87 additions & 0 deletions utils/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import pandas as pd
import matplotlib.pyplot as plt
from pandas.core.internals.blocks import F
# example run on Google colab
# https://colab.research.google.com/drive/17Zn2y9QnjWchwNdBpJ4brjj7IPzH5I_5?usp=sharing

KPI_COLUMNS = [
"Operating revenue (Turnover)\nm USD",
"Sales\nm USD",
"Gross profit\nm USD",
"P/L before tax\nm USD",
"P/L for period [=Net income]\nm USD",
"Cash flow\nm USD",
"Total assets\nm USD",
# "Number of employees\n",
]
available_years = [str(year) for year in range(1994, 2024)]


def create_data_dictionary(df):
company_missing_kpi_mapping = {}
# # Iterate over each row
for index, row in df.iterrows():
agreement_date_year = str(row["agreementDate"]).split("-")[0]
if agreement_date_year != "nan":
agreement_date_year = str(int(agreement_date_year) - 1)
company_name = row["Company name Latin alphabet_x"]
if company_name not in d:
company_missing_kpi_mapping[company_name] = {}
else:
if "missing_kpi" not in company_missing_kpi_mapping[company_name]:
company_missing_kpi_mapping[company_name]["missing_kpi"] = 0
else:
for column in KPI_COLUMNS:
if int(agreement_date_year) < 1994:
company_missing_kpi_mapping[company_name][f"{column} {agreement_date_year}"] = False
else:
print(row[f"{column} {agreement_date_year}"])
if (
row[f"{column} {agreement_date_year}"] == "n.a."
or row[f"{column} {agreement_date_year}"] is None
):
company_missing_kpi_mapping[company_name]["missing_kpi"] += 1
company_missing_kpi_mapping[company_name][f"{column} {agreement_date_year}"] = False
return company_missing_kpi_mapping


def create_missing_kpi_var_map(data: dict, is_licensee: bool = True):
# Extracting companies with missing_kpi values
companies = [company for company in data.keys() if data[company].get("missing_kpi", 0) > 0]
missing_kpis = [data[company]["missing_kpi"] for company in companies]

# Sorting companies based on missing_kpi values (high to low)
sorted_companies, sorted_missing_kpis = zip(*sorted(zip(companies, missing_kpis), key=lambda x: x[1], reverse=True))

# Prepare the total number of missing KPI variables for each company
missing_kpi_vars = []
for company in sorted_companies:
missing_vars = [var for var in data[company] if data[company][var] is False]
missing_kpi_vars.append(", ".join(missing_vars))

# Plotting the bar chart
plt.figure(figsize=(12, 6))
bars = plt.bar(range(len(sorted_companies)), sorted_missing_kpis)
plt.xlabel('Company')
plt.ylabel('Number of Missing KPIs')
if is_licensee:
plt.title('Companies with Missing KPIs on merged data (Licensee)')
else:
plt.title('Companies with Missing KPIs on merged data (Licensor)')
plt.xticks(range(len(sorted_companies)), sorted_companies, rotation=90)

# Adding values inside the bars
for bar, value in zip(bars, sorted_missing_kpis):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 1, str(value), ha='center', va='center', fontsize=10, color='white')

# Adding total number of missing KPI variables as text in the right upper corner
total_missing_kpi_vars = sum(len(missing_vars.split(", ")) for missing_vars in missing_kpi_vars)
table_total_text = f"Total Missing KPI Variables: {total_missing_kpi_vars}"
plt.text(0.65, 0.90, table_total_text, transform=plt.gca().transAxes, fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()




0 comments on commit 40f1719

Please sign in to comment.