-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsend_to_telegram.py
393 lines (302 loc) Β· 17 KB
/
send_to_telegram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import pandas as pd
import os
import matplotlib.pyplot as plt
from datetime import datetime
import matplotlib.dates as mdates
import matplotlib.patches as patches
import seaborn as sns
from dotenv import load_dotenv
import asyncio
from telegram import Bot
import nest_asyncio
import glob
load_dotenv()
#===========Telegram Bot configuration==============
bot_token = os.environ.get('BOT_TOKEN')
chat_id = os.environ.get('CHAT_ID')
# Initialize the Telegram Bot
bot = Bot(token=bot_token)
#===============Check for latest commit==============
#check if the data was fetched
if not os.path.exists('data_fetched.txt'):
print("data_fetched.txt file not exists. Exiting script.")
exit()
#open file, if the file not exists, quit.
with open('data_fetched.txt', 'r') as flag_file:
if flag_file.read().strip() != 'Data fetched':
print("Data not fetched. Exiting script.")
exit()
#to load data from a CSV file
def load_data(file_path):
data = pd.read_csv(file_path)
return data
#to send all PNG files in a folder
async def send_all_images_in_folder(folder_path):
png_files = glob.glob(os.path.join(folder_path, '*.png'))
for png_file in png_files:
await send_image(png_file)
#def send_image(image_path):
# asyncio.get_event_loop().run_until_complete(send_image_async(image_path))
async def send_image_with_caption(image_path, caption):
with open(image_path, 'rb') as image_file:
await bot.send_photo(chat_id=chat_id, photo=image_file, caption=caption)
async def send_image_async(image_path):
with open(image_path, 'rb') as image_file:
await bot.send_photo(chat_id=chat_id, photo=image_file)
async def send_image(image_path):
await send_image_async(image_path)
#====================Plot Visualisation=====================
async def send_latest_donation_info(data):
try:
data['date'] = pd.to_datetime(data['date'])
malaysia_data = data[data['state'] == 'Malaysia']
max_date = malaysia_data['date'].max()
latest_data = malaysia_data[malaysia_data['date'] == max_date]
total_donations_latest_date = latest_data['daily'].sum()
current_year = datetime.now().year
current_year_data = malaysia_data[malaysia_data['date'].dt.year == current_year]
total_donations_current_year = current_year_data['daily'].sum()
formatted_max_date = max_date.strftime('%Y-%m-%d')
message = (
f"TODAY'S UPDATE! π©Έπ©Έπ©Έ\n"
f"\n"
f"Blood donations count today: +{total_donations_latest_date}\n"
f"\n"
f"Total blood donations {current_year}: {total_donations_current_year}\n"
f"(data as of: {formatted_max_date})"
)
await bot.send_message(chat_id=chat_id, text=message)
except Exception as e:
await bot.send_message(chat_id=chat_id, text=f"An error occurred while processing the data: {e}")
#To count new donors by year and create a bar chart
def count_new_donors_by_year(data, start_year, end_year):
# Use .copy() to explicitly work with a copy of the filtered DataFrame
malaysia_data = data[data['state'] == 'Malaysia'].copy()
malaysia_data['date'] = pd.to_datetime(malaysia_data['date'])
latest_date = malaysia_data['date'].max() # Find the max (latest) date in the data
print(f"data as of: {latest_date.strftime('%Y-%m-%d')}")
malaysia_data.loc[:, 'total'] = pd.to_numeric(malaysia_data['total'], errors='coerce')
malaysia_data.loc[:, 'year'] = malaysia_data['date'].dt.year
#group by year & sum the total new donors for each year
new_donors_by_year = malaysia_data[malaysia_data['year'].between(start_year, end_year)].groupby('year')['total'].sum()
plt.figure(figsize=(9, 5))
bars = plt.bar(new_donors_by_year.index, new_donors_by_year, color='blue', width=0.6)
plt.xlabel('Year')
plt.ylabel('Total')
plt.title(f'New Donors ({start_year} to {end_year})')
plt.xticks(new_donors_by_year.index, rotation=0)
#annotation
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2.0, height, f'{int(height)}', ha='center', va='bottom')
plt.tight_layout()
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)
plt.savefig(os.path.join(output_folder, '1-New_Donors_Plot.png'))
return new_donors_by_year
#to plot monthly blood donation trends, and create a line chart
def plot_blood_donation_trends(data, start_year, end_year):
data['date'] = pd.to_datetime(data['date'])
#filter data based on the certain years and 'state' must be from 'Malaysia'
filtered_data = data[(data['date'].dt.year >= start_year) & (data['date'].dt.year <= end_year) & (data['state'] == 'Malaysia')]
#group by month and then sum up the daily donations
monthly_total_donations = filtered_data.groupby([filtered_data['date'].dt.to_period('M')])['daily'].sum().reset_index()
monthly_total_donations['date'] = monthly_total_donations['date'].dt.to_timestamp()
monthly_total_donations.rename(columns={'daily': 'total_donations'}, inplace=True)
plt.figure(figsize=(15, 6))
sns.lineplot(x='date', y='total_donations', data=monthly_total_donations, color='maroon', marker='o')
plt.title(f'Trend of Total Monthly Blood Donations in Malaysia ({start_year} - {end_year})')
plt.xlabel('Month and Year')
plt.ylabel('Total Donations')
#format x-axis to show 'Month Year'
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
plt.xticks(rotation=90)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
#annotate, but only the most recent data point (bcs its too crowded if all is annotated)
if not monthly_total_donations.empty:
last_row = monthly_total_donations.iloc[-1]
plt.text(last_row['date'], last_row['total_donations'], f"{last_row['total_donations']}", color='black', ha='center', va='bottom')
plt.tight_layout()
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)
plt.savefig(os.path.join(output_folder, '2-Monthly_Donations_Trend.png'))
def plot_blood_donation_trends_by_state(data, start_year, end_year):
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
filtered_data = data[(data['year'] >= start_year) & (data['year'] <= end_year) & (data['state'] != 'Malaysia')] #state != Malaysia
yearly_donations = filtered_data.groupby(['state', 'year'])['daily'].sum().reset_index()
#pivot the data to hv years as columns
pivoted_data = yearly_donations.pivot(index='state', columns='year', values='daily').fillna(0)
#sum of donations for each state
total_donations_by_state = pivoted_data.sum(axis=1).sort_values(ascending=True)
sorted_pivoted_data = pivoted_data.loc[total_donations_by_state.index]
overall_total = total_donations_by_state.sum()
plt.figure(figsize=(10, 10))
ax = sorted_pivoted_data.plot(kind='barh', stacked=True)
plt.title(f'Comparison of Total Blood Donations by State ({start_year}-{end_year})')
plt.xlabel('Total Donations')
plt.ylabel('State')
ax.grid(axis='x', linestyle='--', alpha=0.7)
#annotate to show count & %
for idx, state in enumerate(sorted_pivoted_data.index):
total_donations = sorted_pivoted_data.loc[state].sum()
percentage = (total_donations/overall_total)*100
plt.annotate(f'{total_donations:,.0f} ({percentage:.1f}%)', (total_donations + 500, idx), fontsize=10, va='center')
#remove x-axis tick labels bcs crowded
plt.xticks([])
plt.legend(title='Year')
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)
plt.savefig(os.path.join(output_folder, '5-Donations_by_State.png'))
def plot_returning_new_donor_counts(data):
data['visit_date'] = pd.to_datetime(data['visit_date'])
data['donation_year'] = data['visit_date'].dt.year
first_donation_year = data.groupby('donor_id')['donation_year'].min().reset_index()
first_donation_year.rename(columns={'donation_year': 'first_donation_year'}, inplace=True)
data_donor_status = pd.merge(data, first_donation_year, on='donor_id')
#determine status; whether each donation was made by a 'New' or 'Returning' donor
data_donor_status['donor_status'] = 'Returning' # Assume 'Returning' by default
#mark the first donation for each donor as 'New'
data_donor_status.loc[data_donor_status['donation_year'] == data_donor_status['first_donation_year'], 'donor_status'] = 'New'
#agg count unique donors per year by status; new or returning
donor_counts_per_year = data_donor_status.groupby(['donation_year', 'donor_status'])['donor_id'].nunique().unstack(fill_value=0)
plt.figure(figsize=(10, 6))
#stacked bar chart
bars_new = plt.bar(donor_counts_per_year.index.astype(str), donor_counts_per_year['New'], color='lightgreen', label='New')
bars_returning = plt.bar(donor_counts_per_year.index.astype(str), donor_counts_per_year['Returning'], bottom=donor_counts_per_year['New'], color='skyblue', label='Returning')
plt.grid(axis='y', linestyle='--', alpha=0.7)
#annotate
for bars in [bars_new, bars_returning]:
for bar in bars:
yval = bar.get_height()
if yval > 0: # Only annotate non-zero bars
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_y() + yval / 2, int(yval), ha='center', va='center')
plt.title('Count of New-Donors & Returning-Donors Per Year')
plt.xlabel('Year')
plt.ylabel('Count of Donors')
plt.xticks(rotation=0)
plt.legend()
plt.tight_layout()
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)
plt.savefig(os.path.join(output_folder, '4-Count_new_returning_donor.png'))
def plot_donor_counts_by_age_and_year(data, start_year, end_year):
data['visit_date'] = pd.to_datetime(data['visit_date'])
data['donation_year'] = data['visit_date'].dt.year
data['age_at_visit'] = data['donation_year'] - data['birth_date']
bins = [17, 25, 30, 35, 40, 45, 50, 55]
labels = ['17-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54']
data['age_group'] = pd.cut(data['age_at_visit'], bins=bins, labels=labels, right=False)
#filter for the years between start_year and end_year
filtered_data = data[data['donation_year'].between(start_year, end_year)]
#make sure count unique donor_id
age_group_year_counts = filtered_data.groupby(['age_group', 'donation_year'], observed=True)['donor_id'].nunique().reset_index()
#observed=true bcs the default observe=false is deprecating (got FutureWarning)
age_group_year_counts = age_group_year_counts[age_group_year_counts['age_group'].isin(labels)] #labels is only until age 50-54
plt.figure(figsize=(10, 6))
sns.barplot(x='age_group', y='donor_id', hue='donation_year', data=age_group_year_counts) #huee is the year
plt.title(f'Count of Donors by Age Group and Year ({start_year}-{end_year})')
plt.xlabel('Age Group')
plt.ylabel('Count of Donors')
plt.legend(title='Donation Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)
plt.savefig(os.path.join(output_folder, '6-Donor_Count_Age_Year.png'))
def plot_donor_retention_heatmap(data, donated_min_x_times):
data['visit_date'] = pd.to_datetime(data['visit_date'])
data['visit_year'] = data['visit_date'].dt.year
min_year = data['visit_year'].min()
max_year = data['visit_year'].max()
donor_yearly_counts = data.groupby(['donor_id', 'visit_year']).size().reset_index(name='donation_count')
repeating_donors = donor_yearly_counts[donor_yearly_counts['donation_count']>= donated_min_x_times]
percentage_data = pd.DataFrame(index=range(min_year, max_year + 1), columns=range(max_year - min_year + 1))
for first_year in range(min_year, max_year + 1):
first_year_donors = repeating_donors[repeating_donors['visit_year'] == first_year]['donor_id']
for N in range(max_year-first_year + 1):
current_year = first_year + N
still_donating_count = donor_yearly_counts[(donor_yearly_counts['visit_year'] == current_year) &
(donor_yearly_counts['donor_id'].isin(first_year_donors))]['donor_id'].nunique()
if len(first_year_donors) > 0:
percentage = round((still_donating_count/len(first_year_donors))*100)
else:
percentage = 0
percentage_data.at[first_year, N] = percentage
percentage_data = percentage_data.fillna(0)
plt.figure(figsize=(8, 8))
ax = sns.heatmap(percentage_data, annot=True, cmap="Blues", fmt="d", cbar=False)
plt.title(f'% of Donors Still Donating After N Years (at least {donated_min_x_times}x times)')
plt.xlabel('N Years since the donors first donation')
plt.ylabel('Year (cohort)')
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.yticks(rotation=0)
#dynamic sample interpretation
example_year = 2023 #example
example_n_years = 1 #example
if example_n_years < percentage_data.shape[1]: # Ensure N is within the range
example_percentage = percentage_data.loc[example_year, example_n_years]
sample_interpretation = (f"Sample Interpretation:\n"
f"In cohort year {example_year}, {example_percentage}% of those who donated\n"
f"at least {donated_min_x_times}x in {example_year} have\n"
f"made donation after {example_n_years} year({example_year+1}).")
else:
sample_interpretation = "Sample interpretation not available for the selected N years."
ax.text(5, 10, sample_interpretation, fontsize=10, color='black', ha='left', va='center')
for text in ax.texts:
if text.get_text() == '0':
text.set_text('')
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)
plt.savefig(os.path.join(output_folder, '7-Retention_Rate_Heatmap.png'))
#====================================MAIN===========================================
async def main():
folder_path = './data-darah-public'
#load all CSV files in the folder into a dictionary
datasets = {}
for file_name in os.listdir(folder_path):
if file_name.endswith('.csv'):
file_path = os.path.join(folder_path, file_name)
dataset_key = os.path.splitext(file_name)[0]
datasets[dataset_key] = load_data(file_path)
#initialize the dataset keys
newdonors_state = datasets['newdonors_state']
donations_state = datasets['donations_state']
#what year?
start_year = 2019
end_year = 2024
# ====Part 1 - Trends====
await send_latest_donation_info(donations_state)
count_new_donors_by_year(newdonors_state, start_year, end_year)
plot_blood_donation_trends(donations_state, start_year, end_year)
plot_blood_donation_trends_by_state(donations_state, start_year, end_year)
await send_image_with_caption('output/1-New_Donors_Plot.png', "How many new donors this year?π₯³") #caption
await send_image_with_caption('output/2-Monthly_Donations_Trend.png', "Monthly Donation Trend!") #caption
await send_image_with_caption('output/5-Donations_by_State.png', "Which state in Malaysia contributes most donation?") #caption
await send_image_with_caption('output/6-Donor_Count_Age_Year.png', "Which age group contributes most donation per Year? π") #caption
# ====Part 2 - Retention rate====
retention_data_path = './data-granular/ds-data-granular'
retention_data = pd.read_parquet(retention_data_path)
plot_returning_new_donor_counts(retention_data)
plot_donor_counts_by_age_and_year(retention_data, start_year, end_year)
await send_image_with_caption('output/4-Count_new_returning_donor.png', "Donor Retention: Does the previous donor come back? or we gain more Newbies each year?πΆ") #caption
await bot.send_message(chat_id=chat_id, text="---DONOR RETENTION DATA---\n"
"The following heatmap plot will show % of donors who donated\n"
"at least x times within the first year of their donation, and continues to donate in the following years~")
#1x time
plot_donor_retention_heatmap(retention_data, donated_min_x_times=1)
await send_image_with_caption('output/7-Retention_Rate_Heatmap.png', "Donated at least 1x time π₯")
#3x time
plot_donor_retention_heatmap(retention_data, donated_min_x_times=3)
await send_image_with_caption('output/7-Retention_Rate_Heatmap.png', "Donated at least 3x times π₯π₯")
#6x time
plot_donor_retention_heatmap(retention_data, donated_min_x_times=6)
await send_image_with_caption('output/7-Retention_Rate_Heatmap.png', "Donated at least 6x times π₯π₯π₯π₯")
#===shortcut to send all images in folder=====
#await send_all_images_in_folder('output')
if __name__ == "__main__":
asyncio.run(main())
if os.path.exists('data_fetched.txt'):
os.remove('data_fetched.txt')