-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_datasets.py
115 lines (81 loc) · 3.64 KB
/
create_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import requests
code = 'MH' #State name
data = requests.get('https://api.covid19india.org/v4/min/timeseries-{}.min.json'.format(code)) #API
data = data.json()
def transform_func(x, ATTR_type, i):
if pd.isnull(x):
return 0
else:
val = x[ATTR_type][i]
return val
#Create single scalar dataframe for selected datatype
def create_TxN_df(dict_data, ATTR_type, ATTR_val):
df = pd.DataFrame.from_dict(ALL_DISTRICT_DATA)
if ATTR_val == 'confirmed':
i = 0
elif ATTR_val == 'deceased':
i = 1
elif ATTR_val == 'recovered':
i = 2
for col in df.columns:
df[col] = df[col].apply(lambda x: transform_func(x, ATTR_type, i))
return df
def get_data_by_district(district_name, SKIP_START=0, SKIP_END=0, TOTAL_LEN=444):
#district_name = 'Akola'
data_per_day = data[code]['districts'][district_name]['dates']
# node feature vector data for each day
ATTR_LIST = ['confirmed', 'deceased', 'recovered', 'vaccinated1', 'vaccinated2']
ALL_KEYS = ['delta', 'delta7', 'total']
DISTRICT_INFO_DICT = {}
ctr = 0
for date in data_per_day:
ctr+=1
if ctr<=SKIP_START:
continue
if ctr >= TOTAL_LEN - SKIP_END:
continue
day_data = data_per_day[date]
daily_info_dict = {}
for key in ALL_KEYS:
daily_info_dict[key] = []
for attr in ATTR_LIST:
daily_info_dict[key].append(day_data.get(key, {}).get(attr, 0))
DISTRICT_INFO_DICT[date] = daily_info_dict
return DISTRICT_INFO_DICT, min(DISTRICT_INFO_DICT.keys()), max(DISTRICT_INFO_DICT.keys())
def get_all_data(ATTR_type, ATTR_val):
#GET dataframe of cases for all cities
init_skip_val = 15
end_skip_val = 1
DISTRICT_NAMES = ['ahmednagar', 'akola', 'amravati', 'aurangabad', 'beed',
'bhandara', 'buldhana', 'chandrapur', 'dhule', 'gadchiroli',
'gondia', 'hingoli', 'jalgaon', 'jalna', 'kolhapur', 'latur',
'mumbai', 'nagpur', 'nanded', 'nandurbar', 'nashik', 'osmanabad',
'palghar', 'parbhani', 'pune', 'ratnagiri', 'sangli', 'satara',
'sindhudurg', 'solapur', 'thane', 'wardha', 'washim', 'yavatmal']
SKIP_DISTRICTS = ['Gadchiroli', 'Wardha']
ALL_DISTRICT_DATA = dict()
ctr = 0
for NAME in DISTRICT_NAMES:
NAME = NAME.title().strip()
if NAME in SKIP_DISTRICTS:
print("Skipping: ", NAME)
continue
ALL_DISTRICT_DATA[NAME], min_val, max_val = get_data_by_district(NAME, SKIP_START=init_skip_val, SKIP_END=end_skip_val)
print(NAME, ": ", len(ALL_DISTRICT_DATA[NAME].keys()), " | ", min_val, max_val)
print('---'*4)
print("--------------------------------")
print("All Districts: ", len(ALL_DISTRICT_DATA.keys()))
vector_df = create_TxN_df(ALL_DISTRICT_DATA, ATTR_type, ATTR_val)
pd.DataFrame(vector_df.values).to_csv('data/train/road_traffic/covid/vel.csv', header =False, index = False) #Save file
#create Adjacency matrix
df_adj = pd.read_csv('adjacency_matrix.csv')
df_adj = df_adj[~df_adj['city1'].isin([x.lower() for x in SKIP_DISTRICTS])]
df_adj = df_adj[~df_adj['city2'].isin([x.lower() for x in SKIP_DISTRICTS])]
df_adj['distance'] = df_adj['distance']/ max(df_adj['distance'])
adj_finl_data = df_adj.pivot_table('distance', ['city1'], 'city2')
print(os.getcwd())
pd.DataFrame(adj_finl_data.values).to_csv('data/train/road_traffic/covid/adj_mat.csv', header = False, index = False) #Save file