-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathC2Analyse.py
173 lines (150 loc) · 5.89 KB
/
C2Analyse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#%%
import json
import pandas as pd
import numpy as np
import traceback
from datetime import datetime
class df():
def __init__(self):
#self.event_map = {"1":"1 minute", "4":"4 minute", "30":"30 minute", "60":"60 minute"}
self.athletes = None
self.extended = None
self.workouts = None
def load_JSONs(self, path_folder):
self.athletes = self.df_from_file(f"{path_folder}/C2Athletes.json", "profile_id")
self.extended = self.df_from_file(f"{path_folder}/C2Extended.json", "workout_id")
self.workouts = self.df_from_file(f"{path_folder}/C2Workouts.json", "workout_id")
self.set_list()
def load_csvs(self, path_folder):
self.athletes = pd.read_csv("analysis/athletes.csv",sep=",", index_col=0)
self.extended = pd.read_csv("analysis/extended.csv",sep=",", index_col=0)
self.workouts = pd.read_csv("analysis/workouts.csv",sep=",", index_col=0)
self.set_list()
def set_list(self):
self.list = [self.athletes, self.extended, self.workouts]
def df_from_file(self, path, index_name="id"):
try:
fa = open(path, "r")
df = pd.DataFrame.from_dict(json.load(fa)).T
df.index.set_names(index_name, inplace=True)
#df = df.reset_index()
df.replace({"":np.nan, "None":np.nan, None:np.nan}, inplace=True) #all missing values to nan
except FileNotFoundError:
print(f"Could not load JSON file: {path}")
df = None
return df
def write_csv(self, dfs, paths):
for df, path in zip(dfs, paths):
# try:
df.to_csv(path)
# except:
# print(f"Could not write csv file: {path}")
def merge_frames(self, how="inner"):
self.merge = pd.merge(
left=(
pd.merge(
left=self.workouts,
right=self.athletes,
left_on='profile_id',
right_on="profile_id",
right_index=True,
how=how
)
),
right = self.extended,
left_on='workout_id',
right_on="workout_id",
right_index=True,
how=how
)
def print_lengths(self):
print(f"Number of workouts: {len(self.workouts)}")
print(f"Number of athletes: {len(self.athletes)}")
print(f"Number of extended workout data: {len(self.extended)}")
print(f"Number of merged data: {len(self.merge)}")
class Clean():
def __init__(self, verbose = 0):
self.df = df()
self.verbose = verbose
self.ft_to_cm = 30.48
self.in_to_cm = 2.54
def load_JSON(self, path_folder="./output/"):
if self.verbose == 1:
print("Loading JSON.")
self.df.load_JSONs(path_folder)
self.df.write_csv(self.df.list, ["analysis/athletes.csv", "analysis/extended.csv", "analysis/workouts.csv"])
if self.verbose == 1:
print("Loaded.")
def load_csv(self, path_folder):
self.df.load_csvs(path_folder)
def convert_to_datetime(date_str):
dtFormats = ('%B %d, %Y','%B %d, %Y %H:%M:%S','%d-%m-%Y %H:%M:%S')
date_value = None
if isinstance(date_str, datetime):
#return if already a datetime
return date_str
if isinstance(date_str, str):
date_str = date_str.strip()
for dtFormat in dtFormats:
try:
date_value = datetime.strptime(date_str, dtFormat)
except ValueError:
pass
else:
return date_value
if date_value == None:
#raise ValueError(f"No mathing datetime format found for '{date_str}'")
return
def convert_heights(df_heights):
ft_to_cm = 30.48
in_to_cm = 2.54
datatypes = {0:float, 1:float}
try:
df_heights = df_heights.replace({' ft ':' ', ' in':' '}, regex=True)
df_heights = df_heights.str.split(expand=True)
df_heights = df_heights.astype(datatypes)
df_heights["height"] = round(df_heights[0] * ft_to_cm + df_heights[1] * in_to_cm,1)
except AttributeError:
print("Looks like height is already converted, skipping")
else:
return df_heights["height"]
def clean_heights(height):
ft_to_cm = 30.48
# people are stupid
tallest_human = 272 # weed out impossible heights
smallest_human = 60
wrong_unit_low = 4300 # some people have entered cm instead of ft and inch, so can recover this by converting back to ft
wrong_unit_high = 6096
if height > wrong_unit_low and height < wrong_unit_high:
return round(height * 1/ft_to_cm,0)
if height < smallest_human or height > tallest_human:
return np.nan
return height
def convert_weights(df_weight):
lbs_kg = 1/2.2046
try:
df_weights = df_weight.replace({' lb':''}, regex=True)
print(df_weights)
df_weights = df_weights.astype(float)
df_weights = round(df_weights * lbs_kg,1)
except AttributeError:
print("Something went wrong")
else:
return df_weights
def duration_string_to_duration_seconds(duration_string):
"""pace & time- convert to seconds"""
if isinstance(duration_string, float):
return duration_string
if isinstance(duration_string, str):
min_sec = 60
hour_sec = 3600
duration_list = duration_string.split(":")
# convert minutes and secnds, always have both
duration_seconds = int(duration_list[-2]) * min_sec + float(duration_list[-1])
#convert hours if they are present
if len(duration_list) == 3:
duration_seconds += int(duration_list[-3]) * hour_sec
return duration_seconds
# -1 = seconds
# -2 = minutes
# -3 = hours