-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathclean_data.py
66 lines (46 loc) · 1.77 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import sys
import pandas as pd
def test_legitimacy(grouped):
"""
Check if the max is higher than 100˚F for any given day.
There are three situations where it is:
Time min max mean
2011-05-04 00:00:00 53.6 101.8 68.46319444444441
2017-09-01 00:00:00 66.6 107.5 84.26629629629628
2017-09-02 00:00:00 73.4 104.4 86.59037037037035
After looking up the dates, we can confirm data legitimacy.
This is due to news stories that came out during these days citing high temperatures.
"""
for index, row in grouped.iterrows():
if row["max"] > 100:
print(index, row["min"], row["max"], row["mean"])
def fill_nans(grouped):
grouped.fillna(method="ffill", inplace=True)
return grouped
def make_negatives_nans(grouped):
grouped["min"] = grouped["min"].apply(lambda x: float("nan") if x < 0 else x)
grouped["max"] = grouped["max"].apply(lambda x: float("nan") if x < 0 else x)
grouped["mean"] = grouped["mean"].apply(lambda x: float("nan") if x < 12.46 else x)
return grouped
def group_data(df):
grouped = df.groupby(pd.Grouper(freq="D"))["TemperatureF"].agg(
["min", "max", "mean"]
)
return grouped
def process_data(df):
grouped = group_data(df)
grouped = make_negatives_nans(grouped)
grouped = fill_nans(grouped)
return grouped
def main():
try:
df = pd.read_csv("temp_data_raw.csv", index_col="Time", parse_dates=True)
except:
print("Please run `scrape_data.py` to generate the raw data first.")
sys.exit(0)
df.drop(df.columns[df.columns.str.contains("Unnamed")], axis=1, inplace=True)
grouped = process_data(df)
test_legitimacy(grouped)
grouped.to_csv("temp_data_cleaned.csv")
if __name__ == "__main__":
main()