-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDiversity_RepresentationInTech.py
54 lines (41 loc) · 1.51 KB
/
Diversity_RepresentationInTech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
from collections import defaultdict
import numpy as np
def total_count(df, col1, col2, look_for):
'''
INPUT:
df - the pandas dataframe you want to search
col1 - the column name you want to look through
col2 - the column you want to count values from
look_for - a list of strings you want to search for in each row of df[col]
OUTPUT:
new_df - a dataframe of each look_for with the count of how often it shows up
'''
new_df = defaultdict(int)
#loop through list of ed types
for val in look_for:
#loop through rows
for idx in range(df.shape[0]):
#if the ed type is in the row add 1
if val in df[col1][idx]:
new_df[val] += int(df[col2][idx])
new_df = pd.DataFrame(pd.Series(new_df)).reset_index()
new_df.columns = [col1, col2]
new_df.sort_values('count', ascending=False, inplace=True)
return new_df
def clean_race(race_item):
possible_race = ['White or of European descent', 'South Asian', 'Hispanic or Latino/Latina',
'East Asian', 'Middle Eastern','Native American, Pacific Islander, or Indigenous Australian',
'I prefer not to say','Black or of African descent','I don\'t know']
'''
INPUT
race_item - line item of the dataframe column
OUTPUT
val - cleaned and streamlined data
'''
race_item = str(race_item)
for val in possible_race:
if val in race_item:
return val
else:
pass