diff --git a/api/requirements-dev.txt b/api/requirements-dev.txt index 9c2a97a9..83ec409e 100644 --- a/api/requirements-dev.txt +++ b/api/requirements-dev.txt @@ -45,6 +45,7 @@ packaging==24.0 # pyproject-api # pytest # tox +pandas==1.5.3 platformdirs==4.2.0 # via # tox diff --git a/api/requirements-minimal.txt b/api/requirements-minimal.txt index d1a02eba..3a90b927 100644 --- a/api/requirements-minimal.txt +++ b/api/requirements-minimal.txt @@ -5,6 +5,7 @@ flask-api-utils Flask-SQLAlchemy httplib2 networkx +pandas psycopg2-binary pydantic pytz diff --git a/api/requirements.txt b/api/requirements.txt index 48ca6209..5e4ff867 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -53,6 +53,7 @@ markupsafe==2.1.5 # werkzeug networkx==3.2.1 # via -r requirements-minimal.txt +pandas==1.5.3 psycopg2-binary==2.9.9 # via -r requirements-minimal.txt pydantic==2.6.4 diff --git a/api/tests/matching/match_test.py b/api/tests/matching/match_test.py index 7b4448a5..1a6d114c 100644 --- a/api/tests/matching/match_test.py +++ b/api/tests/matching/match_test.py @@ -5,6 +5,8 @@ from yelp_beans.logic.subscription import get_specs_from_subscription from yelp_beans.logic.subscription import store_specs_from_subscription from yelp_beans.matching.match import generate_meetings +from yelp_beans.matching.match_utils import get_meeting_weights +from yelp_beans.matching.pair_match import get_disallowed_meetings from yelp_beans.models import Meeting from yelp_beans.models import MeetingParticipant from yelp_beans.models import MeetingRequest @@ -25,9 +27,29 @@ def test_generate_meetings_same_department(session, subscription): preference = subscription.datetime[0] user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription) session.add(user_pref) - user1 = User(email="a@yelp.com", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) + user1 = User( + id=1, + email="a@yelp.com", + meta_data={"department": "dept"}, + subscription_preferences=[user_pref], + manager_id="0", + languages="en, fr", + days_since_start=100, + employee_id="101", + location="UK, London", + ) session.add(user1) - user2 = User(email="b@yelp.com", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) + user2 = User( + id=2, + email="b@yelp.com", + meta_data={"department": "dept"}, + subscription_preferences=[user_pref], + manager_id="101", + languages="en, fr", + days_since_start=100, + employee_id="102", + location="CA, London", + ) session.add(user2) user_list = [user1, user2] session.commit() @@ -47,13 +69,53 @@ def test_generate_meetings_with_history(session, subscription): user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription) session.add(user_pref) - user1 = User(email="a@yelp.com", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) + user1 = User( + id=1, + email="a@yelp.com", + meta_data={"department": "dept"}, + subscription_preferences=[user_pref], + manager_id="0", + languages="en, fr", + days_since_start=100, + employee_id="101", + location="UK, London", + ) session.add(user1) - user2 = User(email="b@yelp.com", meta_data={"department": "dept2"}, subscription_preferences=[user_pref]) + user2 = User( + id=2, + email="b@yelp.com", + meta_data={"department": "dept2"}, + subscription_preferences=[user_pref], + manager_id="101", + languages="en, fr", + days_since_start=100, + employee_id="102", + location="CA, London", + ) session.add(user2) - user3 = User(email="c@yelp.com", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) + user3 = User( + id=3, + email="c@yelp.com", + meta_data={"department": "dept"}, + subscription_preferences=[user_pref], + manager_id="101", + languages="", + days_since_start=100, + employee_id="103", + location="UK, London", + ) session.add(user3) - user4 = User(email="d@yelp.com", meta_data={"department": "dept2"}, subscription_preferences=[user_pref]) + user4 = User( + id=4, + email="d@yelp.com", + meta_data={"department": "dept2"}, + subscription_preferences=[user_pref], + manager_id="101", + languages="en", + days_since_start=100, + employee_id="104", + location="US, SF", + ) session.add(user4) user_list = [user1, user2, user3, user4] @@ -102,7 +164,17 @@ def test_no_re_matches(session): users = [] num_users = 20 for i in range(0, num_users): - user = User(email=f"{i}@yelp.com", meta_data={"department": f"dept{i}"}, subscription_preferences=[user_pref]) + user = User( + id=i, + email=f"{i}@yelp.com", + meta_data={"department": f"dept{i//2}"}, + subscription_preferences=[user_pref], + manager_id="101", + languages="en", + days_since_start=100, + employee_id=f"{100+i}", + location="", + ) session.add(user) mr = MeetingRequest(user=user, meeting_spec=meeting_spec) session.add(mr) @@ -236,3 +308,485 @@ def test_previous_meeting_penalty(session): assert len(unmatched) == 2 for matched_group in matches: assert not (users[0] in matched_group and users[1] in matched_group) + + +def test_pairwise_distance(session, subscription): + rule = Rule(name="department", value="") + session.add(rule) + subscription.dept_rules = [rule] + preference = subscription.datetime[0] + user_pref = UserSubscriptionPreferences(preference=preference, subscription=subscription) + session.add(user_pref) + + user0 = User( + id=126, + email="126@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="1073", + languages="", + days_since_start=317, + employee_id="126", + location="California, USA", + ) + session.add(user0) + + user1 = User( + id=223, + email="223@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="782", + languages="", + days_since_start=115, + employee_id="223", + location="Berkshire, United Kingdom", + ) + session.add(user1) + + user2 = User( + id=707, + email="707@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="782", + languages="English, Farsi", + days_since_start=509, + employee_id="707", + location="California, USA", + ) + session.add(user2) + + user3 = User( + id=782, + email="782@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="2989", + languages="", + days_since_start=356, + employee_id="782", + location="California, USA", + ) + session.add(user3) + + user4 = User( + id=890, + email="890@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="1073", + languages="", + days_since_start=54, + employee_id="890", + location="California, USA", + ) + session.add(user4) + + user5 = User( + id=1073, + email="1073@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="2989", + languages="Turkish", + days_since_start=595, + employee_id="1073", + location="California, USA", + ) + session.add(user5) + + user6 = User( + id=1117, + email="1117@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="782", + languages="", + days_since_start=338, + employee_id="1117", + location="Texas, USA", + ) + session.add(user6) + + user7 = User( + id=1460, + email="1460@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="5384", + languages="", + days_since_start=1265, + employee_id="1460", + location="California, USA", + ) + session.add(user7) + + user8 = User( + id=1463, + email="1463@yelp.com", + meta_data={"department": "Engineering - Growth"}, + subscription_preferences=[user_pref], + manager_id="5543", + languages="", + days_since_start=410, + employee_id="1463", + location="California, USA", + ) + session.add(user8) + + user9 = User( + id=1715, + email="1715@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="5384", + languages="", + days_since_start=269, + employee_id="1715", + location="New York, USA", + ) + session.add(user9) + + user10 = User( + id=2131, + email="2131@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="782", + languages="", + days_since_start=880, + employee_id="2131", + location="Georgia, USA", + ) + session.add(user10) + + user11 = User( + id=2169, + email="2169@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="782", + languages="", + days_since_start=309, + employee_id="2169", + location="California, USA", + ) + session.add(user11) + + user12 = User( + id=2241, + email="2241@yelp.com", + meta_data={"department": "Engineering - Engineering Effectiveness"}, + subscription_preferences=[user_pref], + manager_id="5543", + languages="", + days_since_start=98, + employee_id="2241", + location="British Columbia, Canada", + ) + session.add(user12) + + user13 = User( + id=2525, + email="2525@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="5384", + languages="", + days_since_start=492, + employee_id="2525", + location="New York, USA", + ) + session.add(user13) + + user14 = User( + id=2589, + email="2589@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="5543", + languages="", + days_since_start=511, + employee_id="2589", + location="Florida, USA", + ) + session.add(user14) + + user15 = User( + id=2989, + email="2989@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="5543", + languages="", + days_since_start=1202, + employee_id="2989", + location="California, USA", + ) + session.add(user15) + + user16 = User( + id=3002, + email="3002@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="5384", + languages="", + days_since_start=537, + employee_id="3002", + location="California, USA", + ) + session.add(user16) + + user17 = User( + id=3447, + email="3447@yelp.com", + meta_data={"department": "Engineering - Content Platform"}, + subscription_preferences=[user_pref], + manager_id="5543", + languages="", + days_since_start=692, + employee_id="3447", + location="Pennsylvania, USA", + ) + session.add(user17) + + user18 = User( + id=3457, + email="3457@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="1073", + languages="", + days_since_start=542, + employee_id="3457", + location="Berlin, Germany", + ) + session.add(user18) + + user19 = User( + id=3601, + email="3601@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="5384", + languages="", + days_since_start=141, + employee_id="3601", + location="Ontario, Canada", + ) + session.add(user19) + + user20 = User( + id=3683, + email="3683@yelp.com", + meta_data={"department": "Engineering - Content Platform"}, + subscription_preferences=[user_pref], + manager_id="5543", + languages="", + days_since_start=428, + employee_id="3683", + location="California, USA", + ) + session.add(user20) + + user21 = User( + id=3815, + email="3815@yelp.com", + meta_data={"department": "Engineering - Engineering Effectiveness"}, + subscription_preferences=[user_pref], + manager_id="5543", + languages="", + days_since_start=1816, + employee_id="3815", + location="California, USA", + ) + session.add(user21) + + user22 = User( + id=3957, + email="3957@yelp.com", + meta_data={"department": "Engineering - Services Experience"}, + subscription_preferences=[user_pref], + manager_id="5543", + languages="", + days_since_start=86, + employee_id="3957", + location="British Columbia, Canada", + ) + session.add(user22) + + user23 = User( + id=4078, + email="4078@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="782", + languages="", + days_since_start=266, + employee_id="4078", + location="New York, USA", + ) + session.add(user23) + + user24 = User( + id=4102, + email="4102@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="5384", + languages="", + days_since_start=541, + employee_id="4102", + location="British Columbia, Canada", + ) + session.add(user24) + + user25 = User( + id=4292, + email="4292@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="782", + languages="English, Gujarati, Hindi", + days_since_start=373, + employee_id="4292", + location="Washington, USA", + ) + session.add(user25) + + user26 = User( + id=4650, + email="4650@yelp.com", + meta_data={"department": "Engineering"}, + subscription_preferences=[user_pref], + manager_id="2432", + languages="", + days_since_start=446, + employee_id="4650", + location="East Sussex, United Kingdom", + ) + session.add(user26) + + user27 = User( + id=5240, + email="5240@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="2989", + languages="", + days_since_start=519, + employee_id="5240", + location="California, USA", + ) + session.add(user27) + + user28 = User( + id=5384, + email="5384@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="2989", + languages="", + days_since_start=721, + employee_id="5384", + location="Quebec, Canada", + ) + session.add(user28) + + user29 = User( + id=5529, + email="5529@yelp.com", + meta_data={"department": "Engineering - Services Leads"}, + subscription_preferences=[user_pref], + manager_id="5543", + languages="", + days_since_start=240, + employee_id="5529", + location="California, USA", + ) + session.add(user29) + + user30 = User( + id=5543, + email="5543@yelp.com", + meta_data={"department": "Engineering"}, + subscription_preferences=[user_pref], + manager_id="4650", + languages="", + days_since_start=610, + employee_id="5543", + location="California, USA", + ) + session.add(user30) + + user31 = User( + id=5637, + email="5637@yelp.com", + meta_data={"department": "Engineering - Core Experience"}, + subscription_preferences=[user_pref], + manager_id="1073", + languages="", + days_since_start=226, + employee_id="5637", + location="California, USA", + ) + session.add(user31) + + user_list = [ + user0, + user1, + user2, + user3, + user4, + user5, + user6, + user7, + user8, + user9, + user10, + user11, + user12, + user13, + user14, + user15, + user16, + user17, + user18, + user19, + user20, + user21, + user22, + user23, + user24, + user25, + user26, + user27, + user28, + user29, + user30, + user31, + ] + user_ids = [user.id for user in user_list] + session.commit() + + # considering disallowed meetings and rules + meeting_history = set( + [ + (user1.id, user2.id), + (user3.id, user4.id), + (user2.id, user3.id), + ] + ) + + _, specs = get_specs_from_subscription(subscription) + possible_meetings = {tuple(sorted(meeting)) for meeting in itertools.combinations(user_ids, 2)} + disallowed_meetings = get_disallowed_meetings(user_list, meeting_history, specs[0]) + allowed_meetings = possible_meetings - {tuple(sorted(a)) for a in disallowed_meetings} + paired_distance = get_meeting_weights(allowed_meetings) + + assert (126, 223) not in paired_distance.keys() # historically paired not in paired_distance bc historical + assert (2169, 5384) not in paired_distance.keys() # same department members should not be paired + assert round(paired_distance[(3457, 3815)], 3) == 2.102 + assert round(paired_distance[(4102, 4650)], 3) == 1.452 diff --git a/api/tests/matching/match_utils_test.py b/api/tests/matching/match_utils_test.py index 10c8427f..a79f6695 100644 --- a/api/tests/matching/match_utils_test.py +++ b/api/tests/matching/match_utils_test.py @@ -28,8 +28,26 @@ def test_generate_save_meetings(session, subscription): pref_1 = SubscriptionDateTime(datetime=datetime.now() - timedelta(weeks=MEETING_COOLDOWN_WEEKS - 1)) subscription = MeetingSubscription(title="all engineering weekly", datetime=[pref_1]) user_pref = UserSubscriptionPreferences(preference=pref_1, subscription=subscription) - user1 = User(email="a@yelp.com", meta_data={"department": "dept"}, subscription_preferences=[user_pref]) - user2 = User(email="b@yelp.com", meta_data={"department": "dept2"}, subscription_preferences=[user_pref]) + user1 = User( + email="a@yelp.com", + meta_data={"department": "dept"}, + subscription_preferences=[user_pref], + manager_id="0", + languages="en, fr", + days_since_start=100, + employee_id="101", + location="UK, London", + ) + user2 = User( + email="b@yelp.com", + meta_data={"department": "dept2"}, + subscription_preferences=[user_pref], + manager_id="101", + languages="en, fr", + days_since_start=100, + employee_id="102", + location="CA, London", + ) meeting_spec = MeetingSpec(meeting_subscription=subscription, datetime=pref_1.datetime) mr1 = MeetingRequest(user=user1, meeting_spec=meeting_spec) mr2 = MeetingRequest(user=user2, meeting_spec=meeting_spec) diff --git a/api/yelp_beans/logic/employee.py b/api/yelp_beans/logic/employee.py new file mode 100644 index 00000000..149656ab --- /dev/null +++ b/api/yelp_beans/logic/employee.py @@ -0,0 +1,5 @@ +from yelp_beans.models import Employee + + +def get_employee(work_email): + return Employee.query.filter(Employee.work_email == work_email).first() diff --git a/api/yelp_beans/matching/match_utils.py b/api/yelp_beans/matching/match_utils.py index a3b7104f..58e82fce 100644 --- a/api/yelp_beans/matching/match_utils.py +++ b/api/yelp_beans/matching/match_utils.py @@ -3,6 +3,8 @@ from datetime import datetime from datetime import timedelta +import networkx as nx +import pandas as pd from database import db from yelp_beans.logic.config import get_config @@ -86,3 +88,97 @@ def get_previous_meetings(subscription, cooldown=None): disallowed_meetings = {tuple([meeting.id for meeting in meeting]) for meeting in disallowed_meetings} return disallowed_meetings + + +def jaccard(list1, list2): + intersection = len(list(set(list1).intersection(list2))) + if intersection == 0: + return 1 + else: + union = (len(list1) + len(list2)) - intersection + return float(intersection) / union + + +def get_pairwise_distance( + user_pair, + org_graph, + employee_df, + max_tenure=1000, +): + """ + TODO@ichenkao: define input and output + get the distance between two users. + The returned distance score is a linear combination of the multiple user attributes' distnace (normalized). + The importance of each attribute is considered equal. + User attribute considered: + 1. team/function: distance in the org chart + 2. location - country, city + 3. tenure at Yelp + 4. language + + note: we considered using education and work experience, but think it likely correlates with the first attribute + """ + user_a, user_b = user_pair + user_a_attributes = dict(employee_df.loc[user_a]) + user_b_attributes = dict(employee_df.loc[user_b]) + + distance = 0 + dist_1 = nx.shortest_path_length(org_graph, user_a, user_b) + dist_1 = dist_1 / 10 # approx. min-max scaled + distance += dist_1 + + # location + try: + user_a_city, user_a_country = user_a_attributes["location"].split(", ") + except ValueError: + user_a_city, user_a_country = "unknown", user_a_attributes["location"] + try: + user_b_city, user_b_country = user_b_attributes["location"].split(", ") + except ValueError: + user_b_city, user_b_country = "unknown", user_b_attributes["location"] + country_dist = 0 if user_a_country == user_b_country else 1 + city_dist = 0 if user_a_city == user_b_city else 1 + dist_2 = country_dist + city_dist + dist_2 = dist_2 / 2 # min-max scaled + distance += dist_2 + + # tenure + dist_3 = abs(int(user_a_attributes["days_since_start"]) - int(user_b_attributes["days_since_start"])) + dist_3 = dist_3 / max_tenure + distance += dist_3 + + # language + lang_similarity = jaccard(user_a_attributes["languages"], user_b_attributes["languages"]) + dist_4 = 1 - lang_similarity + distance += dist_4 + + return distance + + +def get_meeting_weights(allowed_meetings): + """ + generate distance score for each user pairs. + """ + meeting_to_weight = {} + + # need to convert this to JSON to match the previous logic + db_query_result = db.session.query(User).all() + json_dump = [obj.serialize() for obj in db_query_result] + employees = pd.DataFrame(json_dump) + + employees["languages"] = employees["languages"].apply(lambda x: x.split(", ")) + employees = employees[["id", "manager_id", "days_since_start", "location", "languages", "email", "employee_id"]] + employees = employees.merge( + employees[["employee_id", "id"]], how="left", left_on="manager_id", right_on="employee_id", suffixes=("", "_manager") + ) + employees = employees.set_index("id", drop=False) + max_tenure = max(employees["days_since_start"].astype(int)) + + # yelp employee network graph created through reporting line + G = nx.Graph() + G.add_edges_from(list(zip(employees["id"], employees["id_manager"]))) + for user_pair in allowed_meetings: + users_distance_score = get_pairwise_distance(user_pair, org_graph=G, employee_df=employees.copy(), max_tenure=max_tenure) + meeting_to_weight[user_pair] = users_distance_score + + return meeting_to_weight diff --git a/api/yelp_beans/matching/pair_match.py b/api/yelp_beans/matching/pair_match.py index d18d96cc..36bc69c0 100644 --- a/api/yelp_beans/matching/pair_match.py +++ b/api/yelp_beans/matching/pair_match.py @@ -4,6 +4,7 @@ import networkx as nx from yelp_beans.logic.user import user_preference +from yelp_beans.matching.match_utils import get_meeting_weights from yelp_beans.matching.match_utils import get_previous_meetings @@ -78,16 +79,15 @@ def construct_graph(user_ids, disallowed_meetings): Yay graphs! Networkx will do all the work for us. """ - # special weights that be put on the matching potential of each meeting, - # depending on heuristics for what makes a good/bad potential meeting. - meeting_to_weight = {} - # This creates the graph and the maximal matching set is returned. # It does not return anyone who didn't get matched. meetings = [] - possible_meetings = {meeting for meeting in itertools.combinations(user_ids, 2)} - allowed_meetings = possible_meetings - disallowed_meetings + possible_meetings = {tuple(sorted(meeting)) for meeting in itertools.combinations(user_ids, 2)} + allowed_meetings = possible_meetings - {tuple(sorted(a)) for a in disallowed_meetings} + # special weights that be put on the matching potential of each meeting, + # depending on heuristics for what makes a good/bad potential meeting. + meeting_to_weight = get_meeting_weights(allowed_meetings) for meeting in allowed_meetings: weight = meeting_to_weight.get(meeting, 1.0) meetings.append((*meeting, {"weight": weight})) diff --git a/api/yelp_beans/models.py b/api/yelp_beans/models.py index 4d83ffbc..f9515d91 100644 --- a/api/yelp_beans/models.py +++ b/api/yelp_beans/models.py @@ -23,9 +23,32 @@ class User(db.Model): terminated = db.Column(db.Boolean, nullable=False, default=False) subscription_preferences = db.relationship("UserSubscriptionPreferences") + # Additional fields for match algo + languages = db.Column(db.Text) + days_since_start = db.Column(db.Integer) + employee_id = db.Column(db.String()) + location = db.Column(db.String()) + manager_id = db.Column(db.String()) + def get_username(self): return self.email.split("@")[0] + def serialize(self): + return { + "id": self.id, + "email": self.email, + "first_name": self.first_name, + "last_name": self.last_name, + "photo_url": self.photo_url, + "meta_data": self.meta_data, + "terminated": self.terminated, + "languages": self.languages, + "days_since_start": self.days_since_start, + "employee_id": self.employee_id, + "location": self.location, + "manager_id": self.manager_id, + } + class MeetingSubscription(db.Model): """The base template for a meeting type, it is comprised of