-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
166 lines (134 loc) · 5.31 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#%%
##importing libraries
from flask import Flask, render_template, request
from neo4j import GraphDatabase
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
app = Flask(__name__)
# Define the Neo4j connection class
class Neo4jConnection:
def __init__(self, uri, user, pwd):
self._uri = uri
self._user = user
self._password = pwd
self._driver = None
def connect(self):
self._driver = GraphDatabase.driver(self._uri, auth=(self._user, self._password))
return self
def close(self):
if self._driver is not None:
self._driver.close()
def query(self, query, parameters=None, db=None):
assert self._driver is not None, "Driver not initialized!"
session = self._driver.session(database=db) if db else self._driver.session()
result = list(session.run(query, parameters))
session.close()
return result
#%%
# Load university data from CSV
import os
cwd = os.getcwd()
uni = pd.read_csv(cwd+'\\university_data.csv')
# Drop unnecessary columns
uni.drop(['primaryPhoto', 'primaryPhotoThumb', 'sortName',
'urlName', 'aliasNames', 'nonResponderText',
'nonResponder', 'rankingSortRank', 'overallRank',
'rankingRankStatus', 'xwalkId', 'primaryKey',
'rankingNoteText','rankingNoteCharacter','rankingMaxPossibleScore',
'rankingIsTied','ranking','schoolType','rankingType',
'rankingDisplayName','region','isPublic'], axis=1, inplace=True)
# Handling missing values
columns_to_fill = ['act-avg', 'sat-avg', 'acceptance-rate','hs-gpa-avg','businessRepScore','engineeringRepScore','enrollment','rankingDisplayScore']
for column in columns_to_fill:
mean_value = uni[column].mean()
uni[column].fillna(mean_value, inplace=True)
uni[['percent-receiving-aid', 'cost-after-aid']] = uni[['percent-receiving-aid', 'cost-after-aid']].fillna(uni[['percent-receiving-aid', 'cost-after-aid']].median())
#%%
# Create a Neo4j connection
uri = "bolt://localhost:7687"
username = "neo4j"
password = "root@12345"
neo4j_conn = Neo4jConnection(uri, username, password).connect()
#%%
# Flask routes
@app.route('/')
def index():
return render_template('index.html')
@app.route('/recommend', methods=['POST'])
def recommend():
college_name = request.form['college_name']
# Get recommendations
recommendations = get_recommendations(neo4j_conn, college_name)
# Close Neo4j connection
neo4j_conn.close()
recommendations_data = recommendations.to_dict(orient='records')
return render_template('index.html', college_name=college_name, recommendations=recommendations_data)
#%%
##Create relationships between universities based on similarity
similarity_query = '''
MATCH (u1:University), (u2:University)
WHERE id(u1) < id(u2)
WITH u1, u2,
gds.similarity.euclidean(
[u1.actAvg, u1.satAvg, u1.acceptanceRate, u1.hsGpaAvg, u1.rankingDisplayRank, u1.businessRepScore, u1.engineeringRepScore],
[u2.actAvg, u2.satAvg, u2.acceptanceRate, u2.hsGpaAvg, u2.rankingDisplayRank, u2.businessRepScore, u2.engineeringRepScore]
) AS euclideanDistance
MERGE (u1)-[similarity:SIMILARITY_EDGE]->(u2)
ON CREATE SET similarity.euclideanDistance = euclideanDistance;
'''
neo4j_conn.query(similarity_query)
#%%
# Create the graph using gds.graph.project
graph_creation_query = """
// Check if the graph exists
CALL gds.graph.exists('myGraph')
YIELD exists AS graphExists
// If the graph does not exist, create it
WITH 'myGraph' AS graphToCreate, graphExists
WHERE NOT graphExists
CALL gds.graph.project(graphToCreate, 'University', 'SIMILARITY_EDGE', {
nodeProperties: ['actAvg', 'satAvg', 'acceptanceRate', 'hsGpaAvg', 'rankingDisplayRank', 'businessRepScore', 'engineeringRepScore']
}) YIELD graphName, nodeCount, relationshipCount
RETURN graphName, nodeCount, relationshipCount;
"""
neo4j_conn.query(graph_creation_query)
#%%
# Perform Min-Max scaling
scale_features_query = '''
MATCH (n)
WHERE n.scaledProperties IS NOT NULL
CALL gds.alpha.scaleProperties.mutate('myGraph', {
nodeProperties: ['actAvg', 'satAvg', 'acceptanceRate', 'hsGpaAvg', 'rankingDisplayRank', 'businessRepScore', 'engineeringRepScore'],
scaler: "MinMax",
mutateProperty: "scaledProperties"
})
YIELD nodePropertiesWritten
RETURN nodePropertiesWritten;
'''
neo4j_conn.query(scale_features_query)
#%%
# Function to get recommendations using GDS KNN algorithm
def get_recommendations(connection, college_name):
query = f"""
CALL gds.knn.stream('myGraph', {{
topK: 5,
nodeProperties: ['actAvg', 'satAvg', 'acceptanceRate', 'hsGpaAvg', 'rankingDisplayRank', 'businessRepScore', 'engineeringRepScore'],
randomSeed: 1337,
concurrency: 1,
sampleRate: 1.0,
deltaThreshold: 0.0
}})
YIELD node1, node2, similarity
WITH gds.util.asNode(node1) AS university1, gds.util.asNode(node2) AS university2, similarity
// Use the parameter in the WHERE clause
WHERE university1.name = $college_name
RETURN university1.name AS University1, university2.name AS University2, similarity
ORDER BY similarity DESCENDING, University1, University2;
"""
parameters = {'college_name': college_name}
result = connection.query(query, parameters)
return pd.DataFrame(result)
#%%
if __name__ == '__main__':
app.run(debug=True, host='127.0.0.1',port=8080)
#%%s