-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkNN.py
281 lines (207 loc) · 9.49 KB
/
kNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
import torch
import boto3
import re
import time
#import sagemaker
import json
import pandas as pd
import requests
from transformers import AutoTokenizer, AutoModel
from transformers import DistilBertTokenizer, DistilBertModel
# 1. Prepare headset production question answer (PQA) data
'''
Each JSON document in the raw PQA data set has a question with many potential answers in additon to other information about the product in question.
The code below creates a pandas data frame (df) where each row is a single question and answer pair. The other product information is also removed.
For example a JSON document from the raw PQA data set is below
{
"question_id": "Tx39GCUOS5AYAFK",
"question_text": "does this work with cisco ip phone 7942",
"asin": "B000LSZ2D6",
"bullet_point1": "Noise-Canceling microphone filters out background sound",
"bullet_point2": "HW251N P/N 75100-06",
"bullet_point3": "Uses Plantronics QD Quick Disconnect Connector. Must be used with Plantronics Amp or with proper phone or USB adapter cable",
"bullet_point4": "Connectivity Technology: Wired, Earpiece Design: Over-the-head, Earpiece Type: Monaural, Host Interface: Proprietary, Microphone Design: Boom, Microphone Technology: Noise Canceling, Product Model: HW251N, Product Series: SupraPlus, Standard Warranty: 2 Year",
"bullet_point5": "Easy Lightweight Wear -Leaving One Ear Uncovered For Person-to-Person Conversations", "product_description": "", "brand_name": "Plantronics", "item_name": "Plantronics HW251N SupraPlus Wideband Headset (64338-31)",
"question_type": "yes-no",
"answer_aggregated": "neutral",
"answers": [
{"answer_text": "Use the Plantronics compatibility guide to see what is compatible with your phone. http://www.plantronics.com/us/compatibility-guide/"},
{"answer_text": "I think that you will need a extra cord, but, To avoid offering you any false information, We highly recommend contacting the manufacturer of this product for more specific information. if you are not sure about it, you can go first to : http://www.plantronics.com/us/support/ or call Plantronics TOLL FREE SUPPORT: 1-855-765-7878 24-HOUR SUPPORT SUNDAY 2PM-FRIDAY 5PM (PT) they will answer all the questions you need to know about it."},
{"answer_text": "I'm really not positive. It works with our phones that include model numbers 7941, 7945 and 7961."}
]
}
After processing the document the df data frame will have a question and answer column
Question: does this work with cisco ip phone 7942
Answer: Use the Plantronics compatibility guide to see what is compatible with your phone. http://www.plantronics.com/us/compatibility-guide/
'''
print("Preparing data set")
number_of_rows_from_dataset = 1000
df = pd.DataFrame(columns=('question', 'answer'))
with open('amazon_pqa_headsets.json') as f:
i=0
for line in f:
data = json.loads(line)
df.loc[i] = [data['question_text'],data['answers'][0]['answer_text']]
i+=1
# optional
if(i == number_of_rows_from_dataset):
break
# 2. Convert the question text in the PQA data set into vector(s)
'''
Step 1. Tokenize the question text
Input: df["question"].tolist()
Output: inputs_tokens
tokenizer()
padding - Ensure that all sequences in a batch have the same length. If the padding argument is set to True, the function will pad sequences up to the length of the longest sequence in the batch
return_tensors - Return output as a PyTorch torch.Tensor object
Step 2. Convert tokenized questions into vectors using BERT
Input: inputs_tokens
Output: outputs
outputs is 3 dimensional tensor object. Working with 1000 rows of data the dimension of outputs could be [1000, 64, 768]
Step 3. Use mean pooling to condense the
Input: outputs
Ouput: question_text_embeddings
question_text_embeddings is a 2 dimensional tensor object. Working with 1000 rows of data the dimension of output could be [1000, 768]
'''
# Tokenize the questions in the PQA data set
print("Tokenizing the text")
tokenizer = DistilBertTokenizer.from_pretrained("sentence-transformers/distilbert-base-nli-stsb-mean-tokens")
inputs_tokens = tokenizer(df["question"].tolist(), padding=True, return_tensors="pt")
# Convert the tokenized questions into vectors
print("Converting tokenized text into vectors")
model = DistilBertModel.from_pretrained("sentence-transformers/distilbert-base-nli-stsb-mean-tokens")
# disable gradient computation to speed up the vector creation
with torch.no_grad():
outputs = model(**inputs_tokens)
#print('outputs: ' + str(outputs[0].size()))
# Mean pooling
print("Applying mean pooling to vector representation of the text")
token_embeddings = outputs[0] # first element of model_output contains all token embeddings
input_mask_expanded = inputs_tokens['attention_mask'].unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
question_text_embeddings = sum_embeddings / sum_mask
#print('question_text_embeddings: ' + str(question_text_embeddings.size()))
#print(df["question"])
#print(question_text_embeddings)
# 3. Create an OpenSearch index
'''
Create an OpenSearch index named nlp_pqa with 3 fields. These fields include
1. question_vector
2. question
3. answer
The data type of the question_vector field is knn_vector
'''
print("Creating the OpenSearch index")
# Configure re-usable variables for Opensearch domain URL, user name and password
opensearch_url = 'https://<opensearch_domain_url' # DO NOT INCLUDE TRAILING SLASH
opensearch_user_name = '<user_name>'
opensearch_password = '<password>'
create_knn_index_request_body = {
"settings": {
"index.knn": True,
"index.knn.space_type": "cosinesimil",
"analysis": {
"analyzer": {
"default": {
"type": "standard",
"stopwords": "_english_"
}
}
}
},
"mappings": {
"properties": {
"question_vector": {
"type": "knn_vector",
"dimension": 768,
"store": True
},
"question": {
"type": "text",
"store": True
},
"answer": {
"type": "text",
"store": True
}
}
}
}
create_index_r = requests.put(opensearch_url + '/nlp_pqa', auth=(opensearch_user_name, opensearch_password), headers= {'Content-type': 'application/json'}, data=json.dumps(create_knn_index_request_body))
#print(create_index_r.text)
# 4. Load data into the index
'''
Load data into the OpenSearch index that was just created.
'''
print("Loading data to the OpenSearch index")
i = 0
for c in df["question"].tolist():
question_text_i = c
question_vector_i = question_text_embeddings[i].tolist()
answer_i = df["answer"][i]
#print('Question Text: ' + question_text)
#print('Question Vector: ' + str(question_vector[0]) + (' ...'))
#print('Answer: ' + answer)
upload_document_request_body = {
"question_vector": question_vector_i,
"question": question_text_i,
"answer": answer_i
}
upload_document_r = requests.post(opensearch_url + '/nlp_pqa/_doc', auth=(opensearch_user_name, opensearch_password), headers= {'Content-type': 'application/json'}, data=json.dumps(upload_document_request_body))
#print(upload_document_r.text)
i+=1
# 5. Convert user input/search into a vector
'''
Conver user input/search in a vector
Output: search_vector
'''
print("Converting search into a vector")
# ? Refactor this block of code
query_raw_sentences = ['does this work with xbox?']
tokenizer = DistilBertTokenizer.from_pretrained("sentence-transformers/distilbert-base-nli-stsb-mean-tokens")
model = DistilBertModel.from_pretrained("sentence-transformers/distilbert-base-nli-stsb-mean-tokens")
inputs_tokens = tokenizer(query_raw_sentences, padding=True, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs_tokens)
print("Applying mean pooling to vector representation of the search")
token_embeddings = outputs[0] # first element of model_output contains all token embeddings
input_mask_expanded = inputs_tokens['attention_mask'].unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
sentence_embeddings = sum_embeddings / sum_mask
search_vector = sentence_embeddings[0].tolist()
# ?
# 6. Search OpenSearch using the vector representation of the user input/search
'''
Make an API call to run the search using the search_vector created in the last step
'''
print("Search OpenSearch using the vector representation of the search")
query = {
"size": 30,
"query": {
"knn": {
"question_vector":{
"vector":search_vector,
"k":30
}
}
}
}
query_r = requests.get(opensearch_url + '/nlp_pqa/_search', auth=(opensearch_user_name, opensearch_password), headers= {'Content-type': 'application/json'}, data=json.dumps(query))
#print(query_r.text)
# Print search results
json_res = query_r.json()
number_of_results_to_print = 3
results_printed = 0
print('Search results:')
for hit in json_res["hits"]["hits"]:
if number_of_results_to_print > results_printed:
print(' ')
print('Score: ' + str(hit["_score"]))
print('Answer: ' + hit["_source"]["answer"])
print('Question: ' + hit["_source"]["question"])
print(' ')
results_printed = results_printed + 1
else:
break