-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtask1a.py
104 lines (90 loc) · 4.02 KB
/
task1a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import textdistance as td
from fuzzywuzzy import fuzz
import numpy as np
# function uesd to check is two prices
# provided is 'close' enough.
def check_close_price(price1, price2):
abs_diff = abs(price1-price2)
if price1<price2:
if abs_diff < price1*0.1:
return 1
else:
if abs_diff< price2*0.1:
return 1
return 0
# the final score function uesd to calculate the
# final similarity score of the product from google
# and amazon
def final_scoring(simility_scoring, manufacturer_scoring, close_price):
total_scoring = 0
total_scoring += simility_scoring
if manufacturer_scoring > 75:
total_scoring +=10
if close_price:
total_scoring +=5
return total_scoring
# read two csv file google_small.csv and
# amazon_small.csv first
google = pd.read_csv('google_small.csv',encoding = 'ISO-8859-1')
amazon = pd.read_csv('amazon_small.csv',encoding = 'ISO-8859-1')
# preprocessing for the price column in two
# data sets, avoid error in the comparison later
google['price']=google['price'].astype(float)
amazon['price']=amazon['price'].astype(float)
google['manufacturer'] = google['manufacturer'].replace(np.nan, '', regex=True)
# use a nested loop to fine the match pair
# for each csv file
matches = []
for google_data in google['name']:
scoring_list = []
scoring_diction = {}
match = []
google_manufacturer = google.loc[google['name'] == google_data,'manufacturer'].iloc[0]
google_price = google.loc[google['name'] == google_data,'price'].iloc[0]
for amazon_data in amazon['title']:
total_scoring = 0
amazon_manufacturer = ''
manufacturer_scoring = 0
if pd.notnull(amazon.loc[amazon['title'] == amazon_data, 'manufacturer'].iloc[0]):
amazon_manufacturer = amazon.loc[amazon['title'] == amazon_data, 'manufacturer'].iloc[0]
amazon_price = amazon.loc[amazon['title'] == amazon_data, 'price'].iloc[0]
# use the similarity function fuzz.token_set_ratio to
# find how similarity between the product's name in google
# and the product's title in amazon
simility_scoring = fuzz.token_set_ratio(google_data,amazon_data)
# figure out if two product have same manufacturer
if amazon_manufacturer != '' and google_manufacturer != '':
manufacturer_scoring = fuzz.token_set_ratio(google_manufacturer,amazon_manufacturer)
elif amazon_manufacturer != '' and google_manufacturer == '':
manufacturer_scoring = fuzz.token_set_ratio(google_data,amazon_manufacturer)
else:
manufacturer = 0
# check if the prices of
# these products are close
close_price = check_close_price(amazon_price,google_price)
# use the final_scoring function to calculate a final similarity
# score of these two products, based on the similarity_scoring,
# manufacturer_scoring and close_price got before
total_scoring = final_scoring(simility_scoring, manufacturer_scoring, close_price)
# determine if these two product are similar base on
# similarity score they got, threshold is 0.68 here
if total_scoring > 70:
scoring_list.append(total_scoring)
scoring_diction[total_scoring] = amazon_data
# find the mose highest similarity score
# and the corresponding amazon product
if scoring_list != []:
maxi_scoring = sorted(scoring_list)[-1]
amazon_data = scoring_diction[maxi_scoring]
google_id = google.loc[google['name'] == google_data, 'idGoogleBase'].iloc[0]
amazon_id = amazon.loc[amazon['title'] == amazon_data, 'idAmazon'].iloc[0]
match.append(google_id)
match.append(amazon_id)
# append this match pair into a total matches list
if match not in matches:
matches.append(match)
# store the match pairs into a csv file called 'task1a.csv'
column_names = ['idGoogleBase','idAmazon']
task1 = pd.DataFrame(matches, columns = column_names,)
task1.to_csv('task1a.csv',index = False)