-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtask1b.py
166 lines (147 loc) · 5.87 KB
/
task1b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import pandas as pd
import textdistance as td
import numpy as np
# function used to only extract number part
# of a given input and output its integer
def get_int(price):
output_price = ''
price = str(price)
for letter in price:
if letter.isnumeric():
output_price+=str(letter)
else:
break
if output_price == '':
output_price = '0'
return int(output_price)
# function used to create a block_key
# based on the price provide
def get_block_price(price):
block_price = ''
if price<=200 and price>=0:
block_price = 'state1 '
block_price += str(price//10)
elif price<=1000 and price>300:
block_price = 'state2 '
block_price += str(price//100)
elif price>1000 and price<=100000:
block_price = 'state3 '
block_price += str(price//1000)
elif price>10000 and price<=1000000:
block_price = 'state4 '
block_price += str(price//10000)
else:
block_price = 'state5 '
block_price += str(price//100000)
return block_price
# function used to determine if number in a string
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
# function uesd to find the brand name from the input string
# the first word withour any number is assumed as
# the brand namehere
def get_brand(manufacturer):
res = manufacturer.split()
output_brand = ''
if len(res) == 0:
output_brand = res[0]
else:
for word in res:
if not hasNumbers(word):
output_brand = word
break
return output_brand
# read two csv file which include the products
# need to be blocked
google = pd.read_csv('google.csv',encoding= 'ISO-8859-1')
amazon = pd.read_csv('amazon.csv',encoding= 'ISO-8859-1')
# make a copy and only extract the useful data
amazon_new = amazon.copy()
google_new = google.copy()
amazon_new = amazon_new[['title','idAmazon','price','manufacturer']]
google_new = google_new[['name','id','price','manufacturer']]
# replace the nan in price column
# by empty string
google_new['price'] = google_new['price'].replace(np.nan, '', regex=True)
amazon_new['price'] = amazon_new['price'].replace(np.nan,'', regex = True)
# create a new column in both data set which records the interge part
# of the product price(would be used for blocking later)
price_list = []
for price in google_new.loc[:,'price']:
price_list.append(get_int(price))
google_new.loc[:,'Intprice'] = price_list
price_list1 = []
for price in amazon_new.loc[:,'price']:
price_list1.append(get_int(price))
amazon_new.loc[:,'Intprice'] = price_list1
# replace the nan column to '' in the
# manufacturer column in google_new
google_new['manufacturer'] = google_new['manufacturer'].replace(np.nan, '', regex=True)
# figure out two block keys for each product in
# google, one is based on product's price
# another is based on its 'manufacturer'
all_block_google = []
for google_id in google_new['id']:
match_price = []
match_brand = []
google_price = google_new.loc[google_new['id'] == google_id, 'Intprice'].iloc[0]
block_price = get_block_price(google_price)
# store the product id and its corresponding
# block key(based on price) into a list
match_price.append(block_price)
match_price.append(google_id)
all_block_google.append(match_price)
# find the block key based on product's manufacturer
# and store in to a list
if google_new.loc[google_new['id'] == google_id, 'manufacturer'].iloc[0] != '':
google_manufacturer = google_new.loc[google_new['id'] == google_id, 'manufacturer'].iloc[0]
brand = get_brand(google_manufacturer)
match_brand.append(brand)
match_brand.append(google_id)
all_block_google.append(match_brand)
else:
google_name = google_new.loc[google_new['id'] == google_id, 'name'].iloc[0]
brand = get_brand(google_name)
match_brand.append(brand)
match_brand.append(google_id)
all_block_google.append(match_brand)
# figure out two block keys for each product in
# amazon, one is based on product's price
# another is based on its 'manufacturer'
all_block_amazon = []
for amazon_id in amazon_new['idAmazon']:
match_price = []
match_brand = []
amazon_price = amazon_new.loc[amazon_new['idAmazon'] == amazon_id, 'Intprice'].iloc[0]
block_price = get_block_price(amazon_price)
# store the product id and its corresponding
# block key into a list
match_price.append(block_price)
match_price.append(amazon_id)
all_block_amazon.append(match_price)
# find the block key based on product's manufacturer
# and store in to a list
if amazon_new.loc[amazon_new['idAmazon'] == amazon_id, 'manufacturer'].iloc[0] != '':
amazon_manufacturer = amazon_new.loc[amazon_new['idAmazon'] == amazon_id, 'manufacturer'].iloc[0]
brand = get_brand(amazon_manufacturer)
match_brand.append(brand)
match_brand.append(amazon_id)
all_block_amazon.append(match_brand)
else:
amazon_name = amazon_new.loc[amazon_new['id']== amazon_id,'title'].iloc[0]
brand = get_brand(google_name)
match_brand.append(brand)
match_brand.append(amazon_id)
all_block_amazon.append(match_brand)
# store all the google product ids and its
# block key as dataframe and convert to csv file
# called 'google_blocks.csv'
column_names_google = ['block_key','product_id']
task1b_google = pd.DataFrame(all_block_google, columns = column_names_google,)
task1b_google.to_csv('google_blocks.csv',index = False)
# store all the amazon product ids and its
# block key as dataframe and convert to csv file
# called 'amazon_blocks.csv'
column_names_amazon = ['block_key','product_id']
task1b_amazon = pd.DataFrame(all_block_amazon, columns = column_names_amazon,)
task1b_amazon.to_csv('amazon_blocks.csv',index = False)