-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathscrape_images_from_alibaba.py
81 lines (62 loc) · 2 KB
/
scrape_images_from_alibaba.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import requests
from bs4 import *
import argparse
import os
import pandas as pd
# add argument parser for passing the target folder for saving the images
parser = argparse.ArgumentParser()
parser.add_argument('--saveto', help='Target directory to save the Images (default: images/', action='store', dest='dirName')
parser.add_argument('--pages', help='No. of pages (default: 40', action='store', dest='last_pagination')
args = parser.parse_args()
if(args.dirName):
dirName = str(args.dirName) + '/'
else :
dirName = 'images/'
if(args.last_pagination):
last_pagination = int(args.last_pagination) + 1
else :
last_pagination = 40
if not os.path.exists(dirName):
os.mkdir(dirName)
headers_std = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36',
'Content-Type': 'text/html',
}
link = input('Search Query URL: ')
req = requests.get(link)
soup = BeautifulSoup(req.text, 'lxml')
urls = []
item_category = input('Enter the image category')
for x in range (1, last_pagination):
urls.append(link + '&page=' + str(x))
img_urls = set()
k = 1
pag = 1
for x in urls:
print('Pagination : ' + str(pag))
req = requests.get(x,headers=headers_std).text
soup = BeautifulSoup(req, 'lxml')
imgs = soup.find_all('img',{'src': True})
for i in imgs:
if str(i).find('src') != -1:
url = 'https:' + i['src']
img_urls.add(url)
# name_image_folder = dirName + str(k) + '.jpg'
# image = requests.get(url).content
# with open(name_image_folder, 'wb') as handler:
# handler.write(image)
k += 1
pag += 1
urls = list(img_urls)
categories = [item_category] * len(urls)
try:
df1 = pd.read_csv('data.csv')
except:
df1 = pd.DataFrame()
df2 = pd.DataFrame({'url' : urls, 'category':categories})
# concat both the dataframes
df = pd.concat([df1, df2])
print(len(df1))
print(len(df2))
print(len(df))
df.to_csv('data.csv',index=False)