-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfonction_scraping_film.py
187 lines (155 loc) · 6.31 KB
/
fonction_scraping_film.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 14 15:49:34 2020
@author: Jules
"""
from warnings import warn
import bs4
import requests
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import fonction_scraping_accueil as scrap
#def extraction_movie_data_from_link(link):
def extraction_movie_data_from_link(link, mv_attributs):
'''
Get some information from a movie link
:param1 String link: http url that point to the movie
:param2 tupe mv_attributs : tupe of all the data frome movies
:return tupe mv_attributs: tupe of all the data frome movies
:rtype: tupe of list
'''
page_link = link
response = requests.get(page_link)
html = bs4.BeautifulSoup(response.text, 'html.parser')
nb_genre = 0
#get the movie genres
div = html.find('div', class_="subtext")
for a in div.find_all('a'):
title = a.get('title')
#there is a balise title which we do not want
if title is None:
mv_attributs[7+nb_genre].append(a.text)
nb_genre += 1
if nb_genre == 1:
mv_attributs[8].append(None)
mv_attributs[9].append(None)
nb_genre = 3
if nb_genre == 2:
mv_attributs[9].append(None)
nb_genre = 3
#get the stars acting in the movie
stars = []
nb_act = 0
for credit in html.find_all('div', class_='credit_summary_item'):
#test_stars = False
inline = credit.h4.text
if inline == "Stars:":
for a in credit.find_all('a'):
href = a.get('href')
if href != "fullcredits/":
mv_attributs[10+nb_act].append(a.text)
nb_act += 1
elif inline == "Star:":
for a in credit.find_all('a'):
href = a.get('href')
if href != "fullcredits/":
mv_attributs[10+nb_act].append(a.text)
nb_act += 1
if nb_act == 0:
mv_attributs[10].append(None)
mv_attributs[11].append(None)
mv_attributs[12].append(None)
nb_genre = 3
if nb_act == 1:
mv_attributs[11].append(None)
mv_attributs[12].append(None)
nb_genre = 3
if nb_act == 2:
mv_attributs[12].append(None)
nb_act = 3
award = html.find('div', id='titleAwardsRanks', class_='article highlighted')
test_rank = False
test_nb_oscar = False
test_win = False
test_nom = False
if award is not None:
# movie rank
if award.find('strong') is not None:
strong = award.find('strong').text
rank = scrap.clean_chars(strong)
mv_attributs[13].append(rank)
test_rank = True
# oscars, wins and nominations
if award.find_all('span', class_="awards-blurb") is not None:
test_nb_oscar = False #On en a besoin seulemnt dans la prmemière iteration de la boucle (après il n'y a plus de possibilité de trouver d'Oscar)
for span in award.find_all('span', class_="awards-blurb"):
# if there is/are oscar/s
if span.find('b') is not None: #Uniquement pour prendre l'Oscar
nb_oscar = span.find('b').text
nb_oscar = scrap.clean_chars(nb_oscar)
mv_attributs[14].append(nb_oscar)
test_nb_oscar = True
else:
length = len(span.text)
if length > 40:
win = span.text[:length - 24] #On évite de prendre le nomination qu'on prend après
win = scrap.clean_chars(win)
mv_attributs[15].append(win)
test_win = True
nom = span.text[34:] #On évite de prendre le win qu'on a déja
nom = scrap.clean_chars(nom)
mv_attributs[16].append(nom)
test_nom = True
elif length <= 40 and length > 30:
win = span.text[:length - 24] #On évite de prendre le nomination qu'on prend après
win = scrap.clean_chars(win)
mv_attributs[15].append(win)
test_win = True
nom = span.text[14:] #On évite de prendre le win qu'on a déja
nom = scrap.clean_chars(nom)
mv_attributs[16].append(nom)
test_nom = True
else:
win = None
nom = span.text
nom = scrap.clean_chars(nom)
mv_attributs[16].append(nom)
test_nom = True
if not test_rank:
mv_attributs[13].append(0)
if not test_nb_oscar:
mv_attributs[14].append(0)
if not test_win:
mv_attributs[15].append(0)
if not test_nom:
mv_attributs[16].append(0)
test_runtime=False
test_budget=False
test_gross=False
for div in html.find_all('div', class_="txt-block"):
if div.find('h4', class_='inline') is not None:
inline = div.find('h4', class_='inline').text
# find the runtime in minutes
if inline == "Runtime:":
runtime = div.find('time').text
runtime = scrap.clean_chars(runtime)
mv_attributs[17].append(runtime)
test_runtime = True
# find the movie budget
if inline == "Budget:":
budget = div.text
budget = scrap.clean_chars(budget)
mv_attributs[18].append(budget)
test_budget = True
# find the movie worldwide gross
if inline == "Cumulative Worldwide Gross:":
gross = div.text
gross = scrap.clean_chars(gross)
mv_attributs[19].append(gross)
test_gross = True
if not test_runtime:
mv_attributs[17].append(None)
if not test_budget:
mv_attributs[18].append(None)
if not test_gross:
mv_attributs[19].append(None)
return mv_attributs