-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
95 lines (79 loc) · 2.94 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
import numpy as np
from IMDb import IMDb
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
"""
This file is used to test out functionality and new added features.
"""
#url = 'https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres=animation&sort=boxoffice_gross_us,desc'
titles = []
ratings = []
amount = []
def genre(aGenre):
url = 'https://www.imdb.com/search/title/?title_type=feature&num_votes=25000,&genres='
# Change this based on how you want to sort the website. For now it will be based on box office, since
# some movies actually don't have box office data.
url_completed = '&sort=boxoffice_gross_us,desc'
# a is the temp url that is created. b is the starting number for the page
url_pages = lambda a, b: a + '&start=' + str(b) + '&ref_=adv_nxt'
#if aGenre == 'action':
temp = url + aGenre + url_completed
main(temp)
numTitles = numberOfTitles(temp)
counter = 1
while numTitles > 50:
counter += 50
anew = url_pages(temp, counter)
main(anew)
numTitles -= 50
if (numTitles > 0) :
counter += 50
anew = url_pages(temp, counter)
main(anew)
return
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# getting all of our necessary data
# movieRating = soup.select('div.inline-block.ratings-imdb-rating')
# theRating = movieRating.find('strong').text
movieTitles = soup.select('h3.lister-item-header a')
movieRating = soup.select('div.inline-block.ratings-imdb-rating')
grossAmount = soup.select('p.sort-num_votes-visible span[name = nv]')
# obtain all of the necessary data needed
titles.extend([title.text for title in movieTitles])
ratings.extend([float(rating['data-value']) for rating in movieRating])
# had to separate the right data values, as votes and gross amount were both under the name nv in span
amount.extend(int(gross['data-value'].replace(',', '')) for votes, gross in pairwise(grossAmount))
#print(numTitles)
# records = []
# for (a, b, c) in zip(titles, ratings, amount):
# records.append((a, b, c))
#
# df = pd.DataFrame(records, columns=('Title', 'Rating', 'Net Gross'))
# df.head()
# df.tail()
# df.to_csv('theData.csv', index=False, encoding='utf-8')
def pairwise(iterable):
a = iter(iterable)
return zip(a, a)
def numberOfTitles(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
numberTitles = soup.find_all("div", class_="desc")
test = numberTitles[0].text[6:-1]
Titles = int(''.join(filter(str.isdigit, test)))
return Titles
#numTitles = numbTitles[0]
if __name__ == '__main__' :
genre('western')
x = np.array([ratings])
y = np.array([amount])
plt.plot(x,y)
plt.xlabel('IMDb Rating')
plt.ylabel('Gross Amount')
# print(titles)
# print(ratings)
# print(amount)
# print(len(titles))