Skip to content

Commit

Permalink
adjust to new filmweb app
Browse files Browse the repository at this point in the history
  • Loading branch information
ppatrzyk committed Dec 7, 2023
1 parent f7cd601 commit d127a67
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 55 deletions.
41 changes: 17 additions & 24 deletions filmweb/getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,35 @@

HEADERS = {
# https://www.whatismybrowser.com/guides/the-latest-user-agent/firefox
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13.4; rv:109.0) Gecko/20100101 Firefox/113.0",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
"x-locale": "pl_PL",
"Host": "www.filmweb.pl",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Origin": "https://www.filmweb.pl",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"Sec-GPC": "1",
"TE": "trailers",
}

def get_films_page(args):
"""
request films page
"""
# this workaround is necessary because multiprocessing imap takes one arg only
(cookie, user, n) = args
url = f"https://www.filmweb.pl/user/{user}/films"
params = {"page": n}
response = requests.get(url, params=params, headers={"Cookie": cookie, **HEADERS})
response.raise_for_status()
return response.text
(cookie, user, friend_query, n) = args
if friend_query:
url = f"https://www.filmweb.pl/api/v1/logged/friend/{user}/vote/title/film?page={n}"
else:
url = f"https://www.filmweb.pl/api/v1/logged/vote/title/film?page={n}"
data = _get_json(url, cookie, "get_films_page")
return json.dumps(data)

def auth_check(cookie):
"""
Expand All @@ -38,33 +44,20 @@ def auth_check(cookie):

def get_votes_count(user):
"""
Get total count of votes
Get total count of voteshttps://www.filmweb.pl/api/v1/user/{user}/votes/film/count
Args:
user: user to get ratings for
"""
url = f"https://www.filmweb.pl/api/v1/user/{user}/votes/film/count"
return _get_json(url, "", "get_votes_count")

def get_user_rating(args):
"""
Gets user rating
"""
(cookie, movie_id, user, friend_query) = args
if friend_query:
url = f"https://www.filmweb.pl/api/v1/logged/friend/{user}/vote/film/{movie_id}/details"
else:
url = f"https://www.filmweb.pl/api/v1/logged/vote/film/{movie_id}/details"
data = _get_json(url, cookie, "get_user_rating")
data["movie_id"] = movie_id
return json.dumps(data)

def get_global_info(movie_id):
"""
Get info about a movie (title etc)
"""
url = f"https://www.filmweb.pl/api/v1/title/{movie_id}/info"
data = _get_json(url, "", "get_global_info")
data["movie_id"] = movie_id
data["entity"] = movie_id
return json.dumps(data)

def get_global_rating(movie_id):
Expand All @@ -73,7 +66,7 @@ def get_global_rating(movie_id):
"""
url = f"https://www.filmweb.pl/api/v1/film/{movie_id}/rating"
data = _get_json(url, "", "get_global_rating")
data["movie_id"] = movie_id
data["entity"] = movie_id
data["global_rate"] = data.pop("rate")
return json.dumps(data)

Expand Down
24 changes: 10 additions & 14 deletions filmweb/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from . import parser

PARALLEL_PROC = multiprocessing.cpu_count()
MOVIES_PER_PAGE = 25
MOVIES_PER_PAGE = 100
FORMATS = {"csv", "json", "letterboxd"}

def main():
Expand All @@ -43,23 +43,19 @@ def main():
pages = ceil(votes_total/MOVIES_PER_PAGE)
logged_in_user = getter.auth_check(cookie)
friend_query = (user != logged_in_user)
logging.info("Fetching list of movies [1/6]...")
get_films_page_args = ((cookie, user, page) for page in range(1, pages+1))
raw_responses = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_films_page, get_films_page_args), total=pages))
logging.info("Parsing list of movies [2/6]...")
ids = tuple(tqdm.tqdm(pool.imap_unordered(parser.extract_movie_ids, raw_responses), total=pages))
ids = tuple(set(itertools.chain.from_iterable((json.loads(el) for el in ids))))
total_movies = len(ids)
logging.info("Fetching list of movies [1/4]...")
get_films_page_args = ((cookie, user, friend_query, page) for page in range(1, pages+1))
user_ratings_raw = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_films_page, get_films_page_args), total=pages))
user_ratings = tuple(itertools.chain.from_iterable((json.loads(el) for el in user_ratings_raw)))
total_movies = len(user_ratings)
logging.info(f"User {user} has {total_movies} movies...")
assert total_movies, "No movies available"
logging.info("Fetching user ratings [3/6]...")
get_user_rating_args = ((cookie, movie_id, user, friend_query) for movie_id in ids)
user_ratings = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_user_rating, get_user_rating_args), total=total_movies))
logging.info("Fetching info about movies [4/6]...")
ids = tuple(el.get("entity") for el in user_ratings)
logging.info("Fetching info about movies [2/4]...")
global_info = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_info, ids), total=total_movies))
logging.info("Fetching global rating for movies [5/6]...")
logging.info("Fetching global rating for movies [3/4]...")
global_rating = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_rating, ids), total=total_movies))
logging.info("Writing data [6/6]...")
logging.info("Writing data [4/4]...")
movies = parser.merge_data(ids, user_ratings, global_info, global_rating)
parser.write_data(movies, user, formats)
except Exception as e:
Expand Down
18 changes: 3 additions & 15 deletions filmweb/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import logging
import json
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import quote_plus

KEY_MAPPING = {
Expand All @@ -14,23 +13,12 @@
"originalTitle": "original_title",
"title": "pl_title",
"year": "year",
"movie_id": "movie_id",
"entity": "movie_id",
"url": "url",
"date": "date",
}
# TODO? country/genre info not visible in api, would need to parse htmls

def extract_movie_ids(content):
"""
Extract movie ids from films page
Args:
content: raw html
"""
soup = BeautifulSoup(content, "html.parser")
id_containers = soup.find_all("div", attrs={"data-film-id": True})
ids = set(el["data-film-id"] for el in id_containers)
# necessary for multiprocessing pickle to work
return json.dumps(list(ids))
# TODO? countVote1 and so on keys ingnored for now, histogram info?

def merge_data(ids, user_ratings, global_info, global_rating):
"""
Expand All @@ -45,7 +33,7 @@ def _movie_id_key(data):
Parse and reformat data into dict with movie_id as key
"""
data = (json.loads(el) for el in data)
return {entry["movie_id"]: entry for entry in data}
return {entry["entity"]: entry for entry in data}

def _fix_keys(entry):
"""
Expand Down
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
beautifulsoup4==4.12.2
certifi==2023.11.17
charset-normalizer==3.3.2
docopt==0.6.2
idna==3.6
requests==2.31.0
soupsieve==2.5
tqdm==4.66.1
urllib3==2.1.0

0 comments on commit d127a67

Please sign in to comment.