diff --git a/filmweb/getter.py b/filmweb/getter.py index cfa8314..23ed764 100644 --- a/filmweb/getter.py +++ b/filmweb/getter.py @@ -3,16 +3,21 @@ HEADERS = { # https://www.whatismybrowser.com/guides/the-latest-user-agent/firefox - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13.4; rv:109.0) Gecko/20100101 Firefox/113.0", + "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0", "x-locale": "pl_PL", "Host": "www.filmweb.pl", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept": "*/*", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", "Origin": "https://www.filmweb.pl", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": 'empty', + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "Sec-GPC": "1", + "TE": "trailers", } def get_films_page(args): @@ -20,12 +25,13 @@ def get_films_page(args): request films page """ # this workaround is necessary because multiprocessing imap takes one arg only - (cookie, user, n) = args - url = f"https://www.filmweb.pl/user/{user}/films" - params = {"page": n} - response = requests.get(url, params=params, headers={"Cookie": cookie, **HEADERS}) - response.raise_for_status() - return response.text + (cookie, user, friend_query, n) = args + if friend_query: + url = f"https://www.filmweb.pl/api/v1/logged/friend/{user}/vote/title/film?page={n}" + else: + url = f"https://www.filmweb.pl/api/v1/logged/vote/title/film?page={n}" + data = _get_json(url, cookie, "get_films_page") + return json.dumps(data) def auth_check(cookie): """ @@ -38,33 +44,20 @@ def auth_check(cookie): def get_votes_count(user): """ - Get total count of votes + Get total count of voteshttps://www.filmweb.pl/api/v1/user/{user}/votes/film/count Args: user: user to get ratings for """ url = f"https://www.filmweb.pl/api/v1/user/{user}/votes/film/count" return _get_json(url, "", "get_votes_count") -def get_user_rating(args): - """ - Gets user rating - """ - (cookie, movie_id, user, friend_query) = args - if friend_query: - url = f"https://www.filmweb.pl/api/v1/logged/friend/{user}/vote/film/{movie_id}/details" - else: - url = f"https://www.filmweb.pl/api/v1/logged/vote/film/{movie_id}/details" - data = _get_json(url, cookie, "get_user_rating") - data["movie_id"] = movie_id - return json.dumps(data) - def get_global_info(movie_id): """ Get info about a movie (title etc) """ url = f"https://www.filmweb.pl/api/v1/title/{movie_id}/info" data = _get_json(url, "", "get_global_info") - data["movie_id"] = movie_id + data["entity"] = movie_id return json.dumps(data) def get_global_rating(movie_id): @@ -73,7 +66,7 @@ def get_global_rating(movie_id): """ url = f"https://www.filmweb.pl/api/v1/film/{movie_id}/rating" data = _get_json(url, "", "get_global_rating") - data["movie_id"] = movie_id + data["entity"] = movie_id data["global_rate"] = data.pop("rate") return json.dumps(data) diff --git a/filmweb/main.py b/filmweb/main.py index 92423a6..f542fc5 100644 --- a/filmweb/main.py +++ b/filmweb/main.py @@ -21,7 +21,7 @@ from . import parser PARALLEL_PROC = multiprocessing.cpu_count() -MOVIES_PER_PAGE = 25 +MOVIES_PER_PAGE = 100 FORMATS = {"csv", "json", "letterboxd"} def main(): @@ -43,23 +43,19 @@ def main(): pages = ceil(votes_total/MOVIES_PER_PAGE) logged_in_user = getter.auth_check(cookie) friend_query = (user != logged_in_user) - logging.info("Fetching list of movies [1/6]...") - get_films_page_args = ((cookie, user, page) for page in range(1, pages+1)) - raw_responses = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_films_page, get_films_page_args), total=pages)) - logging.info("Parsing list of movies [2/6]...") - ids = tuple(tqdm.tqdm(pool.imap_unordered(parser.extract_movie_ids, raw_responses), total=pages)) - ids = tuple(set(itertools.chain.from_iterable((json.loads(el) for el in ids)))) - total_movies = len(ids) + logging.info("Fetching list of movies [1/4]...") + get_films_page_args = ((cookie, user, friend_query, page) for page in range(1, pages+1)) + user_ratings_raw = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_films_page, get_films_page_args), total=pages)) + user_ratings = tuple(itertools.chain.from_iterable((json.loads(el) for el in user_ratings_raw))) + total_movies = len(user_ratings) logging.info(f"User {user} has {total_movies} movies...") assert total_movies, "No movies available" - logging.info("Fetching user ratings [3/6]...") - get_user_rating_args = ((cookie, movie_id, user, friend_query) for movie_id in ids) - user_ratings = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_user_rating, get_user_rating_args), total=total_movies)) - logging.info("Fetching info about movies [4/6]...") + ids = tuple(el.get("entity") for el in user_ratings) + logging.info("Fetching info about movies [2/4]...") global_info = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_info, ids), total=total_movies)) - logging.info("Fetching global rating for movies [5/6]...") + logging.info("Fetching global rating for movies [3/4]...") global_rating = tuple(tqdm.tqdm(pool.imap_unordered(getter.get_global_rating, ids), total=total_movies)) - logging.info("Writing data [6/6]...") + logging.info("Writing data [4/4]...") movies = parser.merge_data(ids, user_ratings, global_info, global_rating) parser.write_data(movies, user, formats) except Exception as e: diff --git a/filmweb/parser.py b/filmweb/parser.py index 1f77ecc..e90af7e 100644 --- a/filmweb/parser.py +++ b/filmweb/parser.py @@ -2,7 +2,6 @@ import logging import json from datetime import datetime -from bs4 import BeautifulSoup from urllib.parse import quote_plus KEY_MAPPING = { @@ -14,23 +13,12 @@ "originalTitle": "original_title", "title": "pl_title", "year": "year", - "movie_id": "movie_id", + "entity": "movie_id", "url": "url", "date": "date", } # TODO? country/genre info not visible in api, would need to parse htmls - -def extract_movie_ids(content): - """ - Extract movie ids from films page - Args: - content: raw html - """ - soup = BeautifulSoup(content, "html.parser") - id_containers = soup.find_all("div", attrs={"data-film-id": True}) - ids = set(el["data-film-id"] for el in id_containers) - # necessary for multiprocessing pickle to work - return json.dumps(list(ids)) +# TODO? countVote1 and so on keys ingnored for now, histogram info? def merge_data(ids, user_ratings, global_info, global_rating): """ @@ -45,7 +33,7 @@ def _movie_id_key(data): Parse and reformat data into dict with movie_id as key """ data = (json.loads(el) for el in data) - return {entry["movie_id"]: entry for entry in data} + return {entry["entity"]: entry for entry in data} def _fix_keys(entry): """ diff --git a/requirements.txt b/requirements.txt index 9410fdf..715a522 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,7 @@ -beautifulsoup4==4.12.2 certifi==2023.11.17 charset-normalizer==3.3.2 docopt==0.6.2 idna==3.6 requests==2.31.0 -soupsieve==2.5 tqdm==4.66.1 urllib3==2.1.0