From 6cba1ee909c2dd575a5e1200952d43ff693c3549 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 5 Dec 2024 16:45:08 +0100 Subject: [PATCH 01/47] Draft reddit --- minet/cli/commands.py | 2 + minet/cli/reddit/__init__.py | 49 +++++++++++++++++ minet/cli/reddit/posts.py | 44 +++++++++++++++ minet/reddit/scraper.py | 103 +++++++++++++++++++++++++++++++++++ minet/reddit/types.py | 32 +++++++++++ 5 files changed, 230 insertions(+) create mode 100644 minet/cli/reddit/__init__.py create mode 100644 minet/cli/reddit/posts.py create mode 100644 minet/reddit/scraper.py create mode 100644 minet/reddit/types.py diff --git a/minet/cli/commands.py b/minet/cli/commands.py index 4202252a19..6200e990bb 100644 --- a/minet/cli/commands.py +++ b/minet/cli/commands.py @@ -14,6 +14,7 @@ from minet.cli.hyphe import HYPHE_COMMAND from minet.cli.instagram import INSTAGRAM_COMMAND from minet.cli.mediacloud import MEDIACLOUD_COMMAND +from minet.cli.reddit import REDDIT_COMMAND from minet.cli.telegram import TELEGRAM_COMMAND from minet.cli.tiktok import TIKTOK_COMMAND from minet.cli.twitter import TWITTER_COMMAND @@ -42,6 +43,7 @@ HYPHE_COMMAND, INSTAGRAM_COMMAND, MEDIACLOUD_COMMAND, + REDDIT_COMMAND, TELEGRAM_COMMAND, TIKTOK_COMMAND, TWITTER_COMMAND, diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py new file mode 100644 index 0000000000..3301a6ee5e --- /dev/null +++ b/minet/cli/reddit/__init__.py @@ -0,0 +1,49 @@ +# ============================================================================= +# Minet Reddit CLI Action +# ============================================================================= +# +# Logic of the `rd` action. +# +from casanova import RowCountResumer + +from minet.cli.argparse import command, ConfigAction + +REDDIT_POSTS_SUBCOMMAND = command( + "posts", + "minet.cli.reddit.posts", + title="Minet Reddit Posts Command", + description=""" + Retrieve reddit posts from a subreddit link. + """, + epilog=""" + Example: + + . Searching posts from the subreddit r/france: + $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv + """, + variadic_input= { + "dummy_column": "post", + "item_label": "post url, post shortcode or post id", + "item_label_plural": "post urls, post shortcodes or post ids", + }, + arguments=[ + { + "flags": ["-n", "--number"], + "help": "Number of posts to retrieve.", + "type": int, + } + ], +) + +REDDIT_COMMAND = command( + "reddit", + "minet.cli.reddit", + "Minet Reddit Command", + aliases=["rd"], + description=""" + Collect data from Reddit. + """, + subcommands=[ + REDDIT_POSTS_SUBCOMMAND, + ], +) \ No newline at end of file diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py new file mode 100644 index 0000000000..bad56c15cb --- /dev/null +++ b/minet/cli/reddit/posts.py @@ -0,0 +1,44 @@ +# ============================================================================= +# Minet Reddit Posts CLI Action +# ============================================================================= +# +# Logic of the `rd posts` action. +# +from minet.cli.utils import with_enricher_and_loading_bar +from minet.reddit.scraper import RedditScraper +from minet.reddit.types import RedditPost + + + +@with_enricher_and_loading_bar( + headers={"post_url"}, + title="Scraping posts", + unit="groups", + nested=True, + sub_unit="posts", +) +def action(cli_args, enricher, loading_bar): + scraper = RedditScraper() + + for i, row, url in enricher.enumerate_cells( + cli_args.column, with_rows=True, start=1 + ): + with loading_bar.step(url): + try: + if cli_args.number: + posts = scraper.get_posts_urls(url, cli_args.number) + else: + posts = scraper.get_posts_urls(url) + except : + loading_bar.print( + "problème" + ) + continue + + list_posts = [] + for post in posts: + list_posts.append({post}) + + for post in list_posts: + loading_bar.nested_advance() + enricher.writerow(row, post) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py new file mode 100644 index 0000000000..624c421b6e --- /dev/null +++ b/minet/reddit/scraper.py @@ -0,0 +1,103 @@ +from minet.web import request, create_pool_manager +from math import ceil +from ural import get_domain_name, urlpathsplit, is_url +from time import sleep +from minet.reddit.types import RedditPost +import json +from ebbe import getpath +from collections import deque +from urllib.parse import urljoin +import csv +import re +import sys +import os + +def get_old_url(url): + domain = get_domain_name(url) + path = urlpathsplit(url) + return f"https://old.{domain}/" + "/".join(path) + "/" + + +def get_new_url(url): + domain = get_domain_name(url) + path = urlpathsplit(url) + return f"https://www.{domain}/" + "/".join(path) + "/" + +def reddit_request(url, pool_manager): + sleep(1) + response = request(url, pool_manager=pool_manager) + remaining_requests = float(response.headers["x-ratelimit-remaining"]) + if remaining_requests == 1: + time_remaining = int(response.headers["x-ratelimit-reset"]) + print(f"Time before next request : {time_remaining}s") + sleep(time_remaining) + return reddit_request(url) + if response.status == 429: + return reddit_request(url) + return response + + +class RedditScraper(object): + def __init__(self): + self.pool_manager = create_pool_manager() + + def get_posts_urls(self, url, nb_post = 25): + dir_name = urlpathsplit(url)[1] + try: + os.mkdir(dir_name) + except FileExistsError: + pass + except PermissionError: + print(f"Permission denied: Unable to create '{dir_name}'.") + except Exception as e: + print(f"An error occurred: {e}") + list_posts = set() + nb_pages = ceil(int(nb_post) / 25) + old_url = get_old_url(url) + n_crawled = 0 + for _ in range(nb_pages): + if n_crawled == int(nb_post): + break + response = reddit_request(old_url, self.pool_manager) + soup = response.soup() + list_buttons = soup.select("ul[class='flat-list buttons']") + for link in list_buttons: + if n_crawled == int(nb_post): + break + if len(link.scrape("span[class='promoted-span']")) == 0: + list_posts.update(link.scrape("a[class^='bylink comments']", "href")) + n_crawled += 1 + old_url = soup.scrape("span[class='next-button'] a", "href")[0] + return list(list_posts) + + + def get_posts(self, url, nb_post): + posts = [] + list_posts_url = self.get_posts_urls(self, url, nb_post) + for url in list_posts_url: + response = reddit_request(url, self.pool_manager) + if response.url == 429: + print(response.headers) + print(response.end_url) + soup = response.soup() + title = soup.force_select_one("a[class^='title']").get_text() + upvote = soup.force_select_one("div[class='score'] span").get_text() + author = soup.scrape_one("a[class^='author']", "href") + published_date = soup.scrape_one("div[class='date'] time", "datetime") + link = soup.scrape_one("a[class^='title']", "href") + if urlpathsplit(link) == urlpathsplit(url): + link = None + author_text = soup.scrape_one( + "div[id='siteTable'] div[class^='usertext-body'] div p" + ) + post = RedditPost( + title=title, + url=url, + author=author, + author_text=author_text, + upvote=upvote, + published_date=published_date, + link=link, + ) + posts.append(post) + return posts \ No newline at end of file diff --git a/minet/reddit/types.py b/minet/reddit/types.py new file mode 100644 index 0000000000..b918f72eab --- /dev/null +++ b/minet/reddit/types.py @@ -0,0 +1,32 @@ +from typing import List, Optional, Dict, Tuple, Iterable +from datetime import datetime + +from dataclasses import dataclass +from casanova import TabularRecord +from ebbe import getpath + + +@dataclass +class RedditPost(TabularRecord): + title: str + url: str + author: str + author_text: str + upvote: str + published_date: str + link: Optional[str] + + +@dataclass +class RedditComment(TabularRecord): + # url: str + # author: str + id: str + parent: str + # points: Optional[str] + # published_date: str + comment: str + + + + From 831537e106c2dfd2c63e7d6886a7aeb7ec2f2e6a Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 5 Dec 2024 18:07:46 +0100 Subject: [PATCH 02/47] Fix reddit posts --- minet/cli/reddit/posts.py | 8 ++-- minet/reddit/scraper.py | 83 ++++++++++++--------------------------- minet/reddit/types.py | 2 +- 3 files changed, 31 insertions(+), 62 deletions(-) diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index bad56c15cb..6344cda616 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -11,7 +11,7 @@ @with_enricher_and_loading_bar( - headers={"post_url"}, + headers=RedditPost, title="Scraping posts", unit="groups", nested=True, @@ -26,9 +26,9 @@ def action(cli_args, enricher, loading_bar): with loading_bar.step(url): try: if cli_args.number: - posts = scraper.get_posts_urls(url, cli_args.number) + posts = scraper.get_posts(url, cli_args.number) else: - posts = scraper.get_posts_urls(url) + posts = scraper.get_posts(url) except : loading_bar.print( "problème" @@ -37,7 +37,7 @@ def action(cli_args, enricher, loading_bar): list_posts = [] for post in posts: - list_posts.append({post}) + list_posts.append(post) for post in list_posts: loading_bar.nested_advance() diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 624c421b6e..df2bd326e0 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -1,16 +1,8 @@ from minet.web import request, create_pool_manager from math import ceil -from ural import get_domain_name, urlpathsplit, is_url +from ural import get_domain_name, urlpathsplit from time import sleep from minet.reddit.types import RedditPost -import json -from ebbe import getpath -from collections import deque -from urllib.parse import urljoin -import csv -import re -import sys -import os def get_old_url(url): domain = get_domain_name(url) @@ -41,17 +33,8 @@ class RedditScraper(object): def __init__(self): self.pool_manager = create_pool_manager() - def get_posts_urls(self, url, nb_post = 25): - dir_name = urlpathsplit(url)[1] - try: - os.mkdir(dir_name) - except FileExistsError: - pass - except PermissionError: - print(f"Permission denied: Unable to create '{dir_name}'.") - except Exception as e: - print(f"An error occurred: {e}") - list_posts = set() + def get_posts(self, url, nb_post = 25): + list_posts = [] nb_pages = ceil(int(nb_post) / 25) old_url = get_old_url(url) n_crawled = 0 @@ -60,44 +43,30 @@ def get_posts_urls(self, url, nb_post = 25): break response = reddit_request(old_url, self.pool_manager) soup = response.soup() - list_buttons = soup.select("ul[class='flat-list buttons']") - for link in list_buttons: + posts = soup.select("div[id^='thing_t3_']") + for post in posts: if n_crawled == int(nb_post): break - if len(link.scrape("span[class='promoted-span']")) == 0: - list_posts.update(link.scrape("a[class^='bylink comments']", "href")) - n_crawled += 1 - old_url = soup.scrape("span[class='next-button'] a", "href")[0] - return list(list_posts) + list_buttons = post.select_one("ul[class='flat-list buttons']") + if len(list_buttons.scrape("span[class='promoted-span']")) == 0: + title = post.force_select_one("a[class*='title']").get_text() + post_url = list_buttons.scrape_one("a[class^='bylink comments']", "href") + author = post.select_one("a[class*='author']").get_text() + upvote = post.select_one("div[class='score unvoted']").get_text() + published_date = post.scrape_one("time", "datetime") + link = post.scrape_one("a[class*='title']", "href") + data = RedditPost( + title=title, + url=post_url, + author=author, + author_text=None, + upvote=upvote, + published_date=published_date, + link=link + ) - def get_posts(self, url, nb_post): - posts = [] - list_posts_url = self.get_posts_urls(self, url, nb_post) - for url in list_posts_url: - response = reddit_request(url, self.pool_manager) - if response.url == 429: - print(response.headers) - print(response.end_url) - soup = response.soup() - title = soup.force_select_one("a[class^='title']").get_text() - upvote = soup.force_select_one("div[class='score'] span").get_text() - author = soup.scrape_one("a[class^='author']", "href") - published_date = soup.scrape_one("div[class='date'] time", "datetime") - link = soup.scrape_one("a[class^='title']", "href") - if urlpathsplit(link) == urlpathsplit(url): - link = None - author_text = soup.scrape_one( - "div[id='siteTable'] div[class^='usertext-body'] div p" - ) - post = RedditPost( - title=title, - url=url, - author=author, - author_text=author_text, - upvote=upvote, - published_date=published_date, - link=link, - ) - posts.append(post) - return posts \ No newline at end of file + list_posts.append(data) + n_crawled += 1 + old_url = soup.scrape("span[class='next-button'] a", "href")[0] + return list(list_posts) \ No newline at end of file diff --git a/minet/reddit/types.py b/minet/reddit/types.py index b918f72eab..f80064d0b5 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -11,7 +11,7 @@ class RedditPost(TabularRecord): title: str url: str author: str - author_text: str + author_text: Optional[str] upvote: str published_date: str link: Optional[str] From 8fb9cf0825c84a5acfe9136d8668967306327c6c Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 6 Dec 2024 11:45:03 +0100 Subject: [PATCH 03/47] Updating reddit posts --- minet/cli/reddit/__init__.py | 8 +++---- minet/reddit/scraper.py | 41 ++++++++++++++++++++++++++++++------ minet/reddit/types.py | 1 + 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index 3301a6ee5e..f7bc7e0d89 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -22,16 +22,16 @@ $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv """, variadic_input= { - "dummy_column": "post", - "item_label": "post url, post shortcode or post id", - "item_label_plural": "post urls, post shortcodes or post ids", + "dummy_column": "subreddit", + "item_label": "subreddit url, subreddit shortcode or subreddit id", + "item_label_plural": "subreddit urls, subreddit shortcodes or subreddits ids", }, arguments=[ { "flags": ["-n", "--number"], "help": "Number of posts to retrieve.", "type": int, - } + }, ], ) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index df2bd326e0..6c87eb5dda 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -1,8 +1,10 @@ from minet.web import request, create_pool_manager from math import ceil -from ural import get_domain_name, urlpathsplit +from ural import get_domain_name, urlpathsplit, is_url from time import sleep from minet.reddit.types import RedditPost +import re + def get_old_url(url): domain = get_domain_name(url) @@ -15,9 +17,23 @@ def get_new_url(url): path = urlpathsplit(url) return f"https://www.{domain}/" + "/".join(path) + "/" + +def get_url_from_subreddit(name: str): + if is_url(name): + return name + name = name.lstrip("/") + if name.startswith("r/"): + return "https://old.reddit.com/" + name + return "https://old.reddit.com/r/" + name + + def reddit_request(url, pool_manager): sleep(1) response = request(url, pool_manager=pool_manager) + soup = response.soup() + if response.status == 404 or soup.scrape("p[id='noresults']"): + print("invalid url!") + return remaining_requests = float(response.headers["x-ratelimit-remaining"]) if remaining_requests == 1: time_remaining = int(response.headers["x-ratelimit-reset"]) @@ -33,10 +49,10 @@ class RedditScraper(object): def __init__(self): self.pool_manager = create_pool_manager() - def get_posts(self, url, nb_post = 25): + def get_posts(self, url, nb_post=25): list_posts = [] nb_pages = ceil(int(nb_post) / 25) - old_url = get_old_url(url) + old_url = get_old_url(get_url_from_subreddit(url)) n_crawled = 0 for _ in range(nb_pages): if n_crawled == int(nb_post): @@ -50,8 +66,18 @@ def get_posts(self, url, nb_post = 25): list_buttons = post.select_one("ul[class='flat-list buttons']") if len(list_buttons.scrape("span[class='promoted-span']")) == 0: title = post.force_select_one("a[class*='title']").get_text() - post_url = list_buttons.scrape_one("a[class^='bylink comments']", "href") - author = post.select_one("a[class*='author']").get_text() + post_url = list_buttons.scrape_one( + "a[class^='bylink comments']", "href" + ) + n_comments = list_buttons.select_one( + "a[class^='bylink comments']").get_text() + match = re.match(r"(\d+)\s+comments", n_comments) + if match: + n_comments = int(match.group(1)) + else: + n_comments = 0 + try_author = post.select_one("a[class*='author']") + author = try_author.get_text() if try_author else "Deleted" upvote = post.select_one("div[class='score unvoted']").get_text() published_date = post.scrape_one("time", "datetime") link = post.scrape_one("a[class*='title']", "href") @@ -62,11 +88,12 @@ def get_posts(self, url, nb_post = 25): author=author, author_text=None, upvote=upvote, + number_comments=n_comments, published_date=published_date, - link=link + link=link, ) list_posts.append(data) n_crawled += 1 old_url = soup.scrape("span[class='next-button'] a", "href")[0] - return list(list_posts) \ No newline at end of file + return list(list_posts) diff --git a/minet/reddit/types.py b/minet/reddit/types.py index f80064d0b5..a7af811463 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -13,6 +13,7 @@ class RedditPost(TabularRecord): author: str author_text: Optional[str] upvote: str + number_comments: int published_date: str link: Optional[str] From a88ac134a97f2e7fb07fd79f0dc934086052120a Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 6 Dec 2024 16:23:17 +0100 Subject: [PATCH 04/47] Adding -t, --text to reddit posts --- minet/cli/reddit/__init__.py | 5 +++++ minet/cli/reddit/posts.py | 14 ++++++++++---- minet/reddit/scraper.py | 16 +++++++++++++--- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index f7bc7e0d89..e10db2cd40 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -32,6 +32,11 @@ "help": "Number of posts to retrieve.", "type": int, }, + { + "flags": ["-t", "--text"], + "help": "Retrieve the text of the post. Note that it will require one request per post.", + "action": "store_true", + } ], ) diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index 6344cda616..4d54689ca8 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -15,7 +15,7 @@ title="Scraping posts", unit="groups", nested=True, - sub_unit="posts", + sub_unit="subreddits", ) def action(cli_args, enricher, loading_bar): scraper = RedditScraper() @@ -26,12 +26,18 @@ def action(cli_args, enricher, loading_bar): with loading_bar.step(url): try: if cli_args.number: - posts = scraper.get_posts(url, cli_args.number) + if cli_args.text: + posts = scraper.get_posts(url, True, cli_args.number) + else: + posts = scraper.get_posts(url, False, cli_args.number) else: - posts = scraper.get_posts(url) + if cli_args.text: + posts = scraper.get_posts(url, True) + else: + posts = scraper.get_posts(url, False) except : loading_bar.print( - "problème" + "the script could not complete normally on line %i" % (i) ) continue diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 6c87eb5dda..bd89234bcf 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -31,7 +31,7 @@ def reddit_request(url, pool_manager): sleep(1) response = request(url, pool_manager=pool_manager) soup = response.soup() - if response.status == 404 or soup.scrape("p[id='noresults']"): + if response.status == 404 or (soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']")): print("invalid url!") return remaining_requests = float(response.headers["x-ratelimit-remaining"]) @@ -49,7 +49,7 @@ class RedditScraper(object): def __init__(self): self.pool_manager = create_pool_manager() - def get_posts(self, url, nb_post=25): + def get_posts(self, url: str, add_text: bool, nb_post=25): list_posts = [] nb_pages = ceil(int(nb_post) / 25) old_url = get_old_url(get_url_from_subreddit(url)) @@ -81,12 +81,22 @@ def get_posts(self, url, nb_post=25): upvote = post.select_one("div[class='score unvoted']").get_text() published_date = post.scrape_one("time", "datetime") link = post.scrape_one("a[class*='title']", "href") + if add_text: + text_response = reddit_request(post_url, self.pool_manager) + text_soup = text_response.soup() + try_content = text_soup.select_one("div[id='siteTable'] div[class^='usertext']") + if try_content: + content = try_content.get_text() + else: + content = "" + else: + content = "" data = RedditPost( title=title, url=post_url, author=author, - author_text=None, + author_text=content, upvote=upvote, number_comments=n_comments, published_date=published_date, From 2735a0a68d5a700a20fd4332dd1d947e7e7610f8 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 6 Dec 2024 16:58:58 +0100 Subject: [PATCH 05/47] Fix tests --- minet/cli/reddit/__init__.py | 10 +++++----- minet/cli/reddit/posts.py | 7 +++---- minet/reddit/exceptions.py | 17 +++++++++++++++++ minet/reddit/scraper.py | 15 ++++++++++----- minet/reddit/types.py | 4 ---- 5 files changed, 35 insertions(+), 18 deletions(-) create mode 100644 minet/reddit/exceptions.py diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index e10db2cd40..d26cb97c25 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -4,9 +4,8 @@ # # Logic of the `rd` action. # -from casanova import RowCountResumer -from minet.cli.argparse import command, ConfigAction +from minet.cli.argparse import command REDDIT_POSTS_SUBCOMMAND = command( "posts", @@ -21,7 +20,7 @@ . Searching posts from the subreddit r/france: $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv """, - variadic_input= { + variadic_input={ "dummy_column": "subreddit", "item_label": "subreddit url, subreddit shortcode or subreddit id", "item_label_plural": "subreddit urls, subreddit shortcodes or subreddits ids", @@ -36,10 +35,11 @@ "flags": ["-t", "--text"], "help": "Retrieve the text of the post. Note that it will require one request per post.", "action": "store_true", - } + }, ], ) + REDDIT_COMMAND = command( "reddit", "minet.cli.reddit", @@ -51,4 +51,4 @@ subcommands=[ REDDIT_POSTS_SUBCOMMAND, ], -) \ No newline at end of file +) diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index 4d54689ca8..b39bd8d9c9 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -9,7 +9,6 @@ from minet.reddit.types import RedditPost - @with_enricher_and_loading_bar( headers=RedditPost, title="Scraping posts", @@ -35,16 +34,16 @@ def action(cli_args, enricher, loading_bar): posts = scraper.get_posts(url, True) else: posts = scraper.get_posts(url, False) - except : + except: loading_bar.print( "the script could not complete normally on line %i" % (i) ) continue - + list_posts = [] for post in posts: list_posts.append(post) - + for post in list_posts: loading_bar.nested_advance() enricher.writerow(row, post) diff --git a/minet/reddit/exceptions.py b/minet/reddit/exceptions.py new file mode 100644 index 0000000000..5b4ffc3aeb --- /dev/null +++ b/minet/reddit/exceptions.py @@ -0,0 +1,17 @@ +# ============================================================================= +# Minet Reddit Exceptions +# ============================================================================= +# +from minet.exceptions import MinetError + + +class RedditError(MinetError): + pass + + +class RedditInvalidTargetError(RedditError): + pass + + +class RedditNotPostError(RedditError): + pass diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index bd89234bcf..6dc7274360 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -3,6 +3,7 @@ from ural import get_domain_name, urlpathsplit, is_url from time import sleep from minet.reddit.types import RedditPost +from minet.reddit.exceptions import RedditInvalidTargetError import re @@ -31,9 +32,10 @@ def reddit_request(url, pool_manager): sleep(1) response = request(url, pool_manager=pool_manager) soup = response.soup() - if response.status == 404 or (soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']")): - print("invalid url!") - return + if response.status == 404 or ( + soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']") + ): + raise RedditInvalidTargetError remaining_requests = float(response.headers["x-ratelimit-remaining"]) if remaining_requests == 1: time_remaining = int(response.headers["x-ratelimit-reset"]) @@ -70,7 +72,8 @@ def get_posts(self, url: str, add_text: bool, nb_post=25): "a[class^='bylink comments']", "href" ) n_comments = list_buttons.select_one( - "a[class^='bylink comments']").get_text() + "a[class^='bylink comments']" + ).get_text() match = re.match(r"(\d+)\s+comments", n_comments) if match: n_comments = int(match.group(1)) @@ -84,7 +87,9 @@ def get_posts(self, url: str, add_text: bool, nb_post=25): if add_text: text_response = reddit_request(post_url, self.pool_manager) text_soup = text_response.soup() - try_content = text_soup.select_one("div[id='siteTable'] div[class^='usertext']") + try_content = text_soup.select_one( + "div[id='siteTable'] div[class^='usertext']" + ) if try_content: content = try_content.get_text() else: diff --git a/minet/reddit/types.py b/minet/reddit/types.py index a7af811463..0be774dd5e 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -27,7 +27,3 @@ class RedditComment(TabularRecord): # points: Optional[str] # published_date: str comment: str - - - - From 9434b192f63afee217b243b1746b81ded646aba8 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 6 Dec 2024 17:01:00 +0100 Subject: [PATCH 06/47] fix tests --- minet/reddit/types.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 0be774dd5e..6b52cbfef2 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -1,9 +1,7 @@ -from typing import List, Optional, Dict, Tuple, Iterable -from datetime import datetime +from typing import Optional from dataclasses import dataclass from casanova import TabularRecord -from ebbe import getpath @dataclass @@ -16,14 +14,3 @@ class RedditPost(TabularRecord): number_comments: int published_date: str link: Optional[str] - - -@dataclass -class RedditComment(TabularRecord): - # url: str - # author: str - id: str - parent: str - # points: Optional[str] - # published_date: str - comment: str From 1c93157ffb7908eb49b910749b4b16b1997ef75d Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 6 Dec 2024 17:03:40 +0100 Subject: [PATCH 07/47] fix tests --- minet/cli/reddit/posts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index b39bd8d9c9..0530ac05c7 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -7,6 +7,7 @@ from minet.cli.utils import with_enricher_and_loading_bar from minet.reddit.scraper import RedditScraper from minet.reddit.types import RedditPost +from minet.reddit.exceptions import RedditInvalidTargetError @with_enricher_and_loading_bar( @@ -34,7 +35,7 @@ def action(cli_args, enricher, loading_bar): posts = scraper.get_posts(url, True) else: posts = scraper.get_posts(url, False) - except: + except RedditInvalidTargetError: loading_bar.print( "the script could not complete normally on line %i" % (i) ) From ef116eb852f66cc46b2297efacb7cb5832777fef Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Mon, 9 Dec 2024 16:50:02 +0100 Subject: [PATCH 08/47] First version of reddit comments --- minet/cli/reddit/__init__.py | 29 ++++++++++ minet/cli/reddit/comments.py | 41 ++++++++++++++ minet/cli/reddit/posts.py | 2 +- minet/reddit/scraper.py | 106 ++++++++++++++++++++++++++++++++++- minet/reddit/types.py | 11 ++++ 5 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 minet/cli/reddit/comments.py diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index d26cb97c25..17b2d1f018 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -39,6 +39,34 @@ ], ) +REDDIT_COMMENTS_SUBCOMMAND = command( + "comments", + "minet.cli.reddit.comments", + title="Minet Reddit Comments Command", + description=""" + Retrieve comments from a reddit post link. + Note that it will only retrieve the comments displayed on the page. If you want all the comments you need to use -A, --all but it will require a request per comment, and you can only make 100 requests per 10 minutes. + """, + epilog=""" + Example: + + . Searching comments from a reddit post: + $ minet reddit comments https://www.reddit.com/r/france/comments/... > r_france_comments.csv + """, + variadic_input={ + "dummy_column": "post", + "item_label": "post url, post shortcode or post id", + "item_label_plural": "posts urls, posts shortcodes or posts ids", + }, + arguments=[ + { + "flags": ["-A", "--all"], + "help": "Retrieve all comments.", + "action": "store_true", + }, + ], +) + REDDIT_COMMAND = command( "reddit", @@ -50,5 +78,6 @@ """, subcommands=[ REDDIT_POSTS_SUBCOMMAND, + REDDIT_COMMENTS_SUBCOMMAND, ], ) diff --git a/minet/cli/reddit/comments.py b/minet/cli/reddit/comments.py new file mode 100644 index 0000000000..d41c7b9699 --- /dev/null +++ b/minet/cli/reddit/comments.py @@ -0,0 +1,41 @@ +# ============================================================================= +# Minet Reddit Comments CLI Action +# ============================================================================= +# +# Logic of the `rd comments` action. +# +from minet.cli.utils import with_enricher_and_loading_bar +from minet.reddit.scraper import RedditScraper +from minet.reddit.types import RedditComment +from minet.reddit.exceptions import RedditInvalidTargetError + + +@with_enricher_and_loading_bar( + headers=RedditComment, + title="Scraping comments", + unit="groups", + nested=True, + sub_unit="comments", +) +def action(cli_args, enricher, loading_bar): + scraper = RedditScraper() + + for i, row, url in enricher.enumerate_cells( + cli_args.column, with_rows=True, start=1 + ): + with loading_bar.step(url): + try: + if cli_args.all: + comments = scraper.get_comments(url, True) + else: + comments = scraper.get_comments(url, False) + + except RedditInvalidTargetError: + loading_bar.print( + "the script could not complete normally on line %i" % (i) + ) + continue + + for comment in comments: + loading_bar.nested_advance() + enricher.writerow(row, comment) diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index 0530ac05c7..777f88813d 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -15,7 +15,7 @@ title="Scraping posts", unit="groups", nested=True, - sub_unit="subreddits", + sub_unit="posts", ) def action(cli_args, enricher, loading_bar): scraper = RedditScraper() diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 6dc7274360..5d0806b33d 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -2,9 +2,14 @@ from math import ceil from ural import get_domain_name, urlpathsplit, is_url from time import sleep -from minet.reddit.types import RedditPost +from minet.reddit.types import RedditPost, RedditComment from minet.reddit.exceptions import RedditInvalidTargetError import re +from urllib.parse import urljoin + + +def resolve_relative_url(path): + return urljoin("https://old.reddit.com", path) def get_old_url(url): @@ -47,10 +52,47 @@ def reddit_request(url, pool_manager): return response +def extract_t1_ids(text): + pattern = r"t1_(\w+)" + return [match.group(1) for match in re.finditer(pattern, text)] + + +def get_current_id(com): + current_id = com.get("id") + if current_id: + current_id = current_id.split("_")[-1] + else: + current_id = com.get("data-permalink").split("/")[-2] + return current_id + + class RedditScraper(object): def __init__(self): self.pool_manager = create_pool_manager() + def get_childs_l500(self, url, list_comments, parent_id): + response = reddit_request(url, self.pool_manager) + soup = response.soup() + comments = soup.select("div[class='commentarea']>div>div[class*='comment']") + for com in comments: + child = com.find("div", class_="child") + if child.text != "": + child = child.find("div") + child_com = child.find_all( + "div", + class_=lambda x: x + and ( + "comment" in x + or "deleted comment" in x + or "morerecursion" in x + or "morechildren" in x + ), + recursive=False, + ) + for ele in child_com: + list_comments.append((parent_id, ele)) + return list_comments + def get_posts(self, url: str, add_text: bool, nb_post=25): list_posts = [] nb_pages = ceil(int(nb_post) / 25) @@ -82,6 +124,8 @@ def get_posts(self, url: str, add_text: bool, nb_post=25): try_author = post.select_one("a[class*='author']") author = try_author.get_text() if try_author else "Deleted" upvote = post.select_one("div[class='score unvoted']").get_text() + if upvote == '•': + upvote = "" published_date = post.scrape_one("time", "datetime") link = post.scrape_one("a[class*='title']", "href") if add_text: @@ -105,10 +149,68 @@ def get_posts(self, url: str, add_text: bool, nb_post=25): upvote=upvote, number_comments=n_comments, published_date=published_date, - link=link, + link=resolve_relative_url(link), ) list_posts.append(data) n_crawled += 1 old_url = soup.scrape("span[class='next-button'] a", "href")[0] return list(list_posts) + + + def get_comments(self, url: str, all): + list_return = [] + m_comments = [] + old_url = get_old_url(url) + url_limit = old_url + "?limit=500" + response = reddit_request(url_limit, self.pool_manager) + soup = response.soup() + first_comments = soup.select("div[class='commentarea']>div>div[class*='comment']") + for ele in first_comments: + m_comments.append((None, ele)) + while m_comments: + parent, com = m_comments.pop() + current_id = get_current_id(com) + if "morerecursion" in com.get("class") and all: + url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}" + m_comments = self.get_childs_l500(url_rec, m_comments, parent) + elif "morechildren" in com.get("class") and all: + a = com.select_one("a") + onclick = a["onclick"] + id_list = extract_t1_ids(onclick) + for id in id_list: + comment_url = f"{old_url}{id}" + m_comments = self.get_childs_l500(comment_url, m_comments, current_id) + else: + child = com.find("div", class_="child") + if child.text != "": + child = child.find("div") + if all: + child_com = child.find_all( + "div", + class_=lambda x: x + and ( + "comment" in x + or "deleted comment" in x + or "morerecursion" in x + or "morechildren" in x + ), + recursive=False, + ) + else: + child_com = child.find_all( + "div", + class_=lambda x: x + and ("comment" in x or "deleted comment" in x), + recursive=False, + ) + for ele in child_com: + m_comments.append((current_id, ele)) + data = RedditComment( + id=current_id, + parent=parent, + comment=com.scrape_one("div[class='md']:not(div.child a)"), + ) + if data.id != "": + list_return.append(data) + return list_return diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 6b52cbfef2..3a63113066 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -14,3 +14,14 @@ class RedditPost(TabularRecord): number_comments: int published_date: str link: Optional[str] + + +@dataclass +class RedditComment(TabularRecord): + # url: str + # author: str + id: str + parent: str + # points: Optional[str] + # published_date: str + comment: str From 3ab4b427917c2ee7e42c170607ba8e26fadab0e2 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Mon, 9 Dec 2024 18:05:59 +0100 Subject: [PATCH 09/47] Update reddit comments --- minet/reddit/scraper.py | 11 +++++++++++ minet/reddit/types.py | 10 +++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 5d0806b33d..b3e2a78397 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -171,6 +171,13 @@ def get_comments(self, url: str, all): while m_comments: parent, com = m_comments.pop() current_id = get_current_id(com) + comment_url = com.scrape_one("a[class='bylink']", 'href') + try_author = com.scrape_one("a[class^='author']", 'href') + author = try_author.get_text() if try_author else "Deleted" + com_points = com.scrape_one("span[class='score unvoted']") + match = re.search(r"-?\d+\s+point(?:s)?", com_points) + com_points = int(re.search(r"-?\d+", match.group()).group()) + published_date = com.scrape_one("time", "datetime") if "morerecursion" in com.get("class") and all: url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}" m_comments = self.get_childs_l500(url_rec, m_comments, parent) @@ -207,8 +214,12 @@ def get_comments(self, url: str, all): for ele in child_com: m_comments.append((current_id, ele)) data = RedditComment( + comment_url=comment_url, + author=author, id=current_id, parent=parent, + points=com_points, + published_date=published_date, comment=com.scrape_one("div[class='md']:not(div.child a)"), ) if data.id != "": diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 3a63113066..3d2716f194 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -10,7 +10,7 @@ class RedditPost(TabularRecord): url: str author: str author_text: Optional[str] - upvote: str + upvote: int number_comments: int published_date: str link: Optional[str] @@ -18,10 +18,10 @@ class RedditPost(TabularRecord): @dataclass class RedditComment(TabularRecord): - # url: str - # author: str + comment_url: str + author: str id: str parent: str - # points: Optional[str] - # published_date: str + points: int + published_date: str comment: str From bc901cb1a0b65278dc1d6578f53f0c664f202ccb Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 20 Dec 2024 13:52:36 +0100 Subject: [PATCH 10/47] Optimization with yield --- minet/cli/reddit/posts.py | 4 ---- minet/reddit/scraper.py | 11 +++-------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index 777f88813d..68dd9d68cb 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -41,10 +41,6 @@ def action(cli_args, enricher, loading_bar): ) continue - list_posts = [] for post in posts: - list_posts.append(post) - - for post in list_posts: loading_bar.nested_advance() enricher.writerow(row, post) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index b3e2a78397..04100ef091 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -94,7 +94,6 @@ def get_childs_l500(self, url, list_comments, parent_id): return list_comments def get_posts(self, url: str, add_text: bool, nb_post=25): - list_posts = [] nb_pages = ceil(int(nb_post) / 25) old_url = get_old_url(get_url_from_subreddit(url)) n_crawled = 0 @@ -151,15 +150,12 @@ def get_posts(self, url: str, add_text: bool, nb_post=25): published_date=published_date, link=resolve_relative_url(link), ) - - list_posts.append(data) + yield data n_crawled += 1 old_url = soup.scrape("span[class='next-button'] a", "href")[0] - return list(list_posts) def get_comments(self, url: str, all): - list_return = [] m_comments = [] old_url = get_old_url(url) url_limit = old_url + "?limit=500" @@ -173,7 +169,7 @@ def get_comments(self, url: str, all): current_id = get_current_id(com) comment_url = com.scrape_one("a[class='bylink']", 'href') try_author = com.scrape_one("a[class^='author']", 'href') - author = try_author.get_text() if try_author else "Deleted" + author = try_author if try_author else "Deleted" com_points = com.scrape_one("span[class='score unvoted']") match = re.search(r"-?\d+\s+point(?:s)?", com_points) com_points = int(re.search(r"-?\d+", match.group()).group()) @@ -223,5 +219,4 @@ def get_comments(self, url: str, all): comment=com.scrape_one("div[class='md']:not(div.child a)"), ) if data.id != "": - list_return.append(data) - return list_return + yield data From e40d672f7e08e622ac257ca905071db28f1aa218 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 20 Dec 2024 14:40:15 +0100 Subject: [PATCH 11/47] Adding user_posts function --- minet/cli/reddit/__init__.py | 28 ++++++++++++++++++ minet/cli/reddit/user_posts.py | 46 +++++++++++++++++++++++++++++ minet/reddit/scraper.py | 54 ++++++++++++++++++++++++++++++++-- minet/reddit/types.py | 13 +++++++- 4 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 minet/cli/reddit/user_posts.py diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index 17b2d1f018..86acf6e59b 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -67,6 +67,33 @@ ], ) +REDDIT_USER_POSTS_SUBCOMMAND = command( + "user_posts", + "minet.cli.reddit.user_posts", + title="Minet Reddit User Posts Command", + description=""" + Retrieve reddit posts from a user link. + """, + epilog=""" + Example: + + . Searching posts from the user page of u/random_user: + $ minet reddit posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv + """, + variadic_input={ + "dummy_column": "user", + "item_label": "user url, user shortcode or user id", + "item_label_plural": "user urls, user shortcodes or user ids", + }, + arguments=[ + { + "flags": ["-n", "--number"], + "help": "Number of posts to retrieve.", + "type": int, + }, + ], +) + REDDIT_COMMAND = command( "reddit", @@ -79,5 +106,6 @@ subcommands=[ REDDIT_POSTS_SUBCOMMAND, REDDIT_COMMENTS_SUBCOMMAND, + REDDIT_USER_POSTS_SUBCOMMAND ], ) diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py new file mode 100644 index 0000000000..ab37660165 --- /dev/null +++ b/minet/cli/reddit/user_posts.py @@ -0,0 +1,46 @@ +# ============================================================================= +# Minet Reddit Posts CLI Action +# ============================================================================= +# +# Logic of the `rd user_posts` action. +# +from minet.cli.utils import with_enricher_and_loading_bar +from minet.reddit.scraper import RedditScraper +from minet.reddit.types import RedditUserPost +from minet.reddit.exceptions import RedditInvalidTargetError + + +@with_enricher_and_loading_bar( + headers=RedditUserPost, + title="Scraping user posts", + unit="groups", + nested=True, + sub_unit="user", +) +def action(cli_args, enricher, loading_bar): + scraper = RedditScraper() + + for i, row, url in enricher.enumerate_cells( + cli_args.column, with_rows=True, start=1 + ): + with loading_bar.step(url): + try: + if cli_args.number: + if cli_args.text: + posts = scraper.get_user_posts(url, cli_args.number) + else: + posts = scraper.get_user_posts(url, cli_args.number) + else: + if cli_args.text: + posts = scraper.get_user_posts(url) + else: + posts = scraper.get_user_posts(url) + except RedditInvalidTargetError: + loading_bar.print( + "the script could not complete normally on line %i" % (i) + ) + continue + + for post in posts: + loading_bar.nested_advance() + enricher.writerow(row, post) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 04100ef091..f65d39f152 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -2,7 +2,7 @@ from math import ceil from ural import get_domain_name, urlpathsplit, is_url from time import sleep -from minet.reddit.types import RedditPost, RedditComment +from minet.reddit.types import RedditPost, RedditComment, RedditUserPost from minet.reddit.exceptions import RedditInvalidTargetError import re from urllib.parse import urljoin @@ -145,7 +145,7 @@ def get_posts(self, url: str, add_text: bool, nb_post=25): url=post_url, author=author, author_text=content, - upvote=upvote, + points=upvote, number_comments=n_comments, published_date=published_date, link=resolve_relative_url(link), @@ -220,3 +220,53 @@ def get_comments(self, url: str, all): ) if data.id != "": yield data + + def get_user_posts(self, url: str, nb = 25): + nb_pages = ceil(int(nb) / 25) + n_crawled = 0 + old_url = get_old_url(url) + for _ in range(nb_pages): + if n_crawled == int(nb): + break + response = reddit_request(old_url, self.pool_manager) + soup = response.soup() + posts = soup.select("div[id^='thing_t3_']") + for post in posts: + sub = post.scrape_one("a[class*='subreddit']", "href") + title = post.scrape_one("a[class^='title']") + points = post.scrape_one("div[class='score unvoted']") + post_url = post.scrape_one("a[class^='bylink comment']", "href") + nb_comments = post.scrape_one("a[class^='bylink comment']") + match = re.match(r"(\d+)\s+comments", nb_comments) + if match: + nb_comments = int(match.group(1)) + else: + nb_comments = 0 + link = post.scrape_one("a[class^='title']", "href") + published_date = post.scrape("time", "datetime") + + data = RedditUserPost( + title=title, + url=post_url, + points=points, + number_comments=nb_comments, + published_date=published_date, + link=link, + subreddit=sub + ) + + yield data + n_crawled += 1 + old_url = soup.scrape("span[class='next-button'] a", "href")[0] + + + + + + + + + + + def get_user_comments(self, url: str, nb = 25): + old_url = get_old_url(url) \ No newline at end of file diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 3d2716f194..e27e5b736e 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -10,7 +10,7 @@ class RedditPost(TabularRecord): url: str author: str author_text: Optional[str] - upvote: int + points: int number_comments: int published_date: str link: Optional[str] @@ -25,3 +25,14 @@ class RedditComment(TabularRecord): points: int published_date: str comment: str + + +@dataclass +class RedditUserPost(TabularRecord): + title: str + url: str + points: int + number_comments: int + published_date: str + link: str + subreddit: str \ No newline at end of file From f53c45130c47c3912e37de9a24c55d8494c0f1f8 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 20 Dec 2024 14:55:54 +0100 Subject: [PATCH 12/47] Fix user_posts --- minet/cli/reddit/user_posts.py | 10 ++-------- minet/reddit/scraper.py | 4 +++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py index ab37660165..d57d9fd94d 100644 --- a/minet/cli/reddit/user_posts.py +++ b/minet/cli/reddit/user_posts.py @@ -26,15 +26,9 @@ def action(cli_args, enricher, loading_bar): with loading_bar.step(url): try: if cli_args.number: - if cli_args.text: - posts = scraper.get_user_posts(url, cli_args.number) - else: - posts = scraper.get_user_posts(url, cli_args.number) + posts = scraper.get_user_posts(url, cli_args.number) else: - if cli_args.text: - posts = scraper.get_user_posts(url) - else: - posts = scraper.get_user_posts(url) + posts = scraper.get_user_posts(url) except RedditInvalidTargetError: loading_bar.print( "the script could not complete normally on line %i" % (i) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index f65d39f152..35a1bf5497 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -242,7 +242,9 @@ def get_user_posts(self, url: str, nb = 25): nb_comments = int(match.group(1)) else: nb_comments = 0 - link = post.scrape_one("a[class^='title']", "href") + link = resolve_relative_url(post.scrape_one("a[class^='title']", "href")) + if link == post_url: + link = "" published_date = post.scrape("time", "datetime") data = RedditUserPost( From b932a8dc84eb7a4478b0d6bf70d8ea9d17936872 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 20 Dec 2024 15:04:47 +0100 Subject: [PATCH 13/47] Fixing errors with user_posts --- minet/cli/reddit/user_posts.py | 2 +- minet/reddit/scraper.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py index d57d9fd94d..a095d60c74 100644 --- a/minet/cli/reddit/user_posts.py +++ b/minet/cli/reddit/user_posts.py @@ -15,7 +15,7 @@ title="Scraping user posts", unit="groups", nested=True, - sub_unit="user", + sub_unit="posts", ) def action(cli_args, enricher, loading_bar): scraper = RedditScraper() diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 35a1bf5497..a35b551367 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -232,6 +232,8 @@ def get_user_posts(self, url: str, nb = 25): soup = response.soup() posts = soup.select("div[id^='thing_t3_']") for post in posts: + if n_crawled == int(nb): + break sub = post.scrape_one("a[class*='subreddit']", "href") title = post.scrape_one("a[class^='title']") points = post.scrape_one("div[class='score unvoted']") From 26be5f30e6c8a86a309ffe69702f1389e12bb616 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 20 Dec 2024 15:18:41 +0100 Subject: [PATCH 14/47] Fixing format --- minet/cli/reddit/__init__.py | 7 ++++- minet/cli/reddit/user_posts.py | 10 +++++-- minet/reddit/scraper.py | 51 +++++++++++++++++++++------------- minet/reddit/types.py | 3 +- 4 files changed, 47 insertions(+), 24 deletions(-) diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index 86acf6e59b..0c0e58383b 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -91,6 +91,11 @@ "help": "Number of posts to retrieve.", "type": int, }, + { + "flags": ["-t", "--text"], + "help": "Retrieve the text of the post. Note that it will require one request per post.", + "action": "store_true", + }, ], ) @@ -106,6 +111,6 @@ subcommands=[ REDDIT_POSTS_SUBCOMMAND, REDDIT_COMMENTS_SUBCOMMAND, - REDDIT_USER_POSTS_SUBCOMMAND + REDDIT_USER_POSTS_SUBCOMMAND, ], ) diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py index a095d60c74..6ca7008b3a 100644 --- a/minet/cli/reddit/user_posts.py +++ b/minet/cli/reddit/user_posts.py @@ -26,9 +26,15 @@ def action(cli_args, enricher, loading_bar): with loading_bar.step(url): try: if cli_args.number: - posts = scraper.get_user_posts(url, cli_args.number) + if cli_args.text: + posts = scraper.get_user_posts(url, True, cli_args.number) + else: + posts = scraper.get_user_posts(url, False, cli_args.number) else: - posts = scraper.get_user_posts(url) + if cli_args.text: + posts = scraper.get_user_posts(url, True) + else: + posts = scraper.get_user_posts(url, False) except RedditInvalidTargetError: loading_bar.print( "the script could not complete normally on line %i" % (i) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index a35b551367..33e80c7bf6 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -123,7 +123,7 @@ def get_posts(self, url: str, add_text: bool, nb_post=25): try_author = post.select_one("a[class*='author']") author = try_author.get_text() if try_author else "Deleted" upvote = post.select_one("div[class='score unvoted']").get_text() - if upvote == '•': + if upvote == "•": upvote = "" published_date = post.scrape_one("time", "datetime") link = post.scrape_one("a[class*='title']", "href") @@ -153,7 +153,6 @@ def get_posts(self, url: str, add_text: bool, nb_post=25): yield data n_crawled += 1 old_url = soup.scrape("span[class='next-button'] a", "href")[0] - def get_comments(self, url: str, all): m_comments = [] @@ -161,14 +160,16 @@ def get_comments(self, url: str, all): url_limit = old_url + "?limit=500" response = reddit_request(url_limit, self.pool_manager) soup = response.soup() - first_comments = soup.select("div[class='commentarea']>div>div[class*='comment']") + first_comments = soup.select( + "div[class='commentarea']>div>div[class*='comment']" + ) for ele in first_comments: m_comments.append((None, ele)) while m_comments: parent, com = m_comments.pop() current_id = get_current_id(com) - comment_url = com.scrape_one("a[class='bylink']", 'href') - try_author = com.scrape_one("a[class^='author']", 'href') + comment_url = com.scrape_one("a[class='bylink']", "href") + try_author = com.scrape_one("a[class^='author']", "href") author = try_author if try_author else "Deleted" com_points = com.scrape_one("span[class='score unvoted']") match = re.search(r"-?\d+\s+point(?:s)?", com_points) @@ -183,7 +184,9 @@ def get_comments(self, url: str, all): id_list = extract_t1_ids(onclick) for id in id_list: comment_url = f"{old_url}{id}" - m_comments = self.get_childs_l500(comment_url, m_comments, current_id) + m_comments = self.get_childs_l500( + comment_url, m_comments, current_id + ) else: child = com.find("div", class_="child") if child.text != "": @@ -221,7 +224,7 @@ def get_comments(self, url: str, all): if data.id != "": yield data - def get_user_posts(self, url: str, nb = 25): + def get_user_posts(self, url: str, add_text: bool, nb=25): nb_pages = ceil(int(nb) / 25) n_crawled = 0 old_url = get_old_url(url) @@ -237,6 +240,8 @@ def get_user_posts(self, url: str, nb = 25): sub = post.scrape_one("a[class*='subreddit']", "href") title = post.scrape_one("a[class^='title']") points = post.scrape_one("div[class='score unvoted']") + if points == "•": + points = "" post_url = post.scrape_one("a[class^='bylink comment']", "href") nb_comments = post.scrape_one("a[class^='bylink comment']") match = re.match(r"(\d+)\s+comments", nb_comments) @@ -244,33 +249,39 @@ def get_user_posts(self, url: str, nb = 25): nb_comments = int(match.group(1)) else: nb_comments = 0 - link = resolve_relative_url(post.scrape_one("a[class^='title']", "href")) + link = resolve_relative_url( + post.scrape_one("a[class^='title']", "href") + ) if link == post_url: link = "" published_date = post.scrape("time", "datetime") + if add_text: + text_response = reddit_request(post_url, self.pool_manager) + text_soup = text_response.soup() + try_content = text_soup.select_one( + "div[id='siteTable'] div[class^='usertext']" + ) + if try_content: + content = try_content.get_text() + else: + content = "" + else: + content = "" data = RedditUserPost( title=title, url=post_url, + author_text=content, points=points, number_comments=nb_comments, published_date=published_date, link=link, - subreddit=sub + subreddit=sub, ) yield data n_crawled += 1 old_url = soup.scrape("span[class='next-button'] a", "href")[0] - - - - - - - - - - def get_user_comments(self, url: str, nb = 25): - old_url = get_old_url(url) \ No newline at end of file + def get_user_comments(self, url: str, nb=25): + old_url = get_old_url(url) diff --git a/minet/reddit/types.py b/minet/reddit/types.py index e27e5b736e..4ae9910bf1 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -31,8 +31,9 @@ class RedditComment(TabularRecord): class RedditUserPost(TabularRecord): title: str url: str + author_text: str points: int number_comments: int published_date: str link: str - subreddit: str \ No newline at end of file + subreddit: str From e3a96afc439477dc8f2d718c86f07152b3cfd470 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 20 Dec 2024 15:51:55 +0100 Subject: [PATCH 15/47] Refacto --- minet/reddit/scraper.py | 157 ++++++++++++++++++---------------------- minet/reddit/types.py | 4 +- 2 files changed, 73 insertions(+), 88 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 33e80c7bf6..1a77120ee9 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -93,6 +93,50 @@ def get_childs_l500(self, url, list_comments, parent_id): list_comments.append((parent_id, ele)) return list_comments + def get_post_standard_info(self, post, add_text): + list_buttons = post.select_one("ul[class='flat-list buttons']") + if len(list_buttons.scrape("span[class='promoted-span']")) == 0: + title = post.force_select_one("a[class*='title']").get_text() + post_url = list_buttons.scrape_one("a[class^='bylink comments']", "href") + n_comments = list_buttons.select_one( + "a[class^='bylink comments']" + ).get_text() + match = re.match(r"(\d+)\s+comments", n_comments) + if match: + n_comments = int(match.group(1)) + else: + n_comments = 0 + upvote = post.select_one("div[class='score unvoted']").get_text() + if upvote == "•": + upvote = "" + published_date = post.scrape_one("time", "datetime") + link = resolve_relative_url(post.scrape_one("a[class*='title']", "href")) + if link == post_url: + link = "" + if add_text: + text_response = reddit_request(post_url, self.pool_manager) + text_soup = text_response.soup() + try_content = text_soup.select_one( + "div[id='siteTable'] div[class^='usertext']" + ) + if try_content: + content = try_content.get_text() + else: + content = "" + else: + content = "" + + data = { + "title": title, + "url": post_url, + "author_text": content, + "points": upvote, + "number_comments": n_comments, + "published_date": published_date, + "link": link, + } + return data + def get_posts(self, url: str, add_text: bool, nb_post=25): nb_pages = ceil(int(nb_post) / 25) old_url = get_old_url(get_url_from_subreddit(url)) @@ -106,52 +150,22 @@ def get_posts(self, url: str, add_text: bool, nb_post=25): for post in posts: if n_crawled == int(nb_post): break - list_buttons = post.select_one("ul[class='flat-list buttons']") - if len(list_buttons.scrape("span[class='promoted-span']")) == 0: - title = post.force_select_one("a[class*='title']").get_text() - post_url = list_buttons.scrape_one( - "a[class^='bylink comments']", "href" - ) - n_comments = list_buttons.select_one( - "a[class^='bylink comments']" - ).get_text() - match = re.match(r"(\d+)\s+comments", n_comments) - if match: - n_comments = int(match.group(1)) - else: - n_comments = 0 + data = self.get_post_standard_info(post, add_text) + if data: try_author = post.select_one("a[class*='author']") author = try_author.get_text() if try_author else "Deleted" - upvote = post.select_one("div[class='score unvoted']").get_text() - if upvote == "•": - upvote = "" - published_date = post.scrape_one("time", "datetime") - link = post.scrape_one("a[class*='title']", "href") - if add_text: - text_response = reddit_request(post_url, self.pool_manager) - text_soup = text_response.soup() - try_content = text_soup.select_one( - "div[id='siteTable'] div[class^='usertext']" - ) - if try_content: - content = try_content.get_text() - else: - content = "" - else: - content = "" - - data = RedditPost( - title=title, - url=post_url, + post = RedditPost( + title=data["title"], + url=data["url"], author=author, - author_text=content, - points=upvote, - number_comments=n_comments, - published_date=published_date, - link=resolve_relative_url(link), + author_text=data["author_text"], + points=data["points"], + number_comments=data["number_comments"], + published_date=data["published_date"], + external_link=data["link"], ) - yield data - n_crawled += 1 + yield post + n_crawled += 1 old_url = soup.scrape("span[class='next-button'] a", "href")[0] def get_comments(self, url: str, all): @@ -237,51 +251,22 @@ def get_user_posts(self, url: str, add_text: bool, nb=25): for post in posts: if n_crawled == int(nb): break - sub = post.scrape_one("a[class*='subreddit']", "href") - title = post.scrape_one("a[class^='title']") - points = post.scrape_one("div[class='score unvoted']") - if points == "•": - points = "" - post_url = post.scrape_one("a[class^='bylink comment']", "href") - nb_comments = post.scrape_one("a[class^='bylink comment']") - match = re.match(r"(\d+)\s+comments", nb_comments) - if match: - nb_comments = int(match.group(1)) - else: - nb_comments = 0 - link = resolve_relative_url( - post.scrape_one("a[class^='title']", "href") - ) - if link == post_url: - link = "" - published_date = post.scrape("time", "datetime") - if add_text: - text_response = reddit_request(post_url, self.pool_manager) - text_soup = text_response.soup() - try_content = text_soup.select_one( - "div[id='siteTable'] div[class^='usertext']" + data = self.get_post_standard_info(post, add_text) + if data: + sub = post.scrape_one("a[class*='subreddit']", "href") + post = RedditUserPost( + title=data["title"], + url=data["url"], + author_text=data["author_text"], + points=data["points"], + number_comments=data["number_comments"], + published_date=data["published_date"], + external_link=data["link"], + subreddit=sub, ) - if try_content: - content = try_content.get_text() - else: - content = "" - else: - content = "" - - data = RedditUserPost( - title=title, - url=post_url, - author_text=content, - points=points, - number_comments=nb_comments, - published_date=published_date, - link=link, - subreddit=sub, - ) - - yield data + yield post n_crawled += 1 old_url = soup.scrape("span[class='next-button'] a", "href")[0] - def get_user_comments(self, url: str, nb=25): - old_url = get_old_url(url) + # def get_user_comments(self, url: str, nb=25): + # old_url = get_old_url(url) diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 4ae9910bf1..3fcadc469f 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -13,7 +13,7 @@ class RedditPost(TabularRecord): points: int number_comments: int published_date: str - link: Optional[str] + external_link: Optional[str] @dataclass @@ -35,5 +35,5 @@ class RedditUserPost(TabularRecord): points: int number_comments: int published_date: str - link: str + external_link: str subreddit: str From 2fb4cd2e6f7f1e907b350e4a29efedfbee1c8b35 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 20 Dec 2024 16:17:59 +0100 Subject: [PATCH 16/47] better refacto --- .gitignore | 1 + minet/cli/reddit/posts.py | 10 +- minet/cli/reddit/user_posts.py | 10 +- minet/reddit/scraper.py | 181 +++++++++++++++++---------------- 4 files changed, 106 insertions(+), 96 deletions(-) diff --git a/.gitignore b/.gitignore index b2de0b22ba..ddd3a616c1 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ ftest/*.csv *.sqlar *-wal *-shm +*.csv /crawl /downloaded diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index 68dd9d68cb..d73ef812f1 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -20,6 +20,8 @@ def action(cli_args, enricher, loading_bar): scraper = RedditScraper() + type_page = 'subreddit' + for i, row, url in enricher.enumerate_cells( cli_args.column, with_rows=True, start=1 ): @@ -27,14 +29,14 @@ def action(cli_args, enricher, loading_bar): try: if cli_args.number: if cli_args.text: - posts = scraper.get_posts(url, True, cli_args.number) + posts = scraper.get_general_post(url, type_page, True, cli_args.number) else: - posts = scraper.get_posts(url, False, cli_args.number) + posts = scraper.get_general_post(url, type_page, False, cli_args.number) else: if cli_args.text: - posts = scraper.get_posts(url, True) + posts = scraper.get_general_post(url, type_page, True) else: - posts = scraper.get_posts(url, False) + posts = scraper.get_general_post(url, type_page, False) except RedditInvalidTargetError: loading_bar.print( "the script could not complete normally on line %i" % (i) diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py index 6ca7008b3a..d55abda1b3 100644 --- a/minet/cli/reddit/user_posts.py +++ b/minet/cli/reddit/user_posts.py @@ -20,6 +20,8 @@ def action(cli_args, enricher, loading_bar): scraper = RedditScraper() + type_page = 'user' + for i, row, url in enricher.enumerate_cells( cli_args.column, with_rows=True, start=1 ): @@ -27,14 +29,14 @@ def action(cli_args, enricher, loading_bar): try: if cli_args.number: if cli_args.text: - posts = scraper.get_user_posts(url, True, cli_args.number) + posts = scraper.get_general_post(url, type_page, True, cli_args.number) else: - posts = scraper.get_user_posts(url, False, cli_args.number) + posts = scraper.get_general_post(url, type_page, False, cli_args.number) else: if cli_args.text: - posts = scraper.get_user_posts(url, True) + posts = scraper.get_general_post(url, type_page, True) else: - posts = scraper.get_user_posts(url, False) + posts = scraper.get_general_post(url, type_page, False) except RedditInvalidTargetError: loading_bar.print( "the script could not complete normally on line %i" % (i) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 1a77120ee9..8d54530ae6 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -66,6 +66,41 @@ def get_current_id(com): return current_id +def data_posts( + post, title, url, author_text, points, number_comments, published_date, link +): + try_author = post.select_one("a[class*='author']") + author = try_author.get_text() if try_author else "Deleted" + data = RedditPost( + title=title, + url=url, + author=author, + author_text=author_text, + points=points, + number_comments=number_comments, + published_date=published_date, + external_link=link, + ) + return data + + +def data_user_posts( + post, title, url, author_text, points, number_comments, published_date, link +): + sub = post.scrape_one("a[class*='subreddit']", "href") + data = RedditUserPost( + title=title, + url=url, + author_text=author_text, + points=points, + number_comments=number_comments, + published_date=published_date, + external_link=link, + subreddit=sub, + ) + return data + + class RedditScraper(object): def __init__(self): self.pool_manager = create_pool_manager() @@ -93,81 +128,6 @@ def get_childs_l500(self, url, list_comments, parent_id): list_comments.append((parent_id, ele)) return list_comments - def get_post_standard_info(self, post, add_text): - list_buttons = post.select_one("ul[class='flat-list buttons']") - if len(list_buttons.scrape("span[class='promoted-span']")) == 0: - title = post.force_select_one("a[class*='title']").get_text() - post_url = list_buttons.scrape_one("a[class^='bylink comments']", "href") - n_comments = list_buttons.select_one( - "a[class^='bylink comments']" - ).get_text() - match = re.match(r"(\d+)\s+comments", n_comments) - if match: - n_comments = int(match.group(1)) - else: - n_comments = 0 - upvote = post.select_one("div[class='score unvoted']").get_text() - if upvote == "•": - upvote = "" - published_date = post.scrape_one("time", "datetime") - link = resolve_relative_url(post.scrape_one("a[class*='title']", "href")) - if link == post_url: - link = "" - if add_text: - text_response = reddit_request(post_url, self.pool_manager) - text_soup = text_response.soup() - try_content = text_soup.select_one( - "div[id='siteTable'] div[class^='usertext']" - ) - if try_content: - content = try_content.get_text() - else: - content = "" - else: - content = "" - - data = { - "title": title, - "url": post_url, - "author_text": content, - "points": upvote, - "number_comments": n_comments, - "published_date": published_date, - "link": link, - } - return data - - def get_posts(self, url: str, add_text: bool, nb_post=25): - nb_pages = ceil(int(nb_post) / 25) - old_url = get_old_url(get_url_from_subreddit(url)) - n_crawled = 0 - for _ in range(nb_pages): - if n_crawled == int(nb_post): - break - response = reddit_request(old_url, self.pool_manager) - soup = response.soup() - posts = soup.select("div[id^='thing_t3_']") - for post in posts: - if n_crawled == int(nb_post): - break - data = self.get_post_standard_info(post, add_text) - if data: - try_author = post.select_one("a[class*='author']") - author = try_author.get_text() if try_author else "Deleted" - post = RedditPost( - title=data["title"], - url=data["url"], - author=author, - author_text=data["author_text"], - points=data["points"], - number_comments=data["number_comments"], - published_date=data["published_date"], - external_link=data["link"], - ) - yield post - n_crawled += 1 - old_url = soup.scrape("span[class='next-button'] a", "href")[0] - def get_comments(self, url: str, all): m_comments = [] old_url = get_old_url(url) @@ -238,7 +198,7 @@ def get_comments(self, url: str, all): if data.id != "": yield data - def get_user_posts(self, url: str, add_text: bool, nb=25): + def get_general_post(self, url: str, type: str, add_text: bool, nb=25): nb_pages = ceil(int(nb) / 25) n_crawled = 0 old_url = get_old_url(url) @@ -251,19 +211,64 @@ def get_user_posts(self, url: str, add_text: bool, nb=25): for post in posts: if n_crawled == int(nb): break - data = self.get_post_standard_info(post, add_text) - if data: - sub = post.scrape_one("a[class*='subreddit']", "href") - post = RedditUserPost( - title=data["title"], - url=data["url"], - author_text=data["author_text"], - points=data["points"], - number_comments=data["number_comments"], - published_date=data["published_date"], - external_link=data["link"], - subreddit=sub, + list_buttons = post.select_one("ul[class='flat-list buttons']") + if len(list_buttons.scrape("span[class='promoted-span']")) == 0: + title = post.force_select_one("a[class*='title']").get_text() + post_url = list_buttons.scrape_one( + "a[class^='bylink comments']", "href" ) + n_comments = list_buttons.select_one( + "a[class^='bylink comments']" + ).get_text() + match = re.match(r"(\d+)\s+comments", n_comments) + if match: + n_comments = int(match.group(1)) + else: + n_comments = 0 + upvote = post.select_one("div[class='score unvoted']").get_text() + if upvote == "•": + upvote = "" + published_date = post.scrape_one("time", "datetime") + link = resolve_relative_url( + post.scrape_one("a[class*='title']", "href") + ) + if link == post_url: + link = "" + if add_text: + text_response = reddit_request(post_url, self.pool_manager) + text_soup = text_response.soup() + try_content = text_soup.select_one( + "div[id='siteTable'] div[class^='usertext']" + ) + if try_content: + content = try_content.get_text() + else: + content = "" + else: + content = "" + if type == "subreddit": + post = data_posts( + post, + title, + post_url, + content, + upvote, + n_comments, + published_date, + link, + ) + else: + post = data_user_posts( + post, + title, + post_url, + content, + upvote, + n_comments, + published_date, + link, + ) + yield post n_crawled += 1 old_url = soup.scrape("span[class='next-button'] a", "href")[0] From bc9ff739a590bbed6a49fafa644dc35bcd3f1bb8 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 20 Dec 2024 17:04:41 +0100 Subject: [PATCH 17/47] Adding reddit user_comments --- minet/cli/reddit/__init__.py | 27 ++++++++++++++++++++ minet/cli/reddit/user_comments.py | 41 +++++++++++++++++++++++++++++++ minet/reddit/scraper.py | 35 +++++++++++++++++++++++--- minet/reddit/types.py | 11 +++++++++ 4 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 minet/cli/reddit/user_comments.py diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index 0c0e58383b..cea5e8b8fb 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -99,6 +99,32 @@ ], ) +REDDIT_USER_COMMENTS_SUBCOMMAND = command( + "user_comments", + "minet.cli.reddit.user_comments", + title="Minet Reddit User Comments Command", + description=""" + Retrieve reddit comments from a user link. + """, + epilog=""" + Example: + + . Searching posts from the user page of u/random_user: + $ minet reddit posts https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv + """, + variadic_input={ + "dummy_column": "user", + "item_label": "user url, user shortcode or user id", + "item_label_plural": "user urls, user shortcodes or user ids", + }, + arguments=[ + { + "flags": ["-n", "--number"], + "help": "Number of posts to retrieve.", + "type": int, + }, + ], +) REDDIT_COMMAND = command( "reddit", @@ -112,5 +138,6 @@ REDDIT_POSTS_SUBCOMMAND, REDDIT_COMMENTS_SUBCOMMAND, REDDIT_USER_POSTS_SUBCOMMAND, + REDDIT_USER_COMMENTS_SUBCOMMAND, ], ) diff --git a/minet/cli/reddit/user_comments.py b/minet/cli/reddit/user_comments.py new file mode 100644 index 0000000000..c2e48ccefd --- /dev/null +++ b/minet/cli/reddit/user_comments.py @@ -0,0 +1,41 @@ +# ============================================================================= +# Minet Reddit Comments CLI Action +# ============================================================================= +# +# Logic of the `rd user_comments` action. +# +from minet.cli.utils import with_enricher_and_loading_bar +from minet.reddit.scraper import RedditScraper +from minet.reddit.types import RedditUserComment +from minet.reddit.exceptions import RedditInvalidTargetError + + +@with_enricher_and_loading_bar( + headers=RedditUserComment, + title="Scraping user comments", + unit="groups", + nested=True, + sub_unit="comments", +) +def action(cli_args, enricher, loading_bar): + scraper = RedditScraper() + + for i, row, url in enricher.enumerate_cells( + cli_args.column, with_rows=True, start=1 + ): + with loading_bar.step(url): + try: + if cli_args.number: + posts = scraper.get_user_comments(url, cli_args.number) + else: + posts = scraper.get_user_comments(url) + + except RedditInvalidTargetError: + loading_bar.print( + "the script could not complete normally on line %i" % (i) + ) + continue + + for post in posts: + loading_bar.nested_advance() + enricher.writerow(row, post) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 8d54530ae6..28b8f0a4b3 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -2,7 +2,7 @@ from math import ceil from ural import get_domain_name, urlpathsplit, is_url from time import sleep -from minet.reddit.types import RedditPost, RedditComment, RedditUserPost +from minet.reddit.types import RedditPost, RedditComment, RedditUserPost, RedditUserComment from minet.reddit.exceptions import RedditInvalidTargetError import re from urllib.parse import urljoin @@ -273,5 +273,34 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_crawled += 1 old_url = soup.scrape("span[class='next-button'] a", "href")[0] - # def get_user_comments(self, url: str, nb=25): - # old_url = get_old_url(url) + def get_user_comments(self, url: str, nb=25): + nb_pages = ceil(int(nb) / 25) + n_crawled = 0 + old_url = get_old_url(url) + for _ in range(nb_pages): + if n_crawled == int(nb): + break + response = reddit_request(old_url, self.pool_manager) + soup = response.soup() + comments = soup.select("[data-type='comment']") + for comment in comments: + if n_crawled == int(nb): + break + post_title = resolve_relative_url(comment.scrape_one("a[class='title']", "href")) + post_author = comment.scrape_one("p[class='parent']>a[class^='author']", "href") + post_subreddit = comment.scrape_one("a[class^='subreddit']", "href") + points = comment.scrape_one("span[class='score unvoted']") + published_date = comment.scrape_one("time", "datetime") + text = comment.scrape_one("div[class='content'] div[class='md']") + comment_url = comment.scrape_one("a[class='bylink']", "href") + data = RedditUserComment( + post_title=post_title, + post_author=post_author, + post_subreddit=post_subreddit, + points=points, + published_date=published_date, + text=text, + comment_url=comment_url + ) + yield data + n_crawled += 1 diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 3fcadc469f..4db1db8c6a 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -37,3 +37,14 @@ class RedditUserPost(TabularRecord): published_date: str external_link: str subreddit: str + + +@dataclass +class RedditUserComment(TabularRecord): + post_title: str + post_author: str + post_subreddit: str + points: int + published_date: str + text: str + comment_url: str From 47c1ae58bebc3ab1a97af784f3c72d2bab5ad157 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Tue, 7 Jan 2025 16:06:06 +0100 Subject: [PATCH 18/47] adding scraped values for points and comments --- minet/reddit/scraper.py | 29 +++++++++++++++++++---------- minet/reddit/types.py | 8 ++++++-- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 28b8f0a4b3..f81b812e0a 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -67,16 +67,18 @@ def get_current_id(com): def data_posts( - post, title, url, author_text, points, number_comments, published_date, link + post, title, url, author_text, real_points, points, scraped_number_comments, number_comments, published_date, link ): try_author = post.select_one("a[class*='author']") author = try_author.get_text() if try_author else "Deleted" data = RedditPost( title=title, - url=url, + url=get_new_url(url), author=author, author_text=author_text, - points=points, + scraped_points=points, + approximated_points=real_points, + scraped_number_comments=scraped_number_comments, number_comments=number_comments, published_date=published_date, external_link=link, @@ -85,14 +87,16 @@ def data_posts( def data_user_posts( - post, title, url, author_text, points, number_comments, published_date, link + post, title, url, author_text, real_points, points, scraped_number_comments, number_comments, published_date, link ): sub = post.scrape_one("a[class*='subreddit']", "href") data = RedditUserPost( title=title, - url=url, + url=get_new_url(url), author_text=author_text, - points=points, + scraped_points=points, + approximated_points=real_points, + scraped_number_comments=scraped_number_comments, number_comments=number_comments, published_date=published_date, external_link=link, @@ -217,17 +221,18 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): post_url = list_buttons.scrape_one( "a[class^='bylink comments']", "href" ) - n_comments = list_buttons.select_one( + n_comments_scraped = list_buttons.select_one( "a[class^='bylink comments']" ).get_text() - match = re.match(r"(\d+)\s+comments", n_comments) + match = re.match(r"(\d+)\s+comment(s)?", n_comments_scraped) if match: n_comments = int(match.group(1)) else: n_comments = 0 upvote = post.select_one("div[class='score unvoted']").get_text() - if upvote == "•": - upvote = "" + real_points = "" if upvote == "•" else upvote + if real_points[-1] == "k": + real_points = int(float(real_points[:-1]) * 1000) published_date = post.scrape_one("time", "datetime") link = resolve_relative_url( post.scrape_one("a[class*='title']", "href") @@ -252,7 +257,9 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): title, post_url, content, + real_points, upvote, + n_comments_scraped, n_comments, published_date, link, @@ -263,7 +270,9 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): title, post_url, content, + real_points, upvote, + n_comments_scraped, n_comments, published_date, link, diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 4db1db8c6a..427c6346c5 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -10,7 +10,9 @@ class RedditPost(TabularRecord): url: str author: str author_text: Optional[str] - points: int + scraped_points: str + approximated_points: int + scraped_number_comments: str number_comments: int published_date: str external_link: Optional[str] @@ -32,7 +34,9 @@ class RedditUserPost(TabularRecord): title: str url: str author_text: str - points: int + scraped_points: str + approximated_points: int + scraped_number_comments: str number_comments: int published_date: str external_link: str From 063211214bc86ab9f5534f265f30655c18400027 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Wed, 8 Jan 2025 10:34:20 +0100 Subject: [PATCH 19/47] Handle broken and banned pages --- minet/reddit/scraper.py | 289 ++++++++++++++++++++++++++-------------- minet/reddit/types.py | 4 + 2 files changed, 196 insertions(+), 97 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index f81b812e0a..280910b38a 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -2,12 +2,23 @@ from math import ceil from ural import get_domain_name, urlpathsplit, is_url from time import sleep -from minet.reddit.types import RedditPost, RedditComment, RedditUserPost, RedditUserComment +from minet.reddit.types import ( + RedditPost, + RedditComment, + RedditUserPost, + RedditUserComment, +) from minet.reddit.exceptions import RedditInvalidTargetError import re from urllib.parse import urljoin +def broken_reddit(soup, response): + if response.status == 500 and soup.scrape("title") == "reddit broke!": + return 0 + return 1 + + def resolve_relative_url(path): return urljoin("https://old.reddit.com", path) @@ -37,6 +48,10 @@ def reddit_request(url, pool_manager): sleep(1) response = request(url, pool_manager=pool_manager) soup = response.soup() + if response.status == 500 and soup.scrape_one("img", "alt") == "you broke reddit": + return response, soup, "broken page" + if response.status == 404 and soup.scrape_one("img", "alt") == "banned": + return response, soup, "banned" if response.status == 404 or ( soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']") ): @@ -49,7 +64,7 @@ def reddit_request(url, pool_manager): return reddit_request(url) if response.status == 429: return reddit_request(url) - return response + return response, soup, None def extract_t1_ids(text): @@ -67,7 +82,17 @@ def get_current_id(com): def data_posts( - post, title, url, author_text, real_points, points, scraped_number_comments, number_comments, published_date, link + post, + title, + url, + author_text, + real_points, + points, + scraped_number_comments, + number_comments, + published_date, + link, + error, ): try_author = post.select_one("a[class*='author']") author = try_author.get_text() if try_author else "Deleted" @@ -82,12 +107,23 @@ def data_posts( number_comments=number_comments, published_date=published_date, external_link=link, + error=error, ) return data def data_user_posts( - post, title, url, author_text, real_points, points, scraped_number_comments, number_comments, published_date, link + post, + title, + url, + author_text, + real_points, + points, + scraped_number_comments, + number_comments, + published_date, + link, + error, ): sub = post.scrape_one("a[class*='subreddit']", "href") data = RedditUserPost( @@ -101,6 +137,7 @@ def data_user_posts( published_date=published_date, external_link=link, subreddit=sub, + error=error, ) return data @@ -110,8 +147,7 @@ def __init__(self): self.pool_manager = create_pool_manager() def get_childs_l500(self, url, list_comments, parent_id): - response = reddit_request(url, self.pool_manager) - soup = response.soup() + _, soup, _ = reddit_request(url, self.pool_manager) comments = soup.select("div[class='commentarea']>div>div[class*='comment']") for com in comments: child = com.find("div", class_="child") @@ -136,71 +172,83 @@ def get_comments(self, url: str, all): m_comments = [] old_url = get_old_url(url) url_limit = old_url + "?limit=500" - response = reddit_request(url_limit, self.pool_manager) - soup = response.soup() - first_comments = soup.select( - "div[class='commentarea']>div>div[class*='comment']" - ) - for ele in first_comments: - m_comments.append((None, ele)) - while m_comments: - parent, com = m_comments.pop() - current_id = get_current_id(com) - comment_url = com.scrape_one("a[class='bylink']", "href") - try_author = com.scrape_one("a[class^='author']", "href") - author = try_author if try_author else "Deleted" - com_points = com.scrape_one("span[class='score unvoted']") - match = re.search(r"-?\d+\s+point(?:s)?", com_points) - com_points = int(re.search(r"-?\d+", match.group()).group()) - published_date = com.scrape_one("time", "datetime") - if "morerecursion" in com.get("class") and all: - url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}" - m_comments = self.get_childs_l500(url_rec, m_comments, parent) - elif "morechildren" in com.get("class") and all: - a = com.select_one("a") - onclick = a["onclick"] - id_list = extract_t1_ids(onclick) - for id in id_list: - comment_url = f"{old_url}{id}" - m_comments = self.get_childs_l500( - comment_url, m_comments, current_id - ) - else: - child = com.find("div", class_="child") - if child.text != "": - child = child.find("div") - if all: - child_com = child.find_all( - "div", - class_=lambda x: x - and ( - "comment" in x - or "deleted comment" in x - or "morerecursion" in x - or "morechildren" in x - ), - recursive=False, - ) - else: - child_com = child.find_all( - "div", - class_=lambda x: x - and ("comment" in x or "deleted comment" in x), - recursive=False, + _, soup, error = reddit_request(url_limit, self.pool_manager) + if error: + yield RedditComment( + comment_url="", + author="", + id="", + parent="", + points="", + published_date="", + comment="", + error=error, + ) + else: + first_comments = soup.select( + "div[class='commentarea']>div>div[class*='comment']" + ) + for ele in first_comments: + m_comments.append((None, ele)) + while m_comments: + parent, com = m_comments.pop() + current_id = get_current_id(com) + comment_url = com.scrape_one("a[class='bylink']", "href") + try_author = com.scrape_one("a[class^='author']", "href") + author = try_author if try_author else "Deleted" + com_points = com.scrape_one("span[class='score unvoted']") + match = re.search(r"-?\d+\s+point(?:s)?", com_points) + com_points = int(re.search(r"-?\d+", match.group()).group()) + published_date = com.scrape_one("time", "datetime") + if "morerecursion" in com.get("class") and all: + url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}" + m_comments = self.get_childs_l500(url_rec, m_comments, parent) + elif "morechildren" in com.get("class") and all: + a = com.select_one("a") + onclick = a["onclick"] + id_list = extract_t1_ids(onclick) + for id in id_list: + comment_url = f"{old_url}{id}" + m_comments = self.get_childs_l500( + comment_url, m_comments, current_id ) - for ele in child_com: - m_comments.append((current_id, ele)) - data = RedditComment( - comment_url=comment_url, - author=author, - id=current_id, - parent=parent, - points=com_points, - published_date=published_date, - comment=com.scrape_one("div[class='md']:not(div.child a)"), - ) - if data.id != "": - yield data + else: + child = com.find("div", class_="child") + if child.text != "": + child = child.find("div") + if all: + child_com = child.find_all( + "div", + class_=lambda x: x + and ( + "comment" in x + or "deleted comment" in x + or "morerecursion" in x + or "morechildren" in x + ), + recursive=False, + ) + else: + child_com = child.find_all( + "div", + class_=lambda x: x + and ("comment" in x or "deleted comment" in x), + recursive=False, + ) + for ele in child_com: + m_comments.append((current_id, ele)) + data = RedditComment( + comment_url=get_new_url(comment_url), + author=author, + id=current_id, + parent=parent, + points=com_points, + published_date=published_date, + comment=com.scrape_one("div[class='md']:not(div.child a)"), + error=error, + ) + if data.id != "": + yield data def get_general_post(self, url: str, type: str, add_text: bool, nb=25): nb_pages = ceil(int(nb) / 25) @@ -209,8 +257,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): for _ in range(nb_pages): if n_crawled == int(nb): break - response = reddit_request(old_url, self.pool_manager) - soup = response.soup() + _, soup, error = reddit_request(old_url, self.pool_manager) posts = soup.select("div[id^='thing_t3_']") for post in posts: if n_crawled == int(nb): @@ -240,8 +287,38 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): if link == post_url: link = "" if add_text: - text_response = reddit_request(post_url, self.pool_manager) - text_soup = text_response.soup() + _, text_soup, text_error = reddit_request( + post_url, self.pool_manager + ) + if text_error: + if type == "subreddit": + yield data_posts( + post, + title, + post_url, + "", + real_points, + upvote, + n_comments_scraped, + n_comments, + published_date, + link, + text_error, + ) + else: + yield data_user_posts( + post, + title, + post_url, + "", + real_points, + upvote, + n_comments_scraped, + n_comments, + published_date, + link, + text_error, + ) try_content = text_soup.select_one( "div[id='siteTable'] div[class^='usertext']" ) @@ -263,6 +340,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_comments, published_date, link, + error, ) else: post = data_user_posts( @@ -276,6 +354,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_comments, published_date, link, + error, ) yield post @@ -289,27 +368,43 @@ def get_user_comments(self, url: str, nb=25): for _ in range(nb_pages): if n_crawled == int(nb): break - response = reddit_request(old_url, self.pool_manager) - soup = response.soup() - comments = soup.select("[data-type='comment']") - for comment in comments: - if n_crawled == int(nb): - break - post_title = resolve_relative_url(comment.scrape_one("a[class='title']", "href")) - post_author = comment.scrape_one("p[class='parent']>a[class^='author']", "href") - post_subreddit = comment.scrape_one("a[class^='subreddit']", "href") - points = comment.scrape_one("span[class='score unvoted']") - published_date = comment.scrape_one("time", "datetime") - text = comment.scrape_one("div[class='content'] div[class='md']") - comment_url = comment.scrape_one("a[class='bylink']", "href") - data = RedditUserComment( - post_title=post_title, - post_author=post_author, - post_subreddit=post_subreddit, - points=points, - published_date=published_date, - text=text, - comment_url=comment_url + _, soup, error = reddit_request(old_url, self.pool_manager) + if error: + yield RedditUserComment( + post_title="", + post_author="", + post_subreddit="", + points="", + published_date="", + text="", + comment_url="", + error=error, ) - yield data - n_crawled += 1 + else: + comments = soup.select("[data-type='comment']") + for comment in comments: + if n_crawled == int(nb): + break + post_title = resolve_relative_url( + comment.scrape_one("a[class='title']", "href") + ) + post_author = comment.scrape_one( + "p[class='parent']>a[class^='author']", "href" + ) + post_subreddit = comment.scrape_one("a[class^='subreddit']", "href") + points = comment.scrape_one("span[class='score unvoted']") + published_date = comment.scrape_one("time", "datetime") + text = comment.scrape_one("div[class='content'] div[class='md']") + comment_url = comment.scrape_one("a[class='bylink']", "href") + data = RedditUserComment( + post_title=post_title, + post_author=post_author, + post_subreddit=post_subreddit, + points=points, + published_date=published_date, + text=text, + comment_url=comment_url, + error=error, + ) + yield data + n_crawled += 1 diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 427c6346c5..a572003f72 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -16,6 +16,7 @@ class RedditPost(TabularRecord): number_comments: int published_date: str external_link: Optional[str] + error: str @dataclass @@ -27,6 +28,7 @@ class RedditComment(TabularRecord): points: int published_date: str comment: str + error: str @dataclass @@ -41,6 +43,7 @@ class RedditUserPost(TabularRecord): published_date: str external_link: str subreddit: str + error: str @dataclass @@ -52,3 +55,4 @@ class RedditUserComment(TabularRecord): published_date: str text: str comment_url: str + error: str From d770363553dfe9e9992e5ecd26d63c45d69dadfa Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Wed, 8 Jan 2025 11:51:33 +0100 Subject: [PATCH 20/47] Better handling for scores --- minet/reddit/scraper.py | 52 +++++++++++++++++++---------------------- minet/reddit/types.py | 8 +++---- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 280910b38a..42acc98d22 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -13,12 +13,6 @@ from urllib.parse import urljoin -def broken_reddit(soup, response): - if response.status == 500 and soup.scrape("title") == "reddit broke!": - return 0 - return 1 - - def resolve_relative_url(path): return urljoin("https://old.reddit.com", path) @@ -81,12 +75,22 @@ def get_current_id(com): return current_id +def get_points(ele): + scrapped_points = ele.select_one("[class='score unvoted']") + score_hidden = ele.select_one("[class='score-hidden']") + if not scrapped_points and not score_hidden: + return "deleted" + scrapped_points = ele.scrape_one("[class='score unvoted']", "title") + if not scrapped_points: + return "score hidden" + return scrapped_points + + def data_posts( post, title, url, author_text, - real_points, points, scraped_number_comments, number_comments, @@ -101,8 +105,7 @@ def data_posts( url=get_new_url(url), author=author, author_text=author_text, - scraped_points=points, - approximated_points=real_points, + points=points, scraped_number_comments=scraped_number_comments, number_comments=number_comments, published_date=published_date, @@ -117,7 +120,6 @@ def data_user_posts( title, url, author_text, - real_points, points, scraped_number_comments, number_comments, @@ -130,8 +132,7 @@ def data_user_posts( title=title, url=get_new_url(url), author_text=author_text, - scraped_points=points, - approximated_points=real_points, + points=points, scraped_number_comments=scraped_number_comments, number_comments=number_comments, published_date=published_date, @@ -194,11 +195,9 @@ def get_comments(self, url: str, all): parent, com = m_comments.pop() current_id = get_current_id(com) comment_url = com.scrape_one("a[class='bylink']", "href") - try_author = com.scrape_one("a[class^='author']", "href") + try_author = com.scrape_one("a[class^='author']") author = try_author if try_author else "Deleted" - com_points = com.scrape_one("span[class='score unvoted']") - match = re.search(r"-?\d+\s+point(?:s)?", com_points) - com_points = int(re.search(r"-?\d+", match.group()).group()) + points = get_points(com) published_date = com.scrape_one("time", "datetime") if "morerecursion" in com.get("class") and all: url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}" @@ -238,11 +237,11 @@ def get_comments(self, url: str, all): for ele in child_com: m_comments.append((current_id, ele)) data = RedditComment( - comment_url=get_new_url(comment_url), + comment_url=get_new_url(comment_url) if comment_url else None, author=author, id=current_id, parent=parent, - points=com_points, + points=points, published_date=published_date, comment=com.scrape_one("div[class='md']:not(div.child a)"), error=error, @@ -276,10 +275,11 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_comments = int(match.group(1)) else: n_comments = 0 - upvote = post.select_one("div[class='score unvoted']").get_text() - real_points = "" if upvote == "•" else upvote - if real_points[-1] == "k": - real_points = int(float(real_points[:-1]) * 1000) + upvote = get_points(post) + # upvote = post.select_one("div[class='score unvoted']").get_text() + # real_points = "" if upvote == "•" else upvote + # if real_points[-1] == "k": + # real_points = int(float(real_points[:-1]) * 1000) published_date = post.scrape_one("time", "datetime") link = resolve_relative_url( post.scrape_one("a[class*='title']", "href") @@ -297,7 +297,6 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): title, post_url, "", - real_points, upvote, n_comments_scraped, n_comments, @@ -311,7 +310,6 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): title, post_url, "", - real_points, upvote, n_comments_scraped, n_comments, @@ -334,7 +332,6 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): title, post_url, content, - real_points, upvote, n_comments_scraped, n_comments, @@ -348,7 +345,6 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): title, post_url, content, - real_points, upvote, n_comments_scraped, n_comments, @@ -389,10 +385,10 @@ def get_user_comments(self, url: str, nb=25): comment.scrape_one("a[class='title']", "href") ) post_author = comment.scrape_one( - "p[class='parent']>a[class^='author']", "href" + "p[class='parent']>a[class^='author']" ) post_subreddit = comment.scrape_one("a[class^='subreddit']", "href") - points = comment.scrape_one("span[class='score unvoted']") + points = get_points(comment) published_date = comment.scrape_one("time", "datetime") text = comment.scrape_one("div[class='content'] div[class='md']") comment_url = comment.scrape_one("a[class='bylink']", "href") diff --git a/minet/reddit/types.py b/minet/reddit/types.py index a572003f72..2e9165b350 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -10,8 +10,7 @@ class RedditPost(TabularRecord): url: str author: str author_text: Optional[str] - scraped_points: str - approximated_points: int + points: str scraped_number_comments: str number_comments: int published_date: str @@ -25,7 +24,7 @@ class RedditComment(TabularRecord): author: str id: str parent: str - points: int + points: str published_date: str comment: str error: str @@ -36,8 +35,7 @@ class RedditUserPost(TabularRecord): title: str url: str author_text: str - scraped_points: str - approximated_points: int + points: str scraped_number_comments: str number_comments: int published_date: str From 240f1f2086238febebf10884b8de89797ea89b6c Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Wed, 8 Jan 2025 13:39:57 +0100 Subject: [PATCH 21/47] Draft of edited_date --- minet/reddit/scraper.py | 27 ++++++++++++++++++++------- minet/reddit/types.py | 4 ++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 42acc98d22..9a243516fd 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -86,6 +86,12 @@ def get_points(ele): return scrapped_points +def get_dates(ele): + published_date = ele.scrape_one("time[class='']", "datetime") + edited_date = ele.scrape_one("time[class='edited-timestamp']", "datetime") + return published_date, edited_date + + def data_posts( post, title, @@ -95,6 +101,7 @@ def data_posts( scraped_number_comments, number_comments, published_date, + edited_date, link, error, ): @@ -109,6 +116,7 @@ def data_posts( scraped_number_comments=scraped_number_comments, number_comments=number_comments, published_date=published_date, + edited_date=edited_date, external_link=link, error=error, ) @@ -124,6 +132,7 @@ def data_user_posts( scraped_number_comments, number_comments, published_date, + edited_date, link, error, ): @@ -136,6 +145,7 @@ def data_user_posts( scraped_number_comments=scraped_number_comments, number_comments=number_comments, published_date=published_date, + edited_date=edited_date, external_link=link, subreddit=sub, error=error, @@ -198,7 +208,7 @@ def get_comments(self, url: str, all): try_author = com.scrape_one("a[class^='author']") author = try_author if try_author else "Deleted" points = get_points(com) - published_date = com.scrape_one("time", "datetime") + published_date, edited_date = get_dates(com) if "morerecursion" in com.get("class") and all: url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}" m_comments = self.get_childs_l500(url_rec, m_comments, parent) @@ -243,6 +253,7 @@ def get_comments(self, url: str, all): parent=parent, points=points, published_date=published_date, + edited_date=edited_date, comment=com.scrape_one("div[class='md']:not(div.child a)"), error=error, ) @@ -276,11 +287,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): else: n_comments = 0 upvote = get_points(post) - # upvote = post.select_one("div[class='score unvoted']").get_text() - # real_points = "" if upvote == "•" else upvote - # if real_points[-1] == "k": - # real_points = int(float(real_points[:-1]) * 1000) - published_date = post.scrape_one("time", "datetime") + published_date, edited_date = get_dates(post) link = resolve_relative_url( post.scrape_one("a[class*='title']", "href") ) @@ -301,6 +308,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_comments_scraped, n_comments, published_date, + edited_date, link, text_error, ) @@ -314,6 +322,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_comments_scraped, n_comments, published_date, + edited_date, link, text_error, ) @@ -336,6 +345,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_comments_scraped, n_comments, published_date, + edited_date, link, error, ) @@ -349,6 +359,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_comments_scraped, n_comments, published_date, + edited_date, link, error, ) @@ -372,6 +383,7 @@ def get_user_comments(self, url: str, nb=25): post_subreddit="", points="", published_date="", + edited_date="", text="", comment_url="", error=error, @@ -389,7 +401,7 @@ def get_user_comments(self, url: str, nb=25): ) post_subreddit = comment.scrape_one("a[class^='subreddit']", "href") points = get_points(comment) - published_date = comment.scrape_one("time", "datetime") + published_date, edited_date = get_dates(comment) text = comment.scrape_one("div[class='content'] div[class='md']") comment_url = comment.scrape_one("a[class='bylink']", "href") data = RedditUserComment( @@ -398,6 +410,7 @@ def get_user_comments(self, url: str, nb=25): post_subreddit=post_subreddit, points=points, published_date=published_date, + edited_date=edited_date, text=text, comment_url=comment_url, error=error, diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 2e9165b350..9fa2bff317 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -14,6 +14,7 @@ class RedditPost(TabularRecord): scraped_number_comments: str number_comments: int published_date: str + edited_date: str external_link: Optional[str] error: str @@ -26,6 +27,7 @@ class RedditComment(TabularRecord): parent: str points: str published_date: str + edited_date: str comment: str error: str @@ -39,6 +41,7 @@ class RedditUserPost(TabularRecord): scraped_number_comments: str number_comments: int published_date: str + edited_date: str external_link: str subreddit: str error: str @@ -51,6 +54,7 @@ class RedditUserComment(TabularRecord): post_subreddit: str points: int published_date: str + edited_date: str text: str comment_url: str error: str From abff314824a7582c4106d02c9a636d8a99d30a9e Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Wed, 8 Jan 2025 13:44:56 +0100 Subject: [PATCH 22/47] Fixing error when no pagination and edited_date --- minet/reddit/scraper.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 9a243516fd..be83c515d9 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -87,7 +87,7 @@ def get_points(ele): def get_dates(ele): - published_date = ele.scrape_one("time[class='']", "datetime") + published_date = ele.scrape_one("time", "datetime") edited_date = ele.scrape_one("time[class='edited-timestamp']", "datetime") return published_date, edited_date @@ -265,7 +265,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_crawled = 0 old_url = get_old_url(url) for _ in range(nb_pages): - if n_crawled == int(nb): + if n_crawled == int(nb) or not old_url: break _, soup, error = reddit_request(old_url, self.pool_manager) posts = soup.select("div[id^='thing_t3_']") @@ -366,7 +366,9 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): yield post n_crawled += 1 - old_url = soup.scrape("span[class='next-button'] a", "href")[0] + old_url = soup.scrape("span[class='next-button'] a") + if old_url: + old_url = old_url[0].get("href") def get_user_comments(self, url: str, nb=25): nb_pages = ceil(int(nb) / 25) From b045fb79bee72a275fca34a9042f611707a3cba8 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Wed, 8 Jan 2025 16:07:27 +0100 Subject: [PATCH 23/47] Fixing data in user_comments --- minet/reddit/scraper.py | 23 ++++++++++++++--------- minet/reddit/types.py | 2 ++ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index be83c515d9..b895ca6b8c 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -107,6 +107,8 @@ def data_posts( ): try_author = post.select_one("a[class*='author']") author = try_author.get_text() if try_author else "Deleted" + if get_domain_name(link) == "reddit.com": + link = "" data = RedditPost( title=title, url=get_new_url(url), @@ -147,7 +149,7 @@ def data_user_posts( published_date=published_date, edited_date=edited_date, external_link=link, - subreddit=sub, + subreddit=get_new_url(sub), error=error, ) return data @@ -366,9 +368,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): yield post n_crawled += 1 - old_url = soup.scrape("span[class='next-button'] a") - if old_url: - old_url = old_url[0].get("href") + old_url = soup.scrape_one("span[class='next-button'] a", "href") def get_user_comments(self, url: str, nb=25): nb_pages = ceil(int(nb) / 25) @@ -395,9 +395,8 @@ def get_user_comments(self, url: str, nb=25): for comment in comments: if n_crawled == int(nb): break - post_title = resolve_relative_url( - comment.scrape_one("a[class='title']", "href") - ) + post_title = comment.scrape_one("a[class='title']") + post_url = comment.scrape_one("a[class='bylink may-blank']", "href") post_author = comment.scrape_one( "p[class='parent']>a[class^='author']" ) @@ -405,17 +404,23 @@ def get_user_comments(self, url: str, nb=25): points = get_points(comment) published_date, edited_date = get_dates(comment) text = comment.scrape_one("div[class='content'] div[class='md']") + link = comment.scrape_one( + "div[class='content'] div[class='md'] a", "href" + ) comment_url = comment.scrape_one("a[class='bylink']", "href") data = RedditUserComment( post_title=post_title, + post_url=get_new_url(post_url), post_author=post_author, - post_subreddit=post_subreddit, + post_subreddit=get_new_url(post_subreddit), points=points, published_date=published_date, edited_date=edited_date, text=text, - comment_url=comment_url, + link=link, + comment_url=get_new_url(comment_url), error=error, ) yield data n_crawled += 1 + old_url = soup.scrape_one("span[class='next-button'] a", "href") diff --git a/minet/reddit/types.py b/minet/reddit/types.py index 9fa2bff317..d3e16aaafd 100644 --- a/minet/reddit/types.py +++ b/minet/reddit/types.py @@ -50,11 +50,13 @@ class RedditUserPost(TabularRecord): @dataclass class RedditUserComment(TabularRecord): post_title: str + post_url: str post_author: str post_subreddit: str points: int published_date: str edited_date: str text: str + link: str comment_url: str error: str From 65ac1bf24249318a3fd2f48c0cabb5d04109c830 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Wed, 8 Jan 2025 16:44:50 +0100 Subject: [PATCH 24/47] refacto and use of posts with the name of the subreddit --- minet/cli/reddit/__init__.py | 2 ++ minet/reddit/scraper.py | 27 +++++++++++++++++---------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index cea5e8b8fb..55bd6e2050 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -19,6 +19,8 @@ . Searching posts from the subreddit r/france: $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv + $ minet reddit posts france > r_france_posts.csv + $ minet reddit posts r/france > r_france_posts.csv """, variadic_input={ "dummy_column": "subreddit", diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index b895ca6b8c..645ce66b21 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -1,16 +1,17 @@ -from minet.web import request, create_pool_manager from math import ceil -from ural import get_domain_name, urlpathsplit, is_url +import re from time import sleep +from ural import get_domain_name, urlpathsplit, is_url +from urllib.parse import urljoin + +from minet.reddit.exceptions import RedditInvalidTargetError from minet.reddit.types import ( RedditPost, RedditComment, RedditUserPost, RedditUserComment, ) -from minet.reddit.exceptions import RedditInvalidTargetError -import re -from urllib.parse import urljoin +from minet.web import request, create_pool_manager def resolve_relative_url(path): @@ -20,13 +21,19 @@ def resolve_relative_url(path): def get_old_url(url): domain = get_domain_name(url) path = urlpathsplit(url) - return f"https://old.{domain}/" + "/".join(path) + "/" + old_url = f"https://old.{domain}" + for ele in path: + old_url = urljoin(old_url, f"{ele}/") + return old_url def get_new_url(url): domain = get_domain_name(url) path = urlpathsplit(url) - return f"https://www.{domain}/" + "/".join(path) + "/" + new_url = f"https://old.{domain}" + for ele in path: + new_url = urljoin(new_url, f"{ele}/") + return new_url def get_url_from_subreddit(name: str): @@ -34,8 +41,8 @@ def get_url_from_subreddit(name: str): return name name = name.lstrip("/") if name.startswith("r/"): - return "https://old.reddit.com/" + name - return "https://old.reddit.com/r/" + name + return urljoin("https://old.reddit.com/", name) + return urljoin("https://old.reddit.com/r/", name) def reddit_request(url, pool_manager): @@ -265,7 +272,7 @@ def get_comments(self, url: str, all): def get_general_post(self, url: str, type: str, add_text: bool, nb=25): nb_pages = ceil(int(nb) / 25) n_crawled = 0 - old_url = get_old_url(url) + old_url = get_old_url(get_url_from_subreddit(url)) for _ in range(nb_pages): if n_crawled == int(nb) or not old_url: break From 30932a25f680a8cc1a5745fb9408ce2eb488a657 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 9 Jan 2025 11:50:36 +0100 Subject: [PATCH 25/47] Fixing typo --- minet/cli/reddit/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index 55bd6e2050..c6e6a295fb 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -12,7 +12,7 @@ "minet.cli.reddit.posts", title="Minet Reddit Posts Command", description=""" - Retrieve reddit posts from a subreddit link. + Retrieve reddit posts from a subreddit link or name. """, epilog=""" Example: @@ -80,7 +80,7 @@ Example: . Searching posts from the user page of u/random_user: - $ minet reddit posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv + $ minet reddit user_posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv """, variadic_input={ "dummy_column": "user", @@ -111,8 +111,8 @@ epilog=""" Example: - . Searching posts from the user page of u/random_user: - $ minet reddit posts https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv + . Searching comments from the user page of u/random_user: + $ minet reddit user_comments https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv """, variadic_input={ "dummy_column": "user", @@ -122,7 +122,7 @@ arguments=[ { "flags": ["-n", "--number"], - "help": "Number of posts to retrieve.", + "help": "Number of comments to retrieve.", "type": int, }, ], From 2e2abfcc9b94fa18292d6d73c88e0677bb43da4d Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 9 Jan 2025 11:52:17 +0100 Subject: [PATCH 26/47] Fixing typo --- minet/cli/reddit/comments.py | 2 +- minet/cli/reddit/posts.py | 12 ++++++++---- minet/cli/reddit/user_comments.py | 2 +- minet/cli/reddit/user_posts.py | 12 ++++++++---- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/minet/cli/reddit/comments.py b/minet/cli/reddit/comments.py index d41c7b9699..c5232b1a63 100644 --- a/minet/cli/reddit/comments.py +++ b/minet/cli/reddit/comments.py @@ -13,7 +13,7 @@ @with_enricher_and_loading_bar( headers=RedditComment, title="Scraping comments", - unit="groups", + unit="pages", nested=True, sub_unit="comments", ) diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index d73ef812f1..a3e8738d9c 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -13,14 +13,14 @@ @with_enricher_and_loading_bar( headers=RedditPost, title="Scraping posts", - unit="groups", + unit="pages", nested=True, sub_unit="posts", ) def action(cli_args, enricher, loading_bar): scraper = RedditScraper() - type_page = 'subreddit' + type_page = "subreddit" for i, row, url in enricher.enumerate_cells( cli_args.column, with_rows=True, start=1 @@ -29,9 +29,13 @@ def action(cli_args, enricher, loading_bar): try: if cli_args.number: if cli_args.text: - posts = scraper.get_general_post(url, type_page, True, cli_args.number) + posts = scraper.get_general_post( + url, type_page, True, cli_args.number + ) else: - posts = scraper.get_general_post(url, type_page, False, cli_args.number) + posts = scraper.get_general_post( + url, type_page, False, cli_args.number + ) else: if cli_args.text: posts = scraper.get_general_post(url, type_page, True) diff --git a/minet/cli/reddit/user_comments.py b/minet/cli/reddit/user_comments.py index c2e48ccefd..b25022303c 100644 --- a/minet/cli/reddit/user_comments.py +++ b/minet/cli/reddit/user_comments.py @@ -13,7 +13,7 @@ @with_enricher_and_loading_bar( headers=RedditUserComment, title="Scraping user comments", - unit="groups", + unit="pages", nested=True, sub_unit="comments", ) diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py index d55abda1b3..1d1b176137 100644 --- a/minet/cli/reddit/user_posts.py +++ b/minet/cli/reddit/user_posts.py @@ -13,14 +13,14 @@ @with_enricher_and_loading_bar( headers=RedditUserPost, title="Scraping user posts", - unit="groups", + unit="pages", nested=True, sub_unit="posts", ) def action(cli_args, enricher, loading_bar): scraper = RedditScraper() - type_page = 'user' + type_page = "user" for i, row, url in enricher.enumerate_cells( cli_args.column, with_rows=True, start=1 @@ -29,9 +29,13 @@ def action(cli_args, enricher, loading_bar): try: if cli_args.number: if cli_args.text: - posts = scraper.get_general_post(url, type_page, True, cli_args.number) + posts = scraper.get_general_post( + url, type_page, True, cli_args.number + ) else: - posts = scraper.get_general_post(url, type_page, False, cli_args.number) + posts = scraper.get_general_post( + url, type_page, False, cli_args.number + ) else: if cli_args.text: posts = scraper.get_general_post(url, type_page, True) From 39700b0b38d476c99665ca08d7d1b65be7ba733d Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 9 Jan 2025 11:57:40 +0100 Subject: [PATCH 27/47] Fixing error in get_new_url --- minet/reddit/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 645ce66b21..03bdcd22f2 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -30,7 +30,7 @@ def get_old_url(url): def get_new_url(url): domain = get_domain_name(url) path = urlpathsplit(url) - new_url = f"https://old.{domain}" + new_url = f"https://www.{domain}" for ele in path: new_url = urljoin(new_url, f"{ele}/") return new_url From 4d74228582dd82ee61efb8e4227819a3e5482606 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 9 Jan 2025 14:01:36 +0100 Subject: [PATCH 28/47] changes doc and kebab-case --- minet/cli/reddit/__init__.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index c6e6a295fb..2d6c504feb 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -24,8 +24,8 @@ """, variadic_input={ "dummy_column": "subreddit", - "item_label": "subreddit url, subreddit shortcode or subreddit id", - "item_label_plural": "subreddit urls, subreddit shortcodes or subreddits ids", + "item_label": "subreddit url, shortcode or id", + "item_label_plural": "subreddit urls, shortcodes or ids", }, arguments=[ { @@ -57,8 +57,8 @@ """, variadic_input={ "dummy_column": "post", - "item_label": "post url, post shortcode or post id", - "item_label_plural": "posts urls, posts shortcodes or posts ids", + "item_label": "post url, shortcode or id", + "item_label_plural": "posts urls, shortcodes or ids", }, arguments=[ { @@ -70,7 +70,7 @@ ) REDDIT_USER_POSTS_SUBCOMMAND = command( - "user_posts", + "user-posts", "minet.cli.reddit.user_posts", title="Minet Reddit User Posts Command", description=""" @@ -80,12 +80,12 @@ Example: . Searching posts from the user page of u/random_user: - $ minet reddit user_posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv + $ minet reddit user-posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv """, variadic_input={ "dummy_column": "user", - "item_label": "user url, user shortcode or user id", - "item_label_plural": "user urls, user shortcodes or user ids", + "item_label": "user url, shortcode or id", + "item_label_plural": "user urls, shortcodes or ids", }, arguments=[ { @@ -102,7 +102,7 @@ ) REDDIT_USER_COMMENTS_SUBCOMMAND = command( - "user_comments", + "user-comments", "minet.cli.reddit.user_comments", title="Minet Reddit User Comments Command", description=""" @@ -112,12 +112,12 @@ Example: . Searching comments from the user page of u/random_user: - $ minet reddit user_comments https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv + $ minet reddit user-comments https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv """, variadic_input={ "dummy_column": "user", - "item_label": "user url, user shortcode or user id", - "item_label_plural": "user urls, user shortcodes or user ids", + "item_label": "user url, shortcode or id", + "item_label_plural": "user urls, shortcodes or ids", }, arguments=[ { From 6e32569333ac837c33c03d4b26353a0b92baed9a Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 9 Jan 2025 14:16:40 +0100 Subject: [PATCH 29/47] removing print and sleep --- minet/reddit/scraper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 03bdcd22f2..1f1f437b48 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -46,7 +46,6 @@ def get_url_from_subreddit(name: str): def reddit_request(url, pool_manager): - sleep(1) response = request(url, pool_manager=pool_manager) soup = response.soup() if response.status == 500 and soup.scrape_one("img", "alt") == "you broke reddit": @@ -60,7 +59,6 @@ def reddit_request(url, pool_manager): remaining_requests = float(response.headers["x-ratelimit-remaining"]) if remaining_requests == 1: time_remaining = int(response.headers["x-ratelimit-reset"]) - print(f"Time before next request : {time_remaining}s") sleep(time_remaining) return reddit_request(url) if response.status == 429: From a49918b7e45538ecf611b48e563a3ac9d881b464 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 9 Jan 2025 14:21:01 +0100 Subject: [PATCH 30/47] Avoid stack overflow error --- minet/reddit/scraper.py | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 1f1f437b48..27c6c158e4 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -46,24 +46,29 @@ def get_url_from_subreddit(name: str): def reddit_request(url, pool_manager): - response = request(url, pool_manager=pool_manager) - soup = response.soup() - if response.status == 500 and soup.scrape_one("img", "alt") == "you broke reddit": - return response, soup, "broken page" - if response.status == 404 and soup.scrape_one("img", "alt") == "banned": - return response, soup, "banned" - if response.status == 404 or ( - soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']") - ): - raise RedditInvalidTargetError - remaining_requests = float(response.headers["x-ratelimit-remaining"]) - if remaining_requests == 1: - time_remaining = int(response.headers["x-ratelimit-reset"]) - sleep(time_remaining) - return reddit_request(url) - if response.status == 429: - return reddit_request(url) - return response, soup, None + while True: + response = request(url, pool_manager=pool_manager) + soup = response.soup() + if ( + response.status == 500 + and soup.scrape_one("img", "alt") == "you broke reddit" + ): + return response, soup, "broken page" + if response.status == 404 and soup.scrape_one("img", "alt") == "banned": + return response, soup, "banned" + if response.status == 404 or ( + soup.scrape("p[id='noresults']") + and not soup.scrape("div[class='commentarea']") + ): + raise RedditInvalidTargetError + remaining_requests = float(response.headers["x-ratelimit-remaining"]) + if remaining_requests == 1: + time_remaining = int(response.headers["x-ratelimit-reset"]) + sleep(time_remaining) + continue + if response.status == 429: + continue + return response, soup, None def extract_t1_ids(text): From 318955864f3ad84abe08fd0a084d24d38107c153 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 9 Jan 2025 15:25:21 +0100 Subject: [PATCH 31/47] refacto --- minet/cli/reddit/comments.py | 5 +---- minet/cli/reddit/posts.py | 17 +++++------------ minet/cli/reddit/user_posts.py | 17 +++++------------ 3 files changed, 11 insertions(+), 28 deletions(-) diff --git a/minet/cli/reddit/comments.py b/minet/cli/reddit/comments.py index c5232b1a63..853175e8e4 100644 --- a/minet/cli/reddit/comments.py +++ b/minet/cli/reddit/comments.py @@ -25,10 +25,7 @@ def action(cli_args, enricher, loading_bar): ): with loading_bar.step(url): try: - if cli_args.all: - comments = scraper.get_comments(url, True) - else: - comments = scraper.get_comments(url, False) + comments = scraper.get_comments(url, cli_args.all) except RedditInvalidTargetError: loading_bar.print( diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index a3e8738d9c..91754946ce 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -28,19 +28,12 @@ def action(cli_args, enricher, loading_bar): with loading_bar.step(url): try: if cli_args.number: - if cli_args.text: - posts = scraper.get_general_post( - url, type_page, True, cli_args.number - ) - else: - posts = scraper.get_general_post( - url, type_page, False, cli_args.number - ) + posts = scraper.get_general_post( + url, type_page, cli_args.text, cli_args.number + ) else: - if cli_args.text: - posts = scraper.get_general_post(url, type_page, True) - else: - posts = scraper.get_general_post(url, type_page, False) + posts = scraper.get_general_post(url, type_page, cli_args.text) + except RedditInvalidTargetError: loading_bar.print( "the script could not complete normally on line %i" % (i) diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py index 1d1b176137..408d217319 100644 --- a/minet/cli/reddit/user_posts.py +++ b/minet/cli/reddit/user_posts.py @@ -28,19 +28,12 @@ def action(cli_args, enricher, loading_bar): with loading_bar.step(url): try: if cli_args.number: - if cli_args.text: - posts = scraper.get_general_post( - url, type_page, True, cli_args.number - ) - else: - posts = scraper.get_general_post( - url, type_page, False, cli_args.number - ) + posts = scraper.get_general_post( + url, type_page, cli_args.text, cli_args.number + ) else: - if cli_args.text: - posts = scraper.get_general_post(url, type_page, True) - else: - posts = scraper.get_general_post(url, type_page, False) + posts = scraper.get_general_post(url, type_page, cli_args.text) + except RedditInvalidTargetError: loading_bar.print( "the script could not complete normally on line %i" % (i) From fa3bc2847073678bbfa657890da13f3aedcb7d55 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 9 Jan 2025 17:49:21 +0100 Subject: [PATCH 32/47] changing -n, --number to -l, --limit and fixing errors with comments --- minet/cli/reddit/__init__.py | 12 +++++----- minet/cli/reddit/posts.py | 9 +++---- minet/cli/reddit/user_comments.py | 5 +--- minet/cli/reddit/user_posts.py | 9 +++---- minet/reddit/scraper.py | 39 +++++++++++++++++-------------- 5 files changed, 34 insertions(+), 40 deletions(-) diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py index 2d6c504feb..6ab6acf884 100644 --- a/minet/cli/reddit/__init__.py +++ b/minet/cli/reddit/__init__.py @@ -29,8 +29,8 @@ }, arguments=[ { - "flags": ["-n", "--number"], - "help": "Number of posts to retrieve.", + "flags": ["-l", "--limit"], + "help": "Maximum number of posts to retrieve.", "type": int, }, { @@ -89,8 +89,8 @@ }, arguments=[ { - "flags": ["-n", "--number"], - "help": "Number of posts to retrieve.", + "flags": ["-l", "--limit"], + "help": "Maximum number of posts to retrieve.", "type": int, }, { @@ -121,8 +121,8 @@ }, arguments=[ { - "flags": ["-n", "--number"], - "help": "Number of comments to retrieve.", + "flags": ["-l", "--limit"], + "help": "Maximum number of comments to retrieve.", "type": int, }, ], diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py index 91754946ce..2111972b98 100644 --- a/minet/cli/reddit/posts.py +++ b/minet/cli/reddit/posts.py @@ -27,12 +27,9 @@ def action(cli_args, enricher, loading_bar): ): with loading_bar.step(url): try: - if cli_args.number: - posts = scraper.get_general_post( - url, type_page, cli_args.text, cli_args.number - ) - else: - posts = scraper.get_general_post(url, type_page, cli_args.text) + posts = scraper.get_general_post( + url, type_page, cli_args.text, cli_args.limit + ) except RedditInvalidTargetError: loading_bar.print( diff --git a/minet/cli/reddit/user_comments.py b/minet/cli/reddit/user_comments.py index b25022303c..e7a3e5a02b 100644 --- a/minet/cli/reddit/user_comments.py +++ b/minet/cli/reddit/user_comments.py @@ -25,10 +25,7 @@ def action(cli_args, enricher, loading_bar): ): with loading_bar.step(url): try: - if cli_args.number: - posts = scraper.get_user_comments(url, cli_args.number) - else: - posts = scraper.get_user_comments(url) + posts = scraper.get_user_comments(url, cli_args.limit) except RedditInvalidTargetError: loading_bar.print( diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py index 408d217319..ea950b09c5 100644 --- a/minet/cli/reddit/user_posts.py +++ b/minet/cli/reddit/user_posts.py @@ -27,12 +27,9 @@ def action(cli_args, enricher, loading_bar): ): with loading_bar.step(url): try: - if cli_args.number: - posts = scraper.get_general_post( - url, type_page, cli_args.text, cli_args.number - ) - else: - posts = scraper.get_general_post(url, type_page, cli_args.text) + posts = scraper.get_general_post( + url, type_page, cli_args.text, cli_args.limit + ) except RedditInvalidTargetError: loading_bar.print( diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 27c6c158e4..d488002b8c 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -56,9 +56,10 @@ def reddit_request(url, pool_manager): return response, soup, "broken page" if response.status == 404 and soup.scrape_one("img", "alt") == "banned": return response, soup, "banned" - if response.status == 404 or ( - soup.scrape("p[id='noresults']") - and not soup.scrape("div[class='commentarea']") + if ( + soup.scrape_one("span.pagename.selected") == "page not found" + or "search?q=" in url + and soup.scrape_one("p.error") ): raise RedditInvalidTargetError remaining_requests = float(response.headers["x-ratelimit-remaining"]) @@ -116,7 +117,7 @@ def data_posts( error, ): try_author = post.select_one("a[class*='author']") - author = try_author.get_text() if try_author else "Deleted" + author = try_author.get_text() if try_author else "[Deleted]" if get_domain_name(link) == "reddit.com": link = "" data = RedditPost( @@ -216,10 +217,14 @@ def get_comments(self, url: str, all): while m_comments: parent, com = m_comments.pop() current_id = get_current_id(com) - comment_url = com.scrape_one("a[class='bylink']", "href") - try_author = com.scrape_one("a[class^='author']") - author = try_author if try_author else "Deleted" - points = get_points(com) + if com.get("class") == " thing noncollapsed deleted comment ": + comment_url = None + author = "[Deleted]" + points = None + else: + comment_url = com.scrape_one("a[class='bylink']", "href") + author = com.scrape_one("a[class^='author']") + points = get_points(com) published_date, edited_date = get_dates(com) if "morerecursion" in com.get("class") and all: url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}" @@ -272,17 +277,16 @@ def get_comments(self, url: str, all): if data.id != "": yield data - def get_general_post(self, url: str, type: str, add_text: bool, nb=25): - nb_pages = ceil(int(nb) / 25) + def get_general_post(self, url: str, type: str, add_text: bool, limit: int): n_crawled = 0 old_url = get_old_url(get_url_from_subreddit(url)) - for _ in range(nb_pages): - if n_crawled == int(nb) or not old_url: + while old_url and (limit is None or n_crawled < limit): + if limit is not None and n_crawled == limit: break _, soup, error = reddit_request(old_url, self.pool_manager) posts = soup.select("div[id^='thing_t3_']") for post in posts: - if n_crawled == int(nb): + if limit is not None and n_crawled == limit: break list_buttons = post.select_one("ul[class='flat-list buttons']") if len(list_buttons.scrape("span[class='promoted-span']")) == 0: @@ -380,12 +384,11 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25): n_crawled += 1 old_url = soup.scrape_one("span[class='next-button'] a", "href") - def get_user_comments(self, url: str, nb=25): - nb_pages = ceil(int(nb) / 25) + def get_user_comments(self, url: str, limit: int): n_crawled = 0 old_url = get_old_url(url) - for _ in range(nb_pages): - if n_crawled == int(nb): + while old_url and (limit is None or n_crawled < limit): + if limit is not None and n_crawled == limit: break _, soup, error = reddit_request(old_url, self.pool_manager) if error: @@ -403,7 +406,7 @@ def get_user_comments(self, url: str, nb=25): else: comments = soup.select("[data-type='comment']") for comment in comments: - if n_crawled == int(nb): + if limit is not None and n_crawled == limit: break post_title = comment.scrape_one("a[class='title']") post_url = comment.scrape_one("a[class='bylink may-blank']", "href") From 2f51fb60f7c99953d3d134626ed8203b50b8ee6d Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Thu, 9 Jan 2025 17:54:00 +0100 Subject: [PATCH 33/47] Fixing gh-tests error --- minet/reddit/scraper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index d488002b8c..b49e023b9c 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -1,4 +1,3 @@ -from math import ceil import re from time import sleep from ural import get_domain_name, urlpathsplit, is_url From 5a42b15b6158b77e7af8d2a48eea8bf70b0c5384 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 10 Jan 2025 15:13:45 +0100 Subject: [PATCH 34/47] Fixing comments and handling detection --- minet/reddit/scraper.py | 93 +++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 26 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index b49e023b9c..05fdfb6df5 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -1,3 +1,4 @@ +from random import choice import re from time import sleep from ural import get_domain_name, urlpathsplit, is_url @@ -12,6 +13,28 @@ ) from minet.web import request, create_pool_manager +USER_AGENTS = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.2420.81", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.4; rv:124.0) Gecko/20100101 Firefox/124.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux i686; rv:124.0) Gecko/20100101 Firefox/124.0", +] + + +def add_slash(url: str): + path = url.split("/") + if path[-1] == "?limit=500": + return url + elif url[-1] != "/": + return url + "/" + return url + def resolve_relative_url(path): return urljoin("https://old.reddit.com", path) @@ -46,8 +69,16 @@ def get_url_from_subreddit(name: str): def reddit_request(url, pool_manager): while True: - response = request(url, pool_manager=pool_manager) + response = request( + add_slash(url), + pool_manager=pool_manager, + headers={"User-Agent": choice(USER_AGENTS)}, + ) soup = response.soup() + remaining_requests = float(response.headers["x-ratelimit-remaining"]) + if response.status == 429 and remaining_requests > 1: + sleep(1) + continue if ( response.status == 500 and soup.scrape_one("img", "alt") == "you broke reddit" @@ -61,13 +92,10 @@ def reddit_request(url, pool_manager): and soup.scrape_one("p.error") ): raise RedditInvalidTargetError - remaining_requests = float(response.headers["x-ratelimit-remaining"]) - if remaining_requests == 1: + if remaining_requests == 1 or response.status == 429: time_remaining = int(response.headers["x-ratelimit-reset"]) sleep(time_remaining) continue - if response.status == 429: - continue return response, soup, None @@ -172,23 +200,27 @@ def __init__(self): def get_childs_l500(self, url, list_comments, parent_id): _, soup, _ = reddit_request(url, self.pool_manager) comments = soup.select("div[class='commentarea']>div>div[class*='comment']") - for com in comments: - child = com.find("div", class_="child") - if child.text != "": - child = child.find("div") - child_com = child.find_all( - "div", - class_=lambda x: x - and ( - "comment" in x - or "deleted comment" in x - or "morerecursion" in x - or "morechildren" in x - ), - recursive=False, - ) - for ele in child_com: - list_comments.append((parent_id, ele)) + if parent_id == None: + for com in comments: + list_comments.append((None, com)) + else: + for com in comments: + child = com.find("div", class_="child") + if child.text != "": + child = child.find("div") + child_com = child.find_all( + "div", + class_=lambda x: x + and ( + "comment" in x + or "deleted comment" in x + or "morerecursion" in x + or "morechildren" in x + ), + recursive=False, + ) + for ele in child_com: + list_comments.append((parent_id, ele)) return list_comments def get_comments(self, url: str, all): @@ -211,13 +243,22 @@ def get_comments(self, url: str, all): first_comments = soup.select( "div[class='commentarea']>div>div[class*='comment']" ) + if all: + more = soup.select("div.commentarea>div>div[class*='morechildren']") + for ele in more: + a = ele.select_one("a") + onclick = a["onclick"] + id_list = extract_t1_ids(onclick) + for id in id_list: + comment_url = f"{old_url}{id}/" + m_comments = self.get_childs_l500(comment_url, m_comments, None) for ele in first_comments: m_comments.append((None, ele)) while m_comments: parent, com = m_comments.pop() current_id = get_current_id(com) - if com.get("class") == " thing noncollapsed deleted comment ": - comment_url = None + if "deleted comment" in com.get("class"): + comment_url = com.get("data-permalink") author = "[Deleted]" points = None else: @@ -233,7 +274,7 @@ def get_comments(self, url: str, all): onclick = a["onclick"] id_list = extract_t1_ids(onclick) for id in id_list: - comment_url = f"{old_url}{id}" + comment_url = f"{old_url}{id}/" m_comments = self.get_childs_l500( comment_url, m_comments, current_id ) @@ -263,7 +304,7 @@ def get_comments(self, url: str, all): for ele in child_com: m_comments.append((current_id, ele)) data = RedditComment( - comment_url=get_new_url(comment_url) if comment_url else None, + comment_url=get_new_url(resolve_relative_url(comment_url)), author=author, id=current_id, parent=parent, From 7b9bb8c9b0f798b3a7b17f3edd42e4cb2b6e9fae Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 10 Jan 2025 16:06:52 +0100 Subject: [PATCH 35/47] adding use of spoof-ua --- minet/reddit/scraper.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 05fdfb6df5..59ca1842c5 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -13,19 +13,6 @@ ) from minet.web import request, create_pool_manager -USER_AGENTS = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.2420.81", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.4; rv:124.0) Gecko/20100101 Firefox/124.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux i686; rv:124.0) Gecko/20100101 Firefox/124.0", -] - def add_slash(url: str): path = url.split("/") @@ -72,7 +59,7 @@ def reddit_request(url, pool_manager): response = request( add_slash(url), pool_manager=pool_manager, - headers={"User-Agent": choice(USER_AGENTS)}, + spoof_ua=True, ) soup = response.soup() remaining_requests = float(response.headers["x-ratelimit-remaining"]) @@ -200,7 +187,7 @@ def __init__(self): def get_childs_l500(self, url, list_comments, parent_id): _, soup, _ = reddit_request(url, self.pool_manager) comments = soup.select("div[class='commentarea']>div>div[class*='comment']") - if parent_id == None: + if parent_id is None: for com in comments: list_comments.append((None, com)) else: @@ -257,7 +244,7 @@ def get_comments(self, url: str, all): while m_comments: parent, com = m_comments.pop() current_id = get_current_id(com) - if "deleted comment" in com.get("class"): + if "deleted" in com.get("class") and "comment" in com.get("class"): comment_url = com.get("data-permalink") author = "[Deleted]" points = None From bf8aee986c2caadaf4997643beb0fdaa84eecee5 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 10 Jan 2025 16:10:33 +0100 Subject: [PATCH 36/47] Fixing tests --- minet/reddit/scraper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 59ca1842c5..e3932e8b02 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -1,4 +1,3 @@ -from random import choice import re from time import sleep from ural import get_domain_name, urlpathsplit, is_url From c28763c5a8a2e10e2cc2f38e0ca6ecaa9aff5848 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 10 Jan 2025 17:21:18 +0100 Subject: [PATCH 37/47] Fixing error with deleted accounts --- minet/reddit/scraper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index e3932e8b02..801e72fa2d 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -249,7 +249,10 @@ def get_comments(self, url: str, all): points = None else: comment_url = com.scrape_one("a[class='bylink']", "href") - author = com.scrape_one("a[class^='author']") + try_author = com.select_one("div.entry.unvoted") + author = try_author.scrape_one("a[class^='author']") + if not author: + author = "[Deleted]" points = get_points(com) published_date, edited_date = get_dates(com) if "morerecursion" in com.get("class") and all: From 622fc2437e27e1631216bb9518ca6046b1a41289 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 10 Jan 2025 17:30:44 +0100 Subject: [PATCH 38/47] Compiling the regex outside the function --- minet/reddit/scraper.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 801e72fa2d..595d1da82a 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -12,6 +12,8 @@ ) from minet.web import request, create_pool_manager +ID_RE = re.compile(r"t1_(\w+)") + def add_slash(url: str): path = url.split("/") @@ -86,8 +88,7 @@ def reddit_request(url, pool_manager): def extract_t1_ids(text): - pattern = r"t1_(\w+)" - return [match.group(1) for match in re.finditer(pattern, text)] + return [match.group(1) for match in re.finditer(ID_RE, text)] def get_current_id(com): From 2fdfb61bdd3e0b5af181803cf31adf3b32bb2372 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Fri, 10 Jan 2025 17:45:38 +0100 Subject: [PATCH 39/47] refacto --- minet/reddit/scraper.py | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 595d1da82a..34442609e1 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -113,7 +113,7 @@ def get_points(ele): def get_dates(ele): published_date = ele.scrape_one("time", "datetime") - edited_date = ele.scrape_one("time[class='edited-timestamp']", "datetime") + edited_date = ele.scrape_one("time.edited-timestamp", "datetime") return published_date, edited_date @@ -186,7 +186,7 @@ def __init__(self): def get_childs_l500(self, url, list_comments, parent_id): _, soup, _ = reddit_request(url, self.pool_manager) - comments = soup.select("div[class='commentarea']>div>div[class*='comment']") + comments = soup.select("div.commentarea>div>div[class*='comment']") if parent_id is None: for com in comments: list_comments.append((None, com)) @@ -227,9 +227,7 @@ def get_comments(self, url: str, all): error=error, ) else: - first_comments = soup.select( - "div[class='commentarea']>div>div[class*='comment']" - ) + first_comments = soup.select("div.commentarea>div>div[class*='comment']") if all: more = soup.select("div.commentarea>div>div[class*='morechildren']") for ele in more: @@ -249,7 +247,7 @@ def get_comments(self, url: str, all): author = "[Deleted]" points = None else: - comment_url = com.scrape_one("a[class='bylink']", "href") + comment_url = com.scrape_one("a.bylink", "href") try_author = com.select_one("div.entry.unvoted") author = try_author.scrape_one("a[class^='author']") if not author: @@ -301,7 +299,7 @@ def get_comments(self, url: str, all): points=points, published_date=published_date, edited_date=edited_date, - comment=com.scrape_one("div[class='md']:not(div.child a)"), + comment=com.scrape_one("div.md:not(div.child a)"), error=error, ) if data.id != "": @@ -318,8 +316,8 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int): for post in posts: if limit is not None and n_crawled == limit: break - list_buttons = post.select_one("ul[class='flat-list buttons']") - if len(list_buttons.scrape("span[class='promoted-span']")) == 0: + list_buttons = post.select_one("ul.flat-list.buttons") + if len(list_buttons.scrape("span.promoted-span")) == 0: title = post.force_select_one("a[class*='title']").get_text() post_url = list_buttons.scrape_one( "a[class^='bylink comments']", "href" @@ -373,7 +371,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int): text_error, ) try_content = text_soup.select_one( - "div[id='siteTable'] div[class^='usertext']" + "div#siteTable div[class^='usertext']" ) if try_content: content = try_content.get_text() @@ -412,7 +410,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int): yield post n_crawled += 1 - old_url = soup.scrape_one("span[class='next-button'] a", "href") + old_url = soup.scrape_one("span.next-button a", "href") def get_user_comments(self, url: str, limit: int): n_crawled = 0 @@ -438,19 +436,15 @@ def get_user_comments(self, url: str, limit: int): for comment in comments: if limit is not None and n_crawled == limit: break - post_title = comment.scrape_one("a[class='title']") - post_url = comment.scrape_one("a[class='bylink may-blank']", "href") - post_author = comment.scrape_one( - "p[class='parent']>a[class^='author']" - ) + post_title = comment.scrape_one("a.title") + post_url = comment.scrape_one("a.bylink.may-blank", "href") + post_author = comment.scrape_one("p.parent>a[class^='author']") post_subreddit = comment.scrape_one("a[class^='subreddit']", "href") points = get_points(comment) published_date, edited_date = get_dates(comment) - text = comment.scrape_one("div[class='content'] div[class='md']") - link = comment.scrape_one( - "div[class='content'] div[class='md'] a", "href" - ) - comment_url = comment.scrape_one("a[class='bylink']", "href") + text = comment.scrape_one("div.content div.md") + link = comment.scrape_one("div.content div.md a", "href") + comment_url = comment.scrape_one("a.bylink", "href") data = RedditUserComment( post_title=post_title, post_url=get_new_url(post_url), @@ -466,4 +460,4 @@ def get_user_comments(self, url: str, limit: int): ) yield data n_crawled += 1 - old_url = soup.scrape_one("span[class='next-button'] a", "href") + old_url = soup.scrape_one("span.next-button a", "href") From 1e311fdd5d24bb0ca5b08a9701053c0855dd50aa Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Mon, 13 Jan 2025 17:05:00 +0100 Subject: [PATCH 40/47] Fixing error with number of posts retrieved --- minet/reddit/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 34442609e1..577f8a296e 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -409,7 +409,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int): ) yield post - n_crawled += 1 + n_crawled += 1 old_url = soup.scrape_one("span.next-button a", "href") def get_user_comments(self, url: str, limit: int): From b414d8a8b94997344122a0167b98260ce2f98af1 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Tue, 14 Jan 2025 12:07:54 +0100 Subject: [PATCH 41/47] Fixing bug with old posts --- minet/reddit/scraper.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 577f8a296e..3ba6a8654f 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -87,8 +87,11 @@ def reddit_request(url, pool_manager): return response, soup, None -def extract_t1_ids(text): - return [match.group(1) for match in re.finditer(ID_RE, text)] +def extract_t1_ids(text: str): + ids = [match.group(1) for match in re.finditer(ID_RE, text)] + if ids: + return ids + return text.split("'")[-4].split(",") def get_current_id(com): From 392f20bc382f26d072f1574634a74fa89e8b44d6 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Tue, 14 Jan 2025 14:00:58 +0100 Subject: [PATCH 42/47] fixing error with "?..." in url --- minet/reddit/scraper.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 3ba6a8654f..6a436844b1 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -17,7 +17,7 @@ def add_slash(url: str): path = url.split("/") - if path[-1] == "?limit=500": + if path[-1][0] == "?": return url elif url[-1] != "/": return url + "/" @@ -29,21 +29,11 @@ def resolve_relative_url(path): def get_old_url(url): - domain = get_domain_name(url) - path = urlpathsplit(url) - old_url = f"https://old.{domain}" - for ele in path: - old_url = urljoin(old_url, f"{ele}/") - return old_url + return url.replace("www.reddit", "old.reddit") def get_new_url(url): - domain = get_domain_name(url) - path = urlpathsplit(url) - new_url = f"https://www.{domain}" - for ele in path: - new_url = urljoin(new_url, f"{ele}/") - return new_url + return url.replace("old.reddit", "www.reddit") def get_url_from_subreddit(name: str): @@ -133,14 +123,13 @@ def data_posts( link, error, ): - try_author = post.select_one("a[class*='author']") - author = try_author.get_text() if try_author else "[Deleted]" + author = post.scrape_one("a[class*='author']") if get_domain_name(link) == "reddit.com": link = "" data = RedditPost( title=title, url=get_new_url(url), - author=author, + author=author if author else "[Deleted]", author_text=author_text, points=points, scraped_number_comments=scraped_number_comments, From 9d4218ffd5fbbf380ad69e9e20bfc7ffcce6a83e Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Tue, 14 Jan 2025 14:04:19 +0100 Subject: [PATCH 43/47] Fixing test error --- minet/reddit/scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 6a436844b1..4a3d4fae52 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -1,6 +1,6 @@ import re from time import sleep -from ural import get_domain_name, urlpathsplit, is_url +from ural import get_domain_name, is_url from urllib.parse import urljoin from minet.reddit.exceptions import RedditInvalidTargetError From 8468cb848fe1eedcd0f3d4bb335f92be97197569 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Tue, 14 Jan 2025 14:17:25 +0100 Subject: [PATCH 44/47] refacto --- minet/reddit/scraper.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 4a3d4fae52..865a11a498 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -15,6 +15,7 @@ ID_RE = re.compile(r"t1_(\w+)") +# when missing a '/' at the end of an url, reddit will make a redirection and it will reduce by 2 the number of requests remaining def add_slash(url: str): path = url.split("/") if path[-1][0] == "?": @@ -94,11 +95,11 @@ def get_current_id(com): def get_points(ele): - scrapped_points = ele.select_one("[class='score unvoted']") - score_hidden = ele.select_one("[class='score-hidden']") + scrapped_points = ele.select_one(".score.unvoted") + score_hidden = ele.select_one(".score-hidden") if not scrapped_points and not score_hidden: return "deleted" - scrapped_points = ele.scrape_one("[class='score unvoted']", "title") + scrapped_points = ele.scrape_one(".score.unvoted", "title") if not scrapped_points: return "score hidden" return scrapped_points @@ -123,9 +124,9 @@ def data_posts( link, error, ): - author = post.scrape_one("a[class*='author']") - if get_domain_name(link) == "reddit.com": - link = "" + author = post.scrape_one("a.author") + if "reddit.com/" in link: + link = None data = RedditPost( title=title, url=get_new_url(url), From 8dc345c970a0f1600229addbf61ae067368fecc9 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Wed, 15 Jan 2025 10:36:18 +0100 Subject: [PATCH 45/47] refacto --- minet/reddit/scraper.py | 71 +++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 865a11a498..665fca65a6 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -1,6 +1,6 @@ import re from time import sleep -from ural import get_domain_name, is_url +from ural import is_url from urllib.parse import urljoin from minet.reddit.exceptions import RedditInvalidTargetError @@ -156,7 +156,7 @@ def data_user_posts( link, error, ): - sub = post.scrape_one("a[class*='subreddit']", "href") + sub = post.scrape_one("a.subreddit", "href") data = RedditUserPost( title=title, url=get_new_url(url), @@ -179,28 +179,28 @@ def __init__(self): def get_childs_l500(self, url, list_comments, parent_id): _, soup, _ = reddit_request(url, self.pool_manager) - comments = soup.select("div.commentarea>div>div[class*='comment']") + comments = soup.select("div.commentarea>div>div.comment") if parent_id is None: for com in comments: list_comments.append((None, com)) - else: - for com in comments: - child = com.find("div", class_="child") - if child.text != "": - child = child.find("div") - child_com = child.find_all( - "div", - class_=lambda x: x - and ( - "comment" in x - or "deleted comment" in x - or "morerecursion" in x - or "morechildren" in x - ), - recursive=False, - ) - for ele in child_com: - list_comments.append((parent_id, ele)) + return list_comments + for com in comments: + child = com.find("div", class_="child") + if child.text != "": + child = child.find("div") + child_com = child.find_all( + "div", + class_=lambda x: x + and ( + "comment" in x + or "deleted comment" in x + or "morerecursion" in x + or "morechildren" in x + ), + recursive=False, + ) + for ele in child_com: + list_comments.append((parent_id, ele)) return list_comments def get_comments(self, url: str, all): @@ -220,9 +220,9 @@ def get_comments(self, url: str, all): error=error, ) else: - first_comments = soup.select("div.commentarea>div>div[class*='comment']") + first_comments = soup.select("div.commentarea>div>div.comment") if all: - more = soup.select("div.commentarea>div>div[class*='morechildren']") + more = soup.select("div.commentarea>div>div.morechildren") for ele in more: a = ele.select_one("a") onclick = a["onclick"] @@ -241,8 +241,7 @@ def get_comments(self, url: str, all): points = None else: comment_url = com.scrape_one("a.bylink", "href") - try_author = com.select_one("div.entry.unvoted") - author = try_author.scrape_one("a[class^='author']") + author = com.scrape_one("div.entry.unvoted a.author") if not author: author = "[Deleted]" points = get_points(com) @@ -286,7 +285,7 @@ def get_comments(self, url: str, all): m_comments.append((current_id, ele)) data = RedditComment( comment_url=get_new_url(resolve_relative_url(comment_url)), - author=author, + author=author if author else "[Deleted]", id=current_id, parent=parent, points=points, @@ -311,12 +310,10 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int): break list_buttons = post.select_one("ul.flat-list.buttons") if len(list_buttons.scrape("span.promoted-span")) == 0: - title = post.force_select_one("a[class*='title']").get_text() - post_url = list_buttons.scrape_one( - "a[class^='bylink comments']", "href" - ) + title = post.force_select_one("a.title").get_text() + post_url = list_buttons.scrape_one("a.bylink.comments", "href") n_comments_scraped = list_buttons.select_one( - "a[class^='bylink comments']" + "a.bylink.comments" ).get_text() match = re.match(r"(\d+)\s+comment(s)?", n_comments_scraped) if match: @@ -325,9 +322,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int): n_comments = 0 upvote = get_points(post) published_date, edited_date = get_dates(post) - link = resolve_relative_url( - post.scrape_one("a[class*='title']", "href") - ) + link = resolve_relative_url(post.scrape_one("a.title", "href")) if link == post_url: link = "" if add_text: @@ -363,9 +358,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int): link, text_error, ) - try_content = text_soup.select_one( - "div#siteTable div[class^='usertext']" - ) + try_content = text_soup.select_one("div#siteTable div.usertext") if try_content: content = try_content.get_text() else: @@ -431,8 +424,8 @@ def get_user_comments(self, url: str, limit: int): break post_title = comment.scrape_one("a.title") post_url = comment.scrape_one("a.bylink.may-blank", "href") - post_author = comment.scrape_one("p.parent>a[class^='author']") - post_subreddit = comment.scrape_one("a[class^='subreddit']", "href") + post_author = comment.scrape_one("p.parent>a.author") + post_subreddit = comment.scrape_one("a.subreddit", "href") points = get_points(comment) published_date, edited_date = get_dates(comment) text = comment.scrape_one("div.content div.md") From de2918755a9561d1ee9fd63c7d6e3e4ed2aed148 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Wed, 15 Jan 2025 11:01:42 +0100 Subject: [PATCH 46/47] fix bug with add_slash --- minet/reddit/scraper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 665fca65a6..542f93b28a 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -18,9 +18,7 @@ # when missing a '/' at the end of an url, reddit will make a redirection and it will reduce by 2 the number of requests remaining def add_slash(url: str): path = url.split("/") - if path[-1][0] == "?": - return url - elif url[-1] != "/": + if path[-1] != "" and not path[-1].startswith("?"): return url + "/" return url From 25e2e60a419e61ff97865acba8c99757a0bca627 Mon Sep 17 00:00:00 2001 From: Julien Pontoire Date: Wed, 15 Jan 2025 13:57:40 +0100 Subject: [PATCH 47/47] refacto --- .gitignore | 1 - minet/reddit/scraper.py | 93 +++++++++++++---------------------------- 2 files changed, 30 insertions(+), 64 deletions(-) diff --git a/.gitignore b/.gitignore index ddd3a616c1..b2de0b22ba 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,6 @@ ftest/*.csv *.sqlar *-wal *-shm -*.csv /crawl /downloaded diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 542f93b28a..f49daa8c9f 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -296,6 +296,7 @@ def get_comments(self, url: str, all): yield data def get_general_post(self, url: str, type: str, add_text: bool, limit: int): + fn = data_posts if type == "subreddit" else data_user_posts n_crawled = 0 old_url = get_old_url(get_url_from_subreddit(url)) while old_url and (limit is None or n_crawled < limit): @@ -328,71 +329,37 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int): post_url, self.pool_manager ) if text_error: - if type == "subreddit": - yield data_posts( - post, - title, - post_url, - "", - upvote, - n_comments_scraped, - n_comments, - published_date, - edited_date, - link, - text_error, - ) - else: - yield data_user_posts( - post, - title, - post_url, - "", - upvote, - n_comments_scraped, - n_comments, - published_date, - edited_date, - link, - text_error, - ) - try_content = text_soup.select_one("div#siteTable div.usertext") - if try_content: - content = try_content.get_text() - else: - content = "" - else: - content = "" - if type == "subreddit": - post = data_posts( - post, - title, - post_url, - content, - upvote, - n_comments_scraped, - n_comments, - published_date, - edited_date, - link, - error, + yield fn( + post, + title, + post_url, + None, + upvote, + n_comments_scraped, + n_comments, + published_date, + edited_date, + link, + text_error, + ) + content = text_soup.scrape_one( + "div#siteTable div.usertext-body" ) else: - post = data_user_posts( - post, - title, - post_url, - content, - upvote, - n_comments_scraped, - n_comments, - published_date, - edited_date, - link, - error, - ) - - yield post + content = "" + yield fn( + post, + title, + post_url, + content, + upvote, + n_comments_scraped, + n_comments, + published_date, + edited_date, + link, + error, + ) n_crawled += 1 old_url = soup.scrape_one("span.next-button a", "href")