From 6cba1ee909c2dd575a5e1200952d43ff693c3549 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 5 Dec 2024 16:45:08 +0100
Subject: [PATCH 01/47] Draft reddit

---
 minet/cli/commands.py        |   2 +
 minet/cli/reddit/__init__.py |  49 +++++++++++++++++
 minet/cli/reddit/posts.py    |  44 +++++++++++++++
 minet/reddit/scraper.py      | 103 +++++++++++++++++++++++++++++++++++
 minet/reddit/types.py        |  32 +++++++++++
 5 files changed, 230 insertions(+)
 create mode 100644 minet/cli/reddit/__init__.py
 create mode 100644 minet/cli/reddit/posts.py
 create mode 100644 minet/reddit/scraper.py
 create mode 100644 minet/reddit/types.py

diff --git a/minet/cli/commands.py b/minet/cli/commands.py
index 4202252a19..6200e990bb 100644
--- a/minet/cli/commands.py
+++ b/minet/cli/commands.py
@@ -14,6 +14,7 @@
 from minet.cli.hyphe import HYPHE_COMMAND
 from minet.cli.instagram import INSTAGRAM_COMMAND
 from minet.cli.mediacloud import MEDIACLOUD_COMMAND
+from minet.cli.reddit import REDDIT_COMMAND
 from minet.cli.telegram import TELEGRAM_COMMAND
 from minet.cli.tiktok import TIKTOK_COMMAND
 from minet.cli.twitter import TWITTER_COMMAND
@@ -42,6 +43,7 @@
     HYPHE_COMMAND,
     INSTAGRAM_COMMAND,
     MEDIACLOUD_COMMAND,
+    REDDIT_COMMAND,
     TELEGRAM_COMMAND,
     TIKTOK_COMMAND,
     TWITTER_COMMAND,
diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
new file mode 100644
index 0000000000..3301a6ee5e
--- /dev/null
+++ b/minet/cli/reddit/__init__.py
@@ -0,0 +1,49 @@
+# =============================================================================
+# Minet Reddit CLI Action
+# =============================================================================
+#
+# Logic of the `rd` action.
+#
+from casanova import RowCountResumer
+
+from minet.cli.argparse import command, ConfigAction
+
+REDDIT_POSTS_SUBCOMMAND = command(
+    "posts",
+    "minet.cli.reddit.posts",
+    title="Minet Reddit Posts Command",
+    description="""
+        Retrieve reddit posts from a subreddit link.
+    """,
+    epilog="""
+        Example:
+
+        . Searching posts from the subreddit r/france:
+            $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv
+    """,
+    variadic_input= {
+        "dummy_column": "post",
+        "item_label": "post url, post shortcode or post id",
+        "item_label_plural": "post urls, post shortcodes or post ids",
+    },
+    arguments=[
+        {
+            "flags": ["-n", "--number"],
+            "help": "Number of posts to retrieve.",
+            "type": int,
+        }
+    ],
+)
+
+REDDIT_COMMAND = command(
+    "reddit",
+    "minet.cli.reddit",
+    "Minet Reddit Command",
+    aliases=["rd"],
+    description="""
+        Collect data from Reddit.
+    """,
+    subcommands=[
+        REDDIT_POSTS_SUBCOMMAND,
+    ],
+)
\ No newline at end of file
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
new file mode 100644
index 0000000000..bad56c15cb
--- /dev/null
+++ b/minet/cli/reddit/posts.py
@@ -0,0 +1,44 @@
+# =============================================================================
+# Minet Reddit Posts CLI Action
+# =============================================================================
+#
+# Logic of the `rd posts` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditPost
+
+
+
+@with_enricher_and_loading_bar(
+    headers={"post_url"},
+    title="Scraping posts",
+    unit="groups",
+    nested=True,
+    sub_unit="posts",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.number:
+                    posts = scraper.get_posts_urls(url, cli_args.number)
+                else:
+                    posts = scraper.get_posts_urls(url)
+            except :
+                loading_bar.print(
+                    "problème"
+                )
+                continue
+        
+            list_posts = []
+            for post in posts:
+                list_posts.append({post})
+            
+            for post in list_posts:
+                loading_bar.nested_advance()
+                enricher.writerow(row, post)
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
new file mode 100644
index 0000000000..624c421b6e
--- /dev/null
+++ b/minet/reddit/scraper.py
@@ -0,0 +1,103 @@
+from minet.web import request, create_pool_manager
+from math import ceil
+from ural import get_domain_name, urlpathsplit, is_url
+from time import sleep
+from minet.reddit.types import RedditPost
+import json
+from ebbe import getpath
+from collections import deque
+from urllib.parse import urljoin
+import csv
+import re
+import sys
+import os
+
+def get_old_url(url):
+    domain = get_domain_name(url)
+    path = urlpathsplit(url)
+    return f"https://old.{domain}/" + "/".join(path) + "/"
+
+
+def get_new_url(url):
+    domain = get_domain_name(url)
+    path = urlpathsplit(url)
+    return f"https://www.{domain}/" + "/".join(path) + "/"
+
+def reddit_request(url, pool_manager):
+    sleep(1)
+    response = request(url, pool_manager=pool_manager)
+    remaining_requests = float(response.headers["x-ratelimit-remaining"])
+    if remaining_requests == 1:
+        time_remaining = int(response.headers["x-ratelimit-reset"])
+        print(f"Time before next request : {time_remaining}s")
+        sleep(time_remaining)
+        return reddit_request(url)
+    if response.status == 429:
+        return reddit_request(url)
+    return response
+
+
+class RedditScraper(object):
+    def __init__(self):
+        self.pool_manager = create_pool_manager()
+
+    def get_posts_urls(self, url, nb_post = 25):
+        dir_name = urlpathsplit(url)[1]
+        try:
+            os.mkdir(dir_name)
+        except FileExistsError:
+            pass
+        except PermissionError:
+            print(f"Permission denied: Unable to create '{dir_name}'.")
+        except Exception as e:
+            print(f"An error occurred: {e}")
+        list_posts = set()
+        nb_pages = ceil(int(nb_post) / 25)
+        old_url = get_old_url(url)
+        n_crawled = 0
+        for _ in range(nb_pages):
+            if n_crawled == int(nb_post):
+                break
+            response = reddit_request(old_url, self.pool_manager)
+            soup = response.soup()
+            list_buttons = soup.select("ul[class='flat-list buttons']")
+            for link in list_buttons:
+                if n_crawled == int(nb_post):
+                    break
+                if len(link.scrape("span[class='promoted-span']")) == 0:
+                    list_posts.update(link.scrape("a[class^='bylink comments']", "href"))
+                    n_crawled += 1
+            old_url = soup.scrape("span[class='next-button'] a", "href")[0]
+        return list(list_posts)
+
+
+    def get_posts(self, url, nb_post):
+        posts = []
+        list_posts_url = self.get_posts_urls(self, url, nb_post)
+        for url in list_posts_url:
+            response = reddit_request(url, self.pool_manager)
+            if response.url == 429:
+                print(response.headers)
+                print(response.end_url)
+            soup = response.soup()
+            title = soup.force_select_one("a[class^='title']").get_text()
+            upvote = soup.force_select_one("div[class='score'] span").get_text()
+            author = soup.scrape_one("a[class^='author']", "href")
+            published_date = soup.scrape_one("div[class='date'] time", "datetime")
+            link = soup.scrape_one("a[class^='title']", "href")
+            if urlpathsplit(link) == urlpathsplit(url):
+                link = None
+            author_text = soup.scrape_one(
+                "div[id='siteTable'] div[class^='usertext-body'] div p"
+            )
+            post = RedditPost(
+                title=title,
+                url=url,
+                author=author,
+                author_text=author_text,
+                upvote=upvote,
+                published_date=published_date,
+                link=link,
+            )
+            posts.append(post)
+        return posts
\ No newline at end of file
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
new file mode 100644
index 0000000000..b918f72eab
--- /dev/null
+++ b/minet/reddit/types.py
@@ -0,0 +1,32 @@
+from typing import List, Optional, Dict, Tuple, Iterable
+from datetime import datetime
+
+from dataclasses import dataclass
+from casanova import TabularRecord
+from ebbe import getpath
+
+
+@dataclass
+class RedditPost(TabularRecord):
+    title: str
+    url: str
+    author: str
+    author_text: str
+    upvote: str
+    published_date: str
+    link: Optional[str]
+
+
+@dataclass
+class RedditComment(TabularRecord):
+    # url: str
+    # author: str
+    id: str
+    parent: str
+    # points: Optional[str]
+    # published_date: str
+    comment: str
+
+
+
+

From 831537e106c2dfd2c63e7d6886a7aeb7ec2f2e6a Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 5 Dec 2024 18:07:46 +0100
Subject: [PATCH 02/47] Fix reddit posts

---
 minet/cli/reddit/posts.py |  8 ++--
 minet/reddit/scraper.py   | 83 ++++++++++++---------------------------
 minet/reddit/types.py     |  2 +-
 3 files changed, 31 insertions(+), 62 deletions(-)

diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index bad56c15cb..6344cda616 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -11,7 +11,7 @@
 
 
 @with_enricher_and_loading_bar(
-    headers={"post_url"},
+    headers=RedditPost,
     title="Scraping posts",
     unit="groups",
     nested=True,
@@ -26,9 +26,9 @@ def action(cli_args, enricher, loading_bar):
         with loading_bar.step(url):
             try:
                 if cli_args.number:
-                    posts = scraper.get_posts_urls(url, cli_args.number)
+                    posts = scraper.get_posts(url, cli_args.number)
                 else:
-                    posts = scraper.get_posts_urls(url)
+                    posts = scraper.get_posts(url)
             except :
                 loading_bar.print(
                     "problème"
@@ -37,7 +37,7 @@ def action(cli_args, enricher, loading_bar):
         
             list_posts = []
             for post in posts:
-                list_posts.append({post})
+                list_posts.append(post)
             
             for post in list_posts:
                 loading_bar.nested_advance()
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 624c421b6e..df2bd326e0 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -1,16 +1,8 @@
 from minet.web import request, create_pool_manager
 from math import ceil
-from ural import get_domain_name, urlpathsplit, is_url
+from ural import get_domain_name, urlpathsplit
 from time import sleep
 from minet.reddit.types import RedditPost
-import json
-from ebbe import getpath
-from collections import deque
-from urllib.parse import urljoin
-import csv
-import re
-import sys
-import os
 
 def get_old_url(url):
     domain = get_domain_name(url)
@@ -41,17 +33,8 @@ class RedditScraper(object):
     def __init__(self):
         self.pool_manager = create_pool_manager()
 
-    def get_posts_urls(self, url, nb_post = 25):
-        dir_name = urlpathsplit(url)[1]
-        try:
-            os.mkdir(dir_name)
-        except FileExistsError:
-            pass
-        except PermissionError:
-            print(f"Permission denied: Unable to create '{dir_name}'.")
-        except Exception as e:
-            print(f"An error occurred: {e}")
-        list_posts = set()
+    def get_posts(self, url, nb_post = 25):
+        list_posts = []
         nb_pages = ceil(int(nb_post) / 25)
         old_url = get_old_url(url)
         n_crawled = 0
@@ -60,44 +43,30 @@ def get_posts_urls(self, url, nb_post = 25):
                 break
             response = reddit_request(old_url, self.pool_manager)
             soup = response.soup()
-            list_buttons = soup.select("ul[class='flat-list buttons']")
-            for link in list_buttons:
+            posts = soup.select("div[id^='thing_t3_']")
+            for post in posts:
                 if n_crawled == int(nb_post):
                     break
-                if len(link.scrape("span[class='promoted-span']")) == 0:
-                    list_posts.update(link.scrape("a[class^='bylink comments']", "href"))
-                    n_crawled += 1
-            old_url = soup.scrape("span[class='next-button'] a", "href")[0]
-        return list(list_posts)
+                list_buttons = post.select_one("ul[class='flat-list buttons']")
+                if len(list_buttons.scrape("span[class='promoted-span']")) == 0:
+                    title = post.force_select_one("a[class*='title']").get_text()
+                    post_url = list_buttons.scrape_one("a[class^='bylink comments']", "href")
+                    author = post.select_one("a[class*='author']").get_text()
+                    upvote = post.select_one("div[class='score unvoted']").get_text()
+                    published_date = post.scrape_one("time", "datetime")
+                    link = post.scrape_one("a[class*='title']", "href")
 
+                    data = RedditPost(
+                        title=title,
+                        url=post_url,
+                        author=author,
+                        author_text=None,
+                        upvote=upvote,
+                        published_date=published_date,
+                        link=link
+                    )
 
-    def get_posts(self, url, nb_post):
-        posts = []
-        list_posts_url = self.get_posts_urls(self, url, nb_post)
-        for url in list_posts_url:
-            response = reddit_request(url, self.pool_manager)
-            if response.url == 429:
-                print(response.headers)
-                print(response.end_url)
-            soup = response.soup()
-            title = soup.force_select_one("a[class^='title']").get_text()
-            upvote = soup.force_select_one("div[class='score'] span").get_text()
-            author = soup.scrape_one("a[class^='author']", "href")
-            published_date = soup.scrape_one("div[class='date'] time", "datetime")
-            link = soup.scrape_one("a[class^='title']", "href")
-            if urlpathsplit(link) == urlpathsplit(url):
-                link = None
-            author_text = soup.scrape_one(
-                "div[id='siteTable'] div[class^='usertext-body'] div p"
-            )
-            post = RedditPost(
-                title=title,
-                url=url,
-                author=author,
-                author_text=author_text,
-                upvote=upvote,
-                published_date=published_date,
-                link=link,
-            )
-            posts.append(post)
-        return posts
\ No newline at end of file
+                    list_posts.append(data)
+                    n_crawled += 1
+            old_url = soup.scrape("span[class='next-button'] a", "href")[0]
+        return list(list_posts)
\ No newline at end of file
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index b918f72eab..f80064d0b5 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -11,7 +11,7 @@ class RedditPost(TabularRecord):
     title: str
     url: str
     author: str
-    author_text: str
+    author_text: Optional[str]
     upvote: str
     published_date: str
     link: Optional[str]

From 8fb9cf0825c84a5acfe9136d8668967306327c6c Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 6 Dec 2024 11:45:03 +0100
Subject: [PATCH 03/47] Updating reddit posts

---
 minet/cli/reddit/__init__.py |  8 +++----
 minet/reddit/scraper.py      | 41 ++++++++++++++++++++++++++++++------
 minet/reddit/types.py        |  1 +
 3 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index 3301a6ee5e..f7bc7e0d89 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -22,16 +22,16 @@
             $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv
     """,
     variadic_input= {
-        "dummy_column": "post",
-        "item_label": "post url, post shortcode or post id",
-        "item_label_plural": "post urls, post shortcodes or post ids",
+        "dummy_column": "subreddit",
+        "item_label": "subreddit url, subreddit shortcode or subreddit id",
+        "item_label_plural": "subreddit urls, subreddit shortcodes or subreddits ids",
     },
     arguments=[
         {
             "flags": ["-n", "--number"],
             "help": "Number of posts to retrieve.",
             "type": int,
-        }
+        },
     ],
 )
 
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index df2bd326e0..6c87eb5dda 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -1,8 +1,10 @@
 from minet.web import request, create_pool_manager
 from math import ceil
-from ural import get_domain_name, urlpathsplit
+from ural import get_domain_name, urlpathsplit, is_url
 from time import sleep
 from minet.reddit.types import RedditPost
+import re
+
 
 def get_old_url(url):
     domain = get_domain_name(url)
@@ -15,9 +17,23 @@ def get_new_url(url):
     path = urlpathsplit(url)
     return f"https://www.{domain}/" + "/".join(path) + "/"
 
+
+def get_url_from_subreddit(name: str):
+    if is_url(name):
+        return name
+    name = name.lstrip("/")
+    if name.startswith("r/"):
+        return "https://old.reddit.com/" + name
+    return "https://old.reddit.com/r/" + name
+
+
 def reddit_request(url, pool_manager):
     sleep(1)
     response = request(url, pool_manager=pool_manager)
+    soup = response.soup()
+    if response.status == 404 or soup.scrape("p[id='noresults']"):
+        print("invalid url!")
+        return
     remaining_requests = float(response.headers["x-ratelimit-remaining"])
     if remaining_requests == 1:
         time_remaining = int(response.headers["x-ratelimit-reset"])
@@ -33,10 +49,10 @@ class RedditScraper(object):
     def __init__(self):
         self.pool_manager = create_pool_manager()
 
-    def get_posts(self, url, nb_post = 25):
+    def get_posts(self, url, nb_post=25):
         list_posts = []
         nb_pages = ceil(int(nb_post) / 25)
-        old_url = get_old_url(url)
+        old_url = get_old_url(get_url_from_subreddit(url))
         n_crawled = 0
         for _ in range(nb_pages):
             if n_crawled == int(nb_post):
@@ -50,8 +66,18 @@ def get_posts(self, url, nb_post = 25):
                 list_buttons = post.select_one("ul[class='flat-list buttons']")
                 if len(list_buttons.scrape("span[class='promoted-span']")) == 0:
                     title = post.force_select_one("a[class*='title']").get_text()
-                    post_url = list_buttons.scrape_one("a[class^='bylink comments']", "href")
-                    author = post.select_one("a[class*='author']").get_text()
+                    post_url = list_buttons.scrape_one(
+                        "a[class^='bylink comments']", "href"
+                    )
+                    n_comments = list_buttons.select_one(
+                        "a[class^='bylink comments']").get_text()
+                    match = re.match(r"(\d+)\s+comments", n_comments)
+                    if match:
+                        n_comments = int(match.group(1))
+                    else:
+                        n_comments = 0
+                    try_author = post.select_one("a[class*='author']")
+                    author = try_author.get_text() if try_author else "Deleted"
                     upvote = post.select_one("div[class='score unvoted']").get_text()
                     published_date = post.scrape_one("time", "datetime")
                     link = post.scrape_one("a[class*='title']", "href")
@@ -62,11 +88,12 @@ def get_posts(self, url, nb_post = 25):
                         author=author,
                         author_text=None,
                         upvote=upvote,
+                        number_comments=n_comments,
                         published_date=published_date,
-                        link=link
+                        link=link,
                     )
 
                     list_posts.append(data)
                     n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]
-        return list(list_posts)
\ No newline at end of file
+        return list(list_posts)
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index f80064d0b5..a7af811463 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -13,6 +13,7 @@ class RedditPost(TabularRecord):
     author: str
     author_text: Optional[str]
     upvote: str
+    number_comments: int
     published_date: str
     link: Optional[str]
 

From a88ac134a97f2e7fb07fd79f0dc934086052120a Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 6 Dec 2024 16:23:17 +0100
Subject: [PATCH 04/47] Adding -t, --text to reddit posts

---
 minet/cli/reddit/__init__.py |  5 +++++
 minet/cli/reddit/posts.py    | 14 ++++++++++----
 minet/reddit/scraper.py      | 16 +++++++++++++---
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index f7bc7e0d89..e10db2cd40 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -32,6 +32,11 @@
             "help": "Number of posts to retrieve.",
             "type": int,
         },
+        {
+            "flags": ["-t", "--text"],
+            "help": "Retrieve the text of the post. Note that it will require one request per post.",
+            "action": "store_true",
+        }
     ],
 )
 
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index 6344cda616..4d54689ca8 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -15,7 +15,7 @@
     title="Scraping posts",
     unit="groups",
     nested=True,
-    sub_unit="posts",
+    sub_unit="subreddits",
 )
 def action(cli_args, enricher, loading_bar):
     scraper = RedditScraper()
@@ -26,12 +26,18 @@ def action(cli_args, enricher, loading_bar):
         with loading_bar.step(url):
             try:
                 if cli_args.number:
-                    posts = scraper.get_posts(url, cli_args.number)
+                    if cli_args.text:
+                        posts = scraper.get_posts(url, True, cli_args.number)
+                    else:
+                        posts = scraper.get_posts(url, False, cli_args.number)
                 else:
-                    posts = scraper.get_posts(url)
+                    if cli_args.text:
+                        posts = scraper.get_posts(url, True)
+                    else:
+                        posts = scraper.get_posts(url, False)
             except :
                 loading_bar.print(
-                    "problème"
+                    "the script could not complete normally on line %i" % (i)
                 )
                 continue
         
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 6c87eb5dda..bd89234bcf 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -31,7 +31,7 @@ def reddit_request(url, pool_manager):
     sleep(1)
     response = request(url, pool_manager=pool_manager)
     soup = response.soup()
-    if response.status == 404 or soup.scrape("p[id='noresults']"):
+    if response.status == 404 or (soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']")):
         print("invalid url!")
         return
     remaining_requests = float(response.headers["x-ratelimit-remaining"])
@@ -49,7 +49,7 @@ class RedditScraper(object):
     def __init__(self):
         self.pool_manager = create_pool_manager()
 
-    def get_posts(self, url, nb_post=25):
+    def get_posts(self, url: str, add_text: bool, nb_post=25):
         list_posts = []
         nb_pages = ceil(int(nb_post) / 25)
         old_url = get_old_url(get_url_from_subreddit(url))
@@ -81,12 +81,22 @@ def get_posts(self, url, nb_post=25):
                     upvote = post.select_one("div[class='score unvoted']").get_text()
                     published_date = post.scrape_one("time", "datetime")
                     link = post.scrape_one("a[class*='title']", "href")
+                    if add_text:
+                        text_response = reddit_request(post_url, self.pool_manager)
+                        text_soup = text_response.soup()
+                        try_content = text_soup.select_one("div[id='siteTable'] div[class^='usertext']")
+                        if try_content:
+                            content = try_content.get_text()
+                        else:
+                            content = ""
+                    else:
+                        content = ""
 
                     data = RedditPost(
                         title=title,
                         url=post_url,
                         author=author,
-                        author_text=None,
+                        author_text=content,
                         upvote=upvote,
                         number_comments=n_comments,
                         published_date=published_date,

From 2735a0a68d5a700a20fd4332dd1d947e7e7610f8 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 6 Dec 2024 16:58:58 +0100
Subject: [PATCH 05/47] Fix tests

---
 minet/cli/reddit/__init__.py | 10 +++++-----
 minet/cli/reddit/posts.py    |  7 +++----
 minet/reddit/exceptions.py   | 17 +++++++++++++++++
 minet/reddit/scraper.py      | 15 ++++++++++-----
 minet/reddit/types.py        |  4 ----
 5 files changed, 35 insertions(+), 18 deletions(-)
 create mode 100644 minet/reddit/exceptions.py

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index e10db2cd40..d26cb97c25 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -4,9 +4,8 @@
 #
 # Logic of the `rd` action.
 #
-from casanova import RowCountResumer
 
-from minet.cli.argparse import command, ConfigAction
+from minet.cli.argparse import command
 
 REDDIT_POSTS_SUBCOMMAND = command(
     "posts",
@@ -21,7 +20,7 @@
         . Searching posts from the subreddit r/france:
             $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv
     """,
-    variadic_input= {
+    variadic_input={
         "dummy_column": "subreddit",
         "item_label": "subreddit url, subreddit shortcode or subreddit id",
         "item_label_plural": "subreddit urls, subreddit shortcodes or subreddits ids",
@@ -36,10 +35,11 @@
             "flags": ["-t", "--text"],
             "help": "Retrieve the text of the post. Note that it will require one request per post.",
             "action": "store_true",
-        }
+        },
     ],
 )
 
+
 REDDIT_COMMAND = command(
     "reddit",
     "minet.cli.reddit",
@@ -51,4 +51,4 @@
     subcommands=[
         REDDIT_POSTS_SUBCOMMAND,
     ],
-)
\ No newline at end of file
+)
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index 4d54689ca8..b39bd8d9c9 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -9,7 +9,6 @@
 from minet.reddit.types import RedditPost
 
 
-
 @with_enricher_and_loading_bar(
     headers=RedditPost,
     title="Scraping posts",
@@ -35,16 +34,16 @@ def action(cli_args, enricher, loading_bar):
                         posts = scraper.get_posts(url, True)
                     else:
                         posts = scraper.get_posts(url, False)
-            except :
+            except:
                 loading_bar.print(
                     "the script could not complete normally on line %i" % (i)
                 )
                 continue
-        
+
             list_posts = []
             for post in posts:
                 list_posts.append(post)
-            
+
             for post in list_posts:
                 loading_bar.nested_advance()
                 enricher.writerow(row, post)
diff --git a/minet/reddit/exceptions.py b/minet/reddit/exceptions.py
new file mode 100644
index 0000000000..5b4ffc3aeb
--- /dev/null
+++ b/minet/reddit/exceptions.py
@@ -0,0 +1,17 @@
+# =============================================================================
+# Minet Reddit Exceptions
+# =============================================================================
+#
+from minet.exceptions import MinetError
+
+
+class RedditError(MinetError):
+    pass
+
+
+class RedditInvalidTargetError(RedditError):
+    pass
+
+
+class RedditNotPostError(RedditError):
+    pass
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index bd89234bcf..6dc7274360 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -3,6 +3,7 @@
 from ural import get_domain_name, urlpathsplit, is_url
 from time import sleep
 from minet.reddit.types import RedditPost
+from minet.reddit.exceptions import RedditInvalidTargetError
 import re
 
 
@@ -31,9 +32,10 @@ def reddit_request(url, pool_manager):
     sleep(1)
     response = request(url, pool_manager=pool_manager)
     soup = response.soup()
-    if response.status == 404 or (soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']")):
-        print("invalid url!")
-        return
+    if response.status == 404 or (
+        soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']")
+    ):
+        raise RedditInvalidTargetError
     remaining_requests = float(response.headers["x-ratelimit-remaining"])
     if remaining_requests == 1:
         time_remaining = int(response.headers["x-ratelimit-reset"])
@@ -70,7 +72,8 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                         "a[class^='bylink comments']", "href"
                     )
                     n_comments = list_buttons.select_one(
-                        "a[class^='bylink comments']").get_text()
+                        "a[class^='bylink comments']"
+                    ).get_text()
                     match = re.match(r"(\d+)\s+comments", n_comments)
                     if match:
                         n_comments = int(match.group(1))
@@ -84,7 +87,9 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                     if add_text:
                         text_response = reddit_request(post_url, self.pool_manager)
                         text_soup = text_response.soup()
-                        try_content = text_soup.select_one("div[id='siteTable'] div[class^='usertext']")
+                        try_content = text_soup.select_one(
+                            "div[id='siteTable'] div[class^='usertext']"
+                        )
                         if try_content:
                             content = try_content.get_text()
                         else:
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index a7af811463..0be774dd5e 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -27,7 +27,3 @@ class RedditComment(TabularRecord):
     # points: Optional[str]
     # published_date: str
     comment: str
-
-
-
-

From 9434b192f63afee217b243b1746b81ded646aba8 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 6 Dec 2024 17:01:00 +0100
Subject: [PATCH 06/47] fix tests

---
 minet/reddit/types.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 0be774dd5e..6b52cbfef2 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -1,9 +1,7 @@
-from typing import List, Optional, Dict, Tuple, Iterable
-from datetime import datetime
+from typing import Optional
 
 from dataclasses import dataclass
 from casanova import TabularRecord
-from ebbe import getpath
 
 
 @dataclass
@@ -16,14 +14,3 @@ class RedditPost(TabularRecord):
     number_comments: int
     published_date: str
     link: Optional[str]
-
-
-@dataclass
-class RedditComment(TabularRecord):
-    # url: str
-    # author: str
-    id: str
-    parent: str
-    # points: Optional[str]
-    # published_date: str
-    comment: str

From 1c93157ffb7908eb49b910749b4b16b1997ef75d Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 6 Dec 2024 17:03:40 +0100
Subject: [PATCH 07/47] fix tests

---
 minet/cli/reddit/posts.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index b39bd8d9c9..0530ac05c7 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -7,6 +7,7 @@
 from minet.cli.utils import with_enricher_and_loading_bar
 from minet.reddit.scraper import RedditScraper
 from minet.reddit.types import RedditPost
+from minet.reddit.exceptions import RedditInvalidTargetError
 
 
 @with_enricher_and_loading_bar(
@@ -34,7 +35,7 @@ def action(cli_args, enricher, loading_bar):
                         posts = scraper.get_posts(url, True)
                     else:
                         posts = scraper.get_posts(url, False)
-            except:
+            except RedditInvalidTargetError:
                 loading_bar.print(
                     "the script could not complete normally on line %i" % (i)
                 )

From ef116eb852f66cc46b2297efacb7cb5832777fef Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Mon, 9 Dec 2024 16:50:02 +0100
Subject: [PATCH 08/47] First version of reddit comments

---
 minet/cli/reddit/__init__.py |  29 ++++++++++
 minet/cli/reddit/comments.py |  41 ++++++++++++++
 minet/cli/reddit/posts.py    |   2 +-
 minet/reddit/scraper.py      | 106 ++++++++++++++++++++++++++++++++++-
 minet/reddit/types.py        |  11 ++++
 5 files changed, 186 insertions(+), 3 deletions(-)
 create mode 100644 minet/cli/reddit/comments.py

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index d26cb97c25..17b2d1f018 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -39,6 +39,34 @@
     ],
 )
 
+REDDIT_COMMENTS_SUBCOMMAND = command(
+    "comments",
+    "minet.cli.reddit.comments",
+    title="Minet Reddit Comments Command",
+    description="""
+        Retrieve comments from a reddit post link.
+        Note that it will only retrieve the comments displayed on the page. If you want all the comments you need to use -A, --all but it will require a request per comment, and you can only make 100 requests per 10 minutes.
+    """,
+    epilog="""
+        Example:
+
+        . Searching comments from a reddit post:
+            $ minet reddit comments https://www.reddit.com/r/france/comments/... > r_france_comments.csv
+    """,
+    variadic_input={
+        "dummy_column": "post",
+        "item_label": "post url, post shortcode or post id",
+        "item_label_plural": "posts urls, posts shortcodes or posts ids",
+    },
+    arguments=[
+        {
+            "flags": ["-A", "--all"],
+            "help": "Retrieve all comments.",
+            "action": "store_true",
+        },
+    ],
+)
+
 
 REDDIT_COMMAND = command(
     "reddit",
@@ -50,5 +78,6 @@
     """,
     subcommands=[
         REDDIT_POSTS_SUBCOMMAND,
+        REDDIT_COMMENTS_SUBCOMMAND,
     ],
 )
diff --git a/minet/cli/reddit/comments.py b/minet/cli/reddit/comments.py
new file mode 100644
index 0000000000..d41c7b9699
--- /dev/null
+++ b/minet/cli/reddit/comments.py
@@ -0,0 +1,41 @@
+# =============================================================================
+# Minet Reddit Comments CLI Action
+# =============================================================================
+#
+# Logic of the `rd comments` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditComment
+from minet.reddit.exceptions import RedditInvalidTargetError
+
+
+@with_enricher_and_loading_bar(
+    headers=RedditComment,
+    title="Scraping comments",
+    unit="groups",
+    nested=True,
+    sub_unit="comments",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.all:
+                    comments = scraper.get_comments(url, True)
+                else:
+                    comments = scraper.get_comments(url, False)
+
+            except RedditInvalidTargetError:
+                loading_bar.print(
+                    "the script could not complete normally on line %i" % (i)
+                )
+                continue
+
+            for comment in comments:
+                loading_bar.nested_advance()
+                enricher.writerow(row, comment)
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index 0530ac05c7..777f88813d 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -15,7 +15,7 @@
     title="Scraping posts",
     unit="groups",
     nested=True,
-    sub_unit="subreddits",
+    sub_unit="posts",
 )
 def action(cli_args, enricher, loading_bar):
     scraper = RedditScraper()
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 6dc7274360..5d0806b33d 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -2,9 +2,14 @@
 from math import ceil
 from ural import get_domain_name, urlpathsplit, is_url
 from time import sleep
-from minet.reddit.types import RedditPost
+from minet.reddit.types import RedditPost, RedditComment
 from minet.reddit.exceptions import RedditInvalidTargetError
 import re
+from urllib.parse import urljoin
+
+
+def resolve_relative_url(path):
+    return urljoin("https://old.reddit.com", path)
 
 
 def get_old_url(url):
@@ -47,10 +52,47 @@ def reddit_request(url, pool_manager):
     return response
 
 
+def extract_t1_ids(text):
+    pattern = r"t1_(\w+)"
+    return [match.group(1) for match in re.finditer(pattern, text)]
+
+
+def get_current_id(com):
+    current_id = com.get("id")
+    if current_id:
+        current_id = current_id.split("_")[-1]
+    else:
+        current_id = com.get("data-permalink").split("/")[-2]
+    return current_id
+
+
 class RedditScraper(object):
     def __init__(self):
         self.pool_manager = create_pool_manager()
 
+    def get_childs_l500(self, url, list_comments, parent_id):
+        response = reddit_request(url, self.pool_manager)
+        soup = response.soup()
+        comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
+        for com in comments:
+            child = com.find("div", class_="child")
+            if child.text != "":
+                child = child.find("div")
+                child_com = child.find_all(
+                    "div",
+                    class_=lambda x: x
+                    and (
+                        "comment" in x
+                        or "deleted comment" in x
+                        or "morerecursion" in x
+                        or "morechildren" in x
+                    ),
+                    recursive=False,
+                )
+                for ele in child_com:
+                    list_comments.append((parent_id, ele))
+        return list_comments
+
     def get_posts(self, url: str, add_text: bool, nb_post=25):
         list_posts = []
         nb_pages = ceil(int(nb_post) / 25)
@@ -82,6 +124,8 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                     try_author = post.select_one("a[class*='author']")
                     author = try_author.get_text() if try_author else "Deleted"
                     upvote = post.select_one("div[class='score unvoted']").get_text()
+                    if upvote == '•':
+                        upvote = ""
                     published_date = post.scrape_one("time", "datetime")
                     link = post.scrape_one("a[class*='title']", "href")
                     if add_text:
@@ -105,10 +149,68 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                         upvote=upvote,
                         number_comments=n_comments,
                         published_date=published_date,
-                        link=link,
+                        link=resolve_relative_url(link),
                     )
 
                     list_posts.append(data)
                     n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]
         return list(list_posts)
+    
+
+    def get_comments(self, url: str, all):
+        list_return = []
+        m_comments = []
+        old_url = get_old_url(url)
+        url_limit = old_url + "?limit=500"
+        response = reddit_request(url_limit, self.pool_manager)
+        soup = response.soup()
+        first_comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
+        for ele in first_comments:
+            m_comments.append((None, ele))
+        while m_comments:
+            parent, com = m_comments.pop()
+            current_id = get_current_id(com)
+            if "morerecursion" in com.get("class") and all:
+                url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
+                m_comments = self.get_childs_l500(url_rec, m_comments, parent)
+            elif "morechildren" in com.get("class") and all:
+                a = com.select_one("a")
+                onclick = a["onclick"]
+                id_list = extract_t1_ids(onclick)
+                for id in id_list:
+                    comment_url = f"{old_url}{id}"
+                    m_comments = self.get_childs_l500(comment_url, m_comments, current_id)
+            else:
+                child = com.find("div", class_="child")
+                if child.text != "":
+                    child = child.find("div")
+                    if all:
+                        child_com = child.find_all(
+                            "div",
+                            class_=lambda x: x
+                            and (
+                                "comment" in x
+                                or "deleted comment" in x
+                                or "morerecursion" in x
+                                or "morechildren" in x
+                            ),
+                            recursive=False,
+                        )
+                    else:
+                        child_com = child.find_all(
+                            "div",
+                            class_=lambda x: x
+                            and ("comment" in x or "deleted comment" in x),
+                            recursive=False,
+                        )
+                    for ele in child_com:
+                        m_comments.append((current_id, ele))
+                data = RedditComment(
+                    id=current_id,
+                    parent=parent,
+                    comment=com.scrape_one("div[class='md']:not(div.child a)"),
+                )
+                if data.id != "":
+                    list_return.append(data)
+        return list_return
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 6b52cbfef2..3a63113066 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -14,3 +14,14 @@ class RedditPost(TabularRecord):
     number_comments: int
     published_date: str
     link: Optional[str]
+
+
+@dataclass
+class RedditComment(TabularRecord):
+    # url: str
+    # author: str
+    id: str
+    parent: str
+    # points: Optional[str]
+    # published_date: str
+    comment: str

From 3ab4b427917c2ee7e42c170607ba8e26fadab0e2 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Mon, 9 Dec 2024 18:05:59 +0100
Subject: [PATCH 09/47] Update reddit comments

---
 minet/reddit/scraper.py | 11 +++++++++++
 minet/reddit/types.py   | 10 +++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 5d0806b33d..b3e2a78397 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -171,6 +171,13 @@ def get_comments(self, url: str, all):
         while m_comments:
             parent, com = m_comments.pop()
             current_id = get_current_id(com)
+            comment_url = com.scrape_one("a[class='bylink']", 'href')
+            try_author = com.scrape_one("a[class^='author']", 'href')
+            author = try_author.get_text() if try_author else "Deleted"
+            com_points = com.scrape_one("span[class='score unvoted']")
+            match = re.search(r"-?\d+\s+point(?:s)?", com_points)
+            com_points = int(re.search(r"-?\d+", match.group()).group())
+            published_date = com.scrape_one("time", "datetime")
             if "morerecursion" in com.get("class") and all:
                 url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
                 m_comments = self.get_childs_l500(url_rec, m_comments, parent)
@@ -207,8 +214,12 @@ def get_comments(self, url: str, all):
                     for ele in child_com:
                         m_comments.append((current_id, ele))
                 data = RedditComment(
+                    comment_url=comment_url,
+                    author=author,
                     id=current_id,
                     parent=parent,
+                    points=com_points,
+                    published_date=published_date,
                     comment=com.scrape_one("div[class='md']:not(div.child a)"),
                 )
                 if data.id != "":
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 3a63113066..3d2716f194 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -10,7 +10,7 @@ class RedditPost(TabularRecord):
     url: str
     author: str
     author_text: Optional[str]
-    upvote: str
+    upvote: int
     number_comments: int
     published_date: str
     link: Optional[str]
@@ -18,10 +18,10 @@ class RedditPost(TabularRecord):
 
 @dataclass
 class RedditComment(TabularRecord):
-    # url: str
-    # author: str
+    comment_url: str
+    author: str
     id: str
     parent: str
-    # points: Optional[str]
-    # published_date: str
+    points: int
+    published_date: str
     comment: str

From bc901cb1a0b65278dc1d6578f53f0c664f202ccb Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 20 Dec 2024 13:52:36 +0100
Subject: [PATCH 10/47] Optimization with yield

---
 minet/cli/reddit/posts.py |  4 ----
 minet/reddit/scraper.py   | 11 +++--------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index 777f88813d..68dd9d68cb 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -41,10 +41,6 @@ def action(cli_args, enricher, loading_bar):
                 )
                 continue
 
-            list_posts = []
             for post in posts:
-                list_posts.append(post)
-
-            for post in list_posts:
                 loading_bar.nested_advance()
                 enricher.writerow(row, post)
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index b3e2a78397..04100ef091 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -94,7 +94,6 @@ def get_childs_l500(self, url, list_comments, parent_id):
         return list_comments
 
     def get_posts(self, url: str, add_text: bool, nb_post=25):
-        list_posts = []
         nb_pages = ceil(int(nb_post) / 25)
         old_url = get_old_url(get_url_from_subreddit(url))
         n_crawled = 0
@@ -151,15 +150,12 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                         published_date=published_date,
                         link=resolve_relative_url(link),
                     )
-
-                    list_posts.append(data)
+                    yield data
                     n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]
-        return list(list_posts)
     
 
     def get_comments(self, url: str, all):
-        list_return = []
         m_comments = []
         old_url = get_old_url(url)
         url_limit = old_url + "?limit=500"
@@ -173,7 +169,7 @@ def get_comments(self, url: str, all):
             current_id = get_current_id(com)
             comment_url = com.scrape_one("a[class='bylink']", 'href')
             try_author = com.scrape_one("a[class^='author']", 'href')
-            author = try_author.get_text() if try_author else "Deleted"
+            author = try_author if try_author else "Deleted"
             com_points = com.scrape_one("span[class='score unvoted']")
             match = re.search(r"-?\d+\s+point(?:s)?", com_points)
             com_points = int(re.search(r"-?\d+", match.group()).group())
@@ -223,5 +219,4 @@ def get_comments(self, url: str, all):
                     comment=com.scrape_one("div[class='md']:not(div.child a)"),
                 )
                 if data.id != "":
-                    list_return.append(data)
-        return list_return
+                    yield data

From e40d672f7e08e622ac257ca905071db28f1aa218 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 20 Dec 2024 14:40:15 +0100
Subject: [PATCH 11/47] Adding user_posts function

---
 minet/cli/reddit/__init__.py   | 28 ++++++++++++++++++
 minet/cli/reddit/user_posts.py | 46 +++++++++++++++++++++++++++++
 minet/reddit/scraper.py        | 54 ++++++++++++++++++++++++++++++++--
 minet/reddit/types.py          | 13 +++++++-
 4 files changed, 138 insertions(+), 3 deletions(-)
 create mode 100644 minet/cli/reddit/user_posts.py

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index 17b2d1f018..86acf6e59b 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -67,6 +67,33 @@
     ],
 )
 
+REDDIT_USER_POSTS_SUBCOMMAND = command(
+    "user_posts",
+    "minet.cli.reddit.user_posts",
+    title="Minet Reddit User Posts Command",
+    description="""
+        Retrieve reddit posts from a user link.
+    """,
+    epilog="""
+        Example:
+
+        . Searching posts from the user page of u/random_user:
+            $ minet reddit posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv
+    """,
+    variadic_input={
+        "dummy_column": "user",
+        "item_label": "user url, user shortcode or user id",
+        "item_label_plural": "user urls, user shortcodes or user ids",
+    },
+    arguments=[
+        {
+            "flags": ["-n", "--number"],
+            "help": "Number of posts to retrieve.",
+            "type": int,
+        },
+    ],
+)
+
 
 REDDIT_COMMAND = command(
     "reddit",
@@ -79,5 +106,6 @@
     subcommands=[
         REDDIT_POSTS_SUBCOMMAND,
         REDDIT_COMMENTS_SUBCOMMAND,
+        REDDIT_USER_POSTS_SUBCOMMAND
     ],
 )
diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
new file mode 100644
index 0000000000..ab37660165
--- /dev/null
+++ b/minet/cli/reddit/user_posts.py
@@ -0,0 +1,46 @@
+# =============================================================================
+# Minet Reddit Posts CLI Action
+# =============================================================================
+#
+# Logic of the `rd user_posts` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditUserPost
+from minet.reddit.exceptions import RedditInvalidTargetError
+
+
+@with_enricher_and_loading_bar(
+    headers=RedditUserPost,
+    title="Scraping user posts",
+    unit="groups",
+    nested=True,
+    sub_unit="user",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.number:
+                    if cli_args.text:
+                        posts = scraper.get_user_posts(url, cli_args.number)
+                    else:
+                        posts = scraper.get_user_posts(url, cli_args.number)
+                else:
+                    if cli_args.text:
+                        posts = scraper.get_user_posts(url)
+                    else:
+                        posts = scraper.get_user_posts(url)
+            except RedditInvalidTargetError:
+                loading_bar.print(
+                    "the script could not complete normally on line %i" % (i)
+                )
+                continue
+
+            for post in posts:
+                loading_bar.nested_advance()
+                enricher.writerow(row, post)
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 04100ef091..f65d39f152 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -2,7 +2,7 @@
 from math import ceil
 from ural import get_domain_name, urlpathsplit, is_url
 from time import sleep
-from minet.reddit.types import RedditPost, RedditComment
+from minet.reddit.types import RedditPost, RedditComment, RedditUserPost
 from minet.reddit.exceptions import RedditInvalidTargetError
 import re
 from urllib.parse import urljoin
@@ -145,7 +145,7 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                         url=post_url,
                         author=author,
                         author_text=content,
-                        upvote=upvote,
+                        points=upvote,
                         number_comments=n_comments,
                         published_date=published_date,
                         link=resolve_relative_url(link),
@@ -220,3 +220,53 @@ def get_comments(self, url: str, all):
                 )
                 if data.id != "":
                     yield data
+
+    def get_user_posts(self, url: str, nb = 25):
+        nb_pages = ceil(int(nb) / 25)
+        n_crawled = 0
+        old_url = get_old_url(url)
+        for _ in range(nb_pages):
+            if n_crawled == int(nb):
+                break
+            response = reddit_request(old_url, self.pool_manager)
+            soup = response.soup()
+            posts = soup.select("div[id^='thing_t3_']")
+            for post in posts:
+                sub = post.scrape_one("a[class*='subreddit']", "href")
+                title = post.scrape_one("a[class^='title']")
+                points = post.scrape_one("div[class='score unvoted']")
+                post_url = post.scrape_one("a[class^='bylink comment']", "href")
+                nb_comments = post.scrape_one("a[class^='bylink comment']")
+                match = re.match(r"(\d+)\s+comments", nb_comments)
+                if match:
+                    nb_comments = int(match.group(1))
+                else:
+                    nb_comments = 0
+                link = post.scrape_one("a[class^='title']", "href")
+                published_date = post.scrape("time", "datetime")
+
+                data = RedditUserPost(
+                    title=title,
+                    url=post_url,
+                    points=points,
+                    number_comments=nb_comments,
+                    published_date=published_date,
+                    link=link,
+                    subreddit=sub
+                )
+
+                yield data
+                n_crawled += 1
+            old_url = soup.scrape("span[class='next-button'] a", "href")[0]
+
+
+
+
+
+
+
+
+
+
+    def get_user_comments(self, url: str, nb = 25):
+        old_url = get_old_url(url)
\ No newline at end of file
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 3d2716f194..e27e5b736e 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -10,7 +10,7 @@ class RedditPost(TabularRecord):
     url: str
     author: str
     author_text: Optional[str]
-    upvote: int
+    points: int
     number_comments: int
     published_date: str
     link: Optional[str]
@@ -25,3 +25,14 @@ class RedditComment(TabularRecord):
     points: int
     published_date: str
     comment: str
+
+
+@dataclass
+class RedditUserPost(TabularRecord):
+    title: str
+    url: str
+    points: int
+    number_comments: int
+    published_date: str
+    link: str
+    subreddit: str
\ No newline at end of file

From f53c45130c47c3912e37de9a24c55d8494c0f1f8 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 20 Dec 2024 14:55:54 +0100
Subject: [PATCH 12/47] Fix user_posts

---
 minet/cli/reddit/user_posts.py | 10 ++--------
 minet/reddit/scraper.py        |  4 +++-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
index ab37660165..d57d9fd94d 100644
--- a/minet/cli/reddit/user_posts.py
+++ b/minet/cli/reddit/user_posts.py
@@ -26,15 +26,9 @@ def action(cli_args, enricher, loading_bar):
         with loading_bar.step(url):
             try:
                 if cli_args.number:
-                    if cli_args.text:
-                        posts = scraper.get_user_posts(url, cli_args.number)
-                    else:
-                        posts = scraper.get_user_posts(url, cli_args.number)
+                    posts = scraper.get_user_posts(url, cli_args.number)
                 else:
-                    if cli_args.text:
-                        posts = scraper.get_user_posts(url)
-                    else:
-                        posts = scraper.get_user_posts(url)
+                    posts = scraper.get_user_posts(url)
             except RedditInvalidTargetError:
                 loading_bar.print(
                     "the script could not complete normally on line %i" % (i)
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index f65d39f152..35a1bf5497 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -242,7 +242,9 @@ def get_user_posts(self, url: str, nb = 25):
                     nb_comments = int(match.group(1))
                 else:
                     nb_comments = 0
-                link = post.scrape_one("a[class^='title']", "href")
+                link = resolve_relative_url(post.scrape_one("a[class^='title']", "href"))
+                if link == post_url:
+                    link = ""
                 published_date = post.scrape("time", "datetime")
 
                 data = RedditUserPost(

From b932a8dc84eb7a4478b0d6bf70d8ea9d17936872 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 20 Dec 2024 15:04:47 +0100
Subject: [PATCH 13/47] Fixing errors with user_posts

---
 minet/cli/reddit/user_posts.py | 2 +-
 minet/reddit/scraper.py        | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
index d57d9fd94d..a095d60c74 100644
--- a/minet/cli/reddit/user_posts.py
+++ b/minet/cli/reddit/user_posts.py
@@ -15,7 +15,7 @@
     title="Scraping user posts",
     unit="groups",
     nested=True,
-    sub_unit="user",
+    sub_unit="posts",
 )
 def action(cli_args, enricher, loading_bar):
     scraper = RedditScraper()
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 35a1bf5497..a35b551367 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -232,6 +232,8 @@ def get_user_posts(self, url: str, nb = 25):
             soup = response.soup()
             posts = soup.select("div[id^='thing_t3_']")
             for post in posts:
+                if n_crawled == int(nb):
+                    break
                 sub = post.scrape_one("a[class*='subreddit']", "href")
                 title = post.scrape_one("a[class^='title']")
                 points = post.scrape_one("div[class='score unvoted']")

From 26be5f30e6c8a86a309ffe69702f1389e12bb616 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 20 Dec 2024 15:18:41 +0100
Subject: [PATCH 14/47] Fixing format

---
 minet/cli/reddit/__init__.py   |  7 ++++-
 minet/cli/reddit/user_posts.py | 10 +++++--
 minet/reddit/scraper.py        | 51 +++++++++++++++++++++-------------
 minet/reddit/types.py          |  3 +-
 4 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index 86acf6e59b..0c0e58383b 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -91,6 +91,11 @@
             "help": "Number of posts to retrieve.",
             "type": int,
         },
+        {
+            "flags": ["-t", "--text"],
+            "help": "Retrieve the text of the post. Note that it will require one request per post.",
+            "action": "store_true",
+        },
     ],
 )
 
@@ -106,6 +111,6 @@
     subcommands=[
         REDDIT_POSTS_SUBCOMMAND,
         REDDIT_COMMENTS_SUBCOMMAND,
-        REDDIT_USER_POSTS_SUBCOMMAND
+        REDDIT_USER_POSTS_SUBCOMMAND,
     ],
 )
diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
index a095d60c74..6ca7008b3a 100644
--- a/minet/cli/reddit/user_posts.py
+++ b/minet/cli/reddit/user_posts.py
@@ -26,9 +26,15 @@ def action(cli_args, enricher, loading_bar):
         with loading_bar.step(url):
             try:
                 if cli_args.number:
-                    posts = scraper.get_user_posts(url, cli_args.number)
+                    if cli_args.text:
+                        posts = scraper.get_user_posts(url, True, cli_args.number)
+                    else:
+                        posts = scraper.get_user_posts(url, False, cli_args.number)
                 else:
-                    posts = scraper.get_user_posts(url)
+                    if cli_args.text:
+                        posts = scraper.get_user_posts(url, True)
+                    else:
+                        posts = scraper.get_user_posts(url, False)
             except RedditInvalidTargetError:
                 loading_bar.print(
                     "the script could not complete normally on line %i" % (i)
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index a35b551367..33e80c7bf6 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -123,7 +123,7 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                     try_author = post.select_one("a[class*='author']")
                     author = try_author.get_text() if try_author else "Deleted"
                     upvote = post.select_one("div[class='score unvoted']").get_text()
-                    if upvote == '•':
+                    if upvote == "•":
                         upvote = ""
                     published_date = post.scrape_one("time", "datetime")
                     link = post.scrape_one("a[class*='title']", "href")
@@ -153,7 +153,6 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                     yield data
                     n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]
-    
 
     def get_comments(self, url: str, all):
         m_comments = []
@@ -161,14 +160,16 @@ def get_comments(self, url: str, all):
         url_limit = old_url + "?limit=500"
         response = reddit_request(url_limit, self.pool_manager)
         soup = response.soup()
-        first_comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
+        first_comments = soup.select(
+            "div[class='commentarea']>div>div[class*='comment']"
+        )
         for ele in first_comments:
             m_comments.append((None, ele))
         while m_comments:
             parent, com = m_comments.pop()
             current_id = get_current_id(com)
-            comment_url = com.scrape_one("a[class='bylink']", 'href')
-            try_author = com.scrape_one("a[class^='author']", 'href')
+            comment_url = com.scrape_one("a[class='bylink']", "href")
+            try_author = com.scrape_one("a[class^='author']", "href")
             author = try_author if try_author else "Deleted"
             com_points = com.scrape_one("span[class='score unvoted']")
             match = re.search(r"-?\d+\s+point(?:s)?", com_points)
@@ -183,7 +184,9 @@ def get_comments(self, url: str, all):
                 id_list = extract_t1_ids(onclick)
                 for id in id_list:
                     comment_url = f"{old_url}{id}"
-                    m_comments = self.get_childs_l500(comment_url, m_comments, current_id)
+                    m_comments = self.get_childs_l500(
+                        comment_url, m_comments, current_id
+                    )
             else:
                 child = com.find("div", class_="child")
                 if child.text != "":
@@ -221,7 +224,7 @@ def get_comments(self, url: str, all):
                 if data.id != "":
                     yield data
 
-    def get_user_posts(self, url: str, nb = 25):
+    def get_user_posts(self, url: str, add_text: bool, nb=25):
         nb_pages = ceil(int(nb) / 25)
         n_crawled = 0
         old_url = get_old_url(url)
@@ -237,6 +240,8 @@ def get_user_posts(self, url: str, nb = 25):
                 sub = post.scrape_one("a[class*='subreddit']", "href")
                 title = post.scrape_one("a[class^='title']")
                 points = post.scrape_one("div[class='score unvoted']")
+                if points == "•":
+                    points = ""
                 post_url = post.scrape_one("a[class^='bylink comment']", "href")
                 nb_comments = post.scrape_one("a[class^='bylink comment']")
                 match = re.match(r"(\d+)\s+comments", nb_comments)
@@ -244,33 +249,39 @@ def get_user_posts(self, url: str, nb = 25):
                     nb_comments = int(match.group(1))
                 else:
                     nb_comments = 0
-                link = resolve_relative_url(post.scrape_one("a[class^='title']", "href"))
+                link = resolve_relative_url(
+                    post.scrape_one("a[class^='title']", "href")
+                )
                 if link == post_url:
                     link = ""
                 published_date = post.scrape("time", "datetime")
+                if add_text:
+                    text_response = reddit_request(post_url, self.pool_manager)
+                    text_soup = text_response.soup()
+                    try_content = text_soup.select_one(
+                        "div[id='siteTable'] div[class^='usertext']"
+                    )
+                    if try_content:
+                        content = try_content.get_text()
+                    else:
+                        content = ""
+                else:
+                    content = ""
 
                 data = RedditUserPost(
                     title=title,
                     url=post_url,
+                    author_text=content,
                     points=points,
                     number_comments=nb_comments,
                     published_date=published_date,
                     link=link,
-                    subreddit=sub
+                    subreddit=sub,
                 )
 
                 yield data
                 n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]
 
-
-
-
-
-
-
-
-
-
-    def get_user_comments(self, url: str, nb = 25):
-        old_url = get_old_url(url)
\ No newline at end of file
+    def get_user_comments(self, url: str, nb=25):
+        old_url = get_old_url(url)
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index e27e5b736e..4ae9910bf1 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -31,8 +31,9 @@ class RedditComment(TabularRecord):
 class RedditUserPost(TabularRecord):
     title: str
     url: str
+    author_text: str
     points: int
     number_comments: int
     published_date: str
     link: str
-    subreddit: str
\ No newline at end of file
+    subreddit: str

From e3a96afc439477dc8f2d718c86f07152b3cfd470 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 20 Dec 2024 15:51:55 +0100
Subject: [PATCH 15/47] Refacto

---
 minet/reddit/scraper.py | 157 ++++++++++++++++++----------------------
 minet/reddit/types.py   |   4 +-
 2 files changed, 73 insertions(+), 88 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 33e80c7bf6..1a77120ee9 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -93,6 +93,50 @@ def get_childs_l500(self, url, list_comments, parent_id):
                     list_comments.append((parent_id, ele))
         return list_comments
 
+    def get_post_standard_info(self, post, add_text):
+        list_buttons = post.select_one("ul[class='flat-list buttons']")
+        if len(list_buttons.scrape("span[class='promoted-span']")) == 0:
+            title = post.force_select_one("a[class*='title']").get_text()
+            post_url = list_buttons.scrape_one("a[class^='bylink comments']", "href")
+            n_comments = list_buttons.select_one(
+                "a[class^='bylink comments']"
+            ).get_text()
+            match = re.match(r"(\d+)\s+comments", n_comments)
+            if match:
+                n_comments = int(match.group(1))
+            else:
+                n_comments = 0
+            upvote = post.select_one("div[class='score unvoted']").get_text()
+            if upvote == "•":
+                upvote = ""
+            published_date = post.scrape_one("time", "datetime")
+            link = resolve_relative_url(post.scrape_one("a[class*='title']", "href"))
+            if link == post_url:
+                link = ""
+            if add_text:
+                text_response = reddit_request(post_url, self.pool_manager)
+                text_soup = text_response.soup()
+                try_content = text_soup.select_one(
+                    "div[id='siteTable'] div[class^='usertext']"
+                )
+                if try_content:
+                    content = try_content.get_text()
+                else:
+                    content = ""
+            else:
+                content = ""
+
+            data = {
+                "title": title,
+                "url": post_url,
+                "author_text": content,
+                "points": upvote,
+                "number_comments": n_comments,
+                "published_date": published_date,
+                "link": link,
+            }
+            return data
+
     def get_posts(self, url: str, add_text: bool, nb_post=25):
         nb_pages = ceil(int(nb_post) / 25)
         old_url = get_old_url(get_url_from_subreddit(url))
@@ -106,52 +150,22 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
             for post in posts:
                 if n_crawled == int(nb_post):
                     break
-                list_buttons = post.select_one("ul[class='flat-list buttons']")
-                if len(list_buttons.scrape("span[class='promoted-span']")) == 0:
-                    title = post.force_select_one("a[class*='title']").get_text()
-                    post_url = list_buttons.scrape_one(
-                        "a[class^='bylink comments']", "href"
-                    )
-                    n_comments = list_buttons.select_one(
-                        "a[class^='bylink comments']"
-                    ).get_text()
-                    match = re.match(r"(\d+)\s+comments", n_comments)
-                    if match:
-                        n_comments = int(match.group(1))
-                    else:
-                        n_comments = 0
+                data = self.get_post_standard_info(post, add_text)
+                if data:
                     try_author = post.select_one("a[class*='author']")
                     author = try_author.get_text() if try_author else "Deleted"
-                    upvote = post.select_one("div[class='score unvoted']").get_text()
-                    if upvote == "•":
-                        upvote = ""
-                    published_date = post.scrape_one("time", "datetime")
-                    link = post.scrape_one("a[class*='title']", "href")
-                    if add_text:
-                        text_response = reddit_request(post_url, self.pool_manager)
-                        text_soup = text_response.soup()
-                        try_content = text_soup.select_one(
-                            "div[id='siteTable'] div[class^='usertext']"
-                        )
-                        if try_content:
-                            content = try_content.get_text()
-                        else:
-                            content = ""
-                    else:
-                        content = ""
-
-                    data = RedditPost(
-                        title=title,
-                        url=post_url,
+                    post = RedditPost(
+                        title=data["title"],
+                        url=data["url"],
                         author=author,
-                        author_text=content,
-                        points=upvote,
-                        number_comments=n_comments,
-                        published_date=published_date,
-                        link=resolve_relative_url(link),
+                        author_text=data["author_text"],
+                        points=data["points"],
+                        number_comments=data["number_comments"],
+                        published_date=data["published_date"],
+                        external_link=data["link"],
                     )
-                    yield data
-                    n_crawled += 1
+                    yield post
+                n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]
 
     def get_comments(self, url: str, all):
@@ -237,51 +251,22 @@ def get_user_posts(self, url: str, add_text: bool, nb=25):
             for post in posts:
                 if n_crawled == int(nb):
                     break
-                sub = post.scrape_one("a[class*='subreddit']", "href")
-                title = post.scrape_one("a[class^='title']")
-                points = post.scrape_one("div[class='score unvoted']")
-                if points == "•":
-                    points = ""
-                post_url = post.scrape_one("a[class^='bylink comment']", "href")
-                nb_comments = post.scrape_one("a[class^='bylink comment']")
-                match = re.match(r"(\d+)\s+comments", nb_comments)
-                if match:
-                    nb_comments = int(match.group(1))
-                else:
-                    nb_comments = 0
-                link = resolve_relative_url(
-                    post.scrape_one("a[class^='title']", "href")
-                )
-                if link == post_url:
-                    link = ""
-                published_date = post.scrape("time", "datetime")
-                if add_text:
-                    text_response = reddit_request(post_url, self.pool_manager)
-                    text_soup = text_response.soup()
-                    try_content = text_soup.select_one(
-                        "div[id='siteTable'] div[class^='usertext']"
+                data = self.get_post_standard_info(post, add_text)
+                if data:
+                    sub = post.scrape_one("a[class*='subreddit']", "href")
+                    post = RedditUserPost(
+                        title=data["title"],
+                        url=data["url"],
+                        author_text=data["author_text"],
+                        points=data["points"],
+                        number_comments=data["number_comments"],
+                        published_date=data["published_date"],
+                        external_link=data["link"],
+                        subreddit=sub,
                     )
-                    if try_content:
-                        content = try_content.get_text()
-                    else:
-                        content = ""
-                else:
-                    content = ""
-
-                data = RedditUserPost(
-                    title=title,
-                    url=post_url,
-                    author_text=content,
-                    points=points,
-                    number_comments=nb_comments,
-                    published_date=published_date,
-                    link=link,
-                    subreddit=sub,
-                )
-
-                yield data
+                    yield post
                 n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]
 
-    def get_user_comments(self, url: str, nb=25):
-        old_url = get_old_url(url)
+    # def get_user_comments(self, url: str, nb=25):
+    #     old_url = get_old_url(url)
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 4ae9910bf1..3fcadc469f 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -13,7 +13,7 @@ class RedditPost(TabularRecord):
     points: int
     number_comments: int
     published_date: str
-    link: Optional[str]
+    external_link: Optional[str]
 
 
 @dataclass
@@ -35,5 +35,5 @@ class RedditUserPost(TabularRecord):
     points: int
     number_comments: int
     published_date: str
-    link: str
+    external_link: str
     subreddit: str

From 2fb4cd2e6f7f1e907b350e4a29efedfbee1c8b35 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 20 Dec 2024 16:17:59 +0100
Subject: [PATCH 16/47] better refacto

---
 .gitignore                     |   1 +
 minet/cli/reddit/posts.py      |  10 +-
 minet/cli/reddit/user_posts.py |  10 +-
 minet/reddit/scraper.py        | 181 +++++++++++++++++----------------
 4 files changed, 106 insertions(+), 96 deletions(-)

diff --git a/.gitignore b/.gitignore
index b2de0b22ba..ddd3a616c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ ftest/*.csv
 *.sqlar
 *-wal
 *-shm
+*.csv
 
 /crawl
 /downloaded
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index 68dd9d68cb..d73ef812f1 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -20,6 +20,8 @@
 def action(cli_args, enricher, loading_bar):
     scraper = RedditScraper()
 
+    type_page = 'subreddit'
+
     for i, row, url in enricher.enumerate_cells(
         cli_args.column, with_rows=True, start=1
     ):
@@ -27,14 +29,14 @@ def action(cli_args, enricher, loading_bar):
             try:
                 if cli_args.number:
                     if cli_args.text:
-                        posts = scraper.get_posts(url, True, cli_args.number)
+                        posts = scraper.get_general_post(url, type_page, True, cli_args.number)
                     else:
-                        posts = scraper.get_posts(url, False, cli_args.number)
+                        posts = scraper.get_general_post(url, type_page, False, cli_args.number)
                 else:
                     if cli_args.text:
-                        posts = scraper.get_posts(url, True)
+                        posts = scraper.get_general_post(url, type_page, True)
                     else:
-                        posts = scraper.get_posts(url, False)
+                        posts = scraper.get_general_post(url, type_page, False)
             except RedditInvalidTargetError:
                 loading_bar.print(
                     "the script could not complete normally on line %i" % (i)
diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
index 6ca7008b3a..d55abda1b3 100644
--- a/minet/cli/reddit/user_posts.py
+++ b/minet/cli/reddit/user_posts.py
@@ -20,6 +20,8 @@
 def action(cli_args, enricher, loading_bar):
     scraper = RedditScraper()
 
+    type_page = 'user'
+
     for i, row, url in enricher.enumerate_cells(
         cli_args.column, with_rows=True, start=1
     ):
@@ -27,14 +29,14 @@ def action(cli_args, enricher, loading_bar):
             try:
                 if cli_args.number:
                     if cli_args.text:
-                        posts = scraper.get_user_posts(url, True, cli_args.number)
+                        posts = scraper.get_general_post(url, type_page, True, cli_args.number)
                     else:
-                        posts = scraper.get_user_posts(url, False, cli_args.number)
+                        posts = scraper.get_general_post(url, type_page, False, cli_args.number)
                 else:
                     if cli_args.text:
-                        posts = scraper.get_user_posts(url, True)
+                        posts = scraper.get_general_post(url, type_page, True)
                     else:
-                        posts = scraper.get_user_posts(url, False)
+                        posts = scraper.get_general_post(url, type_page, False)
             except RedditInvalidTargetError:
                 loading_bar.print(
                     "the script could not complete normally on line %i" % (i)
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 1a77120ee9..8d54530ae6 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -66,6 +66,41 @@ def get_current_id(com):
     return current_id
 
 
+def data_posts(
+    post, title, url, author_text, points, number_comments, published_date, link
+):
+    try_author = post.select_one("a[class*='author']")
+    author = try_author.get_text() if try_author else "Deleted"
+    data = RedditPost(
+        title=title,
+        url=url,
+        author=author,
+        author_text=author_text,
+        points=points,
+        number_comments=number_comments,
+        published_date=published_date,
+        external_link=link,
+    )
+    return data
+
+
+def data_user_posts(
+    post, title, url, author_text, points, number_comments, published_date, link
+):
+    sub = post.scrape_one("a[class*='subreddit']", "href")
+    data = RedditUserPost(
+        title=title,
+        url=url,
+        author_text=author_text,
+        points=points,
+        number_comments=number_comments,
+        published_date=published_date,
+        external_link=link,
+        subreddit=sub,
+    )
+    return data
+
+
 class RedditScraper(object):
     def __init__(self):
         self.pool_manager = create_pool_manager()
@@ -93,81 +128,6 @@ def get_childs_l500(self, url, list_comments, parent_id):
                     list_comments.append((parent_id, ele))
         return list_comments
 
-    def get_post_standard_info(self, post, add_text):
-        list_buttons = post.select_one("ul[class='flat-list buttons']")
-        if len(list_buttons.scrape("span[class='promoted-span']")) == 0:
-            title = post.force_select_one("a[class*='title']").get_text()
-            post_url = list_buttons.scrape_one("a[class^='bylink comments']", "href")
-            n_comments = list_buttons.select_one(
-                "a[class^='bylink comments']"
-            ).get_text()
-            match = re.match(r"(\d+)\s+comments", n_comments)
-            if match:
-                n_comments = int(match.group(1))
-            else:
-                n_comments = 0
-            upvote = post.select_one("div[class='score unvoted']").get_text()
-            if upvote == "•":
-                upvote = ""
-            published_date = post.scrape_one("time", "datetime")
-            link = resolve_relative_url(post.scrape_one("a[class*='title']", "href"))
-            if link == post_url:
-                link = ""
-            if add_text:
-                text_response = reddit_request(post_url, self.pool_manager)
-                text_soup = text_response.soup()
-                try_content = text_soup.select_one(
-                    "div[id='siteTable'] div[class^='usertext']"
-                )
-                if try_content:
-                    content = try_content.get_text()
-                else:
-                    content = ""
-            else:
-                content = ""
-
-            data = {
-                "title": title,
-                "url": post_url,
-                "author_text": content,
-                "points": upvote,
-                "number_comments": n_comments,
-                "published_date": published_date,
-                "link": link,
-            }
-            return data
-
-    def get_posts(self, url: str, add_text: bool, nb_post=25):
-        nb_pages = ceil(int(nb_post) / 25)
-        old_url = get_old_url(get_url_from_subreddit(url))
-        n_crawled = 0
-        for _ in range(nb_pages):
-            if n_crawled == int(nb_post):
-                break
-            response = reddit_request(old_url, self.pool_manager)
-            soup = response.soup()
-            posts = soup.select("div[id^='thing_t3_']")
-            for post in posts:
-                if n_crawled == int(nb_post):
-                    break
-                data = self.get_post_standard_info(post, add_text)
-                if data:
-                    try_author = post.select_one("a[class*='author']")
-                    author = try_author.get_text() if try_author else "Deleted"
-                    post = RedditPost(
-                        title=data["title"],
-                        url=data["url"],
-                        author=author,
-                        author_text=data["author_text"],
-                        points=data["points"],
-                        number_comments=data["number_comments"],
-                        published_date=data["published_date"],
-                        external_link=data["link"],
-                    )
-                    yield post
-                n_crawled += 1
-            old_url = soup.scrape("span[class='next-button'] a", "href")[0]
-
     def get_comments(self, url: str, all):
         m_comments = []
         old_url = get_old_url(url)
@@ -238,7 +198,7 @@ def get_comments(self, url: str, all):
                 if data.id != "":
                     yield data
 
-    def get_user_posts(self, url: str, add_text: bool, nb=25):
+    def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
         nb_pages = ceil(int(nb) / 25)
         n_crawled = 0
         old_url = get_old_url(url)
@@ -251,19 +211,64 @@ def get_user_posts(self, url: str, add_text: bool, nb=25):
             for post in posts:
                 if n_crawled == int(nb):
                     break
-                data = self.get_post_standard_info(post, add_text)
-                if data:
-                    sub = post.scrape_one("a[class*='subreddit']", "href")
-                    post = RedditUserPost(
-                        title=data["title"],
-                        url=data["url"],
-                        author_text=data["author_text"],
-                        points=data["points"],
-                        number_comments=data["number_comments"],
-                        published_date=data["published_date"],
-                        external_link=data["link"],
-                        subreddit=sub,
+                list_buttons = post.select_one("ul[class='flat-list buttons']")
+                if len(list_buttons.scrape("span[class='promoted-span']")) == 0:
+                    title = post.force_select_one("a[class*='title']").get_text()
+                    post_url = list_buttons.scrape_one(
+                        "a[class^='bylink comments']", "href"
                     )
+                    n_comments = list_buttons.select_one(
+                        "a[class^='bylink comments']"
+                    ).get_text()
+                    match = re.match(r"(\d+)\s+comments", n_comments)
+                    if match:
+                        n_comments = int(match.group(1))
+                    else:
+                        n_comments = 0
+                    upvote = post.select_one("div[class='score unvoted']").get_text()
+                    if upvote == "•":
+                        upvote = ""
+                    published_date = post.scrape_one("time", "datetime")
+                    link = resolve_relative_url(
+                        post.scrape_one("a[class*='title']", "href")
+                    )
+                    if link == post_url:
+                        link = ""
+                    if add_text:
+                        text_response = reddit_request(post_url, self.pool_manager)
+                        text_soup = text_response.soup()
+                        try_content = text_soup.select_one(
+                            "div[id='siteTable'] div[class^='usertext']"
+                        )
+                        if try_content:
+                            content = try_content.get_text()
+                        else:
+                            content = ""
+                    else:
+                        content = ""
+                    if type == "subreddit":
+                        post = data_posts(
+                            post,
+                            title,
+                            post_url,
+                            content,
+                            upvote,
+                            n_comments,
+                            published_date,
+                            link,
+                        )
+                    else:
+                        post = data_user_posts(
+                            post,
+                            title,
+                            post_url,
+                            content,
+                            upvote,
+                            n_comments,
+                            published_date,
+                            link,
+                        )
+
                     yield post
                 n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]

From bc9ff739a590bbed6a49fafa644dc35bcd3f1bb8 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 20 Dec 2024 17:04:41 +0100
Subject: [PATCH 17/47] Adding reddit user_comments

---
 minet/cli/reddit/__init__.py      | 27 ++++++++++++++++++++
 minet/cli/reddit/user_comments.py | 41 +++++++++++++++++++++++++++++++
 minet/reddit/scraper.py           | 35 +++++++++++++++++++++++---
 minet/reddit/types.py             | 11 +++++++++
 4 files changed, 111 insertions(+), 3 deletions(-)
 create mode 100644 minet/cli/reddit/user_comments.py

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index 0c0e58383b..cea5e8b8fb 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -99,6 +99,32 @@
     ],
 )
 
+REDDIT_USER_COMMENTS_SUBCOMMAND = command(
+    "user_comments",
+    "minet.cli.reddit.user_comments",
+    title="Minet Reddit User Comments Command",
+    description="""
+        Retrieve reddit comments from a user link.
+    """,
+    epilog="""
+        Example:
+
+        . Searching posts from the user page of u/random_user:
+            $ minet reddit posts https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv
+    """,
+    variadic_input={
+        "dummy_column": "user",
+        "item_label": "user url, user shortcode or user id",
+        "item_label_plural": "user urls, user shortcodes or user ids",
+    },
+    arguments=[
+        {
+            "flags": ["-n", "--number"],
+            "help": "Number of posts to retrieve.",
+            "type": int,
+        },
+    ],
+)
 
 REDDIT_COMMAND = command(
     "reddit",
@@ -112,5 +138,6 @@
         REDDIT_POSTS_SUBCOMMAND,
         REDDIT_COMMENTS_SUBCOMMAND,
         REDDIT_USER_POSTS_SUBCOMMAND,
+        REDDIT_USER_COMMENTS_SUBCOMMAND,
     ],
 )
diff --git a/minet/cli/reddit/user_comments.py b/minet/cli/reddit/user_comments.py
new file mode 100644
index 0000000000..c2e48ccefd
--- /dev/null
+++ b/minet/cli/reddit/user_comments.py
@@ -0,0 +1,41 @@
+# =============================================================================
+# Minet Reddit Comments CLI Action
+# =============================================================================
+#
+# Logic of the `rd user_comments` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditUserComment
+from minet.reddit.exceptions import RedditInvalidTargetError
+
+
+@with_enricher_and_loading_bar(
+    headers=RedditUserComment,
+    title="Scraping user comments",
+    unit="groups",
+    nested=True,
+    sub_unit="comments",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.number:
+                    posts = scraper.get_user_comments(url, cli_args.number)
+                else:
+                    posts = scraper.get_user_comments(url)
+
+            except RedditInvalidTargetError:
+                loading_bar.print(
+                    "the script could not complete normally on line %i" % (i)
+                )
+                continue
+
+            for post in posts:
+                loading_bar.nested_advance()
+                enricher.writerow(row, post)
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 8d54530ae6..28b8f0a4b3 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -2,7 +2,7 @@
 from math import ceil
 from ural import get_domain_name, urlpathsplit, is_url
 from time import sleep
-from minet.reddit.types import RedditPost, RedditComment, RedditUserPost
+from minet.reddit.types import RedditPost, RedditComment, RedditUserPost, RedditUserComment
 from minet.reddit.exceptions import RedditInvalidTargetError
 import re
 from urllib.parse import urljoin
@@ -273,5 +273,34 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                 n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]
 
-    # def get_user_comments(self, url: str, nb=25):
-    #     old_url = get_old_url(url)
+    def get_user_comments(self, url: str, nb=25):
+        nb_pages = ceil(int(nb) / 25)
+        n_crawled = 0
+        old_url = get_old_url(url)
+        for _ in range(nb_pages):
+            if n_crawled == int(nb):
+                break
+            response = reddit_request(old_url, self.pool_manager)
+            soup = response.soup()
+            comments = soup.select("[data-type='comment']")
+            for comment in comments:
+                if n_crawled == int(nb):
+                    break
+                post_title = resolve_relative_url(comment.scrape_one("a[class='title']", "href"))
+                post_author = comment.scrape_one("p[class='parent']>a[class^='author']", "href")
+                post_subreddit = comment.scrape_one("a[class^='subreddit']", "href")
+                points = comment.scrape_one("span[class='score unvoted']")
+                published_date = comment.scrape_one("time", "datetime")
+                text = comment.scrape_one("div[class='content'] div[class='md']")
+                comment_url = comment.scrape_one("a[class='bylink']", "href")
+                data = RedditUserComment(
+                    post_title=post_title,
+                    post_author=post_author,
+                    post_subreddit=post_subreddit,
+                    points=points,
+                    published_date=published_date,
+                    text=text,
+                    comment_url=comment_url
+                )
+                yield data
+                n_crawled += 1
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 3fcadc469f..4db1db8c6a 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -37,3 +37,14 @@ class RedditUserPost(TabularRecord):
     published_date: str
     external_link: str
     subreddit: str
+
+
+@dataclass
+class RedditUserComment(TabularRecord):
+    post_title: str
+    post_author: str
+    post_subreddit: str
+    points: int
+    published_date: str
+    text: str
+    comment_url: str

From 47c1ae58bebc3ab1a97af784f3c72d2bab5ad157 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Tue, 7 Jan 2025 16:06:06 +0100
Subject: [PATCH 18/47] adding scraped values for points and comments

---
 minet/reddit/scraper.py | 29 +++++++++++++++++++----------
 minet/reddit/types.py   |  8 ++++++--
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 28b8f0a4b3..f81b812e0a 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -67,16 +67,18 @@ def get_current_id(com):
 
 
 def data_posts(
-    post, title, url, author_text, points, number_comments, published_date, link
+    post, title, url, author_text, real_points, points, scraped_number_comments, number_comments, published_date, link
 ):
     try_author = post.select_one("a[class*='author']")
     author = try_author.get_text() if try_author else "Deleted"
     data = RedditPost(
         title=title,
-        url=url,
+        url=get_new_url(url),
         author=author,
         author_text=author_text,
-        points=points,
+        scraped_points=points,
+        approximated_points=real_points,
+        scraped_number_comments=scraped_number_comments,
         number_comments=number_comments,
         published_date=published_date,
         external_link=link,
@@ -85,14 +87,16 @@ def data_posts(
 
 
 def data_user_posts(
-    post, title, url, author_text, points, number_comments, published_date, link
+    post, title, url, author_text, real_points, points, scraped_number_comments, number_comments, published_date, link
 ):
     sub = post.scrape_one("a[class*='subreddit']", "href")
     data = RedditUserPost(
         title=title,
-        url=url,
+        url=get_new_url(url),
         author_text=author_text,
-        points=points,
+        scraped_points=points,
+        approximated_points=real_points,
+        scraped_number_comments=scraped_number_comments,
         number_comments=number_comments,
         published_date=published_date,
         external_link=link,
@@ -217,17 +221,18 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                     post_url = list_buttons.scrape_one(
                         "a[class^='bylink comments']", "href"
                     )
-                    n_comments = list_buttons.select_one(
+                    n_comments_scraped = list_buttons.select_one(
                         "a[class^='bylink comments']"
                     ).get_text()
-                    match = re.match(r"(\d+)\s+comments", n_comments)
+                    match = re.match(r"(\d+)\s+comment(s)?", n_comments_scraped)
                     if match:
                         n_comments = int(match.group(1))
                     else:
                         n_comments = 0
                     upvote = post.select_one("div[class='score unvoted']").get_text()
-                    if upvote == "•":
-                        upvote = ""
+                    real_points = "" if upvote == "•" else upvote
+                    if real_points[-1] == "k":
+                        real_points = int(float(real_points[:-1]) * 1000)
                     published_date = post.scrape_one("time", "datetime")
                     link = resolve_relative_url(
                         post.scrape_one("a[class*='title']", "href")
@@ -252,7 +257,9 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                             title,
                             post_url,
                             content,
+                            real_points,
                             upvote,
+                            n_comments_scraped,
                             n_comments,
                             published_date,
                             link,
@@ -263,7 +270,9 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                             title,
                             post_url,
                             content,
+                            real_points,
                             upvote,
+                            n_comments_scraped,
                             n_comments,
                             published_date,
                             link,
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 4db1db8c6a..427c6346c5 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -10,7 +10,9 @@ class RedditPost(TabularRecord):
     url: str
     author: str
     author_text: Optional[str]
-    points: int
+    scraped_points: str
+    approximated_points: int
+    scraped_number_comments: str
     number_comments: int
     published_date: str
     external_link: Optional[str]
@@ -32,7 +34,9 @@ class RedditUserPost(TabularRecord):
     title: str
     url: str
     author_text: str
-    points: int
+    scraped_points: str
+    approximated_points: int
+    scraped_number_comments: str
     number_comments: int
     published_date: str
     external_link: str

From 063211214bc86ab9f5534f265f30655c18400027 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Wed, 8 Jan 2025 10:34:20 +0100
Subject: [PATCH 19/47] Handle broken and banned pages

---
 minet/reddit/scraper.py | 289 ++++++++++++++++++++++++++--------------
 minet/reddit/types.py   |   4 +
 2 files changed, 196 insertions(+), 97 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index f81b812e0a..280910b38a 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -2,12 +2,23 @@
 from math import ceil
 from ural import get_domain_name, urlpathsplit, is_url
 from time import sleep
-from minet.reddit.types import RedditPost, RedditComment, RedditUserPost, RedditUserComment
+from minet.reddit.types import (
+    RedditPost,
+    RedditComment,
+    RedditUserPost,
+    RedditUserComment,
+)
 from minet.reddit.exceptions import RedditInvalidTargetError
 import re
 from urllib.parse import urljoin
 
 
+def broken_reddit(soup, response):
+    if response.status == 500 and soup.scrape("title") == "reddit broke!":
+        return 0
+    return 1
+
+
 def resolve_relative_url(path):
     return urljoin("https://old.reddit.com", path)
 
@@ -37,6 +48,10 @@ def reddit_request(url, pool_manager):
     sleep(1)
     response = request(url, pool_manager=pool_manager)
     soup = response.soup()
+    if response.status == 500 and soup.scrape_one("img", "alt") == "you broke reddit":
+        return response, soup, "broken page"
+    if response.status == 404 and soup.scrape_one("img", "alt") == "banned":
+        return response, soup, "banned"
     if response.status == 404 or (
         soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']")
     ):
@@ -49,7 +64,7 @@ def reddit_request(url, pool_manager):
         return reddit_request(url)
     if response.status == 429:
         return reddit_request(url)
-    return response
+    return response, soup, None
 
 
 def extract_t1_ids(text):
@@ -67,7 +82,17 @@ def get_current_id(com):
 
 
 def data_posts(
-    post, title, url, author_text, real_points, points, scraped_number_comments, number_comments, published_date, link
+    post,
+    title,
+    url,
+    author_text,
+    real_points,
+    points,
+    scraped_number_comments,
+    number_comments,
+    published_date,
+    link,
+    error,
 ):
     try_author = post.select_one("a[class*='author']")
     author = try_author.get_text() if try_author else "Deleted"
@@ -82,12 +107,23 @@ def data_posts(
         number_comments=number_comments,
         published_date=published_date,
         external_link=link,
+        error=error,
     )
     return data
 
 
 def data_user_posts(
-    post, title, url, author_text, real_points, points, scraped_number_comments, number_comments, published_date, link
+    post,
+    title,
+    url,
+    author_text,
+    real_points,
+    points,
+    scraped_number_comments,
+    number_comments,
+    published_date,
+    link,
+    error,
 ):
     sub = post.scrape_one("a[class*='subreddit']", "href")
     data = RedditUserPost(
@@ -101,6 +137,7 @@ def data_user_posts(
         published_date=published_date,
         external_link=link,
         subreddit=sub,
+        error=error,
     )
     return data
 
@@ -110,8 +147,7 @@ def __init__(self):
         self.pool_manager = create_pool_manager()
 
     def get_childs_l500(self, url, list_comments, parent_id):
-        response = reddit_request(url, self.pool_manager)
-        soup = response.soup()
+        _, soup, _ = reddit_request(url, self.pool_manager)
         comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
         for com in comments:
             child = com.find("div", class_="child")
@@ -136,71 +172,83 @@ def get_comments(self, url: str, all):
         m_comments = []
         old_url = get_old_url(url)
         url_limit = old_url + "?limit=500"
-        response = reddit_request(url_limit, self.pool_manager)
-        soup = response.soup()
-        first_comments = soup.select(
-            "div[class='commentarea']>div>div[class*='comment']"
-        )
-        for ele in first_comments:
-            m_comments.append((None, ele))
-        while m_comments:
-            parent, com = m_comments.pop()
-            current_id = get_current_id(com)
-            comment_url = com.scrape_one("a[class='bylink']", "href")
-            try_author = com.scrape_one("a[class^='author']", "href")
-            author = try_author if try_author else "Deleted"
-            com_points = com.scrape_one("span[class='score unvoted']")
-            match = re.search(r"-?\d+\s+point(?:s)?", com_points)
-            com_points = int(re.search(r"-?\d+", match.group()).group())
-            published_date = com.scrape_one("time", "datetime")
-            if "morerecursion" in com.get("class") and all:
-                url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
-                m_comments = self.get_childs_l500(url_rec, m_comments, parent)
-            elif "morechildren" in com.get("class") and all:
-                a = com.select_one("a")
-                onclick = a["onclick"]
-                id_list = extract_t1_ids(onclick)
-                for id in id_list:
-                    comment_url = f"{old_url}{id}"
-                    m_comments = self.get_childs_l500(
-                        comment_url, m_comments, current_id
-                    )
-            else:
-                child = com.find("div", class_="child")
-                if child.text != "":
-                    child = child.find("div")
-                    if all:
-                        child_com = child.find_all(
-                            "div",
-                            class_=lambda x: x
-                            and (
-                                "comment" in x
-                                or "deleted comment" in x
-                                or "morerecursion" in x
-                                or "morechildren" in x
-                            ),
-                            recursive=False,
-                        )
-                    else:
-                        child_com = child.find_all(
-                            "div",
-                            class_=lambda x: x
-                            and ("comment" in x or "deleted comment" in x),
-                            recursive=False,
+        _, soup, error = reddit_request(url_limit, self.pool_manager)
+        if error:
+            yield RedditComment(
+                comment_url="",
+                author="",
+                id="",
+                parent="",
+                points="",
+                published_date="",
+                comment="",
+                error=error,
+            )
+        else:
+            first_comments = soup.select(
+                "div[class='commentarea']>div>div[class*='comment']"
+            )
+            for ele in first_comments:
+                m_comments.append((None, ele))
+            while m_comments:
+                parent, com = m_comments.pop()
+                current_id = get_current_id(com)
+                comment_url = com.scrape_one("a[class='bylink']", "href")
+                try_author = com.scrape_one("a[class^='author']", "href")
+                author = try_author if try_author else "Deleted"
+                com_points = com.scrape_one("span[class='score unvoted']")
+                match = re.search(r"-?\d+\s+point(?:s)?", com_points)
+                com_points = int(re.search(r"-?\d+", match.group()).group())
+                published_date = com.scrape_one("time", "datetime")
+                if "morerecursion" in com.get("class") and all:
+                    url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
+                    m_comments = self.get_childs_l500(url_rec, m_comments, parent)
+                elif "morechildren" in com.get("class") and all:
+                    a = com.select_one("a")
+                    onclick = a["onclick"]
+                    id_list = extract_t1_ids(onclick)
+                    for id in id_list:
+                        comment_url = f"{old_url}{id}"
+                        m_comments = self.get_childs_l500(
+                            comment_url, m_comments, current_id
                         )
-                    for ele in child_com:
-                        m_comments.append((current_id, ele))
-                data = RedditComment(
-                    comment_url=comment_url,
-                    author=author,
-                    id=current_id,
-                    parent=parent,
-                    points=com_points,
-                    published_date=published_date,
-                    comment=com.scrape_one("div[class='md']:not(div.child a)"),
-                )
-                if data.id != "":
-                    yield data
+                else:
+                    child = com.find("div", class_="child")
+                    if child.text != "":
+                        child = child.find("div")
+                        if all:
+                            child_com = child.find_all(
+                                "div",
+                                class_=lambda x: x
+                                and (
+                                    "comment" in x
+                                    or "deleted comment" in x
+                                    or "morerecursion" in x
+                                    or "morechildren" in x
+                                ),
+                                recursive=False,
+                            )
+                        else:
+                            child_com = child.find_all(
+                                "div",
+                                class_=lambda x: x
+                                and ("comment" in x or "deleted comment" in x),
+                                recursive=False,
+                            )
+                        for ele in child_com:
+                            m_comments.append((current_id, ele))
+                    data = RedditComment(
+                        comment_url=get_new_url(comment_url),
+                        author=author,
+                        id=current_id,
+                        parent=parent,
+                        points=com_points,
+                        published_date=published_date,
+                        comment=com.scrape_one("div[class='md']:not(div.child a)"),
+                        error=error,
+                    )
+                    if data.id != "":
+                        yield data
 
     def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
         nb_pages = ceil(int(nb) / 25)
@@ -209,8 +257,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
         for _ in range(nb_pages):
             if n_crawled == int(nb):
                 break
-            response = reddit_request(old_url, self.pool_manager)
-            soup = response.soup()
+            _, soup, error = reddit_request(old_url, self.pool_manager)
             posts = soup.select("div[id^='thing_t3_']")
             for post in posts:
                 if n_crawled == int(nb):
@@ -240,8 +287,38 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                     if link == post_url:
                         link = ""
                     if add_text:
-                        text_response = reddit_request(post_url, self.pool_manager)
-                        text_soup = text_response.soup()
+                        _, text_soup, text_error = reddit_request(
+                            post_url, self.pool_manager
+                        )
+                        if text_error:
+                            if type == "subreddit":
+                                yield data_posts(
+                                    post,
+                                    title,
+                                    post_url,
+                                    "",
+                                    real_points,
+                                    upvote,
+                                    n_comments_scraped,
+                                    n_comments,
+                                    published_date,
+                                    link,
+                                    text_error,
+                                )
+                            else:
+                                yield data_user_posts(
+                                    post,
+                                    title,
+                                    post_url,
+                                    "",
+                                    real_points,
+                                    upvote,
+                                    n_comments_scraped,
+                                    n_comments,
+                                    published_date,
+                                    link,
+                                    text_error,
+                                )
                         try_content = text_soup.select_one(
                             "div[id='siteTable'] div[class^='usertext']"
                         )
@@ -263,6 +340,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                             n_comments,
                             published_date,
                             link,
+                            error,
                         )
                     else:
                         post = data_user_posts(
@@ -276,6 +354,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                             n_comments,
                             published_date,
                             link,
+                            error,
                         )
 
                     yield post
@@ -289,27 +368,43 @@ def get_user_comments(self, url: str, nb=25):
         for _ in range(nb_pages):
             if n_crawled == int(nb):
                 break
-            response = reddit_request(old_url, self.pool_manager)
-            soup = response.soup()
-            comments = soup.select("[data-type='comment']")
-            for comment in comments:
-                if n_crawled == int(nb):
-                    break
-                post_title = resolve_relative_url(comment.scrape_one("a[class='title']", "href"))
-                post_author = comment.scrape_one("p[class='parent']>a[class^='author']", "href")
-                post_subreddit = comment.scrape_one("a[class^='subreddit']", "href")
-                points = comment.scrape_one("span[class='score unvoted']")
-                published_date = comment.scrape_one("time", "datetime")
-                text = comment.scrape_one("div[class='content'] div[class='md']")
-                comment_url = comment.scrape_one("a[class='bylink']", "href")
-                data = RedditUserComment(
-                    post_title=post_title,
-                    post_author=post_author,
-                    post_subreddit=post_subreddit,
-                    points=points,
-                    published_date=published_date,
-                    text=text,
-                    comment_url=comment_url
+            _, soup, error = reddit_request(old_url, self.pool_manager)
+            if error:
+                yield RedditUserComment(
+                    post_title="",
+                    post_author="",
+                    post_subreddit="",
+                    points="",
+                    published_date="",
+                    text="",
+                    comment_url="",
+                    error=error,
                 )
-                yield data
-                n_crawled += 1
+            else:
+                comments = soup.select("[data-type='comment']")
+                for comment in comments:
+                    if n_crawled == int(nb):
+                        break
+                    post_title = resolve_relative_url(
+                        comment.scrape_one("a[class='title']", "href")
+                    )
+                    post_author = comment.scrape_one(
+                        "p[class='parent']>a[class^='author']", "href"
+                    )
+                    post_subreddit = comment.scrape_one("a[class^='subreddit']", "href")
+                    points = comment.scrape_one("span[class='score unvoted']")
+                    published_date = comment.scrape_one("time", "datetime")
+                    text = comment.scrape_one("div[class='content'] div[class='md']")
+                    comment_url = comment.scrape_one("a[class='bylink']", "href")
+                    data = RedditUserComment(
+                        post_title=post_title,
+                        post_author=post_author,
+                        post_subreddit=post_subreddit,
+                        points=points,
+                        published_date=published_date,
+                        text=text,
+                        comment_url=comment_url,
+                        error=error,
+                    )
+                    yield data
+                    n_crawled += 1
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 427c6346c5..a572003f72 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -16,6 +16,7 @@ class RedditPost(TabularRecord):
     number_comments: int
     published_date: str
     external_link: Optional[str]
+    error: str
 
 
 @dataclass
@@ -27,6 +28,7 @@ class RedditComment(TabularRecord):
     points: int
     published_date: str
     comment: str
+    error: str
 
 
 @dataclass
@@ -41,6 +43,7 @@ class RedditUserPost(TabularRecord):
     published_date: str
     external_link: str
     subreddit: str
+    error: str
 
 
 @dataclass
@@ -52,3 +55,4 @@ class RedditUserComment(TabularRecord):
     published_date: str
     text: str
     comment_url: str
+    error: str

From d770363553dfe9e9992e5ecd26d63c45d69dadfa Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Wed, 8 Jan 2025 11:51:33 +0100
Subject: [PATCH 20/47] Better handling for scores

---
 minet/reddit/scraper.py | 52 +++++++++++++++++++----------------------
 minet/reddit/types.py   |  8 +++----
 2 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 280910b38a..42acc98d22 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -13,12 +13,6 @@
 from urllib.parse import urljoin
 
 
-def broken_reddit(soup, response):
-    if response.status == 500 and soup.scrape("title") == "reddit broke!":
-        return 0
-    return 1
-
-
 def resolve_relative_url(path):
     return urljoin("https://old.reddit.com", path)
 
@@ -81,12 +75,22 @@ def get_current_id(com):
     return current_id
 
 
+def get_points(ele):
+    scrapped_points = ele.select_one("[class='score unvoted']")
+    score_hidden = ele.select_one("[class='score-hidden']")
+    if not scrapped_points and not score_hidden:
+        return "deleted"
+    scrapped_points = ele.scrape_one("[class='score unvoted']", "title")
+    if not scrapped_points:
+        return "score hidden"
+    return scrapped_points
+
+
 def data_posts(
     post,
     title,
     url,
     author_text,
-    real_points,
     points,
     scraped_number_comments,
     number_comments,
@@ -101,8 +105,7 @@ def data_posts(
         url=get_new_url(url),
         author=author,
         author_text=author_text,
-        scraped_points=points,
-        approximated_points=real_points,
+        points=points,
         scraped_number_comments=scraped_number_comments,
         number_comments=number_comments,
         published_date=published_date,
@@ -117,7 +120,6 @@ def data_user_posts(
     title,
     url,
     author_text,
-    real_points,
     points,
     scraped_number_comments,
     number_comments,
@@ -130,8 +132,7 @@ def data_user_posts(
         title=title,
         url=get_new_url(url),
         author_text=author_text,
-        scraped_points=points,
-        approximated_points=real_points,
+        points=points,
         scraped_number_comments=scraped_number_comments,
         number_comments=number_comments,
         published_date=published_date,
@@ -194,11 +195,9 @@ def get_comments(self, url: str, all):
                 parent, com = m_comments.pop()
                 current_id = get_current_id(com)
                 comment_url = com.scrape_one("a[class='bylink']", "href")
-                try_author = com.scrape_one("a[class^='author']", "href")
+                try_author = com.scrape_one("a[class^='author']")
                 author = try_author if try_author else "Deleted"
-                com_points = com.scrape_one("span[class='score unvoted']")
-                match = re.search(r"-?\d+\s+point(?:s)?", com_points)
-                com_points = int(re.search(r"-?\d+", match.group()).group())
+                points = get_points(com)
                 published_date = com.scrape_one("time", "datetime")
                 if "morerecursion" in com.get("class") and all:
                     url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
@@ -238,11 +237,11 @@ def get_comments(self, url: str, all):
                         for ele in child_com:
                             m_comments.append((current_id, ele))
                     data = RedditComment(
-                        comment_url=get_new_url(comment_url),
+                        comment_url=get_new_url(comment_url) if comment_url else None,
                         author=author,
                         id=current_id,
                         parent=parent,
-                        points=com_points,
+                        points=points,
                         published_date=published_date,
                         comment=com.scrape_one("div[class='md']:not(div.child a)"),
                         error=error,
@@ -276,10 +275,11 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                         n_comments = int(match.group(1))
                     else:
                         n_comments = 0
-                    upvote = post.select_one("div[class='score unvoted']").get_text()
-                    real_points = "" if upvote == "•" else upvote
-                    if real_points[-1] == "k":
-                        real_points = int(float(real_points[:-1]) * 1000)
+                    upvote = get_points(post)
+                    # upvote = post.select_one("div[class='score unvoted']").get_text()
+                    # real_points = "" if upvote == "•" else upvote
+                    # if real_points[-1] == "k":
+                    #     real_points = int(float(real_points[:-1]) * 1000)
                     published_date = post.scrape_one("time", "datetime")
                     link = resolve_relative_url(
                         post.scrape_one("a[class*='title']", "href")
@@ -297,7 +297,6 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                                     title,
                                     post_url,
                                     "",
-                                    real_points,
                                     upvote,
                                     n_comments_scraped,
                                     n_comments,
@@ -311,7 +310,6 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                                     title,
                                     post_url,
                                     "",
-                                    real_points,
                                     upvote,
                                     n_comments_scraped,
                                     n_comments,
@@ -334,7 +332,6 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                             title,
                             post_url,
                             content,
-                            real_points,
                             upvote,
                             n_comments_scraped,
                             n_comments,
@@ -348,7 +345,6 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                             title,
                             post_url,
                             content,
-                            real_points,
                             upvote,
                             n_comments_scraped,
                             n_comments,
@@ -389,10 +385,10 @@ def get_user_comments(self, url: str, nb=25):
                         comment.scrape_one("a[class='title']", "href")
                     )
                     post_author = comment.scrape_one(
-                        "p[class='parent']>a[class^='author']", "href"
+                        "p[class='parent']>a[class^='author']"
                     )
                     post_subreddit = comment.scrape_one("a[class^='subreddit']", "href")
-                    points = comment.scrape_one("span[class='score unvoted']")
+                    points = get_points(comment)
                     published_date = comment.scrape_one("time", "datetime")
                     text = comment.scrape_one("div[class='content'] div[class='md']")
                     comment_url = comment.scrape_one("a[class='bylink']", "href")
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index a572003f72..2e9165b350 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -10,8 +10,7 @@ class RedditPost(TabularRecord):
     url: str
     author: str
     author_text: Optional[str]
-    scraped_points: str
-    approximated_points: int
+    points: str
     scraped_number_comments: str
     number_comments: int
     published_date: str
@@ -25,7 +24,7 @@ class RedditComment(TabularRecord):
     author: str
     id: str
     parent: str
-    points: int
+    points: str
     published_date: str
     comment: str
     error: str
@@ -36,8 +35,7 @@ class RedditUserPost(TabularRecord):
     title: str
     url: str
     author_text: str
-    scraped_points: str
-    approximated_points: int
+    points: str
     scraped_number_comments: str
     number_comments: int
     published_date: str

From 240f1f2086238febebf10884b8de89797ea89b6c Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Wed, 8 Jan 2025 13:39:57 +0100
Subject: [PATCH 21/47] Draft of edited_date

---
 minet/reddit/scraper.py | 27 ++++++++++++++++++++-------
 minet/reddit/types.py   |  4 ++++
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 42acc98d22..9a243516fd 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -86,6 +86,12 @@ def get_points(ele):
     return scrapped_points
 
 
+def get_dates(ele):
+    published_date = ele.scrape_one("time[class='']", "datetime")
+    edited_date = ele.scrape_one("time[class='edited-timestamp']", "datetime")
+    return published_date, edited_date
+
+
 def data_posts(
     post,
     title,
@@ -95,6 +101,7 @@ def data_posts(
     scraped_number_comments,
     number_comments,
     published_date,
+    edited_date,
     link,
     error,
 ):
@@ -109,6 +116,7 @@ def data_posts(
         scraped_number_comments=scraped_number_comments,
         number_comments=number_comments,
         published_date=published_date,
+        edited_date=edited_date,
         external_link=link,
         error=error,
     )
@@ -124,6 +132,7 @@ def data_user_posts(
     scraped_number_comments,
     number_comments,
     published_date,
+    edited_date,
     link,
     error,
 ):
@@ -136,6 +145,7 @@ def data_user_posts(
         scraped_number_comments=scraped_number_comments,
         number_comments=number_comments,
         published_date=published_date,
+        edited_date=edited_date,
         external_link=link,
         subreddit=sub,
         error=error,
@@ -198,7 +208,7 @@ def get_comments(self, url: str, all):
                 try_author = com.scrape_one("a[class^='author']")
                 author = try_author if try_author else "Deleted"
                 points = get_points(com)
-                published_date = com.scrape_one("time", "datetime")
+                published_date, edited_date = get_dates(com)
                 if "morerecursion" in com.get("class") and all:
                     url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
                     m_comments = self.get_childs_l500(url_rec, m_comments, parent)
@@ -243,6 +253,7 @@ def get_comments(self, url: str, all):
                         parent=parent,
                         points=points,
                         published_date=published_date,
+                        edited_date=edited_date,
                         comment=com.scrape_one("div[class='md']:not(div.child a)"),
                         error=error,
                     )
@@ -276,11 +287,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                     else:
                         n_comments = 0
                     upvote = get_points(post)
-                    # upvote = post.select_one("div[class='score unvoted']").get_text()
-                    # real_points = "" if upvote == "•" else upvote
-                    # if real_points[-1] == "k":
-                    #     real_points = int(float(real_points[:-1]) * 1000)
-                    published_date = post.scrape_one("time", "datetime")
+                    published_date, edited_date = get_dates(post)
                     link = resolve_relative_url(
                         post.scrape_one("a[class*='title']", "href")
                     )
@@ -301,6 +308,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                                     n_comments_scraped,
                                     n_comments,
                                     published_date,
+                                    edited_date,
                                     link,
                                     text_error,
                                 )
@@ -314,6 +322,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                                     n_comments_scraped,
                                     n_comments,
                                     published_date,
+                                    edited_date,
                                     link,
                                     text_error,
                                 )
@@ -336,6 +345,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                             n_comments_scraped,
                             n_comments,
                             published_date,
+                            edited_date,
                             link,
                             error,
                         )
@@ -349,6 +359,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                             n_comments_scraped,
                             n_comments,
                             published_date,
+                            edited_date,
                             link,
                             error,
                         )
@@ -372,6 +383,7 @@ def get_user_comments(self, url: str, nb=25):
                     post_subreddit="",
                     points="",
                     published_date="",
+                    edited_date="",
                     text="",
                     comment_url="",
                     error=error,
@@ -389,7 +401,7 @@ def get_user_comments(self, url: str, nb=25):
                     )
                     post_subreddit = comment.scrape_one("a[class^='subreddit']", "href")
                     points = get_points(comment)
-                    published_date = comment.scrape_one("time", "datetime")
+                    published_date, edited_date = get_dates(comment)
                     text = comment.scrape_one("div[class='content'] div[class='md']")
                     comment_url = comment.scrape_one("a[class='bylink']", "href")
                     data = RedditUserComment(
@@ -398,6 +410,7 @@ def get_user_comments(self, url: str, nb=25):
                         post_subreddit=post_subreddit,
                         points=points,
                         published_date=published_date,
+                        edited_date=edited_date,
                         text=text,
                         comment_url=comment_url,
                         error=error,
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 2e9165b350..9fa2bff317 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -14,6 +14,7 @@ class RedditPost(TabularRecord):
     scraped_number_comments: str
     number_comments: int
     published_date: str
+    edited_date: str
     external_link: Optional[str]
     error: str
 
@@ -26,6 +27,7 @@ class RedditComment(TabularRecord):
     parent: str
     points: str
     published_date: str
+    edited_date: str
     comment: str
     error: str
 
@@ -39,6 +41,7 @@ class RedditUserPost(TabularRecord):
     scraped_number_comments: str
     number_comments: int
     published_date: str
+    edited_date: str
     external_link: str
     subreddit: str
     error: str
@@ -51,6 +54,7 @@ class RedditUserComment(TabularRecord):
     post_subreddit: str
     points: int
     published_date: str
+    edited_date: str
     text: str
     comment_url: str
     error: str

From abff314824a7582c4106d02c9a636d8a99d30a9e Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Wed, 8 Jan 2025 13:44:56 +0100
Subject: [PATCH 22/47] Fixing error when no pagination and edited_date

---
 minet/reddit/scraper.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 9a243516fd..be83c515d9 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -87,7 +87,7 @@ def get_points(ele):
 
 
 def get_dates(ele):
-    published_date = ele.scrape_one("time[class='']", "datetime")
+    published_date = ele.scrape_one("time", "datetime")
     edited_date = ele.scrape_one("time[class='edited-timestamp']", "datetime")
     return published_date, edited_date
 
@@ -265,7 +265,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
         n_crawled = 0
         old_url = get_old_url(url)
         for _ in range(nb_pages):
-            if n_crawled == int(nb):
+            if n_crawled == int(nb) or not old_url:
                 break
             _, soup, error = reddit_request(old_url, self.pool_manager)
             posts = soup.select("div[id^='thing_t3_']")
@@ -366,7 +366,9 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
 
                     yield post
                 n_crawled += 1
-            old_url = soup.scrape("span[class='next-button'] a", "href")[0]
+            old_url = soup.scrape("span[class='next-button'] a")
+            if old_url:
+                old_url = old_url[0].get("href")
 
     def get_user_comments(self, url: str, nb=25):
         nb_pages = ceil(int(nb) / 25)

From b045fb79bee72a275fca34a9042f611707a3cba8 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Wed, 8 Jan 2025 16:07:27 +0100
Subject: [PATCH 23/47] Fixing data in user_comments

---
 minet/reddit/scraper.py | 23 ++++++++++++++---------
 minet/reddit/types.py   |  2 ++
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index be83c515d9..b895ca6b8c 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -107,6 +107,8 @@ def data_posts(
 ):
     try_author = post.select_one("a[class*='author']")
     author = try_author.get_text() if try_author else "Deleted"
+    if get_domain_name(link) == "reddit.com":
+        link = ""
     data = RedditPost(
         title=title,
         url=get_new_url(url),
@@ -147,7 +149,7 @@ def data_user_posts(
         published_date=published_date,
         edited_date=edited_date,
         external_link=link,
-        subreddit=sub,
+        subreddit=get_new_url(sub),
         error=error,
     )
     return data
@@ -366,9 +368,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
 
                     yield post
                 n_crawled += 1
-            old_url = soup.scrape("span[class='next-button'] a")
-            if old_url:
-                old_url = old_url[0].get("href")
+            old_url = soup.scrape_one("span[class='next-button'] a", "href")
 
     def get_user_comments(self, url: str, nb=25):
         nb_pages = ceil(int(nb) / 25)
@@ -395,9 +395,8 @@ def get_user_comments(self, url: str, nb=25):
                 for comment in comments:
                     if n_crawled == int(nb):
                         break
-                    post_title = resolve_relative_url(
-                        comment.scrape_one("a[class='title']", "href")
-                    )
+                    post_title = comment.scrape_one("a[class='title']")
+                    post_url = comment.scrape_one("a[class='bylink may-blank']", "href")
                     post_author = comment.scrape_one(
                         "p[class='parent']>a[class^='author']"
                     )
@@ -405,17 +404,23 @@ def get_user_comments(self, url: str, nb=25):
                     points = get_points(comment)
                     published_date, edited_date = get_dates(comment)
                     text = comment.scrape_one("div[class='content'] div[class='md']")
+                    link = comment.scrape_one(
+                        "div[class='content'] div[class='md'] a", "href"
+                    )
                     comment_url = comment.scrape_one("a[class='bylink']", "href")
                     data = RedditUserComment(
                         post_title=post_title,
+                        post_url=get_new_url(post_url),
                         post_author=post_author,
-                        post_subreddit=post_subreddit,
+                        post_subreddit=get_new_url(post_subreddit),
                         points=points,
                         published_date=published_date,
                         edited_date=edited_date,
                         text=text,
-                        comment_url=comment_url,
+                        link=link,
+                        comment_url=get_new_url(comment_url),
                         error=error,
                     )
                     yield data
                     n_crawled += 1
+            old_url = soup.scrape_one("span[class='next-button'] a", "href")
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
index 9fa2bff317..d3e16aaafd 100644
--- a/minet/reddit/types.py
+++ b/minet/reddit/types.py
@@ -50,11 +50,13 @@ class RedditUserPost(TabularRecord):
 @dataclass
 class RedditUserComment(TabularRecord):
     post_title: str
+    post_url: str
     post_author: str
     post_subreddit: str
     points: int
     published_date: str
     edited_date: str
     text: str
+    link: str
     comment_url: str
     error: str

From 65ac1bf24249318a3fd2f48c0cabb5d04109c830 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Wed, 8 Jan 2025 16:44:50 +0100
Subject: [PATCH 24/47] refacto and use of posts with the name of the subreddit

---
 minet/cli/reddit/__init__.py |  2 ++
 minet/reddit/scraper.py      | 27 +++++++++++++++++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index cea5e8b8fb..55bd6e2050 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -19,6 +19,8 @@
 
         . Searching posts from the subreddit r/france:
             $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv
+            $ minet reddit posts france > r_france_posts.csv
+            $ minet reddit posts r/france > r_france_posts.csv
     """,
     variadic_input={
         "dummy_column": "subreddit",
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index b895ca6b8c..645ce66b21 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -1,16 +1,17 @@
-from minet.web import request, create_pool_manager
 from math import ceil
-from ural import get_domain_name, urlpathsplit, is_url
+import re
 from time import sleep
+from ural import get_domain_name, urlpathsplit, is_url
+from urllib.parse import urljoin
+
+from minet.reddit.exceptions import RedditInvalidTargetError
 from minet.reddit.types import (
     RedditPost,
     RedditComment,
     RedditUserPost,
     RedditUserComment,
 )
-from minet.reddit.exceptions import RedditInvalidTargetError
-import re
-from urllib.parse import urljoin
+from minet.web import request, create_pool_manager
 
 
 def resolve_relative_url(path):
@@ -20,13 +21,19 @@ def resolve_relative_url(path):
 def get_old_url(url):
     domain = get_domain_name(url)
     path = urlpathsplit(url)
-    return f"https://old.{domain}/" + "/".join(path) + "/"
+    old_url = f"https://old.{domain}"
+    for ele in path:
+        old_url = urljoin(old_url, f"{ele}/")
+    return old_url
 
 
 def get_new_url(url):
     domain = get_domain_name(url)
     path = urlpathsplit(url)
-    return f"https://www.{domain}/" + "/".join(path) + "/"
+    new_url = f"https://old.{domain}"
+    for ele in path:
+        new_url = urljoin(new_url, f"{ele}/")
+    return new_url
 
 
 def get_url_from_subreddit(name: str):
@@ -34,8 +41,8 @@ def get_url_from_subreddit(name: str):
         return name
     name = name.lstrip("/")
     if name.startswith("r/"):
-        return "https://old.reddit.com/" + name
-    return "https://old.reddit.com/r/" + name
+        return urljoin("https://old.reddit.com/", name)
+    return urljoin("https://old.reddit.com/r/", name)
 
 
 def reddit_request(url, pool_manager):
@@ -265,7 +272,7 @@ def get_comments(self, url: str, all):
     def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
         nb_pages = ceil(int(nb) / 25)
         n_crawled = 0
-        old_url = get_old_url(url)
+        old_url = get_old_url(get_url_from_subreddit(url))
         for _ in range(nb_pages):
             if n_crawled == int(nb) or not old_url:
                 break

From 30932a25f680a8cc1a5745fb9408ce2eb488a657 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 9 Jan 2025 11:50:36 +0100
Subject: [PATCH 25/47] Fixing typo

---
 minet/cli/reddit/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index 55bd6e2050..c6e6a295fb 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -12,7 +12,7 @@
     "minet.cli.reddit.posts",
     title="Minet Reddit Posts Command",
     description="""
-        Retrieve reddit posts from a subreddit link.
+        Retrieve reddit posts from a subreddit link or name.
     """,
     epilog="""
         Example:
@@ -80,7 +80,7 @@
         Example:
 
         . Searching posts from the user page of u/random_user:
-            $ minet reddit posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv
+            $ minet reddit user_posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv
     """,
     variadic_input={
         "dummy_column": "user",
@@ -111,8 +111,8 @@
     epilog="""
         Example:
 
-        . Searching posts from the user page of u/random_user:
-            $ minet reddit posts https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv
+        . Searching comments from the user page of u/random_user:
+            $ minet reddit user_comments https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv
     """,
     variadic_input={
         "dummy_column": "user",
@@ -122,7 +122,7 @@
     arguments=[
         {
             "flags": ["-n", "--number"],
-            "help": "Number of posts to retrieve.",
+            "help": "Number of comments to retrieve.",
             "type": int,
         },
     ],

From 2e2abfcc9b94fa18292d6d73c88e0677bb43da4d Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 9 Jan 2025 11:52:17 +0100
Subject: [PATCH 26/47] Fixing typo

---
 minet/cli/reddit/comments.py      |  2 +-
 minet/cli/reddit/posts.py         | 12 ++++++++----
 minet/cli/reddit/user_comments.py |  2 +-
 minet/cli/reddit/user_posts.py    | 12 ++++++++----
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/minet/cli/reddit/comments.py b/minet/cli/reddit/comments.py
index d41c7b9699..c5232b1a63 100644
--- a/minet/cli/reddit/comments.py
+++ b/minet/cli/reddit/comments.py
@@ -13,7 +13,7 @@
 @with_enricher_and_loading_bar(
     headers=RedditComment,
     title="Scraping comments",
-    unit="groups",
+    unit="pages",
     nested=True,
     sub_unit="comments",
 )
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index d73ef812f1..a3e8738d9c 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -13,14 +13,14 @@
 @with_enricher_and_loading_bar(
     headers=RedditPost,
     title="Scraping posts",
-    unit="groups",
+    unit="pages",
     nested=True,
     sub_unit="posts",
 )
 def action(cli_args, enricher, loading_bar):
     scraper = RedditScraper()
 
-    type_page = 'subreddit'
+    type_page = "subreddit"
 
     for i, row, url in enricher.enumerate_cells(
         cli_args.column, with_rows=True, start=1
@@ -29,9 +29,13 @@ def action(cli_args, enricher, loading_bar):
             try:
                 if cli_args.number:
                     if cli_args.text:
-                        posts = scraper.get_general_post(url, type_page, True, cli_args.number)
+                        posts = scraper.get_general_post(
+                            url, type_page, True, cli_args.number
+                        )
                     else:
-                        posts = scraper.get_general_post(url, type_page, False, cli_args.number)
+                        posts = scraper.get_general_post(
+                            url, type_page, False, cli_args.number
+                        )
                 else:
                     if cli_args.text:
                         posts = scraper.get_general_post(url, type_page, True)
diff --git a/minet/cli/reddit/user_comments.py b/minet/cli/reddit/user_comments.py
index c2e48ccefd..b25022303c 100644
--- a/minet/cli/reddit/user_comments.py
+++ b/minet/cli/reddit/user_comments.py
@@ -13,7 +13,7 @@
 @with_enricher_and_loading_bar(
     headers=RedditUserComment,
     title="Scraping user comments",
-    unit="groups",
+    unit="pages",
     nested=True,
     sub_unit="comments",
 )
diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
index d55abda1b3..1d1b176137 100644
--- a/minet/cli/reddit/user_posts.py
+++ b/minet/cli/reddit/user_posts.py
@@ -13,14 +13,14 @@
 @with_enricher_and_loading_bar(
     headers=RedditUserPost,
     title="Scraping user posts",
-    unit="groups",
+    unit="pages",
     nested=True,
     sub_unit="posts",
 )
 def action(cli_args, enricher, loading_bar):
     scraper = RedditScraper()
 
-    type_page = 'user'
+    type_page = "user"
 
     for i, row, url in enricher.enumerate_cells(
         cli_args.column, with_rows=True, start=1
@@ -29,9 +29,13 @@ def action(cli_args, enricher, loading_bar):
             try:
                 if cli_args.number:
                     if cli_args.text:
-                        posts = scraper.get_general_post(url, type_page, True, cli_args.number)
+                        posts = scraper.get_general_post(
+                            url, type_page, True, cli_args.number
+                        )
                     else:
-                        posts = scraper.get_general_post(url, type_page, False, cli_args.number)
+                        posts = scraper.get_general_post(
+                            url, type_page, False, cli_args.number
+                        )
                 else:
                     if cli_args.text:
                         posts = scraper.get_general_post(url, type_page, True)

From 39700b0b38d476c99665ca08d7d1b65be7ba733d Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 9 Jan 2025 11:57:40 +0100
Subject: [PATCH 27/47] Fixing error in get_new_url

---
 minet/reddit/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 645ce66b21..03bdcd22f2 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -30,7 +30,7 @@ def get_old_url(url):
 def get_new_url(url):
     domain = get_domain_name(url)
     path = urlpathsplit(url)
-    new_url = f"https://old.{domain}"
+    new_url = f"https://www.{domain}"
     for ele in path:
         new_url = urljoin(new_url, f"{ele}/")
     return new_url

From 4d74228582dd82ee61efb8e4227819a3e5482606 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 9 Jan 2025 14:01:36 +0100
Subject: [PATCH 28/47] changes doc and kebab-case

---
 minet/cli/reddit/__init__.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index c6e6a295fb..2d6c504feb 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -24,8 +24,8 @@
     """,
     variadic_input={
         "dummy_column": "subreddit",
-        "item_label": "subreddit url, subreddit shortcode or subreddit id",
-        "item_label_plural": "subreddit urls, subreddit shortcodes or subreddits ids",
+        "item_label": "subreddit url, shortcode or id",
+        "item_label_plural": "subreddit urls, shortcodes or ids",
     },
     arguments=[
         {
@@ -57,8 +57,8 @@
     """,
     variadic_input={
         "dummy_column": "post",
-        "item_label": "post url, post shortcode or post id",
-        "item_label_plural": "posts urls, posts shortcodes or posts ids",
+        "item_label": "post url, shortcode or id",
+        "item_label_plural": "posts urls, shortcodes or ids",
     },
     arguments=[
         {
@@ -70,7 +70,7 @@
 )
 
 REDDIT_USER_POSTS_SUBCOMMAND = command(
-    "user_posts",
+    "user-posts",
     "minet.cli.reddit.user_posts",
     title="Minet Reddit User Posts Command",
     description="""
@@ -80,12 +80,12 @@
         Example:
 
         . Searching posts from the user page of u/random_user:
-            $ minet reddit user_posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv
+            $ minet reddit user-posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv
     """,
     variadic_input={
         "dummy_column": "user",
-        "item_label": "user url, user shortcode or user id",
-        "item_label_plural": "user urls, user shortcodes or user ids",
+        "item_label": "user url, shortcode or id",
+        "item_label_plural": "user urls, shortcodes or ids",
     },
     arguments=[
         {
@@ -102,7 +102,7 @@
 )
 
 REDDIT_USER_COMMENTS_SUBCOMMAND = command(
-    "user_comments",
+    "user-comments",
     "minet.cli.reddit.user_comments",
     title="Minet Reddit User Comments Command",
     description="""
@@ -112,12 +112,12 @@
         Example:
 
         . Searching comments from the user page of u/random_user:
-            $ minet reddit user_comments https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv
+            $ minet reddit user-comments https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv
     """,
     variadic_input={
         "dummy_column": "user",
-        "item_label": "user url, user shortcode or user id",
-        "item_label_plural": "user urls, user shortcodes or user ids",
+        "item_label": "user url, shortcode or id",
+        "item_label_plural": "user urls, shortcodes or ids",
     },
     arguments=[
         {

From 6e32569333ac837c33c03d4b26353a0b92baed9a Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 9 Jan 2025 14:16:40 +0100
Subject: [PATCH 29/47] removing print and sleep

---
 minet/reddit/scraper.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 03bdcd22f2..1f1f437b48 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -46,7 +46,6 @@ def get_url_from_subreddit(name: str):
 
 
 def reddit_request(url, pool_manager):
-    sleep(1)
     response = request(url, pool_manager=pool_manager)
     soup = response.soup()
     if response.status == 500 and soup.scrape_one("img", "alt") == "you broke reddit":
@@ -60,7 +59,6 @@ def reddit_request(url, pool_manager):
     remaining_requests = float(response.headers["x-ratelimit-remaining"])
     if remaining_requests == 1:
         time_remaining = int(response.headers["x-ratelimit-reset"])
-        print(f"Time before next request : {time_remaining}s")
         sleep(time_remaining)
         return reddit_request(url)
     if response.status == 429:

From a49918b7e45538ecf611b48e563a3ac9d881b464 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 9 Jan 2025 14:21:01 +0100
Subject: [PATCH 30/47] Avoid stack overflow error

---
 minet/reddit/scraper.py | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 1f1f437b48..27c6c158e4 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -46,24 +46,29 @@ def get_url_from_subreddit(name: str):
 
 
 def reddit_request(url, pool_manager):
-    response = request(url, pool_manager=pool_manager)
-    soup = response.soup()
-    if response.status == 500 and soup.scrape_one("img", "alt") == "you broke reddit":
-        return response, soup, "broken page"
-    if response.status == 404 and soup.scrape_one("img", "alt") == "banned":
-        return response, soup, "banned"
-    if response.status == 404 or (
-        soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']")
-    ):
-        raise RedditInvalidTargetError
-    remaining_requests = float(response.headers["x-ratelimit-remaining"])
-    if remaining_requests == 1:
-        time_remaining = int(response.headers["x-ratelimit-reset"])
-        sleep(time_remaining)
-        return reddit_request(url)
-    if response.status == 429:
-        return reddit_request(url)
-    return response, soup, None
+    while True:
+        response = request(url, pool_manager=pool_manager)
+        soup = response.soup()
+        if (
+            response.status == 500
+            and soup.scrape_one("img", "alt") == "you broke reddit"
+        ):
+            return response, soup, "broken page"
+        if response.status == 404 and soup.scrape_one("img", "alt") == "banned":
+            return response, soup, "banned"
+        if response.status == 404 or (
+            soup.scrape("p[id='noresults']")
+            and not soup.scrape("div[class='commentarea']")
+        ):
+            raise RedditInvalidTargetError
+        remaining_requests = float(response.headers["x-ratelimit-remaining"])
+        if remaining_requests == 1:
+            time_remaining = int(response.headers["x-ratelimit-reset"])
+            sleep(time_remaining)
+            continue
+        if response.status == 429:
+            continue
+        return response, soup, None
 
 
 def extract_t1_ids(text):

From 318955864f3ad84abe08fd0a084d24d38107c153 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 9 Jan 2025 15:25:21 +0100
Subject: [PATCH 31/47] refacto

---
 minet/cli/reddit/comments.py   |  5 +----
 minet/cli/reddit/posts.py      | 17 +++++------------
 minet/cli/reddit/user_posts.py | 17 +++++------------
 3 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/minet/cli/reddit/comments.py b/minet/cli/reddit/comments.py
index c5232b1a63..853175e8e4 100644
--- a/minet/cli/reddit/comments.py
+++ b/minet/cli/reddit/comments.py
@@ -25,10 +25,7 @@ def action(cli_args, enricher, loading_bar):
     ):
         with loading_bar.step(url):
             try:
-                if cli_args.all:
-                    comments = scraper.get_comments(url, True)
-                else:
-                    comments = scraper.get_comments(url, False)
+                comments = scraper.get_comments(url, cli_args.all)
 
             except RedditInvalidTargetError:
                 loading_bar.print(
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index a3e8738d9c..91754946ce 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -28,19 +28,12 @@ def action(cli_args, enricher, loading_bar):
         with loading_bar.step(url):
             try:
                 if cli_args.number:
-                    if cli_args.text:
-                        posts = scraper.get_general_post(
-                            url, type_page, True, cli_args.number
-                        )
-                    else:
-                        posts = scraper.get_general_post(
-                            url, type_page, False, cli_args.number
-                        )
+                    posts = scraper.get_general_post(
+                        url, type_page, cli_args.text, cli_args.number
+                    )
                 else:
-                    if cli_args.text:
-                        posts = scraper.get_general_post(url, type_page, True)
-                    else:
-                        posts = scraper.get_general_post(url, type_page, False)
+                    posts = scraper.get_general_post(url, type_page, cli_args.text)
+
             except RedditInvalidTargetError:
                 loading_bar.print(
                     "the script could not complete normally on line %i" % (i)
diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
index 1d1b176137..408d217319 100644
--- a/minet/cli/reddit/user_posts.py
+++ b/minet/cli/reddit/user_posts.py
@@ -28,19 +28,12 @@ def action(cli_args, enricher, loading_bar):
         with loading_bar.step(url):
             try:
                 if cli_args.number:
-                    if cli_args.text:
-                        posts = scraper.get_general_post(
-                            url, type_page, True, cli_args.number
-                        )
-                    else:
-                        posts = scraper.get_general_post(
-                            url, type_page, False, cli_args.number
-                        )
+                    posts = scraper.get_general_post(
+                        url, type_page, cli_args.text, cli_args.number
+                    )
                 else:
-                    if cli_args.text:
-                        posts = scraper.get_general_post(url, type_page, True)
-                    else:
-                        posts = scraper.get_general_post(url, type_page, False)
+                    posts = scraper.get_general_post(url, type_page, cli_args.text)
+
             except RedditInvalidTargetError:
                 loading_bar.print(
                     "the script could not complete normally on line %i" % (i)

From fa3bc2847073678bbfa657890da13f3aedcb7d55 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 9 Jan 2025 17:49:21 +0100
Subject: [PATCH 32/47] changing -n, --number to -l, --limit and fixing errors
 with comments

---
 minet/cli/reddit/__init__.py      | 12 +++++-----
 minet/cli/reddit/posts.py         |  9 +++----
 minet/cli/reddit/user_comments.py |  5 +---
 minet/cli/reddit/user_posts.py    |  9 +++----
 minet/reddit/scraper.py           | 39 +++++++++++++++++--------------
 5 files changed, 34 insertions(+), 40 deletions(-)

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
index 2d6c504feb..6ab6acf884 100644
--- a/minet/cli/reddit/__init__.py
+++ b/minet/cli/reddit/__init__.py
@@ -29,8 +29,8 @@
     },
     arguments=[
         {
-            "flags": ["-n", "--number"],
-            "help": "Number of posts to retrieve.",
+            "flags": ["-l", "--limit"],
+            "help": "Maximum number of posts to retrieve.",
             "type": int,
         },
         {
@@ -89,8 +89,8 @@
     },
     arguments=[
         {
-            "flags": ["-n", "--number"],
-            "help": "Number of posts to retrieve.",
+            "flags": ["-l", "--limit"],
+            "help": "Maximum number of posts to retrieve.",
             "type": int,
         },
         {
@@ -121,8 +121,8 @@
     },
     arguments=[
         {
-            "flags": ["-n", "--number"],
-            "help": "Number of comments to retrieve.",
+            "flags": ["-l", "--limit"],
+            "help": "Maximum number of comments to retrieve.",
             "type": int,
         },
     ],
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
index 91754946ce..2111972b98 100644
--- a/minet/cli/reddit/posts.py
+++ b/minet/cli/reddit/posts.py
@@ -27,12 +27,9 @@ def action(cli_args, enricher, loading_bar):
     ):
         with loading_bar.step(url):
             try:
-                if cli_args.number:
-                    posts = scraper.get_general_post(
-                        url, type_page, cli_args.text, cli_args.number
-                    )
-                else:
-                    posts = scraper.get_general_post(url, type_page, cli_args.text)
+                posts = scraper.get_general_post(
+                    url, type_page, cli_args.text, cli_args.limit
+                )
 
             except RedditInvalidTargetError:
                 loading_bar.print(
diff --git a/minet/cli/reddit/user_comments.py b/minet/cli/reddit/user_comments.py
index b25022303c..e7a3e5a02b 100644
--- a/minet/cli/reddit/user_comments.py
+++ b/minet/cli/reddit/user_comments.py
@@ -25,10 +25,7 @@ def action(cli_args, enricher, loading_bar):
     ):
         with loading_bar.step(url):
             try:
-                if cli_args.number:
-                    posts = scraper.get_user_comments(url, cli_args.number)
-                else:
-                    posts = scraper.get_user_comments(url)
+                posts = scraper.get_user_comments(url, cli_args.limit)
 
             except RedditInvalidTargetError:
                 loading_bar.print(
diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
index 408d217319..ea950b09c5 100644
--- a/minet/cli/reddit/user_posts.py
+++ b/minet/cli/reddit/user_posts.py
@@ -27,12 +27,9 @@ def action(cli_args, enricher, loading_bar):
     ):
         with loading_bar.step(url):
             try:
-                if cli_args.number:
-                    posts = scraper.get_general_post(
-                        url, type_page, cli_args.text, cli_args.number
-                    )
-                else:
-                    posts = scraper.get_general_post(url, type_page, cli_args.text)
+                posts = scraper.get_general_post(
+                    url, type_page, cli_args.text, cli_args.limit
+                )
 
             except RedditInvalidTargetError:
                 loading_bar.print(
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 27c6c158e4..d488002b8c 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -56,9 +56,10 @@ def reddit_request(url, pool_manager):
             return response, soup, "broken page"
         if response.status == 404 and soup.scrape_one("img", "alt") == "banned":
             return response, soup, "banned"
-        if response.status == 404 or (
-            soup.scrape("p[id='noresults']")
-            and not soup.scrape("div[class='commentarea']")
+        if (
+            soup.scrape_one("span.pagename.selected") == "page not found"
+            or "search?q=" in url
+            and soup.scrape_one("p.error")
         ):
             raise RedditInvalidTargetError
         remaining_requests = float(response.headers["x-ratelimit-remaining"])
@@ -116,7 +117,7 @@ def data_posts(
     error,
 ):
     try_author = post.select_one("a[class*='author']")
-    author = try_author.get_text() if try_author else "Deleted"
+    author = try_author.get_text() if try_author else "[Deleted]"
     if get_domain_name(link) == "reddit.com":
         link = ""
     data = RedditPost(
@@ -216,10 +217,14 @@ def get_comments(self, url: str, all):
             while m_comments:
                 parent, com = m_comments.pop()
                 current_id = get_current_id(com)
-                comment_url = com.scrape_one("a[class='bylink']", "href")
-                try_author = com.scrape_one("a[class^='author']")
-                author = try_author if try_author else "Deleted"
-                points = get_points(com)
+                if com.get("class") == " thing noncollapsed   deleted comment ":
+                    comment_url = None
+                    author = "[Deleted]"
+                    points = None
+                else:
+                    comment_url = com.scrape_one("a[class='bylink']", "href")
+                    author = com.scrape_one("a[class^='author']")
+                    points = get_points(com)
                 published_date, edited_date = get_dates(com)
                 if "morerecursion" in com.get("class") and all:
                     url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
@@ -272,17 +277,16 @@ def get_comments(self, url: str, all):
                     if data.id != "":
                         yield data
 
-    def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
-        nb_pages = ceil(int(nb) / 25)
+    def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
         n_crawled = 0
         old_url = get_old_url(get_url_from_subreddit(url))
-        for _ in range(nb_pages):
-            if n_crawled == int(nb) or not old_url:
+        while old_url and (limit is None or n_crawled < limit):
+            if limit is not None and n_crawled == limit:
                 break
             _, soup, error = reddit_request(old_url, self.pool_manager)
             posts = soup.select("div[id^='thing_t3_']")
             for post in posts:
-                if n_crawled == int(nb):
+                if limit is not None and n_crawled == limit:
                     break
                 list_buttons = post.select_one("ul[class='flat-list buttons']")
                 if len(list_buttons.scrape("span[class='promoted-span']")) == 0:
@@ -380,12 +384,11 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
                 n_crawled += 1
             old_url = soup.scrape_one("span[class='next-button'] a", "href")
 
-    def get_user_comments(self, url: str, nb=25):
-        nb_pages = ceil(int(nb) / 25)
+    def get_user_comments(self, url: str, limit: int):
         n_crawled = 0
         old_url = get_old_url(url)
-        for _ in range(nb_pages):
-            if n_crawled == int(nb):
+        while old_url and (limit is None or n_crawled < limit):
+            if limit is not None and n_crawled == limit:
                 break
             _, soup, error = reddit_request(old_url, self.pool_manager)
             if error:
@@ -403,7 +406,7 @@ def get_user_comments(self, url: str, nb=25):
             else:
                 comments = soup.select("[data-type='comment']")
                 for comment in comments:
-                    if n_crawled == int(nb):
+                    if limit is not None and n_crawled == limit:
                         break
                     post_title = comment.scrape_one("a[class='title']")
                     post_url = comment.scrape_one("a[class='bylink may-blank']", "href")

From 2f51fb60f7c99953d3d134626ed8203b50b8ee6d Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Thu, 9 Jan 2025 17:54:00 +0100
Subject: [PATCH 33/47] Fixing gh-tests error

---
 minet/reddit/scraper.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index d488002b8c..b49e023b9c 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -1,4 +1,3 @@
-from math import ceil
 import re
 from time import sleep
 from ural import get_domain_name, urlpathsplit, is_url

From 5a42b15b6158b77e7af8d2a48eea8bf70b0c5384 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 10 Jan 2025 15:13:45 +0100
Subject: [PATCH 34/47] Fixing comments and handling detection

---
 minet/reddit/scraper.py | 93 +++++++++++++++++++++++++++++------------
 1 file changed, 67 insertions(+), 26 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index b49e023b9c..05fdfb6df5 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -1,3 +1,4 @@
+from random import choice
 import re
 from time import sleep
 from ural import get_domain_name, urlpathsplit, is_url
@@ -12,6 +13,28 @@
 )
 from minet.web import request, create_pool_manager
 
+USER_AGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.2420.81",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.4; rv:124.0) Gecko/20100101 Firefox/124.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux i686; rv:124.0) Gecko/20100101 Firefox/124.0",
+]
+
+
+def add_slash(url: str):
+    path = url.split("/")
+    if path[-1] == "?limit=500":
+        return url
+    elif url[-1] != "/":
+        return url + "/"
+    return url
+
 
 def resolve_relative_url(path):
     return urljoin("https://old.reddit.com", path)
@@ -46,8 +69,16 @@ def get_url_from_subreddit(name: str):
 
 def reddit_request(url, pool_manager):
     while True:
-        response = request(url, pool_manager=pool_manager)
+        response = request(
+            add_slash(url),
+            pool_manager=pool_manager,
+            headers={"User-Agent": choice(USER_AGENTS)},
+        )
         soup = response.soup()
+        remaining_requests = float(response.headers["x-ratelimit-remaining"])
+        if response.status == 429 and remaining_requests > 1:
+            sleep(1)
+            continue
         if (
             response.status == 500
             and soup.scrape_one("img", "alt") == "you broke reddit"
@@ -61,13 +92,10 @@ def reddit_request(url, pool_manager):
             and soup.scrape_one("p.error")
         ):
             raise RedditInvalidTargetError
-        remaining_requests = float(response.headers["x-ratelimit-remaining"])
-        if remaining_requests == 1:
+        if remaining_requests == 1 or response.status == 429:
             time_remaining = int(response.headers["x-ratelimit-reset"])
             sleep(time_remaining)
             continue
-        if response.status == 429:
-            continue
         return response, soup, None
 
 
@@ -172,23 +200,27 @@ def __init__(self):
     def get_childs_l500(self, url, list_comments, parent_id):
         _, soup, _ = reddit_request(url, self.pool_manager)
         comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
-        for com in comments:
-            child = com.find("div", class_="child")
-            if child.text != "":
-                child = child.find("div")
-                child_com = child.find_all(
-                    "div",
-                    class_=lambda x: x
-                    and (
-                        "comment" in x
-                        or "deleted comment" in x
-                        or "morerecursion" in x
-                        or "morechildren" in x
-                    ),
-                    recursive=False,
-                )
-                for ele in child_com:
-                    list_comments.append((parent_id, ele))
+        if parent_id == None:
+            for com in comments:
+                list_comments.append((None, com))
+        else:
+            for com in comments:
+                child = com.find("div", class_="child")
+                if child.text != "":
+                    child = child.find("div")
+                    child_com = child.find_all(
+                        "div",
+                        class_=lambda x: x
+                        and (
+                            "comment" in x
+                            or "deleted comment" in x
+                            or "morerecursion" in x
+                            or "morechildren" in x
+                        ),
+                        recursive=False,
+                    )
+                    for ele in child_com:
+                        list_comments.append((parent_id, ele))
         return list_comments
 
     def get_comments(self, url: str, all):
@@ -211,13 +243,22 @@ def get_comments(self, url: str, all):
             first_comments = soup.select(
                 "div[class='commentarea']>div>div[class*='comment']"
             )
+            if all:
+                more = soup.select("div.commentarea>div>div[class*='morechildren']")
+                for ele in more:
+                    a = ele.select_one("a")
+                    onclick = a["onclick"]
+                    id_list = extract_t1_ids(onclick)
+                    for id in id_list:
+                        comment_url = f"{old_url}{id}/"
+                        m_comments = self.get_childs_l500(comment_url, m_comments, None)
             for ele in first_comments:
                 m_comments.append((None, ele))
             while m_comments:
                 parent, com = m_comments.pop()
                 current_id = get_current_id(com)
-                if com.get("class") == " thing noncollapsed   deleted comment ":
-                    comment_url = None
+                if "deleted comment" in com.get("class"):
+                    comment_url = com.get("data-permalink")
                     author = "[Deleted]"
                     points = None
                 else:
@@ -233,7 +274,7 @@ def get_comments(self, url: str, all):
                     onclick = a["onclick"]
                     id_list = extract_t1_ids(onclick)
                     for id in id_list:
-                        comment_url = f"{old_url}{id}"
+                        comment_url = f"{old_url}{id}/"
                         m_comments = self.get_childs_l500(
                             comment_url, m_comments, current_id
                         )
@@ -263,7 +304,7 @@ def get_comments(self, url: str, all):
                         for ele in child_com:
                             m_comments.append((current_id, ele))
                     data = RedditComment(
-                        comment_url=get_new_url(comment_url) if comment_url else None,
+                        comment_url=get_new_url(resolve_relative_url(comment_url)),
                         author=author,
                         id=current_id,
                         parent=parent,

From 7b9bb8c9b0f798b3a7b17f3edd42e4cb2b6e9fae Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 10 Jan 2025 16:06:52 +0100
Subject: [PATCH 35/47] adding use of spoof-ua

---
 minet/reddit/scraper.py | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 05fdfb6df5..59ca1842c5 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -13,19 +13,6 @@
 )
 from minet.web import request, create_pool_manager
 
-USER_AGENTS = [
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.2420.81",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.4; rv:124.0) Gecko/20100101 Firefox/124.0",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (X11; Linux i686; rv:124.0) Gecko/20100101 Firefox/124.0",
-]
-
 
 def add_slash(url: str):
     path = url.split("/")
@@ -72,7 +59,7 @@ def reddit_request(url, pool_manager):
         response = request(
             add_slash(url),
             pool_manager=pool_manager,
-            headers={"User-Agent": choice(USER_AGENTS)},
+            spoof_ua=True,
         )
         soup = response.soup()
         remaining_requests = float(response.headers["x-ratelimit-remaining"])
@@ -200,7 +187,7 @@ def __init__(self):
     def get_childs_l500(self, url, list_comments, parent_id):
         _, soup, _ = reddit_request(url, self.pool_manager)
         comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
-        if parent_id == None:
+        if parent_id is None:
             for com in comments:
                 list_comments.append((None, com))
         else:
@@ -257,7 +244,7 @@ def get_comments(self, url: str, all):
             while m_comments:
                 parent, com = m_comments.pop()
                 current_id = get_current_id(com)
-                if "deleted comment" in com.get("class"):
+                if "deleted" in com.get("class") and "comment" in com.get("class"):
                     comment_url = com.get("data-permalink")
                     author = "[Deleted]"
                     points = None

From bf8aee986c2caadaf4997643beb0fdaa84eecee5 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 10 Jan 2025 16:10:33 +0100
Subject: [PATCH 36/47] Fixing tests

---
 minet/reddit/scraper.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 59ca1842c5..e3932e8b02 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -1,4 +1,3 @@
-from random import choice
 import re
 from time import sleep
 from ural import get_domain_name, urlpathsplit, is_url

From c28763c5a8a2e10e2cc2f38e0ca6ecaa9aff5848 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 10 Jan 2025 17:21:18 +0100
Subject: [PATCH 37/47] Fixing error with deleted accounts

---
 minet/reddit/scraper.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index e3932e8b02..801e72fa2d 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -249,7 +249,10 @@ def get_comments(self, url: str, all):
                     points = None
                 else:
                     comment_url = com.scrape_one("a[class='bylink']", "href")
-                    author = com.scrape_one("a[class^='author']")
+                    try_author = com.select_one("div.entry.unvoted")
+                    author = try_author.scrape_one("a[class^='author']")
+                    if not author:
+                        author = "[Deleted]"
                     points = get_points(com)
                 published_date, edited_date = get_dates(com)
                 if "morerecursion" in com.get("class") and all:

From 622fc2437e27e1631216bb9518ca6046b1a41289 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 10 Jan 2025 17:30:44 +0100
Subject: [PATCH 38/47] Compiling the regex outside the function

---
 minet/reddit/scraper.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 801e72fa2d..595d1da82a 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -12,6 +12,8 @@
 )
 from minet.web import request, create_pool_manager
 
+ID_RE = re.compile(r"t1_(\w+)")
+
 
 def add_slash(url: str):
     path = url.split("/")
@@ -86,8 +88,7 @@ def reddit_request(url, pool_manager):
 
 
 def extract_t1_ids(text):
-    pattern = r"t1_(\w+)"
-    return [match.group(1) for match in re.finditer(pattern, text)]
+    return [match.group(1) for match in re.finditer(ID_RE, text)]
 
 
 def get_current_id(com):

From 2fdfb61bdd3e0b5af181803cf31adf3b32bb2372 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Fri, 10 Jan 2025 17:45:38 +0100
Subject: [PATCH 39/47] refacto

---
 minet/reddit/scraper.py | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 595d1da82a..34442609e1 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -113,7 +113,7 @@ def get_points(ele):
 
 def get_dates(ele):
     published_date = ele.scrape_one("time", "datetime")
-    edited_date = ele.scrape_one("time[class='edited-timestamp']", "datetime")
+    edited_date = ele.scrape_one("time.edited-timestamp", "datetime")
     return published_date, edited_date
 
 
@@ -186,7 +186,7 @@ def __init__(self):
 
     def get_childs_l500(self, url, list_comments, parent_id):
         _, soup, _ = reddit_request(url, self.pool_manager)
-        comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
+        comments = soup.select("div.commentarea>div>div[class*='comment']")
         if parent_id is None:
             for com in comments:
                 list_comments.append((None, com))
@@ -227,9 +227,7 @@ def get_comments(self, url: str, all):
                 error=error,
             )
         else:
-            first_comments = soup.select(
-                "div[class='commentarea']>div>div[class*='comment']"
-            )
+            first_comments = soup.select("div.commentarea>div>div[class*='comment']")
             if all:
                 more = soup.select("div.commentarea>div>div[class*='morechildren']")
                 for ele in more:
@@ -249,7 +247,7 @@ def get_comments(self, url: str, all):
                     author = "[Deleted]"
                     points = None
                 else:
-                    comment_url = com.scrape_one("a[class='bylink']", "href")
+                    comment_url = com.scrape_one("a.bylink", "href")
                     try_author = com.select_one("div.entry.unvoted")
                     author = try_author.scrape_one("a[class^='author']")
                     if not author:
@@ -301,7 +299,7 @@ def get_comments(self, url: str, all):
                         points=points,
                         published_date=published_date,
                         edited_date=edited_date,
-                        comment=com.scrape_one("div[class='md']:not(div.child a)"),
+                        comment=com.scrape_one("div.md:not(div.child a)"),
                         error=error,
                     )
                     if data.id != "":
@@ -318,8 +316,8 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
             for post in posts:
                 if limit is not None and n_crawled == limit:
                     break
-                list_buttons = post.select_one("ul[class='flat-list buttons']")
-                if len(list_buttons.scrape("span[class='promoted-span']")) == 0:
+                list_buttons = post.select_one("ul.flat-list.buttons")
+                if len(list_buttons.scrape("span.promoted-span")) == 0:
                     title = post.force_select_one("a[class*='title']").get_text()
                     post_url = list_buttons.scrape_one(
                         "a[class^='bylink comments']", "href"
@@ -373,7 +371,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
                                     text_error,
                                 )
                         try_content = text_soup.select_one(
-                            "div[id='siteTable'] div[class^='usertext']"
+                            "div#siteTable div[class^='usertext']"
                         )
                         if try_content:
                             content = try_content.get_text()
@@ -412,7 +410,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
 
                     yield post
                 n_crawled += 1
-            old_url = soup.scrape_one("span[class='next-button'] a", "href")
+            old_url = soup.scrape_one("span.next-button a", "href")
 
     def get_user_comments(self, url: str, limit: int):
         n_crawled = 0
@@ -438,19 +436,15 @@ def get_user_comments(self, url: str, limit: int):
                 for comment in comments:
                     if limit is not None and n_crawled == limit:
                         break
-                    post_title = comment.scrape_one("a[class='title']")
-                    post_url = comment.scrape_one("a[class='bylink may-blank']", "href")
-                    post_author = comment.scrape_one(
-                        "p[class='parent']>a[class^='author']"
-                    )
+                    post_title = comment.scrape_one("a.title")
+                    post_url = comment.scrape_one("a.bylink.may-blank", "href")
+                    post_author = comment.scrape_one("p.parent>a[class^='author']")
                     post_subreddit = comment.scrape_one("a[class^='subreddit']", "href")
                     points = get_points(comment)
                     published_date, edited_date = get_dates(comment)
-                    text = comment.scrape_one("div[class='content'] div[class='md']")
-                    link = comment.scrape_one(
-                        "div[class='content'] div[class='md'] a", "href"
-                    )
-                    comment_url = comment.scrape_one("a[class='bylink']", "href")
+                    text = comment.scrape_one("div.content div.md")
+                    link = comment.scrape_one("div.content div.md a", "href")
+                    comment_url = comment.scrape_one("a.bylink", "href")
                     data = RedditUserComment(
                         post_title=post_title,
                         post_url=get_new_url(post_url),
@@ -466,4 +460,4 @@ def get_user_comments(self, url: str, limit: int):
                     )
                     yield data
                     n_crawled += 1
-            old_url = soup.scrape_one("span[class='next-button'] a", "href")
+            old_url = soup.scrape_one("span.next-button a", "href")

From 1e311fdd5d24bb0ca5b08a9701053c0855dd50aa Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Mon, 13 Jan 2025 17:05:00 +0100
Subject: [PATCH 40/47] Fixing error with number of posts retrieved

---
 minet/reddit/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 34442609e1..577f8a296e 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -409,7 +409,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
                         )
 
                     yield post
-                n_crawled += 1
+                    n_crawled += 1
             old_url = soup.scrape_one("span.next-button a", "href")
 
     def get_user_comments(self, url: str, limit: int):

From b414d8a8b94997344122a0167b98260ce2f98af1 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Tue, 14 Jan 2025 12:07:54 +0100
Subject: [PATCH 41/47] Fixing bug with old posts

---
 minet/reddit/scraper.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 577f8a296e..3ba6a8654f 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -87,8 +87,11 @@ def reddit_request(url, pool_manager):
         return response, soup, None
 
 
-def extract_t1_ids(text):
-    return [match.group(1) for match in re.finditer(ID_RE, text)]
+def extract_t1_ids(text: str):
+    ids = [match.group(1) for match in re.finditer(ID_RE, text)]
+    if ids:
+        return ids
+    return text.split("'")[-4].split(",")
 
 
 def get_current_id(com):

From 392f20bc382f26d072f1574634a74fa89e8b44d6 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Tue, 14 Jan 2025 14:00:58 +0100
Subject: [PATCH 42/47] fixing error with "?..." in url

---
 minet/reddit/scraper.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 3ba6a8654f..6a436844b1 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -17,7 +17,7 @@
 
 def add_slash(url: str):
     path = url.split("/")
-    if path[-1] == "?limit=500":
+    if path[-1][0] == "?":
         return url
     elif url[-1] != "/":
         return url + "/"
@@ -29,21 +29,11 @@ def resolve_relative_url(path):
 
 
 def get_old_url(url):
-    domain = get_domain_name(url)
-    path = urlpathsplit(url)
-    old_url = f"https://old.{domain}"
-    for ele in path:
-        old_url = urljoin(old_url, f"{ele}/")
-    return old_url
+    return url.replace("www.reddit", "old.reddit")
 
 
 def get_new_url(url):
-    domain = get_domain_name(url)
-    path = urlpathsplit(url)
-    new_url = f"https://www.{domain}"
-    for ele in path:
-        new_url = urljoin(new_url, f"{ele}/")
-    return new_url
+    return url.replace("old.reddit", "www.reddit")
 
 
 def get_url_from_subreddit(name: str):
@@ -133,14 +123,13 @@ def data_posts(
     link,
     error,
 ):
-    try_author = post.select_one("a[class*='author']")
-    author = try_author.get_text() if try_author else "[Deleted]"
+    author = post.scrape_one("a[class*='author']")
     if get_domain_name(link) == "reddit.com":
         link = ""
     data = RedditPost(
         title=title,
         url=get_new_url(url),
-        author=author,
+        author=author if author else "[Deleted]",
         author_text=author_text,
         points=points,
         scraped_number_comments=scraped_number_comments,

From 9d4218ffd5fbbf380ad69e9e20bfc7ffcce6a83e Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Tue, 14 Jan 2025 14:04:19 +0100
Subject: [PATCH 43/47] Fixing test error

---
 minet/reddit/scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 6a436844b1..4a3d4fae52 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -1,6 +1,6 @@
 import re
 from time import sleep
-from ural import get_domain_name, urlpathsplit, is_url
+from ural import get_domain_name, is_url
 from urllib.parse import urljoin
 
 from minet.reddit.exceptions import RedditInvalidTargetError

From 8468cb848fe1eedcd0f3d4bb335f92be97197569 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Tue, 14 Jan 2025 14:17:25 +0100
Subject: [PATCH 44/47] refacto

---
 minet/reddit/scraper.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 4a3d4fae52..865a11a498 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -15,6 +15,7 @@
 ID_RE = re.compile(r"t1_(\w+)")
 
 
+# when missing a '/' at the end of an url, reddit will make a redirection and it will reduce by 2 the number of requests remaining
 def add_slash(url: str):
     path = url.split("/")
     if path[-1][0] == "?":
@@ -94,11 +95,11 @@ def get_current_id(com):
 
 
 def get_points(ele):
-    scrapped_points = ele.select_one("[class='score unvoted']")
-    score_hidden = ele.select_one("[class='score-hidden']")
+    scrapped_points = ele.select_one(".score.unvoted")
+    score_hidden = ele.select_one(".score-hidden")
     if not scrapped_points and not score_hidden:
         return "deleted"
-    scrapped_points = ele.scrape_one("[class='score unvoted']", "title")
+    scrapped_points = ele.scrape_one(".score.unvoted", "title")
     if not scrapped_points:
         return "score hidden"
     return scrapped_points
@@ -123,9 +124,9 @@ def data_posts(
     link,
     error,
 ):
-    author = post.scrape_one("a[class*='author']")
-    if get_domain_name(link) == "reddit.com":
-        link = ""
+    author = post.scrape_one("a.author")
+    if "reddit.com/" in link:
+        link = None
     data = RedditPost(
         title=title,
         url=get_new_url(url),

From 8dc345c970a0f1600229addbf61ae067368fecc9 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Wed, 15 Jan 2025 10:36:18 +0100
Subject: [PATCH 45/47] refacto

---
 minet/reddit/scraper.py | 71 +++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 39 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 865a11a498..665fca65a6 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -1,6 +1,6 @@
 import re
 from time import sleep
-from ural import get_domain_name, is_url
+from ural import is_url
 from urllib.parse import urljoin
 
 from minet.reddit.exceptions import RedditInvalidTargetError
@@ -156,7 +156,7 @@ def data_user_posts(
     link,
     error,
 ):
-    sub = post.scrape_one("a[class*='subreddit']", "href")
+    sub = post.scrape_one("a.subreddit", "href")
     data = RedditUserPost(
         title=title,
         url=get_new_url(url),
@@ -179,28 +179,28 @@ def __init__(self):
 
     def get_childs_l500(self, url, list_comments, parent_id):
         _, soup, _ = reddit_request(url, self.pool_manager)
-        comments = soup.select("div.commentarea>div>div[class*='comment']")
+        comments = soup.select("div.commentarea>div>div.comment")
         if parent_id is None:
             for com in comments:
                 list_comments.append((None, com))
-        else:
-            for com in comments:
-                child = com.find("div", class_="child")
-                if child.text != "":
-                    child = child.find("div")
-                    child_com = child.find_all(
-                        "div",
-                        class_=lambda x: x
-                        and (
-                            "comment" in x
-                            or "deleted comment" in x
-                            or "morerecursion" in x
-                            or "morechildren" in x
-                        ),
-                        recursive=False,
-                    )
-                    for ele in child_com:
-                        list_comments.append((parent_id, ele))
+            return list_comments
+        for com in comments:
+            child = com.find("div", class_="child")
+            if child.text != "":
+                child = child.find("div")
+                child_com = child.find_all(
+                    "div",
+                    class_=lambda x: x
+                    and (
+                        "comment" in x
+                        or "deleted comment" in x
+                        or "morerecursion" in x
+                        or "morechildren" in x
+                    ),
+                    recursive=False,
+                )
+                for ele in child_com:
+                    list_comments.append((parent_id, ele))
         return list_comments
 
     def get_comments(self, url: str, all):
@@ -220,9 +220,9 @@ def get_comments(self, url: str, all):
                 error=error,
             )
         else:
-            first_comments = soup.select("div.commentarea>div>div[class*='comment']")
+            first_comments = soup.select("div.commentarea>div>div.comment")
             if all:
-                more = soup.select("div.commentarea>div>div[class*='morechildren']")
+                more = soup.select("div.commentarea>div>div.morechildren")
                 for ele in more:
                     a = ele.select_one("a")
                     onclick = a["onclick"]
@@ -241,8 +241,7 @@ def get_comments(self, url: str, all):
                     points = None
                 else:
                     comment_url = com.scrape_one("a.bylink", "href")
-                    try_author = com.select_one("div.entry.unvoted")
-                    author = try_author.scrape_one("a[class^='author']")
+                    author = com.scrape_one("div.entry.unvoted a.author")
                     if not author:
                         author = "[Deleted]"
                     points = get_points(com)
@@ -286,7 +285,7 @@ def get_comments(self, url: str, all):
                             m_comments.append((current_id, ele))
                     data = RedditComment(
                         comment_url=get_new_url(resolve_relative_url(comment_url)),
-                        author=author,
+                        author=author if author else "[Deleted]",
                         id=current_id,
                         parent=parent,
                         points=points,
@@ -311,12 +310,10 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
                     break
                 list_buttons = post.select_one("ul.flat-list.buttons")
                 if len(list_buttons.scrape("span.promoted-span")) == 0:
-                    title = post.force_select_one("a[class*='title']").get_text()
-                    post_url = list_buttons.scrape_one(
-                        "a[class^='bylink comments']", "href"
-                    )
+                    title = post.force_select_one("a.title").get_text()
+                    post_url = list_buttons.scrape_one("a.bylink.comments", "href")
                     n_comments_scraped = list_buttons.select_one(
-                        "a[class^='bylink comments']"
+                        "a.bylink.comments"
                     ).get_text()
                     match = re.match(r"(\d+)\s+comment(s)?", n_comments_scraped)
                     if match:
@@ -325,9 +322,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
                         n_comments = 0
                     upvote = get_points(post)
                     published_date, edited_date = get_dates(post)
-                    link = resolve_relative_url(
-                        post.scrape_one("a[class*='title']", "href")
-                    )
+                    link = resolve_relative_url(post.scrape_one("a.title", "href"))
                     if link == post_url:
                         link = ""
                     if add_text:
@@ -363,9 +358,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
                                     link,
                                     text_error,
                                 )
-                        try_content = text_soup.select_one(
-                            "div#siteTable div[class^='usertext']"
-                        )
+                        try_content = text_soup.select_one("div#siteTable div.usertext")
                         if try_content:
                             content = try_content.get_text()
                         else:
@@ -431,8 +424,8 @@ def get_user_comments(self, url: str, limit: int):
                         break
                     post_title = comment.scrape_one("a.title")
                     post_url = comment.scrape_one("a.bylink.may-blank", "href")
-                    post_author = comment.scrape_one("p.parent>a[class^='author']")
-                    post_subreddit = comment.scrape_one("a[class^='subreddit']", "href")
+                    post_author = comment.scrape_one("p.parent>a.author")
+                    post_subreddit = comment.scrape_one("a.subreddit", "href")
                     points = get_points(comment)
                     published_date, edited_date = get_dates(comment)
                     text = comment.scrape_one("div.content div.md")

From de2918755a9561d1ee9fd63c7d6e3e4ed2aed148 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Wed, 15 Jan 2025 11:01:42 +0100
Subject: [PATCH 46/47] fix bug with add_slash

---
 minet/reddit/scraper.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 665fca65a6..542f93b28a 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -18,9 +18,7 @@
 # when missing a '/' at the end of an url, reddit will make a redirection and it will reduce by 2 the number of requests remaining
 def add_slash(url: str):
     path = url.split("/")
-    if path[-1][0] == "?":
-        return url
-    elif url[-1] != "/":
+    if path[-1] != "" and not path[-1].startswith("?"):
         return url + "/"
     return url
 

From 25e2e60a419e61ff97865acba8c99757a0bca627 Mon Sep 17 00:00:00 2001
From: Julien Pontoire <pontoirejulien@gmail.com>
Date: Wed, 15 Jan 2025 13:57:40 +0100
Subject: [PATCH 47/47] refacto

---
 .gitignore              |  1 -
 minet/reddit/scraper.py | 93 +++++++++++++----------------------------
 2 files changed, 30 insertions(+), 64 deletions(-)

diff --git a/.gitignore b/.gitignore
index ddd3a616c1..b2de0b22ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,6 @@ ftest/*.csv
 *.sqlar
 *-wal
 *-shm
-*.csv
 
 /crawl
 /downloaded
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
index 542f93b28a..f49daa8c9f 100644
--- a/minet/reddit/scraper.py
+++ b/minet/reddit/scraper.py
@@ -296,6 +296,7 @@ def get_comments(self, url: str, all):
                         yield data
 
     def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
+        fn = data_posts if type == "subreddit" else data_user_posts
         n_crawled = 0
         old_url = get_old_url(get_url_from_subreddit(url))
         while old_url and (limit is None or n_crawled < limit):
@@ -328,71 +329,37 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
                             post_url, self.pool_manager
                         )
                         if text_error:
-                            if type == "subreddit":
-                                yield data_posts(
-                                    post,
-                                    title,
-                                    post_url,
-                                    "",
-                                    upvote,
-                                    n_comments_scraped,
-                                    n_comments,
-                                    published_date,
-                                    edited_date,
-                                    link,
-                                    text_error,
-                                )
-                            else:
-                                yield data_user_posts(
-                                    post,
-                                    title,
-                                    post_url,
-                                    "",
-                                    upvote,
-                                    n_comments_scraped,
-                                    n_comments,
-                                    published_date,
-                                    edited_date,
-                                    link,
-                                    text_error,
-                                )
-                        try_content = text_soup.select_one("div#siteTable div.usertext")
-                        if try_content:
-                            content = try_content.get_text()
-                        else:
-                            content = ""
-                    else:
-                        content = ""
-                    if type == "subreddit":
-                        post = data_posts(
-                            post,
-                            title,
-                            post_url,
-                            content,
-                            upvote,
-                            n_comments_scraped,
-                            n_comments,
-                            published_date,
-                            edited_date,
-                            link,
-                            error,
+                            yield fn(
+                                post,
+                                title,
+                                post_url,
+                                None,
+                                upvote,
+                                n_comments_scraped,
+                                n_comments,
+                                published_date,
+                                edited_date,
+                                link,
+                                text_error,
+                            )
+                        content = text_soup.scrape_one(
+                            "div#siteTable div.usertext-body"
                         )
                     else:
-                        post = data_user_posts(
-                            post,
-                            title,
-                            post_url,
-                            content,
-                            upvote,
-                            n_comments_scraped,
-                            n_comments,
-                            published_date,
-                            edited_date,
-                            link,
-                            error,
-                        )
-
-                    yield post
+                        content = ""
+                    yield fn(
+                        post,
+                        title,
+                        post_url,
+                        content,
+                        upvote,
+                        n_comments_scraped,
+                        n_comments,
+                        published_date,
+                        edited_date,
+                        link,
+                        error,
+                    )
                     n_crawled += 1
             old_url = soup.scrape_one("span.next-button a", "href")