medialab · jpontoire · Dec 5, 2024 · Dec 5, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ ftest/*.csv
 *.sqlar
 *-wal
 *-shm
+*.csv
 
 /crawl
 /downloaded

diff --git a/minet/cli/commands.py b/minet/cli/commands.py
@@ -14,6 +14,7 @@
 from minet.cli.hyphe import HYPHE_COMMAND
 from minet.cli.instagram import INSTAGRAM_COMMAND
 from minet.cli.mediacloud import MEDIACLOUD_COMMAND
+from minet.cli.reddit import REDDIT_COMMAND
 from minet.cli.telegram import TELEGRAM_COMMAND
 from minet.cli.tiktok import TIKTOK_COMMAND
 from minet.cli.twitter import TWITTER_COMMAND
@@ -42,6 +43,7 @@
     HYPHE_COMMAND,
     INSTAGRAM_COMMAND,
     MEDIACLOUD_COMMAND,
+    REDDIT_COMMAND,
     TELEGRAM_COMMAND,
     TIKTOK_COMMAND,
     TWITTER_COMMAND,

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
@@ -0,0 +1,145 @@
+# =============================================================================
+# Minet Reddit CLI Action
+# =============================================================================
+#
+# Logic of the `rd` action.
+#
+
+from minet.cli.argparse import command
+
+REDDIT_POSTS_SUBCOMMAND = command(
+    "posts",
+    "minet.cli.reddit.posts",
+    title="Minet Reddit Posts Command",
+    description="""
+        Retrieve reddit posts from a subreddit link or name.
+    """,
+    epilog="""
+        Example:
+
+        . Searching posts from the subreddit r/france:
+            $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv
+            $ minet reddit posts france > r_france_posts.csv
+            $ minet reddit posts r/france > r_france_posts.csv
+    """,
+    variadic_input={
+        "dummy_column": "subreddit",
+        "item_label": "subreddit url, subreddit shortcode or subreddit id",
+        "item_label_plural": "subreddit urls, subreddit shortcodes or subreddits ids",
+    },
+    arguments=[
+        {
+            "flags": ["-n", "--number"],
+            "help": "Number of posts to retrieve.",
+            "type": int,
+        },
+        {
+            "flags": ["-t", "--text"],
+            "help": "Retrieve the text of the post. Note that it will require one request per post.",
+            "action": "store_true",
+        },
+    ],
+)
+
+REDDIT_COMMENTS_SUBCOMMAND = command(
+    "comments",
+    "minet.cli.reddit.comments",
+    title="Minet Reddit Comments Command",
+    description="""
+        Retrieve comments from a reddit post link.
+        Note that it will only retrieve the comments displayed on the page. If you want all the comments you need to use -A, --all but it will require a request per comment, and you can only make 100 requests per 10 minutes.
+    """,
+    epilog="""
+        Example:
+
+        . Searching comments from a reddit post:
+            $ minet reddit comments https://www.reddit.com/r/france/comments/... > r_france_comments.csv
+    """,
+    variadic_input={
+        "dummy_column": "post",
+        "item_label": "post url, post shortcode or post id",
+        "item_label_plural": "posts urls, posts shortcodes or posts ids",
+    },
+    arguments=[
+        {
+            "flags": ["-A", "--all"],
+            "help": "Retrieve all comments.",
+            "action": "store_true",
+        },
+    ],
+)
+
+REDDIT_USER_POSTS_SUBCOMMAND = command(
+    "user_posts",
+    "minet.cli.reddit.user_posts",
+    title="Minet Reddit User Posts Command",
+    description="""
+        Retrieve reddit posts from a user link.
+    """,
+    epilog="""
+        Example:
+
+        . Searching posts from the user page of u/random_user:
+            $ minet reddit user_posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv
+    """,
+    variadic_input={
+        "dummy_column": "user",
+        "item_label": "user url, user shortcode or user id",
+        "item_label_plural": "user urls, user shortcodes or user ids",
+    },
+    arguments=[
+        {
+            "flags": ["-n", "--number"],
+            "help": "Number of posts to retrieve.",
+            "type": int,
+        },
+        {
+            "flags": ["-t", "--text"],
+            "help": "Retrieve the text of the post. Note that it will require one request per post.",
+            "action": "store_true",
+        },
+    ],
+)
+
+REDDIT_USER_COMMENTS_SUBCOMMAND = command(
+    "user_comments",
+    "minet.cli.reddit.user_comments",
+    title="Minet Reddit User Comments Command",
+    description="""
+        Retrieve reddit comments from a user link.
+    """,
+    epilog="""
+        Example:
+
+        . Searching comments from the user page of u/random_user:
+            $ minet reddit user_comments https://www.reddit.com/user/random_user/comments/ > random_user_comments.csv
+    """,
+    variadic_input={
+        "dummy_column": "user",
+        "item_label": "user url, user shortcode or user id",
+        "item_label_plural": "user urls, user shortcodes or user ids",
+    },
+    arguments=[
+        {
+            "flags": ["-n", "--number"],
+            "help": "Number of comments to retrieve.",
+            "type": int,
+        },
+    ],
+)
+
+REDDIT_COMMAND = command(
+    "reddit",
+    "minet.cli.reddit",
+    "Minet Reddit Command",
+    aliases=["rd"],
+    description="""
+        Collect data from Reddit.
+    """,
+    subcommands=[
+        REDDIT_POSTS_SUBCOMMAND,
+        REDDIT_COMMENTS_SUBCOMMAND,
+        REDDIT_USER_POSTS_SUBCOMMAND,
+        REDDIT_USER_COMMENTS_SUBCOMMAND,
+    ],
+)
diff --git a/minet/cli/reddit/comments.py b/minet/cli/reddit/comments.py
@@ -0,0 +1,41 @@
+# =============================================================================
+# Minet Reddit Comments CLI Action
+# =============================================================================
+#
+# Logic of the `rd comments` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditComment
+from minet.reddit.exceptions import RedditInvalidTargetError
+
+
+@with_enricher_and_loading_bar(
+    headers=RedditComment,
+    title="Scraping comments",
+    unit="pages",
+    nested=True,
+    sub_unit="comments",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.all:
+                    comments = scraper.get_comments(url, True)
+                else:
+                    comments = scraper.get_comments(url, False)
+
+            except RedditInvalidTargetError:
+                loading_bar.print(
+                    "the script could not complete normally on line %i" % (i)
+                )
+                continue
+
+            for comment in comments:
+                loading_bar.nested_advance()
+                enricher.writerow(row, comment)
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
@@ -0,0 +1,52 @@
+# =============================================================================
+# Minet Reddit Posts CLI Action
+# =============================================================================
+#
+# Logic of the `rd posts` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditPost
+from minet.reddit.exceptions import RedditInvalidTargetError
+
+
+@with_enricher_and_loading_bar(
+    headers=RedditPost,
+    title="Scraping posts",
+    unit="pages",
+    nested=True,
+    sub_unit="posts",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    type_page = "subreddit"
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.number:
+                    if cli_args.text:
+                        posts = scraper.get_general_post(
+                            url, type_page, True, cli_args.number
+                        )
+                    else:
+                        posts = scraper.get_general_post(
+                            url, type_page, False, cli_args.number
+                        )
+                else:
+                    if cli_args.text:
+                        posts = scraper.get_general_post(url, type_page, True)
+                    else:
+                        posts = scraper.get_general_post(url, type_page, False)
+            except RedditInvalidTargetError:
+                loading_bar.print(
+                    "the script could not complete normally on line %i" % (i)
+                )
+                continue
+
+            for post in posts:
+                loading_bar.nested_advance()
+                enricher.writerow(row, post)
diff --git a/minet/cli/reddit/user_comments.py b/minet/cli/reddit/user_comments.py
@@ -0,0 +1,41 @@
+# =============================================================================
+# Minet Reddit Comments CLI Action
+# =============================================================================
+#
+# Logic of the `rd user_comments` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditUserComment
+from minet.reddit.exceptions import RedditInvalidTargetError
+
+
+@with_enricher_and_loading_bar(
+    headers=RedditUserComment,
+    title="Scraping user comments",
+    unit="pages",
+    nested=True,
+    sub_unit="comments",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.number:
+                    posts = scraper.get_user_comments(url, cli_args.number)
+                else:
+                    posts = scraper.get_user_comments(url)
+
+            except RedditInvalidTargetError:
+                loading_bar.print(
+                    "the script could not complete normally on line %i" % (i)
+                )
+                continue
+
+            for post in posts:
+                loading_bar.nested_advance()
+                enricher.writerow(row, post)
diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
@@ -0,0 +1,52 @@
+# =============================================================================
+# Minet Reddit Posts CLI Action
+# =============================================================================
+#
+# Logic of the `rd user_posts` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditUserPost
+from minet.reddit.exceptions import RedditInvalidTargetError
+
+
+@with_enricher_and_loading_bar(
+    headers=RedditUserPost,
+    title="Scraping user posts",
+    unit="pages",
+    nested=True,
+    sub_unit="posts",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    type_page = "user"
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.number:
+                    if cli_args.text:
+                        posts = scraper.get_general_post(
+                            url, type_page, True, cli_args.number
+                        )
+                    else:
+                        posts = scraper.get_general_post(
+                            url, type_page, False, cli_args.number
+                        )
+                else:
+                    if cli_args.text:
+                        posts = scraper.get_general_post(url, type_page, True)
+                    else:
+                        posts = scraper.get_general_post(url, type_page, False)
+            except RedditInvalidTargetError:
+                loading_bar.print(
+                    "the script could not complete normally on line %i" % (i)
+                )
+                continue
+
+            for post in posts:
+                loading_bar.nested_advance()
+                enricher.writerow(row, post)
diff --git a/minet/reddit/exceptions.py b/minet/reddit/exceptions.py
@@ -0,0 +1,17 @@
+# =============================================================================
+# Minet Reddit Exceptions
+# =============================================================================
+#
+from minet.exceptions import MinetError
+
+
+class RedditError(MinetError):
+    pass
+
+
+class RedditInvalidTargetError(RedditError):
+    pass
+
+
+class RedditNotPostError(RedditError):
+    pass
-Original file line number
+Diff line change
@@ Expand Up / @@ -25,6 +25,7 @@ ftest/*.csv @@
     *.sqlar
     *-wal
     *-shm
+    *.csv
     /crawl
     /downloaded
@@ Expand Down @@