From 26d50422000de70e5cb42e9ba0f7cbd5b2847063 Mon Sep 17 00:00:00 2001
From: lsabor <lukesabor@gmail.com>
Date: Thu, 26 Dec 2024 11:11:55 -0800
Subject: [PATCH 1/4] add spam detection run on command from admin panel, and
 refactor some basic logic in spam_detection.py

---
 users/admin.py                   | 23 ++++++++++++++++++++++-
 users/services/spam_detection.py | 29 +++++++++++++++++++++--------
 users/views.py                   |  8 ++++----
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/users/admin.py b/users/admin.py
index e39b58130..13b29d1e6 100644
--- a/users/admin.py
+++ b/users/admin.py
@@ -6,6 +6,10 @@
 from django.db.models import Count, Exists, OuterRef, Q, F, QuerySet
 
 from users.models import User, UserCampaignRegistration
+from users.services.spam_detection import (
+    check_data_for_spam,
+    send_deactivation_email,
+)
 from questions.models import Forecast
 
 
@@ -126,7 +130,12 @@ class UserAdmin(admin.ModelAdmin):
         "bio_length",
     ]
     can_delete = False
-    actions = ["mark_selected_as_spam", "soft_delete_selected", "hard_delete_selected"]
+    actions = [
+        "mark_selected_as_spam",
+        "soft_delete_selected",
+        "hard_delete_selected",
+        "run_profile_spam_detection_on_selected",
+    ]
     search_fields = ["username", "email", "pk"]
     list_filter = [
         "is_active",
@@ -212,6 +221,18 @@ def soft_delete_selected(self, request, queryset: QuerySet[User]):
     def hard_delete_selected(self, request, queryset: QuerySet[User]):
         queryset.delete()
 
+    def run_profile_spam_detection_on_selected(self, request, queryset: QuerySet[User]):
+        for user in queryset:
+            is_spam, _ = check_data_for_spam(
+                user=user,
+                bio=user.bio,
+                website=user.website,
+            )
+
+            if is_spam:
+                user.mark_as_spam()
+                send_deactivation_email(user.email)
+
 
 @admin.register(UserCampaignRegistration)
 class UserCampaignRegistrationAdmin(admin.ModelAdmin):
diff --git a/users/services/spam_detection.py b/users/services/spam_detection.py
index 044cb0d74..a257a9e27 100644
--- a/users/services/spam_detection.py
+++ b/users/services/spam_detection.py
@@ -11,6 +11,7 @@
 from utils.openai import generate_text_async
 from misc.tasks import send_email_async
 import time
+
 logger = logging.getLogger(__name__)
 
 
@@ -21,8 +22,21 @@ def check_profile_update_for_spam(
     days_since_joined_threshold = 7
     request_data = cast(dict, valid_serializer.validated_data)
 
-    bio: str | None = request_data.get("bio")
-    website: str | None = request_data.get("website")
+    if days_since_joined > days_since_joined_threshold:
+        idenficated_as_spam = False
+        reasoning = (
+            "The user has been a member for more than "
+            f"{days_since_joined_threshold} days"
+        )
+    else:
+        idenficated_as_spam, reasoning = check_data_for_spam(user, **request_data)
+
+    return idenficated_as_spam, reasoning
+
+
+def check_data_for_spam(user: User, **args):
+    bio: str | None = args.get("bio")
+    website: str | None = args.get("website")
     if bio and website:
         bio_plus_website = f"{bio}\n\nWebsite: {website}"
     elif not bio and website:
@@ -42,9 +56,6 @@ def check_profile_update_for_spam(
     elif len(bio_plus_website) < 10:
         idenficated_as_spam = False
         reasoning = "Bio is too short to be spam"
-    elif days_since_joined > days_since_joined_threshold:
-        idenficated_as_spam = False
-        reasoning = f"The user has been a member for more than {days_since_joined_threshold} days"
     elif len(bio_plus_website) > 17500:
         idenficated_as_spam = True
         reasoning = "Bio is more than 17500 characters"
@@ -58,9 +69,11 @@ def check_profile_update_for_spam(
 
     if idenficated_as_spam:
         logger.info(
-            f"User: {user.username} ID: {user.id} was soft deleted for spam bio: {bio_plus_website[:100]}... "
+            f"User: {user.username} ID: {user.id} was soft deleted "
+            f"for spam bio: {bio_plus_website[:100]}... "
             f"The reason was: {reasoning[:100]}... "
-            f"It took {duration:.2f} seconds to check. gpt_was_used: {gpt_was_used}"
+            f"It took {duration:.2f} seconds to check. "
+            f"gpt_was_used: {gpt_was_used}"
         )
     return idenficated_as_spam, reasoning
 
@@ -104,7 +117,7 @@ async def ask_gpt_to_check_profile_for_spam(
             system_prompt=system_prompt,
             prompt=prompt,
             temperature=0,
-            timeout=7
+            timeout=7,
         )
         is_spam = "TRUE" in gpt_response
     except Exception as e:
diff --git a/users/views.py b/users/views.py
index 49a578e75..0bd0e2b2c 100644
--- a/users/views.py
+++ b/users/views.py
@@ -467,12 +467,12 @@ def change_username_api_view(request: Request):
 @api_view(["PATCH"])
 def update_profile_api_view(request: Request) -> Response:
     user: User = request.user
-    serializer = UserUpdateProfileSerializer(user, data=request.data, partial=True)
+    serializer: UserUpdateProfileSerializer = UserUpdateProfileSerializer(
+        user, data=request.data, partial=True
+    )
     serializer.is_valid(raise_exception=True)
 
-    is_spam, _ = check_profile_update_for_spam(
-        user, cast(UserUpdateProfileSerializer, serializer)
-    )
+    is_spam, _ = check_profile_update_for_spam(user, serializer)
 
     if is_spam:
         user.mark_as_spam()

From b599e306cd10e9e60d16417bd07cc0e9e76a2e8c Mon Sep 17 00:00:00 2001
From: lsabor <lukesabor@gmail.com>
Date: Thu, 26 Dec 2024 11:16:45 -0800
Subject: [PATCH 2/4] remove unused import

---
 users/views.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/users/views.py b/users/views.py
index 0bd0e2b2c..900cc72db 100644
--- a/users/views.py
+++ b/users/views.py
@@ -1,7 +1,6 @@
 from datetime import timedelta
 import numpy as np
 import logging
-from typing import cast
 
 from django.contrib.auth.password_validation import validate_password
 from django.utils import timezone

From 36847a829a6714d5964162d53b2eef63e41010df Mon Sep 17 00:00:00 2001
From: lsabor <lukesabor@gmail.com>
Date: Thu, 26 Dec 2024 12:14:08 -0800
Subject: [PATCH 3/4] add first comment check

---
 comments/views.py                | 24 ++++++++-
 users/admin.py                   |  4 +-
 users/services/spam_detection.py | 92 ++++++++++++++++++++++++--------
 users/views.py                   |  3 +-
 4 files changed, 98 insertions(+), 25 deletions(-)

diff --git a/comments/views.py b/comments/views.py
index 4129e6c6c..eccd91d2f 100644
--- a/comments/views.py
+++ b/comments/views.py
@@ -33,6 +33,11 @@
 from notifications.services import NotificationCommentReport, NotificationPostParams
 from posts.services.common import get_post_permission_for_user
 from projects.permissions import ObjectPermission
+from users.models import User
+from users.services.spam_detection import (
+    check_new_comment_for_spam,
+    send_deactivation_email,
+)
 
 
 class RootCommentsPagination(LimitOffsetPagination):
@@ -119,7 +124,7 @@ def comment_delete_api_view(request: Request, pk: int):
 @permission_classes([IsAuthenticated])
 @transaction.non_atomic_requests
 def comment_create_api_view(request: Request):
-    user = request.user
+    user: User = request.user
     serializer = CommentWriteSerializer(data=request.data)
     serializer.is_valid(raise_exception=True)
 
@@ -143,6 +148,23 @@ def comment_create_api_view(request: Request):
         else None
     )
 
+    # Check for spam
+    is_spam, _ = check_new_comment_for_spam(
+        user=user, comment_text=serializer.validated_data["text"]
+    )
+
+    if is_spam:
+        user.mark_as_spam()
+        send_deactivation_email(user.email)
+        return Response(
+            data={
+                "message": "This comment seems to be spam. Please contact "
+                "support@metaculus.com if you believe this was a mistake.",
+                "error_code": "SPAM_DETECTED",
+            },
+            status=status.HTTP_403_FORBIDDEN,
+        )
+
     new_comment = create_comment(
         **serializer.validated_data, included_forecast=forecast, user=user
     )
diff --git a/users/admin.py b/users/admin.py
index 13b29d1e6..de319bae0 100644
--- a/users/admin.py
+++ b/users/admin.py
@@ -7,7 +7,7 @@
 
 from users.models import User, UserCampaignRegistration
 from users.services.spam_detection import (
-    check_data_for_spam,
+    check_profile_data_for_spam,
     send_deactivation_email,
 )
 from questions.models import Forecast
@@ -223,7 +223,7 @@ def hard_delete_selected(self, request, queryset: QuerySet[User]):
 
     def run_profile_spam_detection_on_selected(self, request, queryset: QuerySet[User]):
         for user in queryset:
-            is_spam, _ = check_data_for_spam(
+            is_spam, _ = check_profile_data_for_spam(
                 user=user,
                 bio=user.bio,
                 website=user.website,
diff --git a/users/services/spam_detection.py b/users/services/spam_detection.py
index a257a9e27..6d3046545 100644
--- a/users/services/spam_detection.py
+++ b/users/services/spam_detection.py
@@ -1,3 +1,4 @@
+import re
 import asyncio
 import textwrap
 from typing import cast
@@ -15,26 +16,54 @@
 logger = logging.getLogger(__name__)
 
 
-def check_profile_update_for_spam(
-    user: User, valid_serializer: UserUpdateProfileSerializer
-) -> tuple[bool, str]:
-    days_since_joined = (timezone.now() - user.date_joined).days
-    days_since_joined_threshold = 7
-    request_data = cast(dict, valid_serializer.validated_data)
+def check_comment_data_for_spam(user: User, comment_text: str) -> tuple[bool, str]:
+    start_time = time.time()
+    identified_as_spam = False
+    reasoning = ""
+    gpt_was_used = False
 
-    if days_since_joined > days_since_joined_threshold:
-        idenficated_as_spam = False
-        reasoning = (
-            "The user has been a member for more than "
-            f"{days_since_joined_threshold} days"
+    # Identify improper mentions
+    # @ mentions with external links
+    external_mention_pattern = r"@\[.*?\]\(https?://[^\s]+?\s*.*?\)"
+    # proper internal mentions
+    internal_mention_pattern = r"@\[.*?\]\(/accounts/profile/\d+/?\)"
+    # all @ mentions in the comment
+    all_mentions = re.findall(r"@\[[^\]]+\]\([^\)]+\)", comment_text)
+    improper_mentions = []
+    for mention in all_mentions:
+        if re.match(external_mention_pattern, mention) and not re.match(
+            internal_mention_pattern, mention
+        ):
+            improper_mentions.append(mention)
+    if improper_mentions:
+        identified_as_spam = True
+        reasoning = "Comment contains improper @ mentions with external links"
+
+    end_time = time.time()
+    duration = end_time - start_time
+
+    if identified_as_spam:
+        logger.info(
+            f"User: {user.username} ID: {user.id} was soft deleted "
+            f"for spam comment: {comment_text[:100]}... "
+            f"The reason was: {reasoning[:100]}... "
+            f"It took {duration:.2f} seconds to check. "
+            f"gpt_was_used: {gpt_was_used}"
         )
+    return identified_as_spam, reasoning
+
+
+def check_new_comment_for_spam(user: User, comment_text: str) -> tuple[bool, str]:
+    if user.comment_set.count() > 0:
+        identified_as_spam = False
+        reasoning = "User has already posted a comment"
     else:
-        idenficated_as_spam, reasoning = check_data_for_spam(user, **request_data)
+        identified_as_spam, reasoning = check_comment_data_for_spam(user, comment_text)
 
-    return idenficated_as_spam, reasoning
+    return identified_as_spam, reasoning
 
 
-def check_data_for_spam(user: User, **args):
+def check_profile_data_for_spam(user: User, **args):
     bio: str | None = args.get("bio")
     website: str | None = args.get("website")
     if bio and website:
@@ -47,27 +76,27 @@ def check_data_for_spam(user: User, **args):
         bio_plus_website = ""
 
     start_time = time.time()
-    idenficated_as_spam = False
+    identified_as_spam = False
     reasoning = ""
     gpt_was_used = False
     if not bio_plus_website:
-        idenficated_as_spam = False
+        identified_as_spam = False
         reasoning = "No bio to check for spam"
     elif len(bio_plus_website) < 10:
-        idenficated_as_spam = False
+        identified_as_spam = False
         reasoning = "Bio is too short to be spam"
     elif len(bio_plus_website) > 17500:
-        idenficated_as_spam = True
+        identified_as_spam = True
         reasoning = "Bio is more than 17500 characters"
     else:
-        idenficated_as_spam, reasoning = asyncio.run(
+        identified_as_spam, reasoning = asyncio.run(
             ask_gpt_to_check_profile_for_spam(bio_plus_website, user.email)
         )
         gpt_was_used = True
     end_time = time.time()
     duration = end_time - start_time
 
-    if idenficated_as_spam:
+    if identified_as_spam:
         logger.info(
             f"User: {user.username} ID: {user.id} was soft deleted "
             f"for spam bio: {bio_plus_website[:100]}... "
@@ -75,7 +104,28 @@ def check_data_for_spam(user: User, **args):
             f"It took {duration:.2f} seconds to check. "
             f"gpt_was_used: {gpt_was_used}"
         )
-    return idenficated_as_spam, reasoning
+    return identified_as_spam, reasoning
+
+
+def check_profile_update_for_spam(
+    user: User, valid_serializer: UserUpdateProfileSerializer
+) -> tuple[bool, str]:
+    days_since_joined = (timezone.now() - user.date_joined).days
+    days_since_joined_threshold = 7
+    request_data = cast(dict, valid_serializer.validated_data)
+
+    if days_since_joined > days_since_joined_threshold:
+        identified_as_spam = False
+        reasoning = (
+            "The user has been a member for more than "
+            f"{days_since_joined_threshold} days"
+        )
+    else:
+        identified_as_spam, reasoning = check_profile_data_for_spam(
+            user, **request_data
+        )
+
+    return identified_as_spam, reasoning
 
 
 async def ask_gpt_to_check_profile_for_spam(
diff --git a/users/views.py b/users/views.py
index 900cc72db..621a387f5 100644
--- a/users/views.py
+++ b/users/views.py
@@ -478,7 +478,8 @@ def update_profile_api_view(request: Request) -> Response:
         send_deactivation_email(user.email)
         return Response(
             data={
-                "message": "This bio seems to be spam. Please contact support@metaculus.com if you believe this was a mistake.",
+                "message": "This bio seems to be spam. Please contact "
+                "support@metaculus.com if you believe this was a mistake.",
                 "error_code": "SPAM_DETECTED",
             },
             status=status.HTTP_403_FORBIDDEN,

From f640fe4013b585752b14ef677612ca62bf737289 Mon Sep 17 00:00:00 2001
From: lsabor <lukesabor@gmail.com>
Date: Tue, 31 Dec 2024 09:50:24 -0800
Subject: [PATCH 4/4] fix misnamed variable

---
 users/services/spam_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/users/services/spam_detection.py b/users/services/spam_detection.py
index a53c3402c..bda51d687 100644
--- a/users/services/spam_detection.py
+++ b/users/services/spam_detection.py
@@ -89,7 +89,7 @@ def check_profile_data_for_spam(user: User, **args):
         identified_as_spam = True
         reasoning = "Bio is more than 17500 characters"
     else:
-        idenficated_as_spam, reasoning = asyncio.run(
+        identified_as_spam, reasoning = asyncio.run(
             ask_gpt_to_check_profile_for_spam(bio_plus_website)
         )
         gpt_was_used = True