add first comment check

Metaculus · Dec 26, 2024 · 36847a8 · 36847a8
1 parent b599e30
commit 36847a8
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 25 deletions.
diff --git a/comments/views.py b/comments/views.py
@@ -33,6 +33,11 @@
 from notifications.services import NotificationCommentReport, NotificationPostParams
 from posts.services.common import get_post_permission_for_user
 from projects.permissions import ObjectPermission
+from users.models import User
+from users.services.spam_detection import (
+    check_new_comment_for_spam,
+    send_deactivation_email,
+)
 
 
 class RootCommentsPagination(LimitOffsetPagination):
@@ -119,7 +124,7 @@ def comment_delete_api_view(request: Request, pk: int):
 @permission_classes([IsAuthenticated])
 @transaction.non_atomic_requests
 def comment_create_api_view(request: Request):
-    user = request.user
+    user: User = request.user
     serializer = CommentWriteSerializer(data=request.data)
     serializer.is_valid(raise_exception=True)
 
@@ -143,6 +148,23 @@ def comment_create_api_view(request: Request):
         else None
     )
 
+    # Check for spam
+    is_spam, _ = check_new_comment_for_spam(
+        user=user, comment_text=serializer.validated_data["text"]
+    )
+
+    if is_spam:
+        user.mark_as_spam()
+        send_deactivation_email(user.email)
+        return Response(
+            data={
+                "message": "This comment seems to be spam. Please contact "
+                "support@metaculus.com if you believe this was a mistake.",
+                "error_code": "SPAM_DETECTED",
+            },
+            status=status.HTTP_403_FORBIDDEN,
+        )
+
     new_comment = create_comment(
         **serializer.validated_data, included_forecast=forecast, user=user
     )

diff --git a/users/admin.py b/users/admin.py
@@ -7,7 +7,7 @@
 
 from users.models import User, UserCampaignRegistration
 from users.services.spam_detection import (
-    check_data_for_spam,
+    check_profile_data_for_spam,
     send_deactivation_email,
 )
 from questions.models import Forecast
@@ -223,7 +223,7 @@ def hard_delete_selected(self, request, queryset: QuerySet[User]):
 
     def run_profile_spam_detection_on_selected(self, request, queryset: QuerySet[User]):
         for user in queryset:
-            is_spam, _ = check_data_for_spam(
+            is_spam, _ = check_profile_data_for_spam(
                 user=user,
                 bio=user.bio,
                 website=user.website,

diff --git a/users/services/spam_detection.py b/users/services/spam_detection.py
@@ -1,3 +1,4 @@
+import re
 import asyncio
 import textwrap
 from typing import cast
@@ -15,26 +16,54 @@
 logger = logging.getLogger(__name__)
 
 
-def check_profile_update_for_spam(
-    user: User, valid_serializer: UserUpdateProfileSerializer
-) -> tuple[bool, str]:
-    days_since_joined = (timezone.now() - user.date_joined).days
-    days_since_joined_threshold = 7
-    request_data = cast(dict, valid_serializer.validated_data)
+def check_comment_data_for_spam(user: User, comment_text: str) -> tuple[bool, str]:
+    start_time = time.time()
+    identified_as_spam = False
+    reasoning = ""
+    gpt_was_used = False
 
-    if days_since_joined > days_since_joined_threshold:
-        idenficated_as_spam = False
-        reasoning = (
-            "The user has been a member for more than "
-            f"{days_since_joined_threshold} days"
+    # Identify improper mentions
+    # @ mentions with external links
+    external_mention_pattern = r"@\[.*?\]\(https?://[^\s]+?\s*.*?\)"
+    # proper internal mentions
+    internal_mention_pattern = r"@\[.*?\]\(/accounts/profile/\d+/?\)"
+    # all @ mentions in the comment
+    all_mentions = re.findall(r"@\[[^\]]+\]\([^\)]+\)", comment_text)
+    improper_mentions = []
+    for mention in all_mentions:
+        if re.match(external_mention_pattern, mention) and not re.match(
+            internal_mention_pattern, mention
+        ):
+            improper_mentions.append(mention)
+    if improper_mentions:
+        identified_as_spam = True
+        reasoning = "Comment contains improper @ mentions with external links"
+
+    end_time = time.time()
+    duration = end_time - start_time
+
+    if identified_as_spam:
+        logger.info(
+            f"User: {user.username} ID: {user.id} was soft deleted "
+            f"for spam comment: {comment_text[:100]}... "
+            f"The reason was: {reasoning[:100]}... "
+            f"It took {duration:.2f} seconds to check. "
+            f"gpt_was_used: {gpt_was_used}"
         )
+    return identified_as_spam, reasoning
+
+
+def check_new_comment_for_spam(user: User, comment_text: str) -> tuple[bool, str]:
+    if user.comment_set.count() > 0:
+        identified_as_spam = False
+        reasoning = "User has already posted a comment"
     else:
-        idenficated_as_spam, reasoning = check_data_for_spam(user, **request_data)
+        identified_as_spam, reasoning = check_comment_data_for_spam(user, comment_text)
 
-    return idenficated_as_spam, reasoning
+    return identified_as_spam, reasoning
 
 
-def check_data_for_spam(user: User, **args):
+def check_profile_data_for_spam(user: User, **args):
     bio: str | None = args.get("bio")
     website: str | None = args.get("website")
     if bio and website:
@@ -47,35 +76,56 @@ def check_data_for_spam(user: User, **args):
         bio_plus_website = ""
 
     start_time = time.time()
-    idenficated_as_spam = False
+    identified_as_spam = False
     reasoning = ""
     gpt_was_used = False
     if not bio_plus_website:
-        idenficated_as_spam = False
+        identified_as_spam = False
         reasoning = "No bio to check for spam"
     elif len(bio_plus_website) < 10:
-        idenficated_as_spam = False
+        identified_as_spam = False
         reasoning = "Bio is too short to be spam"
     elif len(bio_plus_website) > 17500:
-        idenficated_as_spam = True
+        identified_as_spam = True
         reasoning = "Bio is more than 17500 characters"
     else:
-        idenficated_as_spam, reasoning = asyncio.run(
+        identified_as_spam, reasoning = asyncio.run(
             ask_gpt_to_check_profile_for_spam(bio_plus_website, user.email)
         )
         gpt_was_used = True
     end_time = time.time()
     duration = end_time - start_time
 
-    if idenficated_as_spam:
+    if identified_as_spam:
         logger.info(
             f"User: {user.username} ID: {user.id} was soft deleted "
             f"for spam bio: {bio_plus_website[:100]}... "
             f"The reason was: {reasoning[:100]}... "
             f"It took {duration:.2f} seconds to check. "
             f"gpt_was_used: {gpt_was_used}"
         )
-    return idenficated_as_spam, reasoning
+    return identified_as_spam, reasoning
+
+
+def check_profile_update_for_spam(
+    user: User, valid_serializer: UserUpdateProfileSerializer
+) -> tuple[bool, str]:
+    days_since_joined = (timezone.now() - user.date_joined).days
+    days_since_joined_threshold = 7
+    request_data = cast(dict, valid_serializer.validated_data)
+
+    if days_since_joined > days_since_joined_threshold:
+        identified_as_spam = False
+        reasoning = (
+            "The user has been a member for more than "
+            f"{days_since_joined_threshold} days"
+        )
+    else:
+        identified_as_spam, reasoning = check_profile_data_for_spam(
+            user, **request_data
+        )
+
+    return identified_as_spam, reasoning
 
 
 async def ask_gpt_to_check_profile_for_spam(

diff --git a/users/views.py b/users/views.py
@@ -478,7 +478,8 @@ def update_profile_api_view(request: Request) -> Response:
         send_deactivation_email(user.email)
         return Response(
             data={
-                "message": "This bio seems to be spam. Please contact support@metaculus.com if you believe this was a mistake.",
+                "message": "This bio seems to be spam. Please contact "
+                "support@metaculus.com if you believe this was a mistake.",
                 "error_code": "SPAM_DETECTED",
             },
             status=status.HTTP_403_FORBIDDEN,