From 26d50422000de70e5cb42e9ba0f7cbd5b2847063 Mon Sep 17 00:00:00 2001 From: lsabor Date: Thu, 26 Dec 2024 11:11:55 -0800 Subject: [PATCH 1/4] add spam detection run on command from admin panel, and refactor some basic logic in spam_detection.py --- users/admin.py | 23 ++++++++++++++++++++++- users/services/spam_detection.py | 29 +++++++++++++++++++++-------- users/views.py | 8 ++++---- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/users/admin.py b/users/admin.py index e39b58130..13b29d1e6 100644 --- a/users/admin.py +++ b/users/admin.py @@ -6,6 +6,10 @@ from django.db.models import Count, Exists, OuterRef, Q, F, QuerySet from users.models import User, UserCampaignRegistration +from users.services.spam_detection import ( + check_data_for_spam, + send_deactivation_email, +) from questions.models import Forecast @@ -126,7 +130,12 @@ class UserAdmin(admin.ModelAdmin): "bio_length", ] can_delete = False - actions = ["mark_selected_as_spam", "soft_delete_selected", "hard_delete_selected"] + actions = [ + "mark_selected_as_spam", + "soft_delete_selected", + "hard_delete_selected", + "run_profile_spam_detection_on_selected", + ] search_fields = ["username", "email", "pk"] list_filter = [ "is_active", @@ -212,6 +221,18 @@ def soft_delete_selected(self, request, queryset: QuerySet[User]): def hard_delete_selected(self, request, queryset: QuerySet[User]): queryset.delete() + def run_profile_spam_detection_on_selected(self, request, queryset: QuerySet[User]): + for user in queryset: + is_spam, _ = check_data_for_spam( + user=user, + bio=user.bio, + website=user.website, + ) + + if is_spam: + user.mark_as_spam() + send_deactivation_email(user.email) + @admin.register(UserCampaignRegistration) class UserCampaignRegistrationAdmin(admin.ModelAdmin): diff --git a/users/services/spam_detection.py b/users/services/spam_detection.py index 044cb0d74..a257a9e27 100644 --- a/users/services/spam_detection.py +++ b/users/services/spam_detection.py @@ -11,6 +11,7 @@ from utils.openai import generate_text_async from misc.tasks import send_email_async import time + logger = logging.getLogger(__name__) @@ -21,8 +22,21 @@ def check_profile_update_for_spam( days_since_joined_threshold = 7 request_data = cast(dict, valid_serializer.validated_data) - bio: str | None = request_data.get("bio") - website: str | None = request_data.get("website") + if days_since_joined > days_since_joined_threshold: + idenficated_as_spam = False + reasoning = ( + "The user has been a member for more than " + f"{days_since_joined_threshold} days" + ) + else: + idenficated_as_spam, reasoning = check_data_for_spam(user, **request_data) + + return idenficated_as_spam, reasoning + + +def check_data_for_spam(user: User, **args): + bio: str | None = args.get("bio") + website: str | None = args.get("website") if bio and website: bio_plus_website = f"{bio}\n\nWebsite: {website}" elif not bio and website: @@ -42,9 +56,6 @@ def check_profile_update_for_spam( elif len(bio_plus_website) < 10: idenficated_as_spam = False reasoning = "Bio is too short to be spam" - elif days_since_joined > days_since_joined_threshold: - idenficated_as_spam = False - reasoning = f"The user has been a member for more than {days_since_joined_threshold} days" elif len(bio_plus_website) > 17500: idenficated_as_spam = True reasoning = "Bio is more than 17500 characters" @@ -58,9 +69,11 @@ def check_profile_update_for_spam( if idenficated_as_spam: logger.info( - f"User: {user.username} ID: {user.id} was soft deleted for spam bio: {bio_plus_website[:100]}... " + f"User: {user.username} ID: {user.id} was soft deleted " + f"for spam bio: {bio_plus_website[:100]}... " f"The reason was: {reasoning[:100]}... " - f"It took {duration:.2f} seconds to check. gpt_was_used: {gpt_was_used}" + f"It took {duration:.2f} seconds to check. " + f"gpt_was_used: {gpt_was_used}" ) return idenficated_as_spam, reasoning @@ -104,7 +117,7 @@ async def ask_gpt_to_check_profile_for_spam( system_prompt=system_prompt, prompt=prompt, temperature=0, - timeout=7 + timeout=7, ) is_spam = "TRUE" in gpt_response except Exception as e: diff --git a/users/views.py b/users/views.py index 49a578e75..0bd0e2b2c 100644 --- a/users/views.py +++ b/users/views.py @@ -467,12 +467,12 @@ def change_username_api_view(request: Request): @api_view(["PATCH"]) def update_profile_api_view(request: Request) -> Response: user: User = request.user - serializer = UserUpdateProfileSerializer(user, data=request.data, partial=True) + serializer: UserUpdateProfileSerializer = UserUpdateProfileSerializer( + user, data=request.data, partial=True + ) serializer.is_valid(raise_exception=True) - is_spam, _ = check_profile_update_for_spam( - user, cast(UserUpdateProfileSerializer, serializer) - ) + is_spam, _ = check_profile_update_for_spam(user, serializer) if is_spam: user.mark_as_spam() From b599e306cd10e9e60d16417bd07cc0e9e76a2e8c Mon Sep 17 00:00:00 2001 From: lsabor Date: Thu, 26 Dec 2024 11:16:45 -0800 Subject: [PATCH 2/4] remove unused import --- users/views.py | 1 - 1 file changed, 1 deletion(-) diff --git a/users/views.py b/users/views.py index 0bd0e2b2c..900cc72db 100644 --- a/users/views.py +++ b/users/views.py @@ -1,7 +1,6 @@ from datetime import timedelta import numpy as np import logging -from typing import cast from django.contrib.auth.password_validation import validate_password from django.utils import timezone From 36847a829a6714d5964162d53b2eef63e41010df Mon Sep 17 00:00:00 2001 From: lsabor Date: Thu, 26 Dec 2024 12:14:08 -0800 Subject: [PATCH 3/4] add first comment check --- comments/views.py | 24 ++++++++- users/admin.py | 4 +- users/services/spam_detection.py | 92 ++++++++++++++++++++++++-------- users/views.py | 3 +- 4 files changed, 98 insertions(+), 25 deletions(-) diff --git a/comments/views.py b/comments/views.py index 4129e6c6c..eccd91d2f 100644 --- a/comments/views.py +++ b/comments/views.py @@ -33,6 +33,11 @@ from notifications.services import NotificationCommentReport, NotificationPostParams from posts.services.common import get_post_permission_for_user from projects.permissions import ObjectPermission +from users.models import User +from users.services.spam_detection import ( + check_new_comment_for_spam, + send_deactivation_email, +) class RootCommentsPagination(LimitOffsetPagination): @@ -119,7 +124,7 @@ def comment_delete_api_view(request: Request, pk: int): @permission_classes([IsAuthenticated]) @transaction.non_atomic_requests def comment_create_api_view(request: Request): - user = request.user + user: User = request.user serializer = CommentWriteSerializer(data=request.data) serializer.is_valid(raise_exception=True) @@ -143,6 +148,23 @@ def comment_create_api_view(request: Request): else None ) + # Check for spam + is_spam, _ = check_new_comment_for_spam( + user=user, comment_text=serializer.validated_data["text"] + ) + + if is_spam: + user.mark_as_spam() + send_deactivation_email(user.email) + return Response( + data={ + "message": "This comment seems to be spam. Please contact " + "support@metaculus.com if you believe this was a mistake.", + "error_code": "SPAM_DETECTED", + }, + status=status.HTTP_403_FORBIDDEN, + ) + new_comment = create_comment( **serializer.validated_data, included_forecast=forecast, user=user ) diff --git a/users/admin.py b/users/admin.py index 13b29d1e6..de319bae0 100644 --- a/users/admin.py +++ b/users/admin.py @@ -7,7 +7,7 @@ from users.models import User, UserCampaignRegistration from users.services.spam_detection import ( - check_data_for_spam, + check_profile_data_for_spam, send_deactivation_email, ) from questions.models import Forecast @@ -223,7 +223,7 @@ def hard_delete_selected(self, request, queryset: QuerySet[User]): def run_profile_spam_detection_on_selected(self, request, queryset: QuerySet[User]): for user in queryset: - is_spam, _ = check_data_for_spam( + is_spam, _ = check_profile_data_for_spam( user=user, bio=user.bio, website=user.website, diff --git a/users/services/spam_detection.py b/users/services/spam_detection.py index a257a9e27..6d3046545 100644 --- a/users/services/spam_detection.py +++ b/users/services/spam_detection.py @@ -1,3 +1,4 @@ +import re import asyncio import textwrap from typing import cast @@ -15,26 +16,54 @@ logger = logging.getLogger(__name__) -def check_profile_update_for_spam( - user: User, valid_serializer: UserUpdateProfileSerializer -) -> tuple[bool, str]: - days_since_joined = (timezone.now() - user.date_joined).days - days_since_joined_threshold = 7 - request_data = cast(dict, valid_serializer.validated_data) +def check_comment_data_for_spam(user: User, comment_text: str) -> tuple[bool, str]: + start_time = time.time() + identified_as_spam = False + reasoning = "" + gpt_was_used = False - if days_since_joined > days_since_joined_threshold: - idenficated_as_spam = False - reasoning = ( - "The user has been a member for more than " - f"{days_since_joined_threshold} days" + # Identify improper mentions + # @ mentions with external links + external_mention_pattern = r"@\[.*?\]\(https?://[^\s]+?\s*.*?\)" + # proper internal mentions + internal_mention_pattern = r"@\[.*?\]\(/accounts/profile/\d+/?\)" + # all @ mentions in the comment + all_mentions = re.findall(r"@\[[^\]]+\]\([^\)]+\)", comment_text) + improper_mentions = [] + for mention in all_mentions: + if re.match(external_mention_pattern, mention) and not re.match( + internal_mention_pattern, mention + ): + improper_mentions.append(mention) + if improper_mentions: + identified_as_spam = True + reasoning = "Comment contains improper @ mentions with external links" + + end_time = time.time() + duration = end_time - start_time + + if identified_as_spam: + logger.info( + f"User: {user.username} ID: {user.id} was soft deleted " + f"for spam comment: {comment_text[:100]}... " + f"The reason was: {reasoning[:100]}... " + f"It took {duration:.2f} seconds to check. " + f"gpt_was_used: {gpt_was_used}" ) + return identified_as_spam, reasoning + + +def check_new_comment_for_spam(user: User, comment_text: str) -> tuple[bool, str]: + if user.comment_set.count() > 0: + identified_as_spam = False + reasoning = "User has already posted a comment" else: - idenficated_as_spam, reasoning = check_data_for_spam(user, **request_data) + identified_as_spam, reasoning = check_comment_data_for_spam(user, comment_text) - return idenficated_as_spam, reasoning + return identified_as_spam, reasoning -def check_data_for_spam(user: User, **args): +def check_profile_data_for_spam(user: User, **args): bio: str | None = args.get("bio") website: str | None = args.get("website") if bio and website: @@ -47,27 +76,27 @@ def check_data_for_spam(user: User, **args): bio_plus_website = "" start_time = time.time() - idenficated_as_spam = False + identified_as_spam = False reasoning = "" gpt_was_used = False if not bio_plus_website: - idenficated_as_spam = False + identified_as_spam = False reasoning = "No bio to check for spam" elif len(bio_plus_website) < 10: - idenficated_as_spam = False + identified_as_spam = False reasoning = "Bio is too short to be spam" elif len(bio_plus_website) > 17500: - idenficated_as_spam = True + identified_as_spam = True reasoning = "Bio is more than 17500 characters" else: - idenficated_as_spam, reasoning = asyncio.run( + identified_as_spam, reasoning = asyncio.run( ask_gpt_to_check_profile_for_spam(bio_plus_website, user.email) ) gpt_was_used = True end_time = time.time() duration = end_time - start_time - if idenficated_as_spam: + if identified_as_spam: logger.info( f"User: {user.username} ID: {user.id} was soft deleted " f"for spam bio: {bio_plus_website[:100]}... " @@ -75,7 +104,28 @@ def check_data_for_spam(user: User, **args): f"It took {duration:.2f} seconds to check. " f"gpt_was_used: {gpt_was_used}" ) - return idenficated_as_spam, reasoning + return identified_as_spam, reasoning + + +def check_profile_update_for_spam( + user: User, valid_serializer: UserUpdateProfileSerializer +) -> tuple[bool, str]: + days_since_joined = (timezone.now() - user.date_joined).days + days_since_joined_threshold = 7 + request_data = cast(dict, valid_serializer.validated_data) + + if days_since_joined > days_since_joined_threshold: + identified_as_spam = False + reasoning = ( + "The user has been a member for more than " + f"{days_since_joined_threshold} days" + ) + else: + identified_as_spam, reasoning = check_profile_data_for_spam( + user, **request_data + ) + + return identified_as_spam, reasoning async def ask_gpt_to_check_profile_for_spam( diff --git a/users/views.py b/users/views.py index 900cc72db..621a387f5 100644 --- a/users/views.py +++ b/users/views.py @@ -478,7 +478,8 @@ def update_profile_api_view(request: Request) -> Response: send_deactivation_email(user.email) return Response( data={ - "message": "This bio seems to be spam. Please contact support@metaculus.com if you believe this was a mistake.", + "message": "This bio seems to be spam. Please contact " + "support@metaculus.com if you believe this was a mistake.", "error_code": "SPAM_DETECTED", }, status=status.HTTP_403_FORBIDDEN, From f640fe4013b585752b14ef677612ca62bf737289 Mon Sep 17 00:00:00 2001 From: lsabor Date: Tue, 31 Dec 2024 09:50:24 -0800 Subject: [PATCH 4/4] fix misnamed variable --- users/services/spam_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/services/spam_detection.py b/users/services/spam_detection.py index a53c3402c..bda51d687 100644 --- a/users/services/spam_detection.py +++ b/users/services/spam_detection.py @@ -89,7 +89,7 @@ def check_profile_data_for_spam(user: User, **args): identified_as_spam = True reasoning = "Bio is more than 17500 characters" else: - idenficated_as_spam, reasoning = asyncio.run( + identified_as_spam, reasoning = asyncio.run( ask_gpt_to_check_profile_for_spam(bio_plus_website) ) gpt_was_used = True