Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/1825 comment spam detection phase 1 #1826

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion comments/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
from notifications.services import NotificationCommentReport, NotificationPostParams
from posts.services.common import get_post_permission_for_user
from projects.permissions import ObjectPermission
from users.models import User
from users.services.spam_detection import (
check_new_comment_for_spam,
send_deactivation_email,
)


class RootCommentsPagination(LimitOffsetPagination):
Expand Down Expand Up @@ -119,7 +124,7 @@ def comment_delete_api_view(request: Request, pk: int):
@permission_classes([IsAuthenticated])
@transaction.non_atomic_requests
def comment_create_api_view(request: Request):
user = request.user
user: User = request.user
serializer = CommentWriteSerializer(data=request.data)
serializer.is_valid(raise_exception=True)

Expand All @@ -143,6 +148,23 @@ def comment_create_api_view(request: Request):
else None
)

# Check for spam
is_spam, _ = check_new_comment_for_spam(
user=user, comment_text=serializer.validated_data["text"]
)

if is_spam:
user.mark_as_spam()
send_deactivation_email(user.email)
return Response(
data={
"message": "This comment seems to be spam. Please contact "
"support@metaculus.com if you believe this was a mistake.",
"error_code": "SPAM_DETECTED",
},
status=status.HTTP_403_FORBIDDEN,
)

new_comment = create_comment(
**serializer.validated_data, included_forecast=forecast, user=user
)
Expand Down
23 changes: 22 additions & 1 deletion users/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
from django.db.models import Count, Exists, OuterRef, Q, F, QuerySet

from users.models import User, UserCampaignRegistration
from users.services.spam_detection import (
check_profile_data_for_spam,
send_deactivation_email,
)
from questions.models import Forecast


Expand Down Expand Up @@ -126,7 +130,12 @@ class UserAdmin(admin.ModelAdmin):
"bio_length",
]
can_delete = False
actions = ["mark_selected_as_spam", "soft_delete_selected", "hard_delete_selected"]
actions = [
"mark_selected_as_spam",
"soft_delete_selected",
"hard_delete_selected",
"run_profile_spam_detection_on_selected",
]
search_fields = ["username", "email", "pk"]
list_filter = [
"is_active",
Expand Down Expand Up @@ -212,6 +221,18 @@ def soft_delete_selected(self, request, queryset: QuerySet[User]):
def hard_delete_selected(self, request, queryset: QuerySet[User]):
queryset.delete()

def run_profile_spam_detection_on_selected(self, request, queryset: QuerySet[User]):
for user in queryset:
is_spam, _ = check_profile_data_for_spam(
user=user,
bio=user.bio,
website=user.website,
)

if is_spam:
user.mark_as_spam()
send_deactivation_email(user.email)


@admin.register(UserCampaignRegistration)
class UserCampaignRegistrationAdmin(admin.ModelAdmin):
Expand Down
105 changes: 84 additions & 21 deletions users/services/spam_detection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import asyncio
import textwrap
from typing import cast
Expand All @@ -11,18 +12,60 @@
from utils.openai import generate_text_async
from misc.tasks import send_email_async
import time

logger = logging.getLogger(__name__)


def check_profile_update_for_spam(
user: User, valid_serializer: UserUpdateProfileSerializer
) -> tuple[bool, str]:
days_since_joined = (timezone.now() - user.date_joined).days
days_since_joined_threshold = 7
request_data = cast(dict, valid_serializer.validated_data)
def check_comment_data_for_spam(user: User, comment_text: str) -> tuple[bool, str]:
start_time = time.time()
identified_as_spam = False
reasoning = ""
gpt_was_used = False

# Identify improper mentions
# @ mentions with external links
external_mention_pattern = r"@\[.*?\]\(https?://[^\s]+?\s*.*?\)"
# proper internal mentions
internal_mention_pattern = r"@\[.*?\]\(/accounts/profile/\d+/?\)"
# all @ mentions in the comment
all_mentions = re.findall(r"@\[[^\]]+\]\([^\)]+\)", comment_text)
improper_mentions = []
for mention in all_mentions:
if re.match(external_mention_pattern, mention) and not re.match(
internal_mention_pattern, mention
):
improper_mentions.append(mention)
if improper_mentions:
identified_as_spam = True
reasoning = "Comment contains improper @ mentions with external links"

end_time = time.time()
duration = end_time - start_time

if identified_as_spam:
logger.info(
f"User: {user.username} ID: {user.id} was soft deleted "
f"for spam comment: {comment_text[:100]}... "
f"The reason was: {reasoning[:100]}... "
f"It took {duration:.2f} seconds to check. "
f"gpt_was_used: {gpt_was_used}"
)
return identified_as_spam, reasoning

bio: str | None = request_data.get("bio")
website: str | None = request_data.get("website")

def check_new_comment_for_spam(user: User, comment_text: str) -> tuple[bool, str]:
if user.comment_set.count() > 0:
identified_as_spam = False
reasoning = "User has already posted a comment"
else:
identified_as_spam, reasoning = check_comment_data_for_spam(user, comment_text)

return identified_as_spam, reasoning


def check_profile_data_for_spam(user: User, **args):
bio: str | None = args.get("bio")
website: str | None = args.get("website")
if bio and website:
bio_plus_website = f"{bio}\n\nWebsite: {website}"
elif not bio and website:
Expand All @@ -33,36 +76,56 @@ def check_profile_update_for_spam(
bio_plus_website = ""

start_time = time.time()
idenficated_as_spam = False
identified_as_spam = False
reasoning = ""
gpt_was_used = False
if not bio_plus_website:
idenficated_as_spam = False
identified_as_spam = False
reasoning = "No bio to check for spam"
elif len(bio_plus_website) < 10:
idenficated_as_spam = False
identified_as_spam = False
reasoning = "Bio is too short to be spam"
elif days_since_joined > days_since_joined_threshold:
idenficated_as_spam = False
reasoning = f"The user has been a member for more than {days_since_joined_threshold} days"
elif len(bio_plus_website) > 17500:
idenficated_as_spam = True
identified_as_spam = True
reasoning = "Bio is more than 17500 characters"
else:
idenficated_as_spam, reasoning = asyncio.run(
identified_as_spam, reasoning = asyncio.run(
ask_gpt_to_check_profile_for_spam(bio_plus_website, user.email)
)
gpt_was_used = True
end_time = time.time()
duration = end_time - start_time

if idenficated_as_spam:
if identified_as_spam:
logger.info(
f"User: {user.username} ID: {user.id} was soft deleted for spam bio: {bio_plus_website[:100]}... "
f"User: {user.username} ID: {user.id} was soft deleted "
f"for spam bio: {bio_plus_website[:100]}... "
f"The reason was: {reasoning[:100]}... "
f"It took {duration:.2f} seconds to check. gpt_was_used: {gpt_was_used}"
f"It took {duration:.2f} seconds to check. "
f"gpt_was_used: {gpt_was_used}"
)
return identified_as_spam, reasoning


def check_profile_update_for_spam(
user: User, valid_serializer: UserUpdateProfileSerializer
) -> tuple[bool, str]:
days_since_joined = (timezone.now() - user.date_joined).days
days_since_joined_threshold = 7
request_data = cast(dict, valid_serializer.validated_data)

if days_since_joined > days_since_joined_threshold:
identified_as_spam = False
reasoning = (
"The user has been a member for more than "
f"{days_since_joined_threshold} days"
)
else:
identified_as_spam, reasoning = check_profile_data_for_spam(
user, **request_data
)
return idenficated_as_spam, reasoning

return identified_as_spam, reasoning


async def ask_gpt_to_check_profile_for_spam(
Expand Down Expand Up @@ -104,7 +167,7 @@ async def ask_gpt_to_check_profile_for_spam(
system_prompt=system_prompt,
prompt=prompt,
temperature=0,
timeout=7
timeout=7,
)
is_spam = "TRUE" in gpt_response
except Exception as e:
Expand Down
12 changes: 6 additions & 6 deletions users/views.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from datetime import timedelta
import numpy as np
import logging
from typing import cast

from django.contrib.auth.password_validation import validate_password
from django.utils import timezone
Expand Down Expand Up @@ -467,19 +466,20 @@ def change_username_api_view(request: Request):
@api_view(["PATCH"])
def update_profile_api_view(request: Request) -> Response:
user: User = request.user
serializer = UserUpdateProfileSerializer(user, data=request.data, partial=True)
serializer: UserUpdateProfileSerializer = UserUpdateProfileSerializer(
user, data=request.data, partial=True
)
serializer.is_valid(raise_exception=True)

is_spam, _ = check_profile_update_for_spam(
user, cast(UserUpdateProfileSerializer, serializer)
)
is_spam, _ = check_profile_update_for_spam(user, serializer)

if is_spam:
user.mark_as_spam()
send_deactivation_email(user.email)
return Response(
data={
"message": "This bio seems to be spam. Please contact support@metaculus.com if you believe this was a mistake.",
"message": "This bio seems to be spam. Please contact "
"support@metaculus.com if you believe this was a mistake.",
"error_code": "SPAM_DETECTED",
},
status=status.HTTP_403_FORBIDDEN,
Expand Down
Loading