Skip to content

Commit

Permalink
add first comment check
Browse files Browse the repository at this point in the history
  • Loading branch information
lsabor committed Dec 26, 2024
1 parent b599e30 commit 36847a8
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 25 deletions.
24 changes: 23 additions & 1 deletion comments/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
from notifications.services import NotificationCommentReport, NotificationPostParams
from posts.services.common import get_post_permission_for_user
from projects.permissions import ObjectPermission
from users.models import User
from users.services.spam_detection import (
check_new_comment_for_spam,
send_deactivation_email,
)


class RootCommentsPagination(LimitOffsetPagination):
Expand Down Expand Up @@ -119,7 +124,7 @@ def comment_delete_api_view(request: Request, pk: int):
@permission_classes([IsAuthenticated])
@transaction.non_atomic_requests
def comment_create_api_view(request: Request):
user = request.user
user: User = request.user
serializer = CommentWriteSerializer(data=request.data)
serializer.is_valid(raise_exception=True)

Expand All @@ -143,6 +148,23 @@ def comment_create_api_view(request: Request):
else None
)

# Check for spam
is_spam, _ = check_new_comment_for_spam(
user=user, comment_text=serializer.validated_data["text"]
)

if is_spam:
user.mark_as_spam()
send_deactivation_email(user.email)
return Response(
data={
"message": "This comment seems to be spam. Please contact "
"support@metaculus.com if you believe this was a mistake.",
"error_code": "SPAM_DETECTED",
},
status=status.HTTP_403_FORBIDDEN,
)

new_comment = create_comment(
**serializer.validated_data, included_forecast=forecast, user=user
)
Expand Down
4 changes: 2 additions & 2 deletions users/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from users.models import User, UserCampaignRegistration
from users.services.spam_detection import (
check_data_for_spam,
check_profile_data_for_spam,
send_deactivation_email,
)
from questions.models import Forecast
Expand Down Expand Up @@ -223,7 +223,7 @@ def hard_delete_selected(self, request, queryset: QuerySet[User]):

def run_profile_spam_detection_on_selected(self, request, queryset: QuerySet[User]):
for user in queryset:
is_spam, _ = check_data_for_spam(
is_spam, _ = check_profile_data_for_spam(
user=user,
bio=user.bio,
website=user.website,
Expand Down
92 changes: 71 additions & 21 deletions users/services/spam_detection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import asyncio
import textwrap
from typing import cast
Expand All @@ -15,26 +16,54 @@
logger = logging.getLogger(__name__)


def check_profile_update_for_spam(
user: User, valid_serializer: UserUpdateProfileSerializer
) -> tuple[bool, str]:
days_since_joined = (timezone.now() - user.date_joined).days
days_since_joined_threshold = 7
request_data = cast(dict, valid_serializer.validated_data)
def check_comment_data_for_spam(user: User, comment_text: str) -> tuple[bool, str]:
start_time = time.time()
identified_as_spam = False
reasoning = ""
gpt_was_used = False

if days_since_joined > days_since_joined_threshold:
idenficated_as_spam = False
reasoning = (
"The user has been a member for more than "
f"{days_since_joined_threshold} days"
# Identify improper mentions
# @ mentions with external links
external_mention_pattern = r"@\[.*?\]\(https?://[^\s]+?\s*.*?\)"
# proper internal mentions
internal_mention_pattern = r"@\[.*?\]\(/accounts/profile/\d+/?\)"
# all @ mentions in the comment
all_mentions = re.findall(r"@\[[^\]]+\]\([^\)]+\)", comment_text)
improper_mentions = []
for mention in all_mentions:
if re.match(external_mention_pattern, mention) and not re.match(
internal_mention_pattern, mention
):
improper_mentions.append(mention)
if improper_mentions:
identified_as_spam = True
reasoning = "Comment contains improper @ mentions with external links"

end_time = time.time()
duration = end_time - start_time

if identified_as_spam:
logger.info(
f"User: {user.username} ID: {user.id} was soft deleted "
f"for spam comment: {comment_text[:100]}... "
f"The reason was: {reasoning[:100]}... "
f"It took {duration:.2f} seconds to check. "
f"gpt_was_used: {gpt_was_used}"
)
return identified_as_spam, reasoning


def check_new_comment_for_spam(user: User, comment_text: str) -> tuple[bool, str]:
if user.comment_set.count() > 0:
identified_as_spam = False
reasoning = "User has already posted a comment"
else:
idenficated_as_spam, reasoning = check_data_for_spam(user, **request_data)
identified_as_spam, reasoning = check_comment_data_for_spam(user, comment_text)

return idenficated_as_spam, reasoning
return identified_as_spam, reasoning


def check_data_for_spam(user: User, **args):
def check_profile_data_for_spam(user: User, **args):
bio: str | None = args.get("bio")
website: str | None = args.get("website")
if bio and website:
Expand All @@ -47,35 +76,56 @@ def check_data_for_spam(user: User, **args):
bio_plus_website = ""

start_time = time.time()
idenficated_as_spam = False
identified_as_spam = False
reasoning = ""
gpt_was_used = False
if not bio_plus_website:
idenficated_as_spam = False
identified_as_spam = False
reasoning = "No bio to check for spam"
elif len(bio_plus_website) < 10:
idenficated_as_spam = False
identified_as_spam = False
reasoning = "Bio is too short to be spam"
elif len(bio_plus_website) > 17500:
idenficated_as_spam = True
identified_as_spam = True
reasoning = "Bio is more than 17500 characters"
else:
idenficated_as_spam, reasoning = asyncio.run(
identified_as_spam, reasoning = asyncio.run(
ask_gpt_to_check_profile_for_spam(bio_plus_website, user.email)
)
gpt_was_used = True
end_time = time.time()
duration = end_time - start_time

if idenficated_as_spam:
if identified_as_spam:
logger.info(
f"User: {user.username} ID: {user.id} was soft deleted "
f"for spam bio: {bio_plus_website[:100]}... "
f"The reason was: {reasoning[:100]}... "
f"It took {duration:.2f} seconds to check. "
f"gpt_was_used: {gpt_was_used}"
)
return idenficated_as_spam, reasoning
return identified_as_spam, reasoning


def check_profile_update_for_spam(
user: User, valid_serializer: UserUpdateProfileSerializer
) -> tuple[bool, str]:
days_since_joined = (timezone.now() - user.date_joined).days
days_since_joined_threshold = 7
request_data = cast(dict, valid_serializer.validated_data)

if days_since_joined > days_since_joined_threshold:
identified_as_spam = False
reasoning = (
"The user has been a member for more than "
f"{days_since_joined_threshold} days"
)
else:
identified_as_spam, reasoning = check_profile_data_for_spam(
user, **request_data
)

return identified_as_spam, reasoning


async def ask_gpt_to_check_profile_for_spam(
Expand Down
3 changes: 2 additions & 1 deletion users/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,8 @@ def update_profile_api_view(request: Request) -> Response:
send_deactivation_email(user.email)
return Response(
data={
"message": "This bio seems to be spam. Please contact support@metaculus.com if you believe this was a mistake.",
"message": "This bio seems to be spam. Please contact "
"support@metaculus.com if you believe this was a mistake.",
"error_code": "SPAM_DETECTED",
},
status=status.HTTP_403_FORBIDDEN,
Expand Down

0 comments on commit 36847a8

Please sign in to comment.