Skip to content

Commit

Permalink
Merge branch 'main' into 1983-new_reco_pages
Browse files Browse the repository at this point in the history
  • Loading branch information
GresilleSiffle committed Dec 5, 2024
2 parents 0c37963 + 59f57e7 commit c8d84a1
Show file tree
Hide file tree
Showing 79 changed files with 2,101 additions and 1,753 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/backend-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
- uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: '3.9'

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ jobs:
with:
working-directory: tests
browser: chrome
headed: true # necessary to reflect the correct behavior of media queries (e.g. in ComparisonInput)
spec: "cypress/e2e/frontend/**/*"

- name: Print dev-env logs
Expand Down
104 changes: 26 additions & 78 deletions backend/ml/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,94 +2,55 @@
from typing import Optional

import pandas as pd
from django.db.models import Case, F, Q, QuerySet, When
from django.db.models.expressions import RawSQL
from solidago.pipeline import TournesolInput
from django.db.models import F, Q
from solidago.pipeline import PipelineInput

from core.models import User
from tournesol.models import (
ComparisonCriteriaScore,
ContributorRating,
ContributorRatingCriteriaScore,
ContributorScaling,
Entity,
)
from vouch.models import Voucher


class MlInputFromDb(TournesolInput):
SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE = 20

class MlInputFromDb(PipelineInput):
def __init__(self, poll_name: str):
self.poll_name = poll_name

def get_scaling_calibration_users(self) -> QuerySet[User]:
n_alternatives = (
Entity.objects.filter(comparisons_entity_1__poll__name=self.poll_name)
.union(Entity.objects.filter(comparisons_entity_2__poll__name=self.poll_name))
.count()
)
users = User.objects.alias(
n_compared_entities=RawSQL(
"""
SELECT COUNT(DISTINCT e.id)
FROM tournesol_entity e
INNER JOIN tournesol_comparison c
ON (c.entity_1_id = e.id OR c.entity_2_id = e.id)
INNER JOIN tournesol_poll p
ON (p.id = c.poll_id AND p.name = %s)
WHERE c.user_id = "core_user"."id"
""",
(self.poll_name,),
)
)
if n_alternatives <= self.SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE:
# The number of alternatives is low enough to consider as calibration users
# all trusted users who have compared all alternatives.
return users.filter(
is_active=True,
trust_score__gt=self.SCALING_CALIBRATION_MIN_TRUST_SCORE,
n_compared_entities__gte=n_alternatives,
)

return users.filter(
is_active=True,
trust_score__gt=self.SCALING_CALIBRATION_MIN_TRUST_SCORE,
n_compared_entities__gte=self.SCALING_CALIBRATION_MIN_ENTITIES_TO_COMPARE,
).order_by("-n_compared_entities")[: self.MAX_SCALING_CALIBRATION_USERS]

def get_comparisons(self, criteria=None, user_id=None) -> pd.DataFrame:
def get_comparisons(self, criterion=None, user_id=None) -> pd.DataFrame:
scores_queryset = ComparisonCriteriaScore.objects.filter(
comparison__poll__name=self.poll_name,
comparison__user__is_active=True,
)
if criteria is not None:
scores_queryset = scores_queryset.filter(criteria=criteria)
if criterion is not None:
scores_queryset = scores_queryset.filter(criteria=criterion)

if user_id is not None:
scores_queryset = scores_queryset.filter(comparison__user_id=user_id)

values = scores_queryset.values(
"score",
"score_max",
"criteria",
"weight",
criterion=F("criteria"),
entity_a=F("comparison__entity_1_id"),
entity_b=F("comparison__entity_2_id"),
user_id=F("comparison__user_id"),
)
if len(values) > 0:
dtf = pd.DataFrame(values)
return dtf[
["user_id", "entity_a", "entity_b", "criteria", "score", "score_max", "weight"]
["user_id", "entity_a", "entity_b", "criterion", "score", "score_max", "weight"]
]

return pd.DataFrame(
columns=[
"user_id",
"entity_a",
"entity_b",
"criteria",
"criterion",
"score",
"score_max",
"weight",
Expand All @@ -100,33 +61,19 @@ def get_comparisons(self, criteria=None, user_id=None) -> pd.DataFrame:
def ratings_properties(self):
# This makes sure that `get_scaling_calibration_users()` is evaluated separately, as the
# table names mentionned in its RawSQL query could conflict with the current queryset.
scaling_calibration_user_ids = list(self.get_scaling_calibration_users().values_list("id"))
values = (
ContributorRating.objects.filter(
poll__name=self.poll_name,
)
.annotate(
is_scaling_calibration_user=Case(
When(user__in=scaling_calibration_user_ids, then=True),
default=False,
),
)
.values(
"user_id",
"entity_id",
"is_public",
"is_scaling_calibration_user",
trust_score=F("user__trust_score"),
)
values = ContributorRating.objects.filter(
poll__name=self.poll_name,
).values(
"user_id",
"entity_id",
"is_public",
)
if len(values) == 0:
return pd.DataFrame(
columns=[
"user_id",
"entity_id",
"is_public",
"is_scaling_calibration_user",
"trust_score",
]
)
return pd.DataFrame(values)
Expand All @@ -136,7 +83,7 @@ def get_user_scalings(self, user_id=None) -> pd.DataFrame:
Returns:
- ratings_df: DataFrame with columns
* `user_id`: int
* `criteria`: str
* `criterion`: str
* `scale`: float
* `scale_uncertainty`: float
* `translation`: float
Expand All @@ -148,17 +95,18 @@ def get_user_scalings(self, user_id=None) -> pd.DataFrame:
scalings = scalings.filter(user_id=user_id)
values = scalings.values(
"user_id",
"criteria",
"scale",
"scale_uncertainty",
"translation",
"translation_uncertainty",
criterion=F("criteria"),

)
if len(values) == 0:
return pd.DataFrame(
columns=[
"user_id",
"criteria",
"criterion",
"scale",
"scale_uncertainty",
"translation",
Expand All @@ -168,28 +116,28 @@ def get_user_scalings(self, user_id=None) -> pd.DataFrame:
return pd.DataFrame(values)

def get_individual_scores(
self, criteria: Optional[str] = None, user_id: Optional[int] = None
self, user_id: Optional[int] = None, criterion: Optional[str] = None,
) -> pd.DataFrame:
scores_queryset = ContributorRatingCriteriaScore.objects.filter(
contributor_rating__poll__name=self.poll_name,
contributor_rating__user__is_active=True,
)
if criteria is not None:
scores_queryset = scores_queryset.filter(criteria=criteria)
if criterion is not None:
scores_queryset = scores_queryset.filter(criteria=criterion)
if user_id is not None:
scores_queryset = scores_queryset.filter(contributor_rating__user_id=user_id)

values = scores_queryset.values(
"raw_score",
"criteria",
entity=F("contributor_rating__entity_id"),
criterion=F("criteria"),
entity_id=F("contributor_rating__entity_id"),
user_id=F("contributor_rating__user_id"),
)
if len(values) == 0:
return pd.DataFrame(columns=["user_id", "entity", "criteria", "raw_score"])
return pd.DataFrame(columns=["user_id", "entity_id", "criterion", "raw_score"])

dtf = pd.DataFrame(values)
return dtf[["user_id", "entity", "criteria", "raw_score"]]
return dtf[["user_id", "entity_id", "criterion", "raw_score"]]

def get_vouches(self):
values = Voucher.objects.filter(
Expand Down
2 changes: 2 additions & 0 deletions backend/ml/management/commands/ml_train.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from functools import cache

from django import db
from django.conf import settings
Expand All @@ -18,6 +19,7 @@
from tournesol.models.poll import ALGORITHM_MEHESTAN, DEFAULT_POLL_NAME


@cache
def get_solidago_pipeline(run_trust_propagation: bool = True):
if run_trust_propagation:
trust_algo = LipschiTrust()
Expand Down
7 changes: 0 additions & 7 deletions backend/ml/mehestan/parameters.py

This file was deleted.

108 changes: 0 additions & 108 deletions backend/ml/mehestan/run.py

This file was deleted.

Loading

0 comments on commit c8d84a1

Please sign in to comment.