From c42e9b461071fc0fabe795116c2384f4ce1c3d8d Mon Sep 17 00:00:00 2001 From: mikrise2 Date: Thu, 29 Feb 2024 14:21:24 +0100 Subject: [PATCH 1/6] added filter --- jba/README.md | 20 +++++++ jba/src/processing/tasktracker_task_filter.py | 53 +++++++++++++++++++ pyproject.toml | 1 + 3 files changed, 74 insertions(+) create mode 100644 jba/src/processing/tasktracker_task_filter.py diff --git a/jba/README.md b/jba/README.md index 90841ac..8f951b8 100644 --- a/jba/README.md +++ b/jba/README.md @@ -100,6 +100,26 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_content_col - `course_sources_path` — Path to course sources to extract course structure. - `destination_path` — Path to directory where yaml file will be created. + +4.[tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date. Grouping by package name. + +### Usage + +Execute one of the following commands with necessary arguments: +```bash +poetry run tasktracker_task_filter [arguments] +``` +or +```bash +docker run hyperstyle-analysis-prod: poetry run tasktracker_task_filter [arguments] +``` + +**Required arguments**: + +- `document_path` — Path to documentdata.csv. +- `destination_path` — Path of the file to save filtered data. +- `start_date` — Start of the research in the DD.MM.YYYY format. + ---- # Simple general statistics diff --git a/jba/src/processing/tasktracker_task_filter.py b/jba/src/processing/tasktracker_task_filter.py new file mode 100644 index 0000000..e0ff562 --- /dev/null +++ b/jba/src/processing/tasktracker_task_filter.py @@ -0,0 +1,53 @@ +import argparse +from pathlib import Path +import pandas as pd +from datetime import datetime +from pytz import UTC + +from core.src.utils.df_utils import read_df, write_df + +PACKAGE_PATTERN = r'package ([\w\.]+)' +FRAGMENT_COLUMN = 'fragment' +TASK_COLUMN = 'task' +DATE_COLUMN = 'date' +DATE_FORMAT = '%d.%m.%Y' + + +def configure_parser(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + 'document_path', + type=lambda value: Path(value).absolute(), + help='Path to documentdata.csv.', + ) + + parser.add_argument( + 'destination_path', + type=lambda value: Path(value).absolute(), + help='Path of the file to save filtered data', + ) + + parser.add_argument( + 'start_date', + type=str, + help='Start of the research in the DD.MM.YYYY format', + ) + + +def filter_data(document_path: Path, destination_path: Path, date: str) -> None: + data = read_df(document_path) + data[DATE_COLUMN] = pd.to_datetime(data[DATE_COLUMN]) + filtered_data = data[data[DATE_COLUMN] > datetime.strptime(date, DATE_FORMAT).replace(tzinfo=UTC)].copy() + filtered_data[TASK_COLUMN] = filtered_data[FRAGMENT_COLUMN].str.extract(PACKAGE_PATTERN) + write_df(filtered_data, destination_path) + + +def main(): + parser = argparse.ArgumentParser() + configure_parser(parser) + + args = parser.parse_args() + filter_data(args.document_path, args.destination_path, args.start_date) + + +if __name__ == '__main__': + main() diff --git a/pyproject.toml b/pyproject.toml index e07f15e..8de27b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,7 @@ run_hyperstyle = 'data_labelling.src.hyperstyle.evaluate:main' # JBA scripts collect_course_structure = 'jba.src.processing.collect_course_structure:main' tasktracker_content_collector = 'jba.src.processing.tasktracker_content_collector:main' +tasktracker_task_filter = 'jba.src.processing.tasktracker_task_filter:main' process_course_data = 'jba.src.processing.data_processing:main' plot_task_attempt = 'jba.src.plots.task_attempt:main' plot_task_solving = 'jba.src.plots.task_solving:main' From 955ba8f32f7e5b7d08ea5b11b54cdb5539000420 Mon Sep 17 00:00:00 2001 From: mikrise2 Date: Thu, 29 Feb 2024 14:26:22 +0100 Subject: [PATCH 2/6] fixed flake8 --- jba/src/processing/tasktracker_task_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jba/src/processing/tasktracker_task_filter.py b/jba/src/processing/tasktracker_task_filter.py index e0ff562..9e75303 100644 --- a/jba/src/processing/tasktracker_task_filter.py +++ b/jba/src/processing/tasktracker_task_filter.py @@ -10,7 +10,7 @@ FRAGMENT_COLUMN = 'fragment' TASK_COLUMN = 'task' DATE_COLUMN = 'date' -DATE_FORMAT = '%d.%m.%Y' +DATE_FORMAT = '%d.%m.%Y' # noqa: WPS323 def configure_parser(parser: argparse.ArgumentParser) -> None: From 42880eb5dd34ab09bbfe840ec044bacd0dd851c7 Mon Sep 17 00:00:00 2001 From: mikrise2 Date: Thu, 29 Feb 2024 15:32:46 +0100 Subject: [PATCH 3/6] divide dataframe depending on was email presented in edu file or not. --- jba/README.md | 317 +++++++++++------- .../processing/tasktracker_edu_validation.py | 85 +++++ pyproject.toml | 1 + 3 files changed, 280 insertions(+), 123 deletions(-) create mode 100644 jba/src/processing/tasktracker_edu_validation.py diff --git a/jba/README.md b/jba/README.md index 8f951b8..1d90323 100644 --- a/jba/README.md +++ b/jba/README.md @@ -8,15 +8,19 @@ This module contains utilities to analyze data from JetBrains Marketplace. [This module](src/processing) contains a script to preprocess raw data to perform further analysis. -1. [collect_course_structure.py](src/processing/collect_course_structure.py) allows you to collect information about the course structure. +1. [collect_course_structure.py](src/processing/collect_course_structure.py) allows you to collect information about the + course structure. ### Usage Execute one of the following commands with necessary arguments: + ```bash poetry run collect_course_structure [arguments] ``` + or + ```bash docker run hyperstyle-analysis-prod: poetry run collect_course_structure [arguments] ``` @@ -26,7 +30,10 @@ docker run hyperstyle-analysis-prod: poetry run collect_course_structur - `course_sources_path` — Path to course sources to extract course structure. - `output_path` — Path to .csv file where to save the course structure. -After this step you will get a new file with the course structure, e.g. for the [course](tests/resources/processing/collect_course_structure/course_with_section) from the test folder with the following structure: +After this step you will get a new file with the course structure, e.g. for +the [course](tests/resources/processing/collect_course_structure/course_with_section) from the test folder with the +following structure: + ```text course-info.yaml course-remote-info.yaml @@ -56,9 +63,11 @@ section/ └── ... ``` -the [following](tests/resources/processing/collect_course_structure/expected_course_with_section.csv) file will be generated. +the [following](tests/resources/processing/collect_course_structure/expected_course_with_section.csv) file will be +generated. 2. [data_processing.py](src/processing/data_processing.py) allows you to process data from the previous step: + - Merge course data with task info - Add submission group - Add submission attempt @@ -66,31 +75,39 @@ the [following](tests/resources/processing/collect_course_structure/expected_cou ### Usage Execute one of the following commands with necessary arguments: + ```bash poetry run process_course_data [arguments] ``` + or + ```bash docker run hyperstyle-analysis-prod: poetry run process_course_data [arguments] ``` **Required arguments**: -- `course_data_path` — Path to .csv file with preprocessed data by [data_processing.py](src/processing/data_processing.py). -- `course_structure_path` — Path to .csv file with the course structure gathered by [prepare_course_data.py](src/processing/prepare_course_data.py). - -After this step you will get a new file with course data with `courseId_preprocessed` suffix. +- `course_data_path` — Path to .csv file with preprocessed data + by [data_processing.py](src/processing/data_processing.py). +- `course_structure_path` — Path to .csv file with the course structure gathered + by [prepare_course_data.py](src/processing/prepare_course_data.py). +After this step you will get a new file with course data with `courseId_preprocessed` suffix. -3. [tasktracker_content_collector.py](src/processing/tasktracker_content_collector.py) collects data from the course into the tasktracker task content file format +3. [tasktracker_content_collector.py](src/processing/tasktracker_content_collector.py) collects data from the course + into the tasktracker task content file format ### Usage Execute one of the following commands with necessary arguments: + ```bash poetry run tasktracker_content_collector [arguments] ``` + or + ```bash docker run hyperstyle-analysis-prod: poetry run tasktracker_content_collector [arguments] ``` @@ -100,16 +117,19 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_content_col - `course_sources_path` — Path to course sources to extract course structure. - `destination_path` — Path to directory where yaml file will be created. - -4.[tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date. Grouping by package name. +4.[tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date. +Grouping by package name. ### Usage Execute one of the following commands with necessary arguments: + ```bash poetry run tasktracker_task_filter [arguments] ``` + or + ```bash docker run hyperstyle-analysis-prod: poetry run tasktracker_task_filter [arguments] ``` @@ -120,6 +140,31 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_task_filter - `destination_path` — Path of the file to save filtered data. - `start_date` — Start of the research in the DD.MM.YYYY format. +5.[tasktracker_edu_validation.py](src/processing/tasktracker_edu_validation.py) Divide filtered tasktracker file to 2 +files - the first with emails that have been presented in the edu csv file, and the second file without them. + +### Usage + +Execute one of the following commands with necessary arguments: + +```bash +poetry run tasktracker_edu_validation [arguments] +``` + +or + +```bash +docker run hyperstyle-analysis-prod: poetry run tasktracker_edu_validation [arguments] +``` + +**Required arguments**: + +- `filtered_data` — Path to the data filtered by tasktracker_task_filter. +- `destination_path` — Path of the directory to save divided data. +- `users_file` — Tasktracker users.csv file. +- `researches_file` — Tasktracker researches.csv file. +- `edu_file` — Edu csv file. + ---- # Simple general statistics @@ -138,18 +183,21 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_task_filter ``` **Required arguments**: - - - `preprocessed_course_data_path` — Path to .csv file with preprocessed data by [data_processing.py](src/processing/data_processing.py). - - `course_structure_path` — Path to .csv file with the course structure gathered by [prepare_course_data.py](src/processing/prepare_course_data.py). - + + - `preprocessed_course_data_path` — Path to .csv file with preprocessed data + by [data_processing.py](src/processing/data_processing.py). + - `course_structure_path` — Path to .csv file with the course structure gathered + by [prepare_course_data.py](src/processing/prepare_course_data.py). + Optional arguments: - + | Argument | Description | - |-------------------------------------|---------------------------------------------| + |-------------------------------------|---------------------------------------------| | **‑‑course‑name** | Name of the course to display on the chart. | -2. [task_attempt.py](src/plots/task_attempt.py) allows you to plot line charts how many attempts students spend to solve the tasks from the course. +2. [task_attempt.py](src/plots/task_attempt.py) allows you to plot line charts how many attempts students spend to solve + the tasks from the course. Execute one of the following commands with necessary arguments: ```bash @@ -159,19 +207,22 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_task_filter ```bash docker run hyperstyle-analysis-prod: poetry run plot_task_attempt [arguments] ``` - + **Required arguments**: - - - `preprocessed_course_data_path` — Path to .csv file with preprocessed data by [data_processing.py](src/processing/data_processing.py). - - `course_structure_path` — Path to .csv file with the course structure gathered by [prepare_course_data.py](src/processing/prepare_course_data.py). - + + - `preprocessed_course_data_path` — Path to .csv file with preprocessed data + by [data_processing.py](src/processing/data_processing.py). + - `course_structure_path` — Path to .csv file with the course structure gathered + by [prepare_course_data.py](src/processing/prepare_course_data.py). + **Optional arguments**: - + | Argument | Description | - |-------------------------------------|---------------------------------------------| + |-------------------------------------|---------------------------------------------| | **‑‑course‑name** | Name of the course to display on the chart. | -3. [task_duplicates.py](src/plots/task_duplicates.py) allows you to plot line charts how many duplicate submissions students sent during tasks solving. +3. [task_duplicates.py](src/plots/task_duplicates.py) allows you to plot line charts how many duplicate submissions + students sent during tasks solving. Execute one of the following commands with necessary arguments: ```bash @@ -181,16 +232,18 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_task_filter ```bash docker run hyperstyle-analysis-prod: poetry run plot_task_duplicates [arguments] ``` - + **Required arguments**: - - - `preprocessed_course_data_path` — Path to .csv file with preprocessed data by [data_processing.py](src/processing/data_processing.py). - - `course_structure_path` — Path to .csv file with the course structure gathered by [prepare_course_data.py](src/processing/prepare_course_data.py). - + + - `preprocessed_course_data_path` — Path to .csv file with preprocessed data + by [data_processing.py](src/processing/data_processing.py). + - `course_structure_path` — Path to .csv file with the course structure gathered + by [prepare_course_data.py](src/processing/prepare_course_data.py). + **Optional arguments**: - + | Argument | Description | - |-------------------------------------|---------------------------------------------| + |-------------------------------------|---------------------------------------------| | **‑‑course‑name** | Name of the course to display on the chart. | Charts plotted with this module can be found in [this section](#visualization). @@ -201,11 +254,14 @@ Charts plotted with this module can be found in [this section](#visualization). [This module](src/test_logs) allows you to run tests and parse its output to perform further analysis. -**Note**: This module only works with courses that have a structure like in [Kotlin Onboarding](https://github.com/jetbrains-academy/kotlin-onboarding-introduction)! -This means that your task module name should have a specific format. For example, assume you have a task in the folder `Introduction/LastPush/CompleteTheProject`, +**Note**: This module only works with courses that have a structure like +in [Kotlin Onboarding](https://github.com/jetbrains-academy/kotlin-onboarding-introduction)! +This means that your task module name should have a specific format. For example, assume you have a task in the +folder `Introduction/LastPush/CompleteTheProject`, then your module should be named `Introduction-LastPush-CompleteTheProject` -1. [tests_runner.py](src/test_logs/tests_runner.py) allows you to run tests via Gradle and save Gradle logs to perform further parsing. +1. [tests_runner.py](src/test_logs/tests_runner.py) allows you to run tests via Gradle and save Gradle logs to perform + further parsing. Execute one of the following commands with necessary arguments: ```bash @@ -215,16 +271,16 @@ then your module should be named `Introduction-LastPush-CompleteTheProject` ```bash docker run hyperstyle-analysis-prod: poetry run gradle_tests_runner [arguments] ``` - + **Required arguments**: - - `submissions_path` — Path to .csv file with submissions. - - `course_sources_path` — Path to course sources. - - `logs_output_path` — Path to the folder to store logs. - + - `submissions_path` — Path to .csv file with submissions. + - `course_sources_path` — Path to course sources. + - `logs_output_path` — Path to the folder to store logs. + **Optional arguments**: - + | Argument | Description | - |--------------------------------------------------|-----------------------------------------------------------------------------| + |--------------------------------------------------|-----------------------------------------------------------------------------| | **‑‑timeout** | Timeout in seconds for subprocess to be executed. | | **‑‑n‑cpu** | Number of CPUs to use for parallel execution. | | **‑‑force‑ignore‑tests** | Force to ignore substitution of test files if they are visible to the user. | @@ -233,28 +289,29 @@ then your module should be named `Introduction-LastPush-CompleteTheProject` 2. [logs_parser.py](src/test_logs/logs_parser.py) allows you to parse Gradle logs into json strings: - - Gradle exceptions will match the structure of the [`ExceptionData`](src/models/edu_logs.py) dataclass. For example: - ```json - { - "path": "src/main/kotlin/Main.kt", - "line_number": 4, - "column_number": 9, - "message": "Conflicting declarations: val firstUserAnswer: String, val firstUserAnswer: String" - } - ``` - - Gradle tests will match the structure of the [`TestData`](src/models/edu_logs.py) dataclass. - ```json - { - "class_name": "Test", - "test": "testCountExactMatchesFunction()", - "method_name": "testCountExactMatchesFunction()", - "duration": "0s", - "result": "passed", - "error_class": null, - "message": null - } - ``` - + - Gradle exceptions will match the structure of the [`ExceptionData`](src/models/edu_logs.py) dataclass. For + example: + ```json + { + "path": "src/main/kotlin/Main.kt", + "line_number": 4, + "column_number": 9, + "message": "Conflicting declarations: val firstUserAnswer: String, val firstUserAnswer: String" + } + ``` + - Gradle tests will match the structure of the [`TestData`](src/models/edu_logs.py) dataclass. + ```json + { + "class_name": "Test", + "test": "testCountExactMatchesFunction()", + "method_name": "testCountExactMatchesFunction()", + "duration": "0s", + "result": "passed", + "error_class": null, + "message": null + } + ``` + Execute one of the following commands with necessary arguments: ```bash poetry run gradle_logs_parser [arguments] @@ -263,18 +320,18 @@ then your module should be named `Introduction-LastPush-CompleteTheProject` ```bash docker run hyperstyle-analysis-prod: poetry run gradle_logs_parser [arguments] ``` - + **Required arguments**: - - `submissions_path` — Path to .csv file with submissions. - - `gradle_logs_path` — Path to a folder with gradle logs. - + - `submissions_path` — Path to .csv file with submissions. + - `gradle_logs_path` — Path to a folder with gradle logs. + **Optional arguments**: - + | Argument | Description | - |-------------------------|------------------------------| + |-------------------------|------------------------------| | **‑‑debug** | Run the script in debug mode | -Charts plotted with this module can be found in [this section](#visualization). +Charts plotted with this module can be found in [this section](#visualization). --- @@ -283,25 +340,29 @@ Charts plotted with this module can be found in [this section](#visualization). [This module](src/inspections) allows you to gather IntelliJ inspections and also contains functions that are necessary for their visualization. -[gathering.py](src/inspections/gathering.py) allows you to gather IntelliJ inspections. +[gathering.py](src/inspections/gathering.py) allows you to gather IntelliJ inspections. Execute one of the following commands with necessary arguments: + ```bash poetry run inpsections_gathering [arguments] ``` + or + ```bash docker run hyperstyle-analysis-prod: poetry run inpsections_gathering [arguments] ``` **Required arguments**: + - `submissions_path` — Path to .csv file with submissions. - `--tool-path` — Path to the Hyperstyle entry point. -- `--language-version` — Language version of code snippets. +- `--language-version` — Language version of code snippets. - `--host` — Code quality server address. - `--port` — Code quality server port. - + **Optional arguments**: - + | Argument | Description | |------------------------------------------------|----------------------------------------------------------| | **‑‑venv** | Path to venv to run the tool. | @@ -312,80 +373,90 @@ docker run hyperstyle-analysis-prod: poetry run inpsections_gathering [ Charts plotted with this module can be found in [this section](#visualization). --- + # Visualization -[This module](src/visualization) contains [Streamlit](https://streamlit.io/) app that visualizes all statistics gathered from students' submissions. +[This module](src/visualization) contains [Streamlit](https://streamlit.io/) app that visualizes all statistics gathered +from students' submissions. To run the app, you need to execute the following command: + ```bash poetry run streamlit run ./visualization/README.py ``` The app consists of several pages: + 1. [README](src/visualization/README.py) contains an application description and input fields for common resources: - 1. Submissions path — Path to .csv file with submissions, parsed test logs and inspections. - 2. Course structure path — Path to .csv file with course structure. + 1. Submissions path — Path to .csv file with submissions, parsed test logs and inspections. + 2. Course structure path — Path to .csv file with course structure. + +2. [General stats](src/visualization/pages/1_General_stats.py) contains charts from + the [Simple general statistics](#simple-general-statistics) module: + - Task attempts: -2. [General stats](src/visualization/pages/1_General_stats.py) contains charts from the [Simple general statistics](#simple-general-statistics) module: - - Task attempts: + - + - Task solving: - - Task solving: + - + - Task duplicates: - - Task duplicates: + - +3. [Group tests timeline](src/visualization/pages/2_Group_tests_timeline.py) contains charts from + the [Test logs analysis](#test-logs-analysis) module: + - General tests timeline: -3. [Group tests timeline](src/visualization/pages/2_Group_tests_timeline.py) contains charts from the [Test logs analysis](#test-logs-analysis) module: - - General tests timeline: - - - - - Parametrized tests timeline: + - + - Parametrized tests timeline: -4. [Aggregated timeline stats](src/visualization/pages/3_Aggregated_timeline_stats.py) contains charts from the [Test logs analysis](#test-logs-analysis) module: - - Average tests timeline: - - + - - Tests chain: - - +4. [Aggregated timeline stats](src/visualization/pages/3_Aggregated_timeline_stats.py) contains charts from + the [Test logs analysis](#test-logs-analysis) module: + - Average tests timeline: -5. [Inspections stats](src/visualization/pages/4_Inspections_stats.py) contains charts from the [Inspections analysis](#inspections-analysis) module: - - Inspections frequency: + - - - - Inspections fixing examples: - - - -6. [Submissions viewer](src/visualization/pages/5_Submissions_viewer.py) allows you to view user submissions within some group. + - Tests chain: + + + +5. [Inspections stats](src/visualization/pages/4_Inspections_stats.py) contains charts from + the [Inspections analysis](#inspections-analysis) module: + - Inspections frequency: + + + + - Inspections fixing examples: + + + +6. [Submissions viewer](src/visualization/pages/5_Submissions_viewer.py) allows you to view user submissions within some + group. -7. [Automated analysis](src/visualization/pages/6_Automated_analysis.py) allows you to automatically analyse submissions' data. +7. [Automated analysis](src/visualization/pages/6_Automated_analysis.py) allows you to automatically analyse + submissions' data. + + - [Median attempts analysis](src/visualization/pages/automated_analysis/median_attempts_analysis.py) + allows you to find suspicious tasks whose median number of attempts is not less than a threshold + and view group submissions from these tasks. + + - - [Median attempts analysis](src/visualization/pages/automated_analysis/median_attempts_analysis.py) - allows you to find suspicious tasks whose median number of attempts is not less than a threshold - and view group submissions from these tasks. + - [Failed users analysis](src/visualization/pages/automated_analysis/failed_users_analysis.py) + allows you to find suspicious tasks whose relative number of failed users is not less than a threshold + and view group submissions from these tasks. - + - - [Failed users analysis](src/visualization/pages/automated_analysis/failed_users_analysis.py) - allows you to find suspicious tasks whose relative number of failed users is not less than a threshold - and view group submissions from these tasks. - - - - - [Median test attempts analysis](src/visualization/pages/automated_analysis/median_test_attempts_analysis.py) - allows you to find suspicious tests whose median number of attempts is not less than a threshold - and view group submissions that contain these tests. + - [Median test attempts analysis](src/visualization/pages/automated_analysis/median_test_attempts_analysis.py) + allows you to find suspicious tests whose median number of attempts is not less than a threshold + and view group submissions that contain these tests. - \ No newline at end of file + \ No newline at end of file diff --git a/jba/src/processing/tasktracker_edu_validation.py b/jba/src/processing/tasktracker_edu_validation.py new file mode 100644 index 0000000..447a39c --- /dev/null +++ b/jba/src/processing/tasktracker_edu_validation.py @@ -0,0 +1,85 @@ +import argparse +from pathlib import Path +from typing import Dict + +import pandas as pd + +from core.src.utils.df_utils import read_df, write_df + +USER_COLUMN = "user" +ID_COLUMN = "id" +IDX_COLUMN = "id_x" +EMAIL_COLUMN = "email" +RESEARCH_ID_COLUMN = "research_id" +DF_IN_EDU_FILENAME = 'df_in_edu.csv' +DF_NOT_IN_EDU_FILENAME = 'df_not_in_edu.csv' + + +def configure_parser(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + 'filtered_data', + type=lambda value: Path(value).absolute(), + help='Path to the data filtered by tasktracker_task_filter', + ) + + parser.add_argument( + 'destination_path', + type=lambda value: Path(value).absolute(), + help='Path of the directory to save divided data', + ) + + parser.add_argument( + 'users_file', + type=lambda value: Path(value).absolute(), + help='Tasktracker users.csv file', + ) + + parser.add_argument( + 'researches_file', + type=lambda value: Path(value).absolute(), + help='Tasktracker researches.csv file', + ) + + parser.add_argument( + 'edu_file', + type=lambda value: Path(value).absolute(), + help='Edu csv file', + ) + + +def research_to_email(users_path: Path, researches_path: Path) -> Dict[str, str]: + users = read_df(users_path) + researches = read_df(researches_path) + return pd.merge(researches, users, left_on=USER_COLUMN, right_on=ID_COLUMN, how="left").set_index(IDX_COLUMN)[ + EMAIL_COLUMN].to_dict() + + +# noqa: WPS320 +def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) -> ( + pd.DataFrame, pd.DataFrame): + filtered_df[EMAIL_COLUMN] = filtered_df[RESEARCH_ID_COLUMN].map(res_to_email) + edu_emails = edu_df[EMAIL_COLUMN].unique() + df_in_edu = filtered_df[filtered_df[EMAIL_COLUMN].isin(edu_emails)] + df_not_in_edu = filtered_df[~filtered_df[EMAIL_COLUMN].isin(edu_emails)] + return df_in_edu, df_not_in_edu + + +def validate(filtered_data: Path, edu_file: Path, destination_path: Path, res_to_email: Dict[str, str]) -> None: + filtered_df = read_df(filtered_data) + edu_df = read_df(edu_file) + df_in_edu, df_not_in_edu = split_dataframe(filtered_df, edu_df, res_to_email) + write_df(df_in_edu, destination_path / DF_IN_EDU_FILENAME) + write_df(df_not_in_edu, destination_path / DF_NOT_IN_EDU_FILENAME) + + +def main(): + parser = argparse.ArgumentParser() + configure_parser(parser) + + args = parser.parse_args() + res_to_email = research_to_email(args.users_file, args.researches_file) + validate(args.filtered_data, args.edu_file, args.destination_path, res_to_email) + + +if __name__ == '__main__': + main() diff --git a/pyproject.toml b/pyproject.toml index 8de27b2..a0407b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,6 +85,7 @@ run_hyperstyle = 'data_labelling.src.hyperstyle.evaluate:main' collect_course_structure = 'jba.src.processing.collect_course_structure:main' tasktracker_content_collector = 'jba.src.processing.tasktracker_content_collector:main' tasktracker_task_filter = 'jba.src.processing.tasktracker_task_filter:main' +tasktracker_edu_validation = 'jba.src.processing.tasktracker_edu_validation:main' process_course_data = 'jba.src.processing.data_processing:main' plot_task_attempt = 'jba.src.plots.task_attempt:main' plot_task_solving = 'jba.src.plots.task_solving:main' From 709b015e79174e304d348432d95c8531134e9c7c Mon Sep 17 00:00:00 2001 From: mikrise2 Date: Thu, 29 Feb 2024 16:15:34 +0100 Subject: [PATCH 4/6] fixe flake8 --- spellcheck_whitelist.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/spellcheck_whitelist.txt b/spellcheck_whitelist.txt index 29cef8c..b16ace6 100644 --- a/spellcheck_whitelist.txt +++ b/spellcheck_whitelist.txt @@ -1,3 +1,4 @@ +IDX TASKTRACKER WPS336 arange From 85ba0dc43d2991a16ce9e5c4d31ee6b15dcd5bc7 Mon Sep 17 00:00:00 2001 From: mikrise2 Date: Thu, 29 Feb 2024 16:19:48 +0100 Subject: [PATCH 5/6] fixed flake8(1) --- jba/src/processing/tasktracker_edu_validation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jba/src/processing/tasktracker_edu_validation.py b/jba/src/processing/tasktracker_edu_validation.py index 447a39c..d80a88a 100644 --- a/jba/src/processing/tasktracker_edu_validation.py +++ b/jba/src/processing/tasktracker_edu_validation.py @@ -54,8 +54,7 @@ def research_to_email(users_path: Path, researches_path: Path) -> Dict[str, str] EMAIL_COLUMN].to_dict() -# noqa: WPS320 -def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) -> ( +def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) -> ( # noqa: WPS320 pd.DataFrame, pd.DataFrame): filtered_df[EMAIL_COLUMN] = filtered_df[RESEARCH_ID_COLUMN].map(res_to_email) edu_emails = edu_df[EMAIL_COLUMN].unique() From 65838c9868510b75834c3d65102a1d5b7f993919 Mon Sep 17 00:00:00 2001 From: mikrise2 Date: Fri, 1 Mar 2024 11:40:25 +0100 Subject: [PATCH 6/6] fix comments --- jba/README.md | 14 +++++++------- jba/src/processing/tasktracker_edu_validation.py | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/jba/README.md b/jba/README.md index 1d90323..95bd489 100644 --- a/jba/README.md +++ b/jba/README.md @@ -117,7 +117,7 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_content_col - `course_sources_path` — Path to course sources to extract course structure. - `destination_path` — Path to directory where yaml file will be created. -4.[tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date. +4. [tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date. Grouping by package name. ### Usage @@ -140,7 +140,7 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_task_filter - `destination_path` — Path of the file to save filtered data. - `start_date` — Start of the research in the DD.MM.YYYY format. -5.[tasktracker_edu_validation.py](src/processing/tasktracker_edu_validation.py) Divide filtered tasktracker file to 2 +5. [tasktracker_edu_validation.py](src/processing/tasktracker_edu_validation.py) Divide filtered tasktracker file to 2 files - the first with emails that have been presented in the edu csv file, and the second file without them. ### Usage @@ -192,7 +192,7 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_edu_validat Optional arguments: | Argument | Description | - |-------------------------------------|---------------------------------------------| + |-------------------------------------|---------------------------------------------| | **‑‑course‑name** | Name of the course to display on the chart. | @@ -218,7 +218,7 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_edu_validat **Optional arguments**: | Argument | Description | - |-------------------------------------|---------------------------------------------| + |-------------------------------------|---------------------------------------------| | **‑‑course‑name** | Name of the course to display on the chart. | 3. [task_duplicates.py](src/plots/task_duplicates.py) allows you to plot line charts how many duplicate submissions @@ -243,7 +243,7 @@ docker run hyperstyle-analysis-prod: poetry run tasktracker_edu_validat **Optional arguments**: | Argument | Description | - |-------------------------------------|---------------------------------------------| + |-------------------------------------|---------------------------------------------| | **‑‑course‑name** | Name of the course to display on the chart. | Charts plotted with this module can be found in [this section](#visualization). @@ -280,7 +280,7 @@ then your module should be named `Introduction-LastPush-CompleteTheProject` **Optional arguments**: | Argument | Description | - |--------------------------------------------------|-----------------------------------------------------------------------------| + |--------------------------------------------------|-----------------------------------------------------------------------------| | **‑‑timeout** | Timeout in seconds for subprocess to be executed. | | **‑‑n‑cpu** | Number of CPUs to use for parallel execution. | | **‑‑force‑ignore‑tests** | Force to ignore substitution of test files if they are visible to the user. | @@ -328,7 +328,7 @@ then your module should be named `Introduction-LastPush-CompleteTheProject` **Optional arguments**: | Argument | Description | - |-------------------------|------------------------------| + |-------------------------|------------------------------| | **‑‑debug** | Run the script in debug mode | Charts plotted with this module can be found in [this section](#visualization). diff --git a/jba/src/processing/tasktracker_edu_validation.py b/jba/src/processing/tasktracker_edu_validation.py index d80a88a..96482f9 100644 --- a/jba/src/processing/tasktracker_edu_validation.py +++ b/jba/src/processing/tasktracker_edu_validation.py @@ -1,6 +1,6 @@ import argparse from pathlib import Path -from typing import Dict +from typing import Dict, Tuple import pandas as pd @@ -54,8 +54,8 @@ def research_to_email(users_path: Path, researches_path: Path) -> Dict[str, str] EMAIL_COLUMN].to_dict() -def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) -> ( # noqa: WPS320 - pd.DataFrame, pd.DataFrame): +def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) \ + -> Tuple[pd.DataFrame, pd.DataFrame]: filtered_df[EMAIL_COLUMN] = filtered_df[RESEARCH_ID_COLUMN].map(res_to_email) edu_emails = edu_df[EMAIL_COLUMN].unique() df_in_edu = filtered_df[filtered_df[EMAIL_COLUMN].isin(edu_emails)] @@ -63,7 +63,7 @@ def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_emai return df_in_edu, df_not_in_edu -def validate(filtered_data: Path, edu_file: Path, destination_path: Path, res_to_email: Dict[str, str]) -> None: +def validate(filtered_data: Path, edu_file: Path, destination_path: Path, res_to_email: Dict[str, str]): filtered_df = read_df(filtered_data) edu_df = read_df(edu_file) df_in_edu, df_not_in_edu = split_dataframe(filtered_df, edu_df, res_to_email)