From c42e9b461071fc0fabe795116c2384f4ce1c3d8d Mon Sep 17 00:00:00 2001
From: mikrise2 <mikrise2@mail.ru>
Date: Thu, 29 Feb 2024 14:21:24 +0100
Subject: [PATCH 1/6] added filter

---
 jba/README.md                                 | 20 +++++++
 jba/src/processing/tasktracker_task_filter.py | 53 +++++++++++++++++++
 pyproject.toml                                |  1 +
 3 files changed, 74 insertions(+)
 create mode 100644 jba/src/processing/tasktracker_task_filter.py
diff --git a/jba/README.md b/jba/README.md
index 90841ac..8f951b8 100644
--- a/jba/README.md
+++ b/jba/README.md
@@ -100,6 +100,26 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_content_col
 - `course_sources_path` — Path to course sources to extract course structure.
 - `destination_path` — Path to directory where yaml file will be created.
 
+
+4.[tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date. Grouping by package name.
+
+### Usage
+
+Execute one of the following commands with necessary arguments:
+```bash
+poetry run tasktracker_task_filter [arguments]
+```
+or
+```bash
+docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_task_filter [arguments]
+```
+
+**Required arguments**:
+
+- `document_path` — Path to documentdata.csv.
+- `destination_path` — Path of the file to save filtered data.
+- `start_date` — Start of the research in the DD.MM.YYYY format.
+
 ----
 
 # Simple general statistics
diff --git a/jba/src/processing/tasktracker_task_filter.py b/jba/src/processing/tasktracker_task_filter.py
new file mode 100644
index 0000000..e0ff562
--- /dev/null
+++ b/jba/src/processing/tasktracker_task_filter.py
@@ -0,0 +1,53 @@
+import argparse
+from pathlib import Path
+import pandas as pd
+from datetime import datetime
+from pytz import UTC
+
+from core.src.utils.df_utils import read_df, write_df
+
+PACKAGE_PATTERN = r'package ([\w\.]+)'
+FRAGMENT_COLUMN = 'fragment'
+TASK_COLUMN = 'task'
+DATE_COLUMN = 'date'
+DATE_FORMAT = '%d.%m.%Y'
+
+
+def configure_parser(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        'document_path',
+        type=lambda value: Path(value).absolute(),
+        help='Path to documentdata.csv.',
+    )
+
+    parser.add_argument(
+        'destination_path',
+        type=lambda value: Path(value).absolute(),
+        help='Path of the file to save filtered data',
+    )
+
+    parser.add_argument(
+        'start_date',
+        type=str,
+        help='Start of the research in the DD.MM.YYYY format',
+    )
+
+
+def filter_data(document_path: Path, destination_path: Path, date: str) -> None:
+    data = read_df(document_path)
+    data[DATE_COLUMN] = pd.to_datetime(data[DATE_COLUMN])
+    filtered_data = data[data[DATE_COLUMN] > datetime.strptime(date, DATE_FORMAT).replace(tzinfo=UTC)].copy()
+    filtered_data[TASK_COLUMN] = filtered_data[FRAGMENT_COLUMN].str.extract(PACKAGE_PATTERN)
+    write_df(filtered_data, destination_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    configure_parser(parser)
+
+    args = parser.parse_args()
+    filter_data(args.document_path, args.destination_path, args.start_date)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index e07f15e..8de27b2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,6 +84,7 @@ run_hyperstyle = 'data_labelling.src.hyperstyle.evaluate:main'
 # JBA scripts
 collect_course_structure = 'jba.src.processing.collect_course_structure:main'
 tasktracker_content_collector = 'jba.src.processing.tasktracker_content_collector:main'
+tasktracker_task_filter = 'jba.src.processing.tasktracker_task_filter:main'
 process_course_data = 'jba.src.processing.data_processing:main'
 plot_task_attempt = 'jba.src.plots.task_attempt:main'
 plot_task_solving = 'jba.src.plots.task_solving:main'

From 955ba8f32f7e5b7d08ea5b11b54cdb5539000420 Mon Sep 17 00:00:00 2001
From: mikrise2 <mikrise2@mail.ru>
Date: Thu, 29 Feb 2024 14:26:22 +0100
Subject: [PATCH 2/6] fixed flake8

---
 jba/src/processing/tasktracker_task_filter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jba/src/processing/tasktracker_task_filter.py b/jba/src/processing/tasktracker_task_filter.py
index e0ff562..9e75303 100644
--- a/jba/src/processing/tasktracker_task_filter.py
+++ b/jba/src/processing/tasktracker_task_filter.py
@@ -10,7 +10,7 @@
 FRAGMENT_COLUMN = 'fragment'
 TASK_COLUMN = 'task'
 DATE_COLUMN = 'date'
-DATE_FORMAT = '%d.%m.%Y'
+DATE_FORMAT = '%d.%m.%Y'  # noqa: WPS323
 
 
 def configure_parser(parser: argparse.ArgumentParser) -> None:

From 42880eb5dd34ab09bbfe840ec044bacd0dd851c7 Mon Sep 17 00:00:00 2001
From: mikrise2 <mikrise2@mail.ru>
Date: Thu, 29 Feb 2024 15:32:46 +0100
Subject: [PATCH 3/6] divide dataframe depending on was email presented in edu
 file or not.

---
 jba/README.md                                 | 317 +++++++++++-------
 .../processing/tasktracker_edu_validation.py  |  85 +++++
 pyproject.toml                                |   1 +
 3 files changed, 280 insertions(+), 123 deletions(-)
 create mode 100644 jba/src/processing/tasktracker_edu_validation.py

diff --git a/jba/README.md b/jba/README.md
index 8f951b8..1d90323 100644
--- a/jba/README.md
+++ b/jba/README.md
@@ -8,15 +8,19 @@ This module contains utilities to analyze data from JetBrains Marketplace.
 
 [This module](src/processing) contains a script to preprocess raw data to perform further analysis.
 
-1. [collect_course_structure.py](src/processing/collect_course_structure.py) allows you to collect information about the course structure.
+1. [collect_course_structure.py](src/processing/collect_course_structure.py) allows you to collect information about the
+   course structure.
 
 ### Usage
 
 Execute one of the following commands with necessary arguments:
+
 ```bash
 poetry run collect_course_structure [arguments]
 ```
+
 or
+
 ```bash
 docker run hyperstyle-analysis-prod:<VERSION> poetry run collect_course_structure [arguments]
 ```
@@ -26,7 +30,10 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run collect_course_structur
 - `course_sources_path` — Path to course sources to extract course structure.
 - `output_path` — Path to .csv file where to save the course structure.
 
-After this step you will get a new file with the course structure, e.g. for the [course](tests/resources/processing/collect_course_structure/course_with_section) from the test folder with the following structure:
+After this step you will get a new file with the course structure, e.g. for
+the [course](tests/resources/processing/collect_course_structure/course_with_section) from the test folder with the
+following structure:
+
 ```text
 course-info.yaml
 course-remote-info.yaml
@@ -56,9 +63,11 @@ section/
             └── ...
 ```
 
-the [following](tests/resources/processing/collect_course_structure/expected_course_with_section.csv) file will be generated.
+the [following](tests/resources/processing/collect_course_structure/expected_course_with_section.csv) file will be
+generated.
 
 2. [data_processing.py](src/processing/data_processing.py) allows you to process data from the previous step:
+
 - Merge course data with task info
 - Add submission group
 - Add submission attempt
@@ -66,31 +75,39 @@ the [following](tests/resources/processing/collect_course_structure/expected_cou
 ### Usage
 
 Execute one of the following commands with necessary arguments:
+
 ```bash
 poetry run process_course_data [arguments]
 ```
+
 or
+
 ```bash
 docker run hyperstyle-analysis-prod:<VERSION> poetry run process_course_data [arguments]
 ```
 
 **Required arguments**:
 
-- `course_data_path` — Path to .csv file with preprocessed data by [data_processing.py](src/processing/data_processing.py).
-- `course_structure_path` — Path to .csv file with the course structure gathered by [prepare_course_data.py](src/processing/prepare_course_data.py).
-
-After this step you will get a new file with course data with `courseId_preprocessed` suffix. 
+- `course_data_path` — Path to .csv file with preprocessed data
+  by [data_processing.py](src/processing/data_processing.py).
+- `course_structure_path` — Path to .csv file with the course structure gathered
+  by [prepare_course_data.py](src/processing/prepare_course_data.py).
 
+After this step you will get a new file with course data with `courseId_preprocessed` suffix.
 
-3. [tasktracker_content_collector.py](src/processing/tasktracker_content_collector.py) collects data from the course into the tasktracker task content file format
+3. [tasktracker_content_collector.py](src/processing/tasktracker_content_collector.py) collects data from the course
+   into the tasktracker task content file format
 
 ### Usage
 
 Execute one of the following commands with necessary arguments:
+
 ```bash
 poetry run tasktracker_content_collector [arguments]
 ```
+
 or
+
 ```bash
 docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_content_collector [arguments]
 ```
@@ -100,16 +117,19 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_content_col
 - `course_sources_path` — Path to course sources to extract course structure.
 - `destination_path` — Path to directory where yaml file will be created.
 
-
-4.[tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date. Grouping by package name.
+4.[tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date.
+Grouping by package name.
 
 ### Usage
 
 Execute one of the following commands with necessary arguments:
+
 ```bash
 poetry run tasktracker_task_filter [arguments]
 ```
+
 or
+
 ```bash
 docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_task_filter [arguments]
 ```
@@ -120,6 +140,31 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_task_filter
 - `destination_path` — Path of the file to save filtered data.
 - `start_date` — Start of the research in the DD.MM.YYYY format.
 
+5.[tasktracker_edu_validation.py](src/processing/tasktracker_edu_validation.py) Divide filtered tasktracker file to 2
+files - the first with emails that have been presented in the edu csv file, and the second file without them.
+
+### Usage
+
+Execute one of the following commands with necessary arguments:
+
+```bash
+poetry run tasktracker_edu_validation [arguments]
+```
+
+or
+
+```bash
+docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_edu_validation [arguments]
+```
+
+**Required arguments**:
+
+- `filtered_data` — Path to the data filtered by tasktracker_task_filter.
+- `destination_path` — Path of the directory to save divided data.
+- `users_file` — Tasktracker users.csv file.
+- `researches_file` — Tasktracker researches.csv file.
+- `edu_file` — Edu csv file.
+
 ----
 
 # Simple general statistics
@@ -138,18 +183,21 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_task_filter
    ```
 
    **Required arguments**:
-   
-   - `preprocessed_course_data_path` — Path to .csv file with preprocessed data by [data_processing.py](src/processing/data_processing.py).
-   - `course_structure_path` — Path to .csv file with the course structure gathered by [prepare_course_data.py](src/processing/prepare_course_data.py).
-   
+
+    - `preprocessed_course_data_path` — Path to .csv file with preprocessed data
+      by [data_processing.py](src/processing/data_processing.py).
+    - `course_structure_path` — Path to .csv file with the course structure gathered
+      by [prepare_course_data.py](src/processing/prepare_course_data.py).
+
    Optional arguments:
-   
+
    | Argument                            | Description                                 |
-   |-------------------------------------|---------------------------------------------|
+         |-------------------------------------|---------------------------------------------|
    | **&#8209;&#8209;course&#8209;name** | Name of the course to display on the chart. |
 
 
-2. [task_attempt.py](src/plots/task_attempt.py) allows you to plot line charts how many attempts students spend to solve the tasks from the course.
+2. [task_attempt.py](src/plots/task_attempt.py) allows you to plot line charts how many attempts students spend to solve
+   the tasks from the course.
 
    Execute one of the following commands with necessary arguments:
    ```bash
@@ -159,19 +207,22 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_task_filter
    ```bash
    docker run hyperstyle-analysis-prod:<VERSION> poetry run plot_task_attempt [arguments]
    ```
-   
+
    **Required arguments**:
-   
-   - `preprocessed_course_data_path` — Path to .csv file with preprocessed data by [data_processing.py](src/processing/data_processing.py).
-   - `course_structure_path` — Path to .csv file with the course structure gathered by [prepare_course_data.py](src/processing/prepare_course_data.py).
-   
+
+    - `preprocessed_course_data_path` — Path to .csv file with preprocessed data
+      by [data_processing.py](src/processing/data_processing.py).
+    - `course_structure_path` — Path to .csv file with the course structure gathered
+      by [prepare_course_data.py](src/processing/prepare_course_data.py).
+
    **Optional arguments**:
-   
+
    | Argument                            | Description                                 |
-   |-------------------------------------|---------------------------------------------|
+         |-------------------------------------|---------------------------------------------|
    | **&#8209;&#8209;course&#8209;name** | Name of the course to display on the chart. |
 
-3. [task_duplicates.py](src/plots/task_duplicates.py) allows you to plot line charts how many duplicate submissions students sent during tasks solving.
+3. [task_duplicates.py](src/plots/task_duplicates.py) allows you to plot line charts how many duplicate submissions
+   students sent during tasks solving.
 
    Execute one of the following commands with necessary arguments:
    ```bash
@@ -181,16 +232,18 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_task_filter
    ```bash
    docker run hyperstyle-analysis-prod:<VERSION> poetry run plot_task_duplicates [arguments]
    ```
-   
+
    **Required arguments**:
-   
-   - `preprocessed_course_data_path` — Path to .csv file with preprocessed data by [data_processing.py](src/processing/data_processing.py).
-   - `course_structure_path` — Path to .csv file with the course structure gathered by [prepare_course_data.py](src/processing/prepare_course_data.py).
-   
+
+    - `preprocessed_course_data_path` — Path to .csv file with preprocessed data
+      by [data_processing.py](src/processing/data_processing.py).
+    - `course_structure_path` — Path to .csv file with the course structure gathered
+      by [prepare_course_data.py](src/processing/prepare_course_data.py).
+
    **Optional arguments**:
-   
+
    | Argument                            | Description                                 |
-   |-------------------------------------|---------------------------------------------|
+         |-------------------------------------|---------------------------------------------|
    | **&#8209;&#8209;course&#8209;name** | Name of the course to display on the chart. |
 
 Charts plotted with this module can be found in [this section](#visualization).
@@ -201,11 +254,14 @@ Charts plotted with this module can be found in [this section](#visualization).
 
 [This module](src/test_logs) allows you to run tests and parse its output to perform further analysis.
 
-**Note**: This module only works with courses that have a structure like in [Kotlin Onboarding](https://github.com/jetbrains-academy/kotlin-onboarding-introduction)!
-This means that your task module name should have a specific format. For example, assume you have a task in the folder `Introduction/LastPush/CompleteTheProject`, 
+**Note**: This module only works with courses that have a structure like
+in [Kotlin Onboarding](https://github.com/jetbrains-academy/kotlin-onboarding-introduction)!
+This means that your task module name should have a specific format. For example, assume you have a task in the
+folder `Introduction/LastPush/CompleteTheProject`,
 then your module should be named `Introduction-LastPush-CompleteTheProject`
 
-1. [tests_runner.py](src/test_logs/tests_runner.py) allows you to run tests via Gradle and save Gradle logs to perform further parsing.
+1. [tests_runner.py](src/test_logs/tests_runner.py) allows you to run tests via Gradle and save Gradle logs to perform
+   further parsing.
 
    Execute one of the following commands with necessary arguments:
    ```bash
@@ -215,16 +271,16 @@ then your module should be named `Introduction-LastPush-CompleteTheProject`
    ```bash
    docker run hyperstyle-analysis-prod:<VERSION> poetry run gradle_tests_runner [arguments]
    ```
-   
+
    **Required arguments**:
-   - `submissions_path` — Path to .csv file with submissions.
-   - `course_sources_path` — Path to course sources.
-   - `logs_output_path` — Path to the folder to store logs.
-   
+    - `submissions_path` — Path to .csv file with submissions.
+    - `course_sources_path` — Path to course sources.
+    - `logs_output_path` — Path to the folder to store logs.
+
    **Optional arguments**:
-   
+
    | Argument                                         | Description                                                                 |
-   |--------------------------------------------------|-----------------------------------------------------------------------------|
+         |--------------------------------------------------|-----------------------------------------------------------------------------|
    | **&#8209;&#8209;timeout**                        | Timeout in seconds for subprocess to be executed.                           |
    | **&#8209;&#8209;n&#8209;cpu**                    | Number of CPUs to use for parallel execution.                               |
    | **&#8209;&#8209;force&#8209;ignore&#8209;tests** | Force to ignore substitution of test files if they are visible to the user. |
@@ -233,28 +289,29 @@ then your module should be named `Introduction-LastPush-CompleteTheProject`
 
 
 2. [logs_parser.py](src/test_logs/logs_parser.py) allows you to parse Gradle logs into json strings:
-   - Gradle exceptions will match the structure of the [`ExceptionData`](src/models/edu_logs.py) dataclass. For example:
-     ```json
-     {
-       "path": "src/main/kotlin/Main.kt", 
-       "line_number": 4, 
-       "column_number": 9, 
-       "message": "Conflicting declarations: val firstUserAnswer: String, val firstUserAnswer: String"
-     }
-     ```
-   - Gradle tests will match the structure of the [`TestData`](src/models/edu_logs.py) dataclass.
-     ```json
-     {
-       "class_name": "Test", 
-       "test": "testCountExactMatchesFunction()", 
-       "method_name": "testCountExactMatchesFunction()", 
-       "duration": "0s", 
-       "result": "passed", 
-       "error_class": null, 
-       "message": null
-     }
-     ```
-   
+    - Gradle exceptions will match the structure of the [`ExceptionData`](src/models/edu_logs.py) dataclass. For
+      example:
+      ```json
+      {
+        "path": "src/main/kotlin/Main.kt", 
+        "line_number": 4, 
+        "column_number": 9, 
+        "message": "Conflicting declarations: val firstUserAnswer: String, val firstUserAnswer: String"
+      }
+      ```
+    - Gradle tests will match the structure of the [`TestData`](src/models/edu_logs.py) dataclass.
+      ```json
+      {
+        "class_name": "Test", 
+        "test": "testCountExactMatchesFunction()", 
+        "method_name": "testCountExactMatchesFunction()", 
+        "duration": "0s", 
+        "result": "passed", 
+        "error_class": null, 
+        "message": null
+      }
+      ```
+
    Execute one of the following commands with necessary arguments:
    ```bash
    poetry run gradle_logs_parser [arguments]
@@ -263,18 +320,18 @@ then your module should be named `Introduction-LastPush-CompleteTheProject`
    ```bash
    docker run hyperstyle-analysis-prod:<VERSION> poetry run gradle_logs_parser [arguments]
    ```
-   
+
    **Required arguments**:
-   - `submissions_path` — Path to .csv file with submissions.
-   - `gradle_logs_path` — Path to a folder with gradle logs.
-   
+    - `submissions_path` — Path to .csv file with submissions.
+    - `gradle_logs_path` — Path to a folder with gradle logs.
+
    **Optional arguments**:
-   
+
    | Argument                | Description                  |
-   |-------------------------|------------------------------|
+         |-------------------------|------------------------------|
    | **&#8209;&#8209;debug** | Run the script in debug mode |
 
-Charts plotted with this module can be found in [this section](#visualization). 
+Charts plotted with this module can be found in [this section](#visualization).
 
 ---
 
@@ -283,25 +340,29 @@ Charts plotted with this module can be found in [this section](#visualization).
 [This module](src/inspections) allows you to gather IntelliJ inspections and also
 contains functions that are necessary for their visualization.
 
-[gathering.py](src/inspections/gathering.py) allows you to gather IntelliJ inspections. 
+[gathering.py](src/inspections/gathering.py) allows you to gather IntelliJ inspections.
 Execute one of the following commands with necessary arguments:
+
 ```bash
 poetry run inpsections_gathering [arguments]
 ```
+
 or
+
 ```bash
 docker run hyperstyle-analysis-prod:<VERSION> poetry run inpsections_gathering [arguments]
 ```
 
 **Required arguments**:
+
 - `submissions_path` — Path to .csv file with submissions.
 - `--tool-path` — Path to the Hyperstyle entry point.
-- `--language-version` — Language version of code snippets. 
+- `--language-version` — Language version of code snippets.
 - `--host` — Code quality server address.
 - `--port` — Code quality server port.
-   
+
 **Optional arguments**:
-   
+
 | Argument                                       | Description                                              |
 |------------------------------------------------|----------------------------------------------------------|
 | **&#8209;&#8209;venv**                         | Path to venv to run the tool.                            |
@@ -312,80 +373,90 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run inpsections_gathering [
 Charts plotted with this module can be found in [this section](#visualization).
 
 ---
+
 # Visualization
 
-[This module](src/visualization) contains [Streamlit](https://streamlit.io/) app that visualizes all statistics gathered from students' submissions.
+[This module](src/visualization) contains [Streamlit](https://streamlit.io/) app that visualizes all statistics gathered
+from students' submissions.
 
 To run the app, you need to execute the following command:
+
 ```bash
 poetry run streamlit run ./visualization/README.py
 ```
 
 The app consists of several pages:
+
 1. [README](src/visualization/README.py) contains an application description and input fields for common resources:
-   1. Submissions path — Path to .csv file with submissions, parsed test logs and inspections.
-   2. Course structure path — Path to .csv file with course structure.
+    1. Submissions path — Path to .csv file with submissions, parsed test logs and inspections.
+    2. Course structure path — Path to .csv file with course structure.
+
+2. [General stats](src/visualization/pages/1_General_stats.py) contains charts from
+   the [Simple general statistics](#simple-general-statistics) module:
+    - Task attempts:
 
-2. [General stats](src/visualization/pages/1_General_stats.py) contains charts from the [Simple general statistics](#simple-general-statistics) module:
-   - Task attempts:
+      <img src="./images/task-attempts.png" title="Number of task attempts" width="70%"/>
 
-     <img src="./images/task-attempts.png" title="Number of task attempts" width="70%"/>
+    - Task solving:
 
-   - Task solving:
+      <img src="./images/task-solving.png" title="Task solving process" width="70%"/>
 
-     <img src="./images/task-solving.png" title="Task solving process" width="70%"/>
+    - Task duplicates:
 
-   - Task duplicates:
+      <img src="./images/task-duplicates.png" title="Number of task duplicates" width="70%"/>
 
-     <img src="./images/task-duplicates.png" title="Number of task duplicates" width="70%"/>
+3. [Group tests timeline](src/visualization/pages/2_Group_tests_timeline.py) contains charts from
+   the [Test logs analysis](#test-logs-analysis) module:
+    - General tests timeline:
 
-3. [Group tests timeline](src/visualization/pages/2_Group_tests_timeline.py) contains charts from the [Test logs analysis](#test-logs-analysis) module:
-   - General tests timeline:
-     
-     <img src="./images/general-tests-timeline.png" title="General tests timeline" width="70%"/>
- 
-   - Parametrized tests timeline:
+      <img src="./images/general-tests-timeline.png" title="General tests timeline" width="70%"/>
 
-     <img src="./images/parametrized-tests-timeline.png" title="Parametrized tests timeline" width="70%"/>
+    - Parametrized tests timeline:
 
-4. [Aggregated timeline stats](src/visualization/pages/3_Aggregated_timeline_stats.py) contains charts from the [Test logs analysis](#test-logs-analysis) module:
-   - Average tests timeline:
-   
-     <img src="./images/avarage-tests-timeline.png" title="Average tests timeline" width="70%"/>
+      <img src="./images/parametrized-tests-timeline.png" title="Parametrized tests timeline" width="70%"/>
 
-   - Tests chain:
-      
-     <img src="./images/tests-chain.png" title="Tests chain" width="70%"/>
+4. [Aggregated timeline stats](src/visualization/pages/3_Aggregated_timeline_stats.py) contains charts from
+   the [Test logs analysis](#test-logs-analysis) module:
+    - Average tests timeline:
 
-5. [Inspections stats](src/visualization/pages/4_Inspections_stats.py) contains charts from the [Inspections analysis](#inspections-analysis) module:
-   - Inspections frequency:
+      <img src="./images/avarage-tests-timeline.png" title="Average tests timeline" width="70%"/>
 
-     <img src="./images/inspections-frequency.png" title="Inspections frequency" width="70%"/>
-   
-   - Inspections fixing examples:
-   
-     <img src="./images/inspections-fixing-examples.png" title="Inspections frequency" width="70%"/>
-   
-6. [Submissions viewer](src/visualization/pages/5_Submissions_viewer.py) allows you to view user submissions within some group.
+    - Tests chain:
+
+      <img src="./images/tests-chain.png" title="Tests chain" width="70%"/>
+
+5. [Inspections stats](src/visualization/pages/4_Inspections_stats.py) contains charts from
+   the [Inspections analysis](#inspections-analysis) module:
+    - Inspections frequency:
+
+      <img src="./images/inspections-frequency.png" title="Inspections frequency" width="70%"/>
+
+    - Inspections fixing examples:
+
+      <img src="./images/inspections-fixing-examples.png" title="Inspections frequency" width="70%"/>
+
+6. [Submissions viewer](src/visualization/pages/5_Submissions_viewer.py) allows you to view user submissions within some
+   group.
 
    <img src="./images/submissions-viewer.png" title="Inspections frequency" width="70%"/>
 
-7. [Automated analysis](src/visualization/pages/6_Automated_analysis.py) allows you to automatically analyse submissions' data.
+7. [Automated analysis](src/visualization/pages/6_Automated_analysis.py) allows you to automatically analyse
+   submissions' data.
+
+    - [Median attempts analysis](src/visualization/pages/automated_analysis/median_attempts_analysis.py)
+      allows you to find suspicious tasks whose median number of attempts is not less than a threshold
+      and view group submissions from these tasks.
+
+      <img src="./images/median-attempts-analysis.png" title="Inspections frequency" width="70%"/>
 
-   - [Median attempts analysis](src/visualization/pages/automated_analysis/median_attempts_analysis.py) 
-     allows you to find suspicious tasks whose median number of attempts is not less than a threshold
-     and view group submissions from these tasks.
+    - [Failed users analysis](src/visualization/pages/automated_analysis/failed_users_analysis.py)
+      allows you to find suspicious tasks whose relative number of failed users is not less than a threshold
+      and view group submissions from these tasks.
 
-     <img src="./images/median-attempts-analysis.png" title="Inspections frequency" width="70%"/>
+      <img src="./images/failed-users-analysis.png" title="Inspections frequency" width="70%"/>
 
-   - [Failed users analysis](src/visualization/pages/automated_analysis/failed_users_analysis.py) 
-     allows you to find suspicious tasks whose relative number of failed users is not less than a threshold 
-     and view group submissions from these tasks.
-   
-     <img src="./images/failed-users-analysis.png" title="Inspections frequency" width="70%"/>
-   
-   - [Median test attempts analysis](src/visualization/pages/automated_analysis/median_test_attempts_analysis.py)
-     allows you to find suspicious tests whose median number of attempts is not less than a threshold
-     and view group submissions that contain these tests.
+    - [Median test attempts analysis](src/visualization/pages/automated_analysis/median_test_attempts_analysis.py)
+      allows you to find suspicious tests whose median number of attempts is not less than a threshold
+      and view group submissions that contain these tests.
 
-     <img src="./images/median-test-attempts-analysis.png" title="Inspections frequency" width="70%"/>
\ No newline at end of file
+      <img src="./images/median-test-attempts-analysis.png" title="Inspections frequency" width="70%"/>
\ No newline at end of file
diff --git a/jba/src/processing/tasktracker_edu_validation.py b/jba/src/processing/tasktracker_edu_validation.py
new file mode 100644
index 0000000..447a39c
--- /dev/null
+++ b/jba/src/processing/tasktracker_edu_validation.py
@@ -0,0 +1,85 @@
+import argparse
+from pathlib import Path
+from typing import Dict
+
+import pandas as pd
+
+from core.src.utils.df_utils import read_df, write_df
+
+USER_COLUMN = "user"
+ID_COLUMN = "id"
+IDX_COLUMN = "id_x"
+EMAIL_COLUMN = "email"
+RESEARCH_ID_COLUMN = "research_id"
+DF_IN_EDU_FILENAME = 'df_in_edu.csv'
+DF_NOT_IN_EDU_FILENAME = 'df_not_in_edu.csv'
+
+
+def configure_parser(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        'filtered_data',
+        type=lambda value: Path(value).absolute(),
+        help='Path to the data filtered by tasktracker_task_filter',
+    )
+
+    parser.add_argument(
+        'destination_path',
+        type=lambda value: Path(value).absolute(),
+        help='Path of the directory to save divided data',
+    )
+
+    parser.add_argument(
+        'users_file',
+        type=lambda value: Path(value).absolute(),
+        help='Tasktracker users.csv file',
+    )
+
+    parser.add_argument(
+        'researches_file',
+        type=lambda value: Path(value).absolute(),
+        help='Tasktracker researches.csv file',
+    )
+
+    parser.add_argument(
+        'edu_file',
+        type=lambda value: Path(value).absolute(),
+        help='Edu csv file',
+    )
+
+
+def research_to_email(users_path: Path, researches_path: Path) -> Dict[str, str]:
+    users = read_df(users_path)
+    researches = read_df(researches_path)
+    return pd.merge(researches, users, left_on=USER_COLUMN, right_on=ID_COLUMN, how="left").set_index(IDX_COLUMN)[
+        EMAIL_COLUMN].to_dict()
+
+
+# noqa: WPS320
+def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) -> (
+        pd.DataFrame, pd.DataFrame):
+    filtered_df[EMAIL_COLUMN] = filtered_df[RESEARCH_ID_COLUMN].map(res_to_email)
+    edu_emails = edu_df[EMAIL_COLUMN].unique()
+    df_in_edu = filtered_df[filtered_df[EMAIL_COLUMN].isin(edu_emails)]
+    df_not_in_edu = filtered_df[~filtered_df[EMAIL_COLUMN].isin(edu_emails)]
+    return df_in_edu, df_not_in_edu
+
+
+def validate(filtered_data: Path, edu_file: Path, destination_path: Path, res_to_email: Dict[str, str]) -> None:
+    filtered_df = read_df(filtered_data)
+    edu_df = read_df(edu_file)
+    df_in_edu, df_not_in_edu = split_dataframe(filtered_df, edu_df, res_to_email)
+    write_df(df_in_edu, destination_path / DF_IN_EDU_FILENAME)
+    write_df(df_not_in_edu, destination_path / DF_NOT_IN_EDU_FILENAME)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    configure_parser(parser)
+
+    args = parser.parse_args()
+    res_to_email = research_to_email(args.users_file, args.researches_file)
+    validate(args.filtered_data, args.edu_file, args.destination_path, res_to_email)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 8de27b2..a0407b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -85,6 +85,7 @@ run_hyperstyle = 'data_labelling.src.hyperstyle.evaluate:main'
 collect_course_structure = 'jba.src.processing.collect_course_structure:main'
 tasktracker_content_collector = 'jba.src.processing.tasktracker_content_collector:main'
 tasktracker_task_filter = 'jba.src.processing.tasktracker_task_filter:main'
+tasktracker_edu_validation = 'jba.src.processing.tasktracker_edu_validation:main'
 process_course_data = 'jba.src.processing.data_processing:main'
 plot_task_attempt = 'jba.src.plots.task_attempt:main'
 plot_task_solving = 'jba.src.plots.task_solving:main'

From 709b015e79174e304d348432d95c8531134e9c7c Mon Sep 17 00:00:00 2001
From: mikrise2 <mikrise2@mail.ru>
Date: Thu, 29 Feb 2024 16:15:34 +0100
Subject: [PATCH 4/6] fixe flake8

---
 spellcheck_whitelist.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spellcheck_whitelist.txt b/spellcheck_whitelist.txt
index 29cef8c..b16ace6 100644
--- a/spellcheck_whitelist.txt
+++ b/spellcheck_whitelist.txt
@@ -1,3 +1,4 @@
+IDX
 TASKTRACKER
 WPS336
 arange

From 85ba0dc43d2991a16ce9e5c4d31ee6b15dcd5bc7 Mon Sep 17 00:00:00 2001
From: mikrise2 <mikrise2@mail.ru>
Date: Thu, 29 Feb 2024 16:19:48 +0100
Subject: [PATCH 5/6] fixed flake8(1)

---
 jba/src/processing/tasktracker_edu_validation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/jba/src/processing/tasktracker_edu_validation.py b/jba/src/processing/tasktracker_edu_validation.py
index 447a39c..d80a88a 100644
--- a/jba/src/processing/tasktracker_edu_validation.py
+++ b/jba/src/processing/tasktracker_edu_validation.py
@@ -54,8 +54,7 @@ def research_to_email(users_path: Path, researches_path: Path) -> Dict[str, str]
         EMAIL_COLUMN].to_dict()
 
 
-# noqa: WPS320
-def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) -> (
+def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) -> (   # noqa: WPS320
         pd.DataFrame, pd.DataFrame):
     filtered_df[EMAIL_COLUMN] = filtered_df[RESEARCH_ID_COLUMN].map(res_to_email)
     edu_emails = edu_df[EMAIL_COLUMN].unique()

From 65838c9868510b75834c3d65102a1d5b7f993919 Mon Sep 17 00:00:00 2001
From: mikrise2 <mikrise2@mail.ru>
Date: Fri, 1 Mar 2024 11:40:25 +0100
Subject: [PATCH 6/6] fix comments

---
 jba/README.md                                    | 14 +++++++-------
 jba/src/processing/tasktracker_edu_validation.py |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/jba/README.md b/jba/README.md
index 1d90323..95bd489 100644
--- a/jba/README.md
+++ b/jba/README.md
@@ -117,7 +117,7 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_content_col
 - `course_sources_path` — Path to course sources to extract course structure.
 - `destination_path` — Path to directory where yaml file will be created.
 
-4.[tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date.
+4. [tasktracker_task_filter.py](src/processing/tasktracker_task_filter.py) filtering data by the start research date.
 Grouping by package name.
 
 ### Usage
@@ -140,7 +140,7 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_task_filter
 - `destination_path` — Path of the file to save filtered data.
 - `start_date` — Start of the research in the DD.MM.YYYY format.
 
-5.[tasktracker_edu_validation.py](src/processing/tasktracker_edu_validation.py) Divide filtered tasktracker file to 2
+5. [tasktracker_edu_validation.py](src/processing/tasktracker_edu_validation.py) Divide filtered tasktracker file to 2
 files - the first with emails that have been presented in the edu csv file, and the second file without them.
 
 ### Usage
@@ -192,7 +192,7 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_edu_validat
    Optional arguments:
 
    | Argument                            | Description                                 |
-         |-------------------------------------|---------------------------------------------|
+   |-------------------------------------|---------------------------------------------|
    | **&#8209;&#8209;course&#8209;name** | Name of the course to display on the chart. |
 
 
@@ -218,7 +218,7 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_edu_validat
    **Optional arguments**:
 
    | Argument                            | Description                                 |
-         |-------------------------------------|---------------------------------------------|
+   |-------------------------------------|---------------------------------------------|
    | **&#8209;&#8209;course&#8209;name** | Name of the course to display on the chart. |
 
 3. [task_duplicates.py](src/plots/task_duplicates.py) allows you to plot line charts how many duplicate submissions
@@ -243,7 +243,7 @@ docker run hyperstyle-analysis-prod:<VERSION> poetry run tasktracker_edu_validat
    **Optional arguments**:
 
    | Argument                            | Description                                 |
-         |-------------------------------------|---------------------------------------------|
+   |-------------------------------------|---------------------------------------------|
    | **&#8209;&#8209;course&#8209;name** | Name of the course to display on the chart. |
 
 Charts plotted with this module can be found in [this section](#visualization).
@@ -280,7 +280,7 @@ then your module should be named `Introduction-LastPush-CompleteTheProject`
    **Optional arguments**:
 
    | Argument                                         | Description                                                                 |
-         |--------------------------------------------------|-----------------------------------------------------------------------------|
+   |--------------------------------------------------|-----------------------------------------------------------------------------|
    | **&#8209;&#8209;timeout**                        | Timeout in seconds for subprocess to be executed.                           |
    | **&#8209;&#8209;n&#8209;cpu**                    | Number of CPUs to use for parallel execution.                               |
    | **&#8209;&#8209;force&#8209;ignore&#8209;tests** | Force to ignore substitution of test files if they are visible to the user. |
@@ -328,7 +328,7 @@ then your module should be named `Introduction-LastPush-CompleteTheProject`
    **Optional arguments**:
 
    | Argument                | Description                  |
-         |-------------------------|------------------------------|
+   |-------------------------|------------------------------|
    | **&#8209;&#8209;debug** | Run the script in debug mode |
 
 Charts plotted with this module can be found in [this section](#visualization).
diff --git a/jba/src/processing/tasktracker_edu_validation.py b/jba/src/processing/tasktracker_edu_validation.py
index d80a88a..96482f9 100644
--- a/jba/src/processing/tasktracker_edu_validation.py
+++ b/jba/src/processing/tasktracker_edu_validation.py
@@ -1,6 +1,6 @@
 import argparse
 from pathlib import Path
-from typing import Dict
+from typing import Dict, Tuple
 
 import pandas as pd
 
@@ -54,8 +54,8 @@ def research_to_email(users_path: Path, researches_path: Path) -> Dict[str, str]
         EMAIL_COLUMN].to_dict()
 
 
-def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) -> (   # noqa: WPS320
-        pd.DataFrame, pd.DataFrame):
+def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_email: Dict[str, str]) \
+        -> Tuple[pd.DataFrame, pd.DataFrame]:
     filtered_df[EMAIL_COLUMN] = filtered_df[RESEARCH_ID_COLUMN].map(res_to_email)
     edu_emails = edu_df[EMAIL_COLUMN].unique()
     df_in_edu = filtered_df[filtered_df[EMAIL_COLUMN].isin(edu_emails)]
@@ -63,7 +63,7 @@ def split_dataframe(filtered_df: pd.DataFrame, edu_df: pd.DataFrame, res_to_emai
     return df_in_edu, df_not_in_edu
 
 
-def validate(filtered_data: Path, edu_file: Path, destination_path: Path, res_to_email: Dict[str, str]) -> None:
+def validate(filtered_data: Path, edu_file: Path, destination_path: Path, res_to_email: Dict[str, str]):
     filtered_df = read_df(filtered_data)
     edu_df = read_df(edu_file)
     df_in_edu, df_not_in_edu = split_dataframe(filtered_df, edu_df, res_to_email)