From 9aeec7daba8004ba9ca925ca594adad08e7d6f7e Mon Sep 17 00:00:00 2001 From: WojtekMs <62173407+WojtekMs@users.noreply.github.com> Date: Tue, 5 Dec 2023 13:47:46 +0100 Subject: [PATCH] Add parsing categories from file Prepopulate categories file with basic categories. NOTE: It is expected that user will modify categories.json according to his own needs (to get the best experience from this app) --- .gitignore | 1 + pyproject.toml | 4 +- src/banker/__main__.py | 54 +++++-- src/banker/common/naming.py | 5 + .../parser/interfaces/categories_parser.py | 9 ++ src/banker/parser/json_categories_parser.py | 90 +++++++++++ src/banker/parser/payment_type_parser.py | 14 ++ src/banker/resources/__init__.py | 0 src/banker/resources/categories.json | 152 ++++++++++++++++++ .../parser/test_data/json_categories.py | 50 ++++++ .../parser/test_json_categories_parser.py | 85 ++++++++++ .../banker/parser/test_payment_type_parser.py | 20 +++ 12 files changed, 470 insertions(+), 14 deletions(-) create mode 100644 src/banker/parser/interfaces/categories_parser.py create mode 100644 src/banker/parser/json_categories_parser.py create mode 100644 src/banker/parser/payment_type_parser.py create mode 100644 src/banker/resources/__init__.py create mode 100644 src/banker/resources/categories.json create mode 100644 tests/banker/parser/test_data/json_categories.py create mode 100644 tests/banker/parser/test_json_categories_parser.py create mode 100644 tests/banker/parser/test_payment_type_parser.py diff --git a/.gitignore b/.gitignore index 0c17061..0ac677a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ __pycache__ .idea unmatched_transactions.html *.xlsx +output \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 52735ca..8deb5b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,9 @@ dependencies = [ "html5lib", "beautifulsoup4", "py-moneyed", - "openpyxl" + "openpyxl", + "semver", + "importlib-resources" ] [project.optional-dependencies] # Optional diff --git a/src/banker/__main__.py b/src/banker/__main__.py index 15d3a66..85d900c 100644 --- a/src/banker/__main__.py +++ b/src/banker/__main__.py @@ -1,30 +1,58 @@ -from banker.analyzer.analyze import analyze_transactions, deduce_month_year -from banker.data.category import Category, PaymentType import argparse +import os.path + +from importlib_resources import files + +from banker.analyzer.analyze import analyze_transactions, deduce_month_year +from banker.data.category import Category +from banker.data.transaction import Transaction from banker.formatter.month_year_formatter import format_month_year from banker.parser.html_transactions_parser import HtmlTransactionsParser from banker.formatter.html_transactions_formatter import HtmlTransactionsFormatter +from banker.parser.interfaces.categories_parser import ICategoriesParser +from banker.parser.interfaces.transactions_parser import ITransactionsParser +from banker.parser.json_categories_parser import JsonCategoriesParser from banker.writer.excel_categories_writer import ExcelCategoriesWriter +def get_supported_categories(categories_parser: ICategoriesParser, categories_filepath: str) -> list[Category]: + with open(categories_filepath, "r") as file: + return categories_parser.parse_categories(file.read()) + + +def get_transactions(transactions_parser: ITransactionsParser, transactions_filepath: str) -> list[Transaction]: + with open(transactions_filepath, "r") as transactions_file: + return transactions_parser.parse_transactions(transactions_file.read()) + + +def save_to_file(filepath: str, content: str): + with open(filepath, "w") as file: + file.write(content) + + def main(): - supported_categories = [ - Category(name="Kaufland", payment_type=PaymentType.Household, matching_regexes=[r"KAUFLAND PL"])] transactions_parser = HtmlTransactionsParser() + categories_parser = JsonCategoriesParser() transactions_formatter = HtmlTransactionsFormatter() categories_writer = ExcelCategoriesWriter() parser = argparse.ArgumentParser() parser.add_argument("html_file") + parser.add_argument("--categories_file", default=files('banker.resources').joinpath('categories.json')) + parser.add_argument("--output_directory", default=files('banker.resources').joinpath('output')) args = parser.parse_args() - with open(args.html_file, "rb") as input_file: - all_transactions = transactions_parser.parse_transactions(input_file.read().decode('utf-8')) - analyze_result = analyze_transactions(all_transactions, supported_categories) - formatted_transactions = transactions_formatter.format_transactions(analyze_result.unmatched_transactions) - with open("unmatched_transactions.html", "w") as transactions_file: - transactions_file.write(formatted_transactions) - month_year = deduce_month_year(all_transactions) - categories_writer.write_categories(analyze_result.matched_categories, "autogen_budget.xlsx", - format_month_year(month_year)) + os.makedirs(args.output_directory, exist_ok=True) + output_unmatched_transactions_filepath = os.path.join(args.output_directory, "unmatched_transactions.html") + output_matched_categories_filepath = os.path.join(args.output_directory, "autogen_budget.xlsx") + + all_transactions = get_transactions(transactions_parser, args.html_file) + month_year = deduce_month_year(all_transactions) + supported_categories = get_supported_categories(categories_parser, args.categories_file) + analyze_result = analyze_transactions(all_transactions, supported_categories) + formatted_transactions = transactions_formatter.format_transactions(analyze_result.unmatched_transactions) + + save_to_file(output_unmatched_transactions_filepath, formatted_transactions) + categories_writer.write_categories(analyze_result.matched_categories, output_matched_categories_filepath, + format_month_year(month_year)) diff --git a/src/banker/common/naming.py b/src/banker/common/naming.py index b6ba158..82e955b 100644 --- a/src/banker/common/naming.py +++ b/src/banker/common/naming.py @@ -3,3 +3,8 @@ TRANSACTION_COL_NAME_DESCRIPTION = "Opis" TRANSACTION_COL_NAME_VALUE = "Kwota" +CATEGORIES_KEY_NAME_VERSION = "version" +CATEGORIES_KEY_NAME_CATEGORIES = "categories" +CATEGORIES_KEY_NAME_CATEGORY_NAME = "name" +CATEGORIES_KEY_NAME_CATEGORY_PAYMENT_TYPE = "payment_type" +CATEGORIES_KEY_NAME_CATEGORY_REGEXES = "matching_regexes" diff --git a/src/banker/parser/interfaces/categories_parser.py b/src/banker/parser/interfaces/categories_parser.py new file mode 100644 index 0000000..d962e0e --- /dev/null +++ b/src/banker/parser/interfaces/categories_parser.py @@ -0,0 +1,9 @@ +from abc import ABC, abstractmethod + +from banker.data.category import Category + + +class ICategoriesParser(ABC): + @abstractmethod + def parse_categories(self, content: str) -> list[Category]: + raise NotImplementedError("Method not implemented in subclass") diff --git a/src/banker/parser/json_categories_parser.py b/src/banker/parser/json_categories_parser.py new file mode 100644 index 0000000..613914a --- /dev/null +++ b/src/banker/parser/json_categories_parser.py @@ -0,0 +1,90 @@ +import logging +import semver +import json + +from banker.common.naming import CATEGORIES_KEY_NAME_VERSION, CATEGORIES_KEY_NAME_CATEGORIES, \ + CATEGORIES_KEY_NAME_CATEGORY_NAME, CATEGORIES_KEY_NAME_CATEGORY_PAYMENT_TYPE, CATEGORIES_KEY_NAME_CATEGORY_REGEXES +from banker.data.category import Category +from banker.parser.interfaces.categories_parser import ICategoriesParser +from banker.parser.payment_type_parser import parse_payment_type + + +class CategoriesVersionMissing(Exception): + def __str__(self): + return f"Key {CATEGORIES_KEY_NAME_VERSION} is missing in categories JSON file" + + +class CategoriesVersionInvalid(Exception): + def __init__(self, version: str): + self.__version = version + + def __str__(self): + return f"Categories version has invalid format, " \ + f"expected semantic versioning e.g: 1.0.0, actual: {self.__version}" + + +class CategoriesVersionUnsupported(Exception): + def __init__(self, supported_version: semver.Version, current_version: semver.Version): + self.__supported_version = supported_version + self.__current_version = current_version + + def __str__(self): + return f"Categories version is unsupported by application, " \ + f"supported version: {self.__supported_version}, current version: {self.__current_version}" + + +class CategoryNameDuplicate(Exception): + def __init__(self, name: str): + self.__name = name + + def __str__(self): + return f"Categories names must be unique, but this category name is used multiple times: {self.__name}" + + +class JsonCategoriesParser(ICategoriesParser): + def __init__(self): + self.__supported_version = semver.Version(major=1, minor=0, patch=0) + self.__logger = logging.getLogger("JsonCategoriesParser") + + def __validate_version(self, json_dict: dict): + version = json_dict.get(CATEGORIES_KEY_NAME_VERSION) + if version is None: + raise CategoriesVersionMissing() + if not semver.Version.is_valid(version): + raise CategoriesVersionInvalid(version) + version = semver.Version.parse(version) + if not self.__supported_version.is_compatible(version): + raise CategoriesVersionUnsupported(self.__supported_version, version) + + def __contains_required_keys(self, category: dict) -> bool: + required_keys = [CATEGORIES_KEY_NAME_CATEGORY_NAME, CATEGORIES_KEY_NAME_CATEGORY_PAYMENT_TYPE, + CATEGORIES_KEY_NAME_CATEGORY_REGEXES] + for required_key in required_keys: + if required_key not in category: + self.__logger.info(f"Category object key missing: {required_key}") + return False + return True + + def __valid_payment_type(self, category: dict) -> bool: + if parse_payment_type(category[CATEGORIES_KEY_NAME_CATEGORY_PAYMENT_TYPE]) is None: + self.__logger.info("Invalid payment type") + return False + return True + + def parse_categories(self, content: str) -> list[Category]: + json_dict = json.loads(content) + self.__validate_version(json_dict) + + result = {} + for category in json_dict.get(CATEGORIES_KEY_NAME_CATEGORIES, []): + if not self.__contains_required_keys(category): + continue + if not self.__valid_payment_type(category): + continue + name = category[CATEGORIES_KEY_NAME_CATEGORY_NAME] + if name in result: + raise CategoryNameDuplicate(name) + payment_type = parse_payment_type(category[CATEGORIES_KEY_NAME_CATEGORY_PAYMENT_TYPE]) + matching_regexes = category[CATEGORIES_KEY_NAME_CATEGORY_REGEXES] + result[name] = Category(name, payment_type, matching_regexes) + return list(result.values()) diff --git a/src/banker/parser/payment_type_parser.py b/src/banker/parser/payment_type_parser.py new file mode 100644 index 0000000..8cd59d9 --- /dev/null +++ b/src/banker/parser/payment_type_parser.py @@ -0,0 +1,14 @@ +from banker.data.category import PaymentType + + +def parse_payment_type(payment_type: str) -> PaymentType | None: + match payment_type: + case 'household': + return PaymentType.Household + case 'recurring': + return PaymentType.Recurring + case 'occasional': + return PaymentType.Occasional + case 'optional': + return PaymentType.Optional + return None diff --git a/src/banker/resources/__init__.py b/src/banker/resources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/banker/resources/categories.json b/src/banker/resources/categories.json new file mode 100644 index 0000000..a274a07 --- /dev/null +++ b/src/banker/resources/categories.json @@ -0,0 +1,152 @@ +{ + "version": "1.0.0", + "categories": [ + { + "name": "Kaufland", + "payment_type": "household", + "matching_regexes": [ + "KAUFLAND PL" + ] + }, + { + "name": "Bilety PKP", + "payment_type": "occasional", + "matching_regexes": [ + "intercity\\.pl" + ] + }, + { + "name": "Leclerc", + "payment_type": "household", + "matching_regexes": [ + "eLeclerc" + ] + }, + { + "name": "Biedronka", + "payment_type": "household", + "matching_regexes": [ + "BIEDRONKA" + ] + }, + { + "name": "Obuwie", + "payment_type": "occasional", + "matching_regexes": [ + "eobuwie\\.com\\.pl" + ] + }, + { + "name": "Netto", + "payment_type": "household", + "matching_regexes": [ + "NETTO" + ] + }, + { + "name": "Paliwo", + "payment_type": "household", + "matching_regexes": [ + "ORLEN" + ] + }, + { + "name": "Darowizny", + "payment_type": "optional", + "matching_regexes": [ + "DAROWIZNA" + ] + }, + { + "name": "Carrefour", + "payment_type": "household", + "matching_regexes": [ + "CARREFOUR" + ] + }, + { + "name": "Piekarnie", + "payment_type": "household", + "matching_regexes": [ + "(?i)piekarnia" + ] + }, + { + "name": "Drogerie", + "payment_type": "household", + "matching_regexes": [ + "HEBE" + ] + }, + { + "name": "Pralnie", + "payment_type": "occasional", + "matching_regexes": [ + "PRALNIA" + ] + }, + { + "name": "Bilety MPK Wrocław", + "payment_type": "household", + "matching_regexes": [ + "URBANCARD" + ] + }, + { + "name": "Spotify", + "payment_type": "recurring", + "matching_regexes": [ + "Spotify" + ] + }, + { + "name": "Action", + "payment_type": "household", + "matching_regexes": [ + "Action" + ] + }, + { + "name": "Lidl", + "payment_type": "household", + "matching_regexes": [ + "LIDL" + ] + }, + { + "name": "RTV Euro AGD", + "payment_type": "occasional", + "matching_regexes": [ + "EURO\\-NET" + ] + }, + { + "name": "Abonament telefoniczny", + "payment_type": "recurring", + "matching_regexes": [ + "24\\.play\\.pl" + ] + }, + { + "name": "Castorama", + "payment_type": "occassional", + "matching_regexes": [ + "CASTORAMA" + ] + }, + { + "name": "McDonalds", + "payment_type": "optional", + "matching_regexes": [ + "MCDONALDS" + ] + }, + { + "name": "Lody", + "payment_type": "optional", + "matching_regexes": [ + "(?i)lodziarnia" + ] + } + ] +} \ No newline at end of file diff --git a/tests/banker/parser/test_data/json_categories.py b/tests/banker/parser/test_data/json_categories.py new file mode 100644 index 0000000..a866c7d --- /dev/null +++ b/tests/banker/parser/test_data/json_categories.py @@ -0,0 +1,50 @@ +# update this value in case supported version by parser is changed +CURRENTLY_SUPPORTED_VERSION_BY_PARSER = "1.0.0" + +MISSING_VERSION_JSON_CATEGORIES_LITERAL = """{"categories": []}""" +INVALID_VERSION_JSON_CATEGORIES_LITERAL_1 = """{"version": "1", "categories": []}""" +INVALID_VERSION_JSON_CATEGORIES_LITERAL_2 = """{"version": "one", "categories": []}""" +INVALID_VERSION_JSON_CATEGORIES_LITERAL_3 = """{"version": "alpha", "categories": []}""" +INCOMPATIBLE_VERSION_JSON_CATEGORIES_LITERAL = """{"version": "0.1.0", "categories": []}""" + +ONE_CATEGORY_JSON_CATEGORIES_LITERAL = f"""{{ +"version": "{CURRENTLY_SUPPORTED_VERSION_BY_PARSER}", +"categories": [ +{{"name": "Kaufland", "payment_type": "household", "matching_regexes": ["KAUFLAND PL"]}} +]}}""" + +MANY_CATEGORIES_JSON_CATEGORIES_LITERAL = f"""{{ +"version": "{CURRENTLY_SUPPORTED_VERSION_BY_PARSER}", +"categories": [ +{{"name": "Kaufland", "payment_type": "household", "matching_regexes": ["KAUFLAND PL"]}}, +{{"name": "Internet", "payment_type": "recurring", "matching_regexes": ["(?i)vectra"]}}, +{{"name": "Shoes", "payment_type": "occasional", "matching_regexes": ["shoes", "adidas"]}}, +{{"name": "Cafe", "payment_type": "optional", "matching_regexes": ["Klar", "Starbucks"]}}, +{{"name": "Empty", "payment_type": "optional", "matching_regexes": []}} +]}}""" + +SOME_INVALID_CATEGORIES_JSON_CATEGORIES_LITERAL = f"""{{ +"version": "{CURRENTLY_SUPPORTED_VERSION_BY_PARSER}", +"categories": [ +{{"name": "Kaufland", "payment_type": "household", "matching_regexes": ["KAUFLAND PL"]}}, +{{"noname": "Internet", "payment_type": "recurring", "matching_regexes": ["(?i)vectra"]}}, +{{"name": "Shoes", "bad_payment": "occasional", "matching_regexes": ["shoes", "adidas"]}}, +{{"name": "Cafe", "payment_type": "optional", "regxs": ["Klar", "Starbucks"]}} +]}}""" + +INVALID_PAYMENT_TYPE_JSON_CATEGORIES_LITERAL = f"""{{ +"version": "{CURRENTLY_SUPPORTED_VERSION_BY_PARSER}", +"categories": [ +{{"name": "Kaufland", "payment_type": "household", "matching_regexes": ["KAUFLAND PL"]}}, +{{"name": "Internet", "payment_type": "unknown", "matching_regexes": ["(?i)vectra"]}}, +{{"name": "Shoes", "payment_type": "", "matching_regexes": ["(?i)vectra"]}}, +{{"name": "Cafe", "payment_type": "bad", "matching_regexes": ["(?i)vectra"]}} +]}}""" + +DUPLICATE_NAMES_JSON_CATEGORIES_LITERAL = f"""{{ +"version": "{CURRENTLY_SUPPORTED_VERSION_BY_PARSER}", +"categories": [ +{{"name": "Kaufland", "payment_type": "household", "matching_regexes": ["KAUFLAND PL"]}}, +{{"name": "Internet", "payment_type": "recurring", "matching_regexes": ["(?i)vectra"]}}, +{{"name": "Kaufland", "payment_type": "optional", "matching_regexes": ["Starbucks"]}} +]}}""" diff --git a/tests/banker/parser/test_json_categories_parser.py b/tests/banker/parser/test_json_categories_parser.py new file mode 100644 index 0000000..d8503f3 --- /dev/null +++ b/tests/banker/parser/test_json_categories_parser.py @@ -0,0 +1,85 @@ +import pytest + +from banker.data.category import Category, PaymentType +from banker.parser.json_categories_parser import JsonCategoriesParser, CategoriesVersionMissing, \ + CategoriesVersionInvalid, CategoriesVersionUnsupported, CategoryNameDuplicate +from tests.banker.parser.test_data.json_categories import MISSING_VERSION_JSON_CATEGORIES_LITERAL, \ + INCOMPATIBLE_VERSION_JSON_CATEGORIES_LITERAL, INVALID_VERSION_JSON_CATEGORIES_LITERAL_1, \ + INVALID_VERSION_JSON_CATEGORIES_LITERAL_2, INVALID_VERSION_JSON_CATEGORIES_LITERAL_3, \ + ONE_CATEGORY_JSON_CATEGORIES_LITERAL, MANY_CATEGORIES_JSON_CATEGORIES_LITERAL, \ + SOME_INVALID_CATEGORIES_JSON_CATEGORIES_LITERAL, INVALID_PAYMENT_TYPE_JSON_CATEGORIES_LITERAL, \ + DUPLICATE_NAMES_JSON_CATEGORIES_LITERAL + + +@pytest.fixture +def json_categories_parser_sut(): + return JsonCategoriesParser() + + +def test_given_json_file_without_version_when_parse_categories_then_raise_error(json_categories_parser_sut): + with pytest.raises(CategoriesVersionMissing): + json_categories_parser_sut.parse_categories(MISSING_VERSION_JSON_CATEGORIES_LITERAL) + + +@pytest.mark.parametrize('invalid_version', + [INVALID_VERSION_JSON_CATEGORIES_LITERAL_1, INVALID_VERSION_JSON_CATEGORIES_LITERAL_2, + INVALID_VERSION_JSON_CATEGORIES_LITERAL_3]) +def test_given_json_file_with_invalid_version_when_parse_categories_then_raise_error(json_categories_parser_sut, + invalid_version): + with pytest.raises(CategoriesVersionInvalid): + json_categories_parser_sut.parse_categories(invalid_version) + + +def test_given_json_file_with_incompatible_version_when_parse_categories_then_raise_error(json_categories_parser_sut): + with pytest.raises(CategoriesVersionUnsupported): + json_categories_parser_sut.parse_categories(INCOMPATIBLE_VERSION_JSON_CATEGORIES_LITERAL) + + +def test_given_json_file_with_duplicate_category_names_when_parse_categories_then_raise_error( + json_categories_parser_sut): + with pytest.raises(CategoryNameDuplicate): + json_categories_parser_sut.parse_categories(DUPLICATE_NAMES_JSON_CATEGORIES_LITERAL) + + +def test_given_json_file_with_one_category_when_parse_categories_then_return_category(json_categories_parser_sut): + expected_result = [Category(name="Kaufland", payment_type=PaymentType.Household, matching_regexes=["KAUFLAND PL"])] + + actual_result = json_categories_parser_sut.parse_categories(ONE_CATEGORY_JSON_CATEGORIES_LITERAL) + + assert actual_result == expected_result + + +def test_given_json_file_with_many_categories_when_parse_categories_then_return_categories(json_categories_parser_sut): + expected_result = [ + Category(name="Kaufland", payment_type=PaymentType.Household, matching_regexes=["KAUFLAND PL"]), + Category(name="Internet", payment_type=PaymentType.Recurring, matching_regexes=["(?i)vectra"]), + Category(name="Shoes", payment_type=PaymentType.Occasional, matching_regexes=["shoes", "adidas"]), + Category(name="Cafe", payment_type=PaymentType.Optional, matching_regexes=["Klar", "Starbucks"]), + Category(name="Empty", payment_type=PaymentType.Optional, matching_regexes=[]), + ] + + actual_result = json_categories_parser_sut.parse_categories(MANY_CATEGORIES_JSON_CATEGORIES_LITERAL) + + assert actual_result == expected_result + + +def test_given_json_file_with_some_invalid_categories_when_parse_categories_then_return_valid_categories( + json_categories_parser_sut): + expected_result = [ + Category(name="Kaufland", payment_type=PaymentType.Household, matching_regexes=["KAUFLAND PL"]), + ] + + actual_result = json_categories_parser_sut.parse_categories(SOME_INVALID_CATEGORIES_JSON_CATEGORIES_LITERAL) + + assert actual_result == expected_result + + +def test_given_json_file_with_invalid_payment_type_when_parse_categories_then_skip( + json_categories_parser_sut): + expected_result = [ + Category(name="Kaufland", payment_type=PaymentType.Household, matching_regexes=["KAUFLAND PL"]), + ] + + actual_result = json_categories_parser_sut.parse_categories(INVALID_PAYMENT_TYPE_JSON_CATEGORIES_LITERAL) + + assert actual_result == expected_result diff --git a/tests/banker/parser/test_payment_type_parser.py b/tests/banker/parser/test_payment_type_parser.py new file mode 100644 index 0000000..220b29f --- /dev/null +++ b/tests/banker/parser/test_payment_type_parser.py @@ -0,0 +1,20 @@ +import pytest + +from banker.data.category import PaymentType +from banker.parser.payment_type_parser import parse_payment_type + + +@pytest.mark.parametrize('invalid_payment_type_str', ["invalid", "HOUSEHOLD", "duck", "213979", "", '', "Household"]) +def test_given_invalid_payment_type_str_when_parse_payment_type_then_return_none(invalid_payment_type_str): + assert parse_payment_type(invalid_payment_type_str) is None + + +@pytest.mark.parametrize('payment_type_str, expected_result', [("household", PaymentType.Household), + ("recurring", PaymentType.Recurring), + ("occasional", PaymentType.Occasional), + ("optional", PaymentType.Optional)]) +def test_given_valid_payment_type_str_when_parse_payment_type_then_return_payment_type(payment_type_str, + expected_result): + actual_result = parse_payment_type(payment_type_str) + + assert actual_result == expected_result