diff --git a/notebooks/db_analysis.ipynb b/notebooks/db_analysis.ipynb index 99111a3..df2e0cd 100644 --- a/notebooks/db_analysis.ipynb +++ b/notebooks/db_analysis.ipynb @@ -9,23 +9,23 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "from database_handler import DatabaseHandler" + "from stocksense.database_handler import DatabaseHandler" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-11-15 15:59:57.517\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mdatabase_handler.schema\u001b[0m:\u001b[36mcreate_tables\u001b[0m:\u001b[36m122\u001b[0m - \u001b[32m\u001b[1mTables created successfully\u001b[0m\n" + "\u001b[32m2024-11-18 15:44:57.298\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mstocksense.database_handler.schema\u001b[0m:\u001b[36mcreate_tables\u001b[0m:\u001b[36m122\u001b[0m - \u001b[32m\u001b[1mTables created successfully\u001b[0m\n" ] } ], @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -48,30 +48,30 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (940, 6)
ticnamesectorlast_updatespx_statusactive
strstrstrdatei64i64
"ABI"null"Industrials"2005-01-0100
"ABKFQ"null"Financials"2005-01-0100
"ABS"null"Consumer Staples"2005-01-0100
"ACV"null"Consumer Staples"2005-01-0100
"ANRZQ"null"Materials"2005-01-0100
"SW""Smurfit WestRock""Materials"2024-11-0111
"DELL"null"Information Technology"2024-11-1411
"MMM""3M""Industrials"2024-11-1411
"AOS""A. O. Smith""Industrials"2024-11-1411
"ABT""Abbott""Health Care"2024-11-1411
" + "shape: (940, 6)
ticnamesectorlast_updatespx_statusactive
strstrstrdatei64i64
"ABI"null"Industrials"2005-01-0100
"ABKFQ"null"Financials"2005-01-0100
"ABS"null"Consumer Staples"2005-01-0100
"ACV"null"Consumer Staples"2005-01-0100
"ANRZQ"null"Materials"2005-01-0100
"DELL"null"Information Technology"2024-11-1411
"MMM""3M""Industrials"2024-11-1411
"AOS""A. O. Smith""Industrials"2024-11-1411
"ABT""Abbott""Health Care"2024-11-1411
"AAPL""Apple Inc.""Information Technology"2024-11-1711
" ], "text/plain": [ "shape: (940, 6)\n", - "┌───────┬──────────────────┬────────────────────────┬─────────────┬────────────┬────────┐\n", - "│ tic ┆ name ┆ sector ┆ last_update ┆ spx_status ┆ active │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ date ┆ i64 ┆ i64 │\n", - "╞═══════╪══════════════════╪════════════════════════╪═════════════╪════════════╪════════╡\n", - "│ ABI ┆ null ┆ Industrials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", - "│ ABKFQ ┆ null ┆ Financials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", - "│ ABS ┆ null ┆ Consumer Staples ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", - "│ ACV ┆ null ┆ Consumer Staples ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", - "│ ANRZQ ┆ null ┆ Materials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ SW ┆ Smurfit WestRock ┆ Materials ┆ 2024-11-01 ┆ 1 ┆ 1 │\n", - "│ DELL ┆ null ┆ Information Technology ┆ 2024-11-14 ┆ 1 ┆ 1 │\n", - "│ MMM ┆ 3M ┆ Industrials ┆ 2024-11-14 ┆ 1 ┆ 1 │\n", - "│ AOS ┆ A. O. Smith ┆ Industrials ┆ 2024-11-14 ┆ 1 ┆ 1 │\n", - "│ ABT ┆ Abbott ┆ Health Care ┆ 2024-11-14 ┆ 1 ┆ 1 │\n", - "└───────┴──────────────────┴────────────────────────┴─────────────┴────────────┴────────┘" + "┌───────┬─────────────┬────────────────────────┬─────────────┬────────────┬────────┐\n", + "│ tic ┆ name ┆ sector ┆ last_update ┆ spx_status ┆ active │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ date ┆ i64 ┆ i64 │\n", + "╞═══════╪═════════════╪════════════════════════╪═════════════╪════════════╪════════╡\n", + "│ ABI ┆ null ┆ Industrials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", + "│ ABKFQ ┆ null ┆ Financials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", + "│ ABS ┆ null ┆ Consumer Staples ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", + "│ ACV ┆ null ┆ Consumer Staples ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", + "│ ANRZQ ┆ null ┆ Materials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ DELL ┆ null ┆ Information Technology ┆ 2024-11-14 ┆ 1 ┆ 1 │\n", + "│ MMM ┆ 3M ┆ Industrials ┆ 2024-11-14 ┆ 1 ┆ 1 │\n", + "│ AOS ┆ A. O. Smith ┆ Industrials ┆ 2024-11-14 ┆ 1 ┆ 1 │\n", + "│ ABT ┆ Abbott ┆ Health Care ┆ 2024-11-14 ┆ 1 ┆ 1 │\n", + "│ AAPL ┆ Apple Inc. ┆ Information Technology ┆ 2024-11-17 ┆ 1 ┆ 1 │\n", + "└───────┴─────────────┴────────────────────────┴─────────────┴────────────┴────────┘" ] }, - "execution_count": 17, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/mock_data.ipynb b/notebooks/mock_data.ipynb index bdf7906..276db33 100644 --- a/notebooks/mock_data.ipynb +++ b/notebooks/mock_data.ipynb @@ -2,14 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-11-17 17:17:05.118\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mstocksense.database_handler.schema\u001b[0m:\u001b[36mcreate_tables\u001b[0m:\u001b[36m122\u001b[0m - \u001b[32m\u001b[1mTables created successfully\u001b[0m\n" + "\u001b[32m2024-11-17 19:03:35.835\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mstocksense.database_handler.schema\u001b[0m:\u001b[36mcreate_tables\u001b[0m:\u001b[36m122\u001b[0m - \u001b[32m\u001b[1mTables created successfully\u001b[0m\n" ] } ], @@ -27,7 +27,55 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (79, 27)
ticdatadaterdqsaleqcogsqxsgaqniqebitdaqcshoqactqatqcheqrectqinvtqppentqlctqdlttqltqreqseqqoancfqivncfqfincfqdvqcapxqicaptqsurprise_pct
strdatedatef64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64
"AAPL"2005-03-312005-04-133243.02235.0570.0286.0438.0823.1379007.010111.07057.01242.0164.0742.03352.00.03725.03261.06386.01311.0-2432.0406.00.0101.06386.00.397
"AAPL"2005-06-302005-07-133520.02430.0618.0319.0472.0827.9819376.010488.07526.01101.0193.0764.03123.00.03667.03584.06821.0472.0305.063.00.063.06821.00.18
"AAPL"2005-09-302005-10-113678.02592.0618.0428.0468.0835.01910300.011551.08261.01312.0165.0817.03484.00.04085.04005.07466.0752.0-429.074.00.096.07466.00.021
"AAPL"2005-12-312006-01-185749.04133.0814.0565.0802.0845.61712162.014181.08707.02413.0244.0855.05060.00.05801.04565.08380.0283.093.0283.00.082.08380.00.071
"AAPL"2006-03-312006-04-194359.03012.0768.0410.0579.0849.18811286.013911.08226.01984.0204.01005.04456.00.05229.04668.08682.0-125.02462.0-141.00.0193.08682.00.082
"AAPL"2023-09-302023-11-0289498.049071.06151.022956.030653.015550.061143566.0352583.061555.060985.06331.043715.0145308.095281.0290437.0-214.062146.021598.02394.0-23153.03758.02163.0173234.00.049
"AAPL"2023-12-312024-02-01119575.064720.06786.033916.043221.015460.223143692.0353514.073100.050102.06511.043666.0133973.095088.0279414.08242.074100.039895.01927.0-30585.03825.02392.0182140.00.039
"AAPL"2024-03-312024-05-0290753.048482.06468.023636.030736.015337.686128416.0337411.067150.041150.06232.043546.0123822.091831.0263217.04339.074194.022690.0-310.0-30433.03710.01996.0178784.00.02
"AAPL"2024-06-302024-08-0185777.046099.06320.021448.028202.015222.259125435.0331612.061801.043172.06165.044502.0131624.086196.0264904.0-4726.066708.028858.0-127.0-36017.03895.02151.0168012.00.0399
"AAPL"2024-09-302024-10-3194930.051051.06523.014736.032502.015116.786152987.0364980.065171.066243.07286.045680.0176392.085750.0308030.0-19154.056950.026811.01445.0-24948.03804.02908.0163579.00.0237
" + ], + "text/plain": [ + "shape: (79, 27)\n", + "┌──────┬────────────┬────────────┬──────────┬───┬────────┬────────┬──────────┬──────────────┐\n", + "│ tic ┆ datadate ┆ rdq ┆ saleq ┆ … ┆ dvq ┆ capxq ┆ icaptq ┆ surprise_pct │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ date ┆ date ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════╪════════════╪════════════╪══════════╪═══╪════════╪════════╪══════════╪══════════════╡\n", + "│ AAPL ┆ 2005-03-31 ┆ 2005-04-13 ┆ 3243.0 ┆ … ┆ 0.0 ┆ 101.0 ┆ 6386.0 ┆ 0.397 │\n", + "│ AAPL ┆ 2005-06-30 ┆ 2005-07-13 ┆ 3520.0 ┆ … ┆ 0.0 ┆ 63.0 ┆ 6821.0 ┆ 0.18 │\n", + "│ AAPL ┆ 2005-09-30 ┆ 2005-10-11 ┆ 3678.0 ┆ … ┆ 0.0 ┆ 96.0 ┆ 7466.0 ┆ 0.021 │\n", + "│ AAPL ┆ 2005-12-31 ┆ 2006-01-18 ┆ 5749.0 ┆ … ┆ 0.0 ┆ 82.0 ┆ 8380.0 ┆ 0.071 │\n", + "│ AAPL ┆ 2006-03-31 ┆ 2006-04-19 ┆ 4359.0 ┆ … ┆ 0.0 ┆ 193.0 ┆ 8682.0 ┆ 0.082 │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ AAPL ┆ 2023-09-30 ┆ 2023-11-02 ┆ 89498.0 ┆ … ┆ 3758.0 ┆ 2163.0 ┆ 173234.0 ┆ 0.049 │\n", + "│ AAPL ┆ 2023-12-31 ┆ 2024-02-01 ┆ 119575.0 ┆ … ┆ 3825.0 ┆ 2392.0 ┆ 182140.0 ┆ 0.039 │\n", + "│ AAPL ┆ 2024-03-31 ┆ 2024-05-02 ┆ 90753.0 ┆ … ┆ 3710.0 ┆ 1996.0 ┆ 178784.0 ┆ 0.02 │\n", + "│ AAPL ┆ 2024-06-30 ┆ 2024-08-01 ┆ 85777.0 ┆ … ┆ 3895.0 ┆ 2151.0 ┆ 168012.0 ┆ 0.0399 │\n", + "│ AAPL ┆ 2024-09-30 ┆ 2024-10-31 ┆ 94930.0 ┆ … ┆ 3804.0 ┆ 2908.0 ┆ 163579.0 ┆ 0.0237 │\n", + "└──────┴────────────┴────────────┴──────────┴───┴────────┴────────┴──────────┴──────────────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.fetch_financial_data(\"AAPL\")\n", + "# fetch all required data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -74,6 +122,43 @@ "vix_data.write_parquet(FIXTURE_PATH / \"vix_data.parquet\")" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 6)
ticnamesectorlast_updatespx_statusactive
strstrstrdatei64i64
"AAPL""Apple Inc.""Information Technology"2024-11-0111
"COST""Costco""Consumer Staples"2024-11-0111
"PFE""Pfizer""Health Care"2024-11-0111
" + ], + "text/plain": [ + "shape: (3, 6)\n", + "┌──────┬────────────┬────────────────────────┬─────────────┬────────────┬────────┐\n", + "│ tic ┆ name ┆ sector ┆ last_update ┆ spx_status ┆ active │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ date ┆ i64 ┆ i64 │\n", + "╞══════╪════════════╪════════════════════════╪═════════════╪════════════╪════════╡\n", + "│ AAPL ┆ Apple Inc. ┆ Information Technology ┆ 2024-11-01 ┆ 1 ┆ 1 │\n", + "│ COST ┆ Costco ┆ Consumer Staples ┆ 2024-11-01 ┆ 1 ┆ 1 │\n", + "│ PFE ┆ Pfizer ┆ Health Care ┆ 2024-11-01 ┆ 1 ┆ 1 │\n", + "└──────┴────────────┴────────────────────────┴─────────────┴────────────┴────────┘" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, { "cell_type": "code", "execution_count": 3, diff --git a/pyproject.toml b/pyproject.toml index 9c73f23..70a6abf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,5 +75,5 @@ skip-magic-trailing-comma = false line-ending = "auto" [project.scripts] -stocksense-app = "app.home:main" -stocksense = "main:main" +stocksense-app = "stocksense.app.home:main" +stocksense = "stocksense.main:main" diff --git a/stocksense/app/pages/analytics.py b/stocksense/app/pages/analytics.py index fcf8cde..dc9fa79 100644 --- a/stocksense/app/pages/analytics.py +++ b/stocksense/app/pages/analytics.py @@ -1,14 +1,12 @@ -import datetime as dt -from pathlib import Path - import pandas as pd import plotly.express as px import plotly.graph_objects as go -import polars as pl import streamlit as st -from database_handler import DatabaseHandler from plotly.subplots import make_subplots +from stocksense.database_handler import DatabaseHandler +from stocksense.pipeline import clean, engineer_features + pd.options.mode.chained_assignment = None # default='warn' MILLION = 1000000 @@ -44,17 +42,9 @@ def load_processed_data(): Read most recently processed dataset. """ - directory_path = Path("data/1_work_data/processed") - csv_files = directory_path.glob("*.csv") - - date_files = [ - (file, dt.datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d")) for file in csv_files - ] - if date_files: - most_recent_file = max(date_files, key=lambda x: x[1])[0] - return pl.read_csv(most_recent_file, try_parse_dates=True).to_pandas() - else: - raise FileNotFoundError + data = engineer_features() + data = clean(data) + return data.to_pandas() @st.cache_data(show_spinner="Fetching stock data...", max_entries=10) diff --git a/stocksense/app/pages/overview.py b/stocksense/app/pages/overview.py index 269b46a..810abcf 100644 --- a/stocksense/app/pages/overview.py +++ b/stocksense/app/pages/overview.py @@ -2,7 +2,8 @@ import pandas as pd import plotly.express as px import streamlit as st -from database_handler import DatabaseHandler + +from stocksense.database_handler import DatabaseHandler pd.set_option("future.no_silent_downcasting", True) diff --git a/stocksense/main.py b/stocksense/main.py index a8f1447..c04d1e1 100644 --- a/stocksense/main.py +++ b/stocksense/main.py @@ -1,5 +1,6 @@ import click +from stocksense.config import config from stocksense.model import ModelHandler from stocksense.pipeline import ETL, clean, engineer_features @@ -14,7 +15,7 @@ def main(update, train, score): """ if update: - etl_handler = ETL() + etl_handler = ETL(config, stocks=["AAPL"]) etl_handler.update_index_listings() etl_handler.extract() if train: diff --git a/stocksense/model/genetic_algorithm.py b/stocksense/model/genetic_algorithm.py index d18a029..ad25e7b 100644 --- a/stocksense/model/genetic_algorithm.py +++ b/stocksense/model/genetic_algorithm.py @@ -2,9 +2,10 @@ import polars as pl import pygad -from config import config from loguru import logger +from stocksense.config import config + from .xgboost_model import XGBoostModel diff --git a/stocksense/model/model_handler.py b/stocksense/model/model_handler.py index 3ba9c47..a390090 100644 --- a/stocksense/model/model_handler.py +++ b/stocksense/model/model_handler.py @@ -3,9 +3,10 @@ from pathlib import Path import polars as pl -from config import config from loguru import logger +from stocksense.config import config + from .genetic_algorithm import GeneticAlgorithm, fitness_function_wrapper from .xgboost_model import XGBoostModel diff --git a/stocksense/model/xgboost_model.py b/stocksense/model/xgboost_model.py index 512d411..daca1b2 100644 --- a/stocksense/model/xgboost_model.py +++ b/stocksense/model/xgboost_model.py @@ -2,7 +2,6 @@ import sklearn.metrics as skm import xgboost as xgb -from config import config class XGBoostModel: @@ -24,7 +23,7 @@ def __init__(self, params=None, scale=1.0): "scale_pos_weight": scale, "eval_metric": "logloss", "nthread": -1, - "seed": config.model.seed, + "seed": 100, } ) self.model = None diff --git a/stocksense/pipeline/etl.py b/stocksense/pipeline/etl.py index 46a4ba5..c7910ad 100644 --- a/stocksense/pipeline/etl.py +++ b/stocksense/pipeline/etl.py @@ -7,7 +7,6 @@ from loguru import logger from tqdm import tqdm -from stocksense.config import config from stocksense.database_handler import DatabaseHandler from .scraper import Scraper @@ -23,13 +22,13 @@ class ETL: transformation and DB ingestion processes. """ - def __init__(self, stocks: Optional[list[str]] = None): - self.db = DatabaseHandler() - self.db_schema = config.database.db_schema - self.base_date = config.scraping.base_date - self.fin_source = "yfinance" - self.historical_data_path = DATA_PATH / "interim" - self.stocks = stocks or self._set_default_stocks() + def __init__(self, config, stocks: Optional[list[str]] = None): + self.db: DatabaseHandler = DatabaseHandler() + self.db_schema: dict = config.database.db_schema + self.base_date: str = config.scraping.base_date + self.fin_source: str = "yfinance" + self.historical_data_path: Path = DATA_PATH / "interim" + self.stocks: list[str] = stocks or self._set_default_stocks() def _set_default_stocks(self) -> list[str]: """ @@ -137,7 +136,7 @@ def extract(self) -> None: raise ValueError("No stocks assigned for ETL process.") self.extract_sp_500() self.extract_vix() - self._extract_all_stocks() + self.extract_all_stocks() def extract_sp_500(self) -> None: """ @@ -167,17 +166,17 @@ def extract_vix(self) -> None: logger.error("VIX data extraction FAILED") return - def _extract_all_stocks(self) -> None: + def extract_all_stocks(self) -> None: """ Extract data for all assigned stocks. """ pl_bar = tqdm(total=len(self.stocks), desc="Stock", leave=True) for tic in self.stocks: - self._extract_stock_data(tic) + self.extract_stock_data(tic) pl_bar.update(1) pl_bar.close() - def _extract_stock_data(self, tic: str) -> bool: + def extract_stock_data(self, tic: str) -> bool: """ Extract updated data for a single stock, including market, financial and insider trading data. If no financial data is found for the last 2 years, diff --git a/stocksense/pipeline/scraper.py b/stocksense/pipeline/scraper.py index a4b551e..dd54644 100644 --- a/stocksense/pipeline/scraper.py +++ b/stocksense/pipeline/scraper.py @@ -12,7 +12,6 @@ from stocksense.config import config -# Suppress logging from the yfinance and requests libraries logging.getLogger("yfinance").setLevel(logging.CRITICAL) @@ -27,17 +26,20 @@ class Scraper: """ def __init__(self, tic, source): - self.tic = tic - self.source = source - self.session = self._get_session() + self.tic: str = tic + self.source: str = source + self.session: CachedLimiterSession = self._get_session() if self.source == "yfinance": - self.handler = self._get_yfinance_handler() + self.handler: yf.Ticker = self._get_yfinance_handler() def _get_session(self): """ Create session for yfinance queries. - :return CachedLimiterSession: Session object. + Returns + ------- + CachedLimiterSession + Session object. """ session = CachedLimiterSession( limiter=Limiter(RequestRate(2, Duration.SECOND * 5)), @@ -80,8 +82,15 @@ def _get_stock_info_yfinance(self) -> dict: """ Scrape current info using yfinance. - :raises Exception: no info available. - :return dict: current stock info. + Returns + ------- + dict + Current stock info. + + Raises + ------ + Exception + No info available. """ data = self.handler.info @@ -102,14 +111,27 @@ def _get_fundamental_data_yfinance( self, start_date: dt.date, end_date: dt.date ) -> pl.DataFrame: """ - Scraps fundamental data from Yahoo Finance using yfinance lib, searching + Scrape fundamental data from Yahoo Finance using yfinance lib, searching for financial records released between two dates. - :param dt.date start_date: starting date. - :param dt.date end_date: ending date - :raises Exception: no financial records are available. - :return pl.DataFrame: financial report data from yfinance. + Parameters + ---------- + start_date : dt.date + Starting date. + end_date : dt.date + Ending date. + + Returns + ------- + pl.DataFrame + Financial report data from yfinance. + + Raises + ------ + Exception + No financial data available for date interval. """ + fields_to_keep = config.scraping.yahoo # retrieve 3 main financial documents @@ -128,11 +150,7 @@ def _get_fundamental_data_yfinance( for c in list(fields_to_keep.keys()): if c not in df.columns: - df = df.with_columns( - [ - pl.lit(None).alias(c), - ] - ) + df = df.with_columns(pl.lit(None).alias(c)) df = df.select(list(fields_to_keep.keys())) df = df.rename(fields_to_keep) @@ -185,7 +203,6 @@ def get_stock_info(self): Scrape current stock info. """ if self.source == "yfinance": - # scrape stock current info from yfinance return self._get_stock_info_yfinance() else: raise Exception("Other methods not implemented") @@ -297,7 +314,7 @@ def scrape_sp500_stock_info() -> pl.DataFrame: """ List S&P500 stock info from wiki page and return Polars dataframe. """ - resp = requests.get("http://en.wikipedia.org/wiki/List_of_S%26P_500_companies") + resp = requests.get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies") soup = bs(resp.text, "lxml") table = soup.find("table", id="constituents") diff --git a/tests/conftest.py b/tests/conftest.py index dff6345..4818eb9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,5 @@ +import datetime as dt + import polars as pl import pytest @@ -42,3 +44,28 @@ def processed_data() -> pl.DataFrame: @pytest.fixture def cleaned_data() -> pl.DataFrame: return pl.read_parquet(FIXTURES_DIR / "cleaned_data.parquet") + + +@pytest.fixture +def mock_stock_data(): + return pl.DataFrame( + { + "tic": ["AAPL", "OLD", "EXISTING"], + "name": ["Apple Inc.", "Old Corp", "Existing Corp"], + "sector": ["Technology", "Finance", "Healthcare"], + "last_update": [dt.datetime.now().date()] * 3, + "spx_status": [1, 1, 1], + "active": [1, 1, 1], + } + ) + + +@pytest.fixture +def mock_active_data(): + return pl.DataFrame( + { + "tic": ["AAPL", "NEW", "EXISTING"], + "name": ["Apple Inc.", "New Corp", "Existing Corp"], + "sector": ["Technology", "Finance", "Healthcare"], + } + ) diff --git a/tests/test_etl.py b/tests/test_etl.py new file mode 100644 index 0000000..885d4dc --- /dev/null +++ b/tests/test_etl.py @@ -0,0 +1,33 @@ +import polars as pl + +from stocksense.config import config +from stocksense.pipeline import ETL + + +def test_update_index_listings(mocker, mock_stock_data, mock_active_data): + etl = ETL(config) + + # mock database + mock_db = mocker.Mock() + mock_db.fetch_stock.return_value = mock_stock_data + mock_db.update_stock.return_value = True + mock_db.insert_stock.return_value = True + etl.db = mock_db + + # mock scraper + mocker.patch( + "stocksense.pipeline.scraper.Scraper.scrape_sp500_stock_info", return_value=mock_active_data + ) + + # Run update + etl.update_index_listings() + + # verify delisted stock was updated + mock_db.update_stock.assert_any_call("OLD", {"spx_status": 0}) + + # verify new stock was added + mock_db.insert_stock.assert_called_once() + insert_call_df = mock_db.insert_stock.call_args[0][0] + assert "NEW" in insert_call_df["tic"].to_list() + assert insert_call_df.filter(pl.col("tic") == "NEW")["spx_status"][0] == 1 + assert insert_call_df.filter(pl.col("tic") == "NEW")["active"][0] == 1