diff --git a/notebooks/db_analysis.ipynb b/notebooks/db_analysis.ipynb
index 99111a3..df2e0cd 100644
--- a/notebooks/db_analysis.ipynb
+++ b/notebooks/db_analysis.ipynb
@@ -9,23 +9,23 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
- "from database_handler import DatabaseHandler"
+ "from stocksense.database_handler import DatabaseHandler"
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[32m2024-11-15 15:59:57.517\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mdatabase_handler.schema\u001b[0m:\u001b[36mcreate_tables\u001b[0m:\u001b[36m122\u001b[0m - \u001b[32m\u001b[1mTables created successfully\u001b[0m\n"
+ "\u001b[32m2024-11-18 15:44:57.298\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mstocksense.database_handler.schema\u001b[0m:\u001b[36mcreate_tables\u001b[0m:\u001b[36m122\u001b[0m - \u001b[32m\u001b[1mTables created successfully\u001b[0m\n"
]
}
],
@@ -35,7 +35,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -48,30 +48,30 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (940, 6)
tic | name | sector | last_update | spx_status | active |
---|
str | str | str | date | i64 | i64 |
"ABI" | null | "Industrials" | 2005-01-01 | 0 | 0 |
"ABKFQ" | null | "Financials" | 2005-01-01 | 0 | 0 |
"ABS" | null | "Consumer Staples" | 2005-01-01 | 0 | 0 |
"ACV" | null | "Consumer Staples" | 2005-01-01 | 0 | 0 |
"ANRZQ" | null | "Materials" | 2005-01-01 | 0 | 0 |
… | … | … | … | … | … |
"SW" | "Smurfit WestRock" | "Materials" | 2024-11-01 | 1 | 1 |
"DELL" | null | "Information Technology" | 2024-11-14 | 1 | 1 |
"MMM" | "3M" | "Industrials" | 2024-11-14 | 1 | 1 |
"AOS" | "A. O. Smith" | "Industrials" | 2024-11-14 | 1 | 1 |
"ABT" | "Abbott" | "Health Care" | 2024-11-14 | 1 | 1 |
"
+ "shape: (940, 6)tic | name | sector | last_update | spx_status | active |
---|
str | str | str | date | i64 | i64 |
"ABI" | null | "Industrials" | 2005-01-01 | 0 | 0 |
"ABKFQ" | null | "Financials" | 2005-01-01 | 0 | 0 |
"ABS" | null | "Consumer Staples" | 2005-01-01 | 0 | 0 |
"ACV" | null | "Consumer Staples" | 2005-01-01 | 0 | 0 |
"ANRZQ" | null | "Materials" | 2005-01-01 | 0 | 0 |
… | … | … | … | … | … |
"DELL" | null | "Information Technology" | 2024-11-14 | 1 | 1 |
"MMM" | "3M" | "Industrials" | 2024-11-14 | 1 | 1 |
"AOS" | "A. O. Smith" | "Industrials" | 2024-11-14 | 1 | 1 |
"ABT" | "Abbott" | "Health Care" | 2024-11-14 | 1 | 1 |
"AAPL" | "Apple Inc." | "Information Technology" | 2024-11-17 | 1 | 1 |
"
],
"text/plain": [
"shape: (940, 6)\n",
- "┌───────┬──────────────────┬────────────────────────┬─────────────┬────────────┬────────┐\n",
- "│ tic ┆ name ┆ sector ┆ last_update ┆ spx_status ┆ active │\n",
- "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
- "│ str ┆ str ┆ str ┆ date ┆ i64 ┆ i64 │\n",
- "╞═══════╪══════════════════╪════════════════════════╪═════════════╪════════════╪════════╡\n",
- "│ ABI ┆ null ┆ Industrials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
- "│ ABKFQ ┆ null ┆ Financials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
- "│ ABS ┆ null ┆ Consumer Staples ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
- "│ ACV ┆ null ┆ Consumer Staples ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
- "│ ANRZQ ┆ null ┆ Materials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
- "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
- "│ SW ┆ Smurfit WestRock ┆ Materials ┆ 2024-11-01 ┆ 1 ┆ 1 │\n",
- "│ DELL ┆ null ┆ Information Technology ┆ 2024-11-14 ┆ 1 ┆ 1 │\n",
- "│ MMM ┆ 3M ┆ Industrials ┆ 2024-11-14 ┆ 1 ┆ 1 │\n",
- "│ AOS ┆ A. O. Smith ┆ Industrials ┆ 2024-11-14 ┆ 1 ┆ 1 │\n",
- "│ ABT ┆ Abbott ┆ Health Care ┆ 2024-11-14 ┆ 1 ┆ 1 │\n",
- "└───────┴──────────────────┴────────────────────────┴─────────────┴────────────┴────────┘"
+ "┌───────┬─────────────┬────────────────────────┬─────────────┬────────────┬────────┐\n",
+ "│ tic ┆ name ┆ sector ┆ last_update ┆ spx_status ┆ active │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ str ┆ date ┆ i64 ┆ i64 │\n",
+ "╞═══════╪═════════════╪════════════════════════╪═════════════╪════════════╪════════╡\n",
+ "│ ABI ┆ null ┆ Industrials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
+ "│ ABKFQ ┆ null ┆ Financials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
+ "│ ABS ┆ null ┆ Consumer Staples ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
+ "│ ACV ┆ null ┆ Consumer Staples ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
+ "│ ANRZQ ┆ null ┆ Materials ┆ 2005-01-01 ┆ 0 ┆ 0 │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ DELL ┆ null ┆ Information Technology ┆ 2024-11-14 ┆ 1 ┆ 1 │\n",
+ "│ MMM ┆ 3M ┆ Industrials ┆ 2024-11-14 ┆ 1 ┆ 1 │\n",
+ "│ AOS ┆ A. O. Smith ┆ Industrials ┆ 2024-11-14 ┆ 1 ┆ 1 │\n",
+ "│ ABT ┆ Abbott ┆ Health Care ┆ 2024-11-14 ┆ 1 ┆ 1 │\n",
+ "│ AAPL ┆ Apple Inc. ┆ Information Technology ┆ 2024-11-17 ┆ 1 ┆ 1 │\n",
+ "└───────┴─────────────┴────────────────────────┴─────────────┴────────────┴────────┘"
]
},
- "execution_count": 17,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
diff --git a/notebooks/mock_data.ipynb b/notebooks/mock_data.ipynb
index bdf7906..276db33 100644
--- a/notebooks/mock_data.ipynb
+++ b/notebooks/mock_data.ipynb
@@ -2,14 +2,14 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[32m2024-11-17 17:17:05.118\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mstocksense.database_handler.schema\u001b[0m:\u001b[36mcreate_tables\u001b[0m:\u001b[36m122\u001b[0m - \u001b[32m\u001b[1mTables created successfully\u001b[0m\n"
+ "\u001b[32m2024-11-17 19:03:35.835\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mstocksense.database_handler.schema\u001b[0m:\u001b[36mcreate_tables\u001b[0m:\u001b[36m122\u001b[0m - \u001b[32m\u001b[1mTables created successfully\u001b[0m\n"
]
}
],
@@ -27,7 +27,55 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (79, 27)tic | datadate | rdq | saleq | cogsq | xsgaq | niq | ebitdaq | cshoq | actq | atq | cheq | rectq | invtq | ppentq | lctq | dlttq | ltq | req | seqq | oancfq | ivncfq | fincfq | dvq | capxq | icaptq | surprise_pct |
---|
str | date | date | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
"AAPL" | 2005-03-31 | 2005-04-13 | 3243.0 | 2235.0 | 570.0 | 286.0 | 438.0 | 823.137 | 9007.0 | 10111.0 | 7057.0 | 1242.0 | 164.0 | 742.0 | 3352.0 | 0.0 | 3725.0 | 3261.0 | 6386.0 | 1311.0 | -2432.0 | 406.0 | 0.0 | 101.0 | 6386.0 | 0.397 |
"AAPL" | 2005-06-30 | 2005-07-13 | 3520.0 | 2430.0 | 618.0 | 319.0 | 472.0 | 827.981 | 9376.0 | 10488.0 | 7526.0 | 1101.0 | 193.0 | 764.0 | 3123.0 | 0.0 | 3667.0 | 3584.0 | 6821.0 | 472.0 | 305.0 | 63.0 | 0.0 | 63.0 | 6821.0 | 0.18 |
"AAPL" | 2005-09-30 | 2005-10-11 | 3678.0 | 2592.0 | 618.0 | 428.0 | 468.0 | 835.019 | 10300.0 | 11551.0 | 8261.0 | 1312.0 | 165.0 | 817.0 | 3484.0 | 0.0 | 4085.0 | 4005.0 | 7466.0 | 752.0 | -429.0 | 74.0 | 0.0 | 96.0 | 7466.0 | 0.021 |
"AAPL" | 2005-12-31 | 2006-01-18 | 5749.0 | 4133.0 | 814.0 | 565.0 | 802.0 | 845.617 | 12162.0 | 14181.0 | 8707.0 | 2413.0 | 244.0 | 855.0 | 5060.0 | 0.0 | 5801.0 | 4565.0 | 8380.0 | 283.0 | 93.0 | 283.0 | 0.0 | 82.0 | 8380.0 | 0.071 |
"AAPL" | 2006-03-31 | 2006-04-19 | 4359.0 | 3012.0 | 768.0 | 410.0 | 579.0 | 849.188 | 11286.0 | 13911.0 | 8226.0 | 1984.0 | 204.0 | 1005.0 | 4456.0 | 0.0 | 5229.0 | 4668.0 | 8682.0 | -125.0 | 2462.0 | -141.0 | 0.0 | 193.0 | 8682.0 | 0.082 |
… | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
"AAPL" | 2023-09-30 | 2023-11-02 | 89498.0 | 49071.0 | 6151.0 | 22956.0 | 30653.0 | 15550.061 | 143566.0 | 352583.0 | 61555.0 | 60985.0 | 6331.0 | 43715.0 | 145308.0 | 95281.0 | 290437.0 | -214.0 | 62146.0 | 21598.0 | 2394.0 | -23153.0 | 3758.0 | 2163.0 | 173234.0 | 0.049 |
"AAPL" | 2023-12-31 | 2024-02-01 | 119575.0 | 64720.0 | 6786.0 | 33916.0 | 43221.0 | 15460.223 | 143692.0 | 353514.0 | 73100.0 | 50102.0 | 6511.0 | 43666.0 | 133973.0 | 95088.0 | 279414.0 | 8242.0 | 74100.0 | 39895.0 | 1927.0 | -30585.0 | 3825.0 | 2392.0 | 182140.0 | 0.039 |
"AAPL" | 2024-03-31 | 2024-05-02 | 90753.0 | 48482.0 | 6468.0 | 23636.0 | 30736.0 | 15337.686 | 128416.0 | 337411.0 | 67150.0 | 41150.0 | 6232.0 | 43546.0 | 123822.0 | 91831.0 | 263217.0 | 4339.0 | 74194.0 | 22690.0 | -310.0 | -30433.0 | 3710.0 | 1996.0 | 178784.0 | 0.02 |
"AAPL" | 2024-06-30 | 2024-08-01 | 85777.0 | 46099.0 | 6320.0 | 21448.0 | 28202.0 | 15222.259 | 125435.0 | 331612.0 | 61801.0 | 43172.0 | 6165.0 | 44502.0 | 131624.0 | 86196.0 | 264904.0 | -4726.0 | 66708.0 | 28858.0 | -127.0 | -36017.0 | 3895.0 | 2151.0 | 168012.0 | 0.0399 |
"AAPL" | 2024-09-30 | 2024-10-31 | 94930.0 | 51051.0 | 6523.0 | 14736.0 | 32502.0 | 15116.786 | 152987.0 | 364980.0 | 65171.0 | 66243.0 | 7286.0 | 45680.0 | 176392.0 | 85750.0 | 308030.0 | -19154.0 | 56950.0 | 26811.0 | 1445.0 | -24948.0 | 3804.0 | 2908.0 | 163579.0 | 0.0237 |
"
+ ],
+ "text/plain": [
+ "shape: (79, 27)\n",
+ "┌──────┬────────────┬────────────┬──────────┬───┬────────┬────────┬──────────┬──────────────┐\n",
+ "│ tic ┆ datadate ┆ rdq ┆ saleq ┆ … ┆ dvq ┆ capxq ┆ icaptq ┆ surprise_pct │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ date ┆ date ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
+ "╞══════╪════════════╪════════════╪══════════╪═══╪════════╪════════╪══════════╪══════════════╡\n",
+ "│ AAPL ┆ 2005-03-31 ┆ 2005-04-13 ┆ 3243.0 ┆ … ┆ 0.0 ┆ 101.0 ┆ 6386.0 ┆ 0.397 │\n",
+ "│ AAPL ┆ 2005-06-30 ┆ 2005-07-13 ┆ 3520.0 ┆ … ┆ 0.0 ┆ 63.0 ┆ 6821.0 ┆ 0.18 │\n",
+ "│ AAPL ┆ 2005-09-30 ┆ 2005-10-11 ┆ 3678.0 ┆ … ┆ 0.0 ┆ 96.0 ┆ 7466.0 ┆ 0.021 │\n",
+ "│ AAPL ┆ 2005-12-31 ┆ 2006-01-18 ┆ 5749.0 ┆ … ┆ 0.0 ┆ 82.0 ┆ 8380.0 ┆ 0.071 │\n",
+ "│ AAPL ┆ 2006-03-31 ┆ 2006-04-19 ┆ 4359.0 ┆ … ┆ 0.0 ┆ 193.0 ┆ 8682.0 ┆ 0.082 │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ AAPL ┆ 2023-09-30 ┆ 2023-11-02 ┆ 89498.0 ┆ … ┆ 3758.0 ┆ 2163.0 ┆ 173234.0 ┆ 0.049 │\n",
+ "│ AAPL ┆ 2023-12-31 ┆ 2024-02-01 ┆ 119575.0 ┆ … ┆ 3825.0 ┆ 2392.0 ┆ 182140.0 ┆ 0.039 │\n",
+ "│ AAPL ┆ 2024-03-31 ┆ 2024-05-02 ┆ 90753.0 ┆ … ┆ 3710.0 ┆ 1996.0 ┆ 178784.0 ┆ 0.02 │\n",
+ "│ AAPL ┆ 2024-06-30 ┆ 2024-08-01 ┆ 85777.0 ┆ … ┆ 3895.0 ┆ 2151.0 ┆ 168012.0 ┆ 0.0399 │\n",
+ "│ AAPL ┆ 2024-09-30 ┆ 2024-10-31 ┆ 94930.0 ┆ … ┆ 3804.0 ┆ 2908.0 ┆ 163579.0 ┆ 0.0237 │\n",
+ "└──────┴────────────┴────────────┴──────────┴───┴────────┴────────┴──────────┴──────────────┘"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "db.fetch_financial_data(\"AAPL\")\n",
+ "# fetch all required data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -74,6 +122,43 @@
"vix_data.write_parquet(FIXTURE_PATH / \"vix_data.parquet\")"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (3, 6)tic | name | sector | last_update | spx_status | active |
---|
str | str | str | date | i64 | i64 |
"AAPL" | "Apple Inc." | "Information Technology" | 2024-11-01 | 1 | 1 |
"COST" | "Costco" | "Consumer Staples" | 2024-11-01 | 1 | 1 |
"PFE" | "Pfizer" | "Health Care" | 2024-11-01 | 1 | 1 |
"
+ ],
+ "text/plain": [
+ "shape: (3, 6)\n",
+ "┌──────┬────────────┬────────────────────────┬─────────────┬────────────┬────────┐\n",
+ "│ tic ┆ name ┆ sector ┆ last_update ┆ spx_status ┆ active │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ str ┆ date ┆ i64 ┆ i64 │\n",
+ "╞══════╪════════════╪════════════════════════╪═════════════╪════════════╪════════╡\n",
+ "│ AAPL ┆ Apple Inc. ┆ Information Technology ┆ 2024-11-01 ┆ 1 ┆ 1 │\n",
+ "│ COST ┆ Costco ┆ Consumer Staples ┆ 2024-11-01 ┆ 1 ┆ 1 │\n",
+ "│ PFE ┆ Pfizer ┆ Health Care ┆ 2024-11-01 ┆ 1 ┆ 1 │\n",
+ "└──────┴────────────┴────────────────────────┴─────────────┴────────────┴────────┘"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": []
+ },
{
"cell_type": "code",
"execution_count": 3,
diff --git a/pyproject.toml b/pyproject.toml
index 9c73f23..70a6abf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,5 +75,5 @@ skip-magic-trailing-comma = false
line-ending = "auto"
[project.scripts]
-stocksense-app = "app.home:main"
-stocksense = "main:main"
+stocksense-app = "stocksense.app.home:main"
+stocksense = "stocksense.main:main"
diff --git a/stocksense/app/pages/analytics.py b/stocksense/app/pages/analytics.py
index fcf8cde..dc9fa79 100644
--- a/stocksense/app/pages/analytics.py
+++ b/stocksense/app/pages/analytics.py
@@ -1,14 +1,12 @@
-import datetime as dt
-from pathlib import Path
-
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
-import polars as pl
import streamlit as st
-from database_handler import DatabaseHandler
from plotly.subplots import make_subplots
+from stocksense.database_handler import DatabaseHandler
+from stocksense.pipeline import clean, engineer_features
+
pd.options.mode.chained_assignment = None # default='warn'
MILLION = 1000000
@@ -44,17 +42,9 @@ def load_processed_data():
Read most recently processed dataset.
"""
- directory_path = Path("data/1_work_data/processed")
- csv_files = directory_path.glob("*.csv")
-
- date_files = [
- (file, dt.datetime.strptime(file.stem.split("_")[-1], "%Y-%m-%d")) for file in csv_files
- ]
- if date_files:
- most_recent_file = max(date_files, key=lambda x: x[1])[0]
- return pl.read_csv(most_recent_file, try_parse_dates=True).to_pandas()
- else:
- raise FileNotFoundError
+ data = engineer_features()
+ data = clean(data)
+ return data.to_pandas()
@st.cache_data(show_spinner="Fetching stock data...", max_entries=10)
diff --git a/stocksense/app/pages/overview.py b/stocksense/app/pages/overview.py
index 269b46a..810abcf 100644
--- a/stocksense/app/pages/overview.py
+++ b/stocksense/app/pages/overview.py
@@ -2,7 +2,8 @@
import pandas as pd
import plotly.express as px
import streamlit as st
-from database_handler import DatabaseHandler
+
+from stocksense.database_handler import DatabaseHandler
pd.set_option("future.no_silent_downcasting", True)
diff --git a/stocksense/main.py b/stocksense/main.py
index a8f1447..c04d1e1 100644
--- a/stocksense/main.py
+++ b/stocksense/main.py
@@ -1,5 +1,6 @@
import click
+from stocksense.config import config
from stocksense.model import ModelHandler
from stocksense.pipeline import ETL, clean, engineer_features
@@ -14,7 +15,7 @@ def main(update, train, score):
"""
if update:
- etl_handler = ETL()
+ etl_handler = ETL(config, stocks=["AAPL"])
etl_handler.update_index_listings()
etl_handler.extract()
if train:
diff --git a/stocksense/model/genetic_algorithm.py b/stocksense/model/genetic_algorithm.py
index d18a029..ad25e7b 100644
--- a/stocksense/model/genetic_algorithm.py
+++ b/stocksense/model/genetic_algorithm.py
@@ -2,9 +2,10 @@
import polars as pl
import pygad
-from config import config
from loguru import logger
+from stocksense.config import config
+
from .xgboost_model import XGBoostModel
diff --git a/stocksense/model/model_handler.py b/stocksense/model/model_handler.py
index 3ba9c47..a390090 100644
--- a/stocksense/model/model_handler.py
+++ b/stocksense/model/model_handler.py
@@ -3,9 +3,10 @@
from pathlib import Path
import polars as pl
-from config import config
from loguru import logger
+from stocksense.config import config
+
from .genetic_algorithm import GeneticAlgorithm, fitness_function_wrapper
from .xgboost_model import XGBoostModel
diff --git a/stocksense/model/xgboost_model.py b/stocksense/model/xgboost_model.py
index 512d411..daca1b2 100644
--- a/stocksense/model/xgboost_model.py
+++ b/stocksense/model/xgboost_model.py
@@ -2,7 +2,6 @@
import sklearn.metrics as skm
import xgboost as xgb
-from config import config
class XGBoostModel:
@@ -24,7 +23,7 @@ def __init__(self, params=None, scale=1.0):
"scale_pos_weight": scale,
"eval_metric": "logloss",
"nthread": -1,
- "seed": config.model.seed,
+ "seed": 100,
}
)
self.model = None
diff --git a/stocksense/pipeline/etl.py b/stocksense/pipeline/etl.py
index 46a4ba5..c7910ad 100644
--- a/stocksense/pipeline/etl.py
+++ b/stocksense/pipeline/etl.py
@@ -7,7 +7,6 @@
from loguru import logger
from tqdm import tqdm
-from stocksense.config import config
from stocksense.database_handler import DatabaseHandler
from .scraper import Scraper
@@ -23,13 +22,13 @@ class ETL:
transformation and DB ingestion processes.
"""
- def __init__(self, stocks: Optional[list[str]] = None):
- self.db = DatabaseHandler()
- self.db_schema = config.database.db_schema
- self.base_date = config.scraping.base_date
- self.fin_source = "yfinance"
- self.historical_data_path = DATA_PATH / "interim"
- self.stocks = stocks or self._set_default_stocks()
+ def __init__(self, config, stocks: Optional[list[str]] = None):
+ self.db: DatabaseHandler = DatabaseHandler()
+ self.db_schema: dict = config.database.db_schema
+ self.base_date: str = config.scraping.base_date
+ self.fin_source: str = "yfinance"
+ self.historical_data_path: Path = DATA_PATH / "interim"
+ self.stocks: list[str] = stocks or self._set_default_stocks()
def _set_default_stocks(self) -> list[str]:
"""
@@ -137,7 +136,7 @@ def extract(self) -> None:
raise ValueError("No stocks assigned for ETL process.")
self.extract_sp_500()
self.extract_vix()
- self._extract_all_stocks()
+ self.extract_all_stocks()
def extract_sp_500(self) -> None:
"""
@@ -167,17 +166,17 @@ def extract_vix(self) -> None:
logger.error("VIX data extraction FAILED")
return
- def _extract_all_stocks(self) -> None:
+ def extract_all_stocks(self) -> None:
"""
Extract data for all assigned stocks.
"""
pl_bar = tqdm(total=len(self.stocks), desc="Stock", leave=True)
for tic in self.stocks:
- self._extract_stock_data(tic)
+ self.extract_stock_data(tic)
pl_bar.update(1)
pl_bar.close()
- def _extract_stock_data(self, tic: str) -> bool:
+ def extract_stock_data(self, tic: str) -> bool:
"""
Extract updated data for a single stock, including market, financial and
insider trading data. If no financial data is found for the last 2 years,
diff --git a/stocksense/pipeline/scraper.py b/stocksense/pipeline/scraper.py
index a4b551e..dd54644 100644
--- a/stocksense/pipeline/scraper.py
+++ b/stocksense/pipeline/scraper.py
@@ -12,7 +12,6 @@
from stocksense.config import config
-# Suppress logging from the yfinance and requests libraries
logging.getLogger("yfinance").setLevel(logging.CRITICAL)
@@ -27,17 +26,20 @@ class Scraper:
"""
def __init__(self, tic, source):
- self.tic = tic
- self.source = source
- self.session = self._get_session()
+ self.tic: str = tic
+ self.source: str = source
+ self.session: CachedLimiterSession = self._get_session()
if self.source == "yfinance":
- self.handler = self._get_yfinance_handler()
+ self.handler: yf.Ticker = self._get_yfinance_handler()
def _get_session(self):
"""
Create session for yfinance queries.
- :return CachedLimiterSession: Session object.
+ Returns
+ -------
+ CachedLimiterSession
+ Session object.
"""
session = CachedLimiterSession(
limiter=Limiter(RequestRate(2, Duration.SECOND * 5)),
@@ -80,8 +82,15 @@ def _get_stock_info_yfinance(self) -> dict:
"""
Scrape current info using yfinance.
- :raises Exception: no info available.
- :return dict: current stock info.
+ Returns
+ -------
+ dict
+ Current stock info.
+
+ Raises
+ ------
+ Exception
+ No info available.
"""
data = self.handler.info
@@ -102,14 +111,27 @@ def _get_fundamental_data_yfinance(
self, start_date: dt.date, end_date: dt.date
) -> pl.DataFrame:
"""
- Scraps fundamental data from Yahoo Finance using yfinance lib, searching
+ Scrape fundamental data from Yahoo Finance using yfinance lib, searching
for financial records released between two dates.
- :param dt.date start_date: starting date.
- :param dt.date end_date: ending date
- :raises Exception: no financial records are available.
- :return pl.DataFrame: financial report data from yfinance.
+ Parameters
+ ----------
+ start_date : dt.date
+ Starting date.
+ end_date : dt.date
+ Ending date.
+
+ Returns
+ -------
+ pl.DataFrame
+ Financial report data from yfinance.
+
+ Raises
+ ------
+ Exception
+ No financial data available for date interval.
"""
+
fields_to_keep = config.scraping.yahoo
# retrieve 3 main financial documents
@@ -128,11 +150,7 @@ def _get_fundamental_data_yfinance(
for c in list(fields_to_keep.keys()):
if c not in df.columns:
- df = df.with_columns(
- [
- pl.lit(None).alias(c),
- ]
- )
+ df = df.with_columns(pl.lit(None).alias(c))
df = df.select(list(fields_to_keep.keys()))
df = df.rename(fields_to_keep)
@@ -185,7 +203,6 @@ def get_stock_info(self):
Scrape current stock info.
"""
if self.source == "yfinance":
- # scrape stock current info from yfinance
return self._get_stock_info_yfinance()
else:
raise Exception("Other methods not implemented")
@@ -297,7 +314,7 @@ def scrape_sp500_stock_info() -> pl.DataFrame:
"""
List S&P500 stock info from wiki page and return Polars dataframe.
"""
- resp = requests.get("http://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
+ resp = requests.get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
soup = bs(resp.text, "lxml")
table = soup.find("table", id="constituents")
diff --git a/tests/conftest.py b/tests/conftest.py
index dff6345..4818eb9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,5 @@
+import datetime as dt
+
import polars as pl
import pytest
@@ -42,3 +44,28 @@ def processed_data() -> pl.DataFrame:
@pytest.fixture
def cleaned_data() -> pl.DataFrame:
return pl.read_parquet(FIXTURES_DIR / "cleaned_data.parquet")
+
+
+@pytest.fixture
+def mock_stock_data():
+ return pl.DataFrame(
+ {
+ "tic": ["AAPL", "OLD", "EXISTING"],
+ "name": ["Apple Inc.", "Old Corp", "Existing Corp"],
+ "sector": ["Technology", "Finance", "Healthcare"],
+ "last_update": [dt.datetime.now().date()] * 3,
+ "spx_status": [1, 1, 1],
+ "active": [1, 1, 1],
+ }
+ )
+
+
+@pytest.fixture
+def mock_active_data():
+ return pl.DataFrame(
+ {
+ "tic": ["AAPL", "NEW", "EXISTING"],
+ "name": ["Apple Inc.", "New Corp", "Existing Corp"],
+ "sector": ["Technology", "Finance", "Healthcare"],
+ }
+ )
diff --git a/tests/test_etl.py b/tests/test_etl.py
new file mode 100644
index 0000000..885d4dc
--- /dev/null
+++ b/tests/test_etl.py
@@ -0,0 +1,33 @@
+import polars as pl
+
+from stocksense.config import config
+from stocksense.pipeline import ETL
+
+
+def test_update_index_listings(mocker, mock_stock_data, mock_active_data):
+ etl = ETL(config)
+
+ # mock database
+ mock_db = mocker.Mock()
+ mock_db.fetch_stock.return_value = mock_stock_data
+ mock_db.update_stock.return_value = True
+ mock_db.insert_stock.return_value = True
+ etl.db = mock_db
+
+ # mock scraper
+ mocker.patch(
+ "stocksense.pipeline.scraper.Scraper.scrape_sp500_stock_info", return_value=mock_active_data
+ )
+
+ # Run update
+ etl.update_index_listings()
+
+ # verify delisted stock was updated
+ mock_db.update_stock.assert_any_call("OLD", {"spx_status": 0})
+
+ # verify new stock was added
+ mock_db.insert_stock.assert_called_once()
+ insert_call_df = mock_db.insert_stock.call_args[0][0]
+ assert "NEW" in insert_call_df["tic"].to_list()
+ assert insert_call_df.filter(pl.col("tic") == "NEW")["spx_status"][0] == 1
+ assert insert_call_df.filter(pl.col("tic") == "NEW")["active"][0] == 1