diff --git a/alembic/versions/2023_12_04_1338-46c271f8f06f_remove_article_exclude_reason_field_.py b/alembic/versions/2023_12_04_1338-46c271f8f06f_remove_article_exclude_reason_field_.py new file mode 100644 index 0000000..b6c927b --- /dev/null +++ b/alembic/versions/2023_12_04_1338-46c271f8f06f_remove_article_exclude_reason_field_.py @@ -0,0 +1,59 @@ +"""remove article_exclude_reason field from page table and add new article fields + +Revision ID: 46c271f8f06f +Revises: e8f9a954be12 # noqa: W291 +Create Date: 2023-12-04 13:38:09.372485 + +""" +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "46c271f8f06f" +down_revision = "e8f9a954be12" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum("ENGLISH", "SPANISH", name="language").create(op.get_bind()) + op.add_column( + "article", + sa.Column("language", postgresql.ENUM("ENGLISH", "SPANISH", name="language", create_type=False), nullable=False), + ) + op.add_column("article", sa.Column("is_in_house_content", sa.Boolean(), nullable=False)) + op.drop_column("page", "article_exclude_reason") + sa.Enum("NOT_ARTICLE", "NOT_IN_HOUSE_ARTICLE", "TEST_ARTICLE", "HAS_EXCLUDED_TAG", name="articleexcludereason").drop( + op.get_bind() + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + sa.Enum( + "NOT_ARTICLE", "NOT_IN_HOUSE_ARTICLE", "TEST_ARTICLE", "HAS_EXCLUDED_TAG", name="articleexcludereason" + ).create(op.get_bind()) + op.add_column( + "page", + sa.Column( + "article_exclude_reason", + postgresql.ENUM( + "NOT_ARTICLE", + "NOT_IN_HOUSE_ARTICLE", + "TEST_ARTICLE", + "HAS_EXCLUDED_TAG", + name="articleexcludereason", + create_type=False, + ), + autoincrement=False, + nullable=True, + ), + ) + op.drop_column("article", "is_in_house_content") + op.drop_column("article", "language") + sa.Enum("ENGLISH", "SPANISH", name="language").drop(op.get_bind()) + # ### end Alembic commands ### diff --git a/article_rec_db/models/__init__.py b/article_rec_db/models/__init__.py index cb659e1..a660d49 100644 --- a/article_rec_db/models/__init__.py +++ b/article_rec_db/models/__init__.py @@ -1,20 +1,15 @@ __all__ = [ + "SQLModel", "Article", "Page", - "SQLModel", - "ArticleExcludeReason", "Embedding", - "StrategyType", - "MAX_EMBEDDING_DIMENSIONS", "Execution", "Recommendation", - "StrategyRecommendationType", ] from .article import Article -from .embedding import MAX_DIMENSIONS as MAX_EMBEDDING_DIMENSIONS from .embedding import Embedding -from .execution import Execution, StrategyRecommendationType, StrategyType +from .execution import Execution from .helpers import SQLModel -from .page import ArticleExcludeReason, Page +from .page import Page from .recommendation import Recommendation diff --git a/article_rec_db/models/article.py b/article_rec_db/models/article.py index fe1ab6c..9d65f48 100644 --- a/article_rec_db/models/article.py +++ b/article_rec_db/models/article.py @@ -1,8 +1,8 @@ from datetime import datetime -from typing import Annotated, Any +from enum import StrEnum +from typing import Annotated from uuid import UUID -from sqlalchemy import event from sqlmodel import Column, Field, Relationship, String, UniqueConstraint from article_rec_db.sites import SiteName @@ -11,6 +11,16 @@ from .page import Page +class Language(StrEnum): + """ + Languages an article can have, in the IETF tag format + """ + + # Add more as needed + ENGLISH = "en" + SPANISH = "es" + + class Article(SQLModel, UpdateTracked, table=True): __table_args__ = (UniqueConstraint("site", "id_in_site", name="article_site_idinsite_unique"),) @@ -19,6 +29,8 @@ class Article(SQLModel, UpdateTracked, table=True): id_in_site: str # ID of article in the partner site's internal system title: str published_at: datetime + language: Language = Language.ENGLISH + is_in_house_content: bool = True # An article is always a page, but a page is not always an article page: Page = Relationship(back_populates="article") @@ -54,12 +66,3 @@ class Article(SQLModel, UpdateTracked, table=True): "lazy": "joined", }, ) - - -@event.listens_for(Article, "before_insert") -def validate_page_is_not_excluded(mapper: Any, connection: Any, target: Article) -> None: - # If page is none, the foreign key constraint will throw; see the test_article_without_page test - if target.page is not None: - assert ( - target.page.article_exclude_reason is None - ), "Page has a non-null article_exclude_reason, so it cannot be added as an article" diff --git a/article_rec_db/models/embedding.py b/article_rec_db/models/embedding.py index dbe05de..ab64712 100644 --- a/article_rec_db/models/embedding.py +++ b/article_rec_db/models/embedding.py @@ -9,13 +9,13 @@ from .helpers import AutoUUIDPrimaryKey, CreationTracked, SQLModel # The maximum number of dimensions that the vector can have. Vectors with fewer dimensions will be padded with zeros. -MAX_DIMENSIONS = 384 +MAX_EMBEDDING_DIMENSIONS = 384 class Embedding(SQLModel, AutoUUIDPrimaryKey, CreationTracked, table=True): article_id: Annotated[UUID, Field(foreign_key="article.page_id")] execution_id: Annotated[UUID, Field(foreign_key="execution.id")] - vector: Annotated[list[float], Field(sa_column=Column(Vector(MAX_DIMENSIONS)))] + vector: Annotated[list[float], Field(sa_column=Column(Vector(MAX_EMBEDDING_DIMENSIONS)))] # An embedding always corresonds to an article article: Article = Relationship(back_populates="embeddings") diff --git a/article_rec_db/models/execution.py b/article_rec_db/models/execution.py index 5bf8c01..ff3d8a1 100644 --- a/article_rec_db/models/execution.py +++ b/article_rec_db/models/execution.py @@ -19,7 +19,7 @@ class StrategyRecommendationType(StrEnum): class Execution(SQLModel, AutoUUIDPrimaryKey, CreationTracked, table=True): """ - Log of training job executions, each with respect to a single strategy. + Log of training job task executions, each with respect to a single strategy. """ strategy: StrategyType diff --git a/article_rec_db/models/page.py b/article_rec_db/models/page.py index 60e69f2..7c3f1e7 100644 --- a/article_rec_db/models/page.py +++ b/article_rec_db/models/page.py @@ -1,4 +1,3 @@ -from enum import StrEnum from typing import Annotated from uuid import UUID, uuid4 @@ -8,17 +7,9 @@ from .helpers import AutoUUIDPrimaryKey, SQLModel, UpdateTracked -class ArticleExcludeReason(StrEnum): - NOT_ARTICLE = "not_article" - NOT_IN_HOUSE_ARTICLE = "not_in_house_article" - TEST_ARTICLE = "test_article" - HAS_EXCLUDED_TAG = "has_excluded_tag" - - class Page(SQLModel, AutoUUIDPrimaryKey, UpdateTracked, table=True): id: Annotated[UUID, Field(default_factory=uuid4, primary_key=True)] url: Annotated[HttpUrl, Field(sa_column=Column(String, unique=True))] - article_exclude_reason: ArticleExcludeReason | None = None # An article is always a page, but a page is not always an article # Techinically SQLModel considers Page the "many" in the many-to-one relationship, so this list will only ever have at most one element diff --git a/article_rec_db/models/recommendation.py b/article_rec_db/models/recommendation.py index 4b93993..38affb6 100644 --- a/article_rec_db/models/recommendation.py +++ b/article_rec_db/models/recommendation.py @@ -44,7 +44,7 @@ class Recommendation(SQLModel, AutoUUIDPrimaryKey, CreationTracked, table=True): ), ] - # A recommendation always corresponds to a job execution + # A recommendation always corresponds to a task execution execution: Execution = Relationship(back_populates="recommendations") # A default recommendation always corresponds to a target article, but not necessarily to a source article diff --git a/pyproject.toml b/pyproject.toml index 32090ca..df4a7c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] description = "Database models and migrations for the Local News Lab's article recommendation system" name = "article-rec-db" -version = "0.0.5" +version = "0.0.6" authors = ["Duy Nguyen "] license = "MIT" readme = "README.md" diff --git a/tests/integration/models/test_article.py b/tests/integration/models/test_article.py index 370af55..5aa494a 100644 --- a/tests/integration/models/test_article.py +++ b/tests/integration/models/test_article.py @@ -5,25 +5,17 @@ from sqlalchemy.exc import IntegrityError from sqlmodel import Session, func, select -from article_rec_db.models import ( - MAX_EMBEDDING_DIMENSIONS, - Article, - ArticleExcludeReason, - Embedding, - Execution, - Page, - Recommendation, - StrategyRecommendationType, - StrategyType, -) -from article_rec_db.sites import AFRO_LA, DALLAS_FREE_PRESS +from article_rec_db.models import Article, Embedding, Execution, Page, Recommendation +from article_rec_db.models.article import Language +from article_rec_db.models.embedding import MAX_EMBEDDING_DIMENSIONS +from article_rec_db.models.execution import StrategyRecommendationType, StrategyType +from article_rec_db.sites import DALLAS_FREE_PRESS def test_add_article_with_page(refresh_tables, engine): # This is how we would add a page that is also an article page = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) article_published_at = datetime.utcnow() article = Article( @@ -31,6 +23,7 @@ def test_add_article_with_page(refresh_tables, engine): id_in_site="1234", title="Example Article", published_at=article_published_at, + language=Language.SPANISH, page=page, ) @@ -42,7 +35,6 @@ def test_add_article_with_page(refresh_tables, engine): assert isinstance(page.db_created_at, datetime) assert page.db_updated_at is None assert page.url == "https://dallasfreepress.com/example-article/" - assert page.article_exclude_reason is None assert len(page.article) == 1 assert page.article[0] is article @@ -53,6 +45,8 @@ def test_add_article_with_page(refresh_tables, engine): assert article.id_in_site == "1234" assert article.title == "Example Article" assert article.published_at == article_published_at + assert article.language == Language.SPANISH + assert article.is_in_house_content is True assert article.page is page assert len(article.embeddings) == 0 assert len(article.recommendations_where_this_is_source) == 0 @@ -84,69 +78,12 @@ def test_add_article_without_page(refresh_tables, engine): assert num_articles == 0 -def test_add_article_excluded_from_page_side(refresh_tables, engine): - article = Article( - site=AFRO_LA.name, - id_in_site="1234", - title="Actually a Home Page and Not an Article", - published_at=datetime.utcnow(), - ) - page = Page( - id=uuid4(), - url="https://afrolanews.org/", - article_exclude_reason=ArticleExcludeReason.NOT_ARTICLE, - article=[article], - ) - - with Session(engine) as session: - session.add(page) - - with pytest.raises( - AssertionError, match=r"Page has a non-null article_exclude_reason, so it cannot be added as an article" - ): - session.commit() - - # Check that nothing is written - session.rollback() - num_articles = session.exec(select(func.count(Article.page_id))).one() - assert num_articles == 0 - - -def test_add_article_excluded_from_article_side(refresh_tables, engine): - page = Page( - url="https://afrolanews.org/", - article_exclude_reason=ArticleExcludeReason.NOT_ARTICLE, - ) - article = Article( - site=AFRO_LA.name, - id_in_site="1234", - title="Actually a Home Page and Not an Article", - published_at=datetime.utcnow(), - page=page, - ) - - with Session(engine) as session: - session.add(article) - - with pytest.raises( - AssertionError, match=r"Page has a non-null article_exclude_reason, so it cannot be added as an article" - ): - session.commit() - - # Check that nothing is written - session.rollback() - num_articles = session.exec(select(func.count(Article.page_id))).one() - assert num_articles == 0 - - def test_add_articles_duplicate_site_and_id_in_site(refresh_tables, engine): page1 = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) page2 = Page( url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) id_in_site = "1234" article1 = Article( @@ -185,7 +122,6 @@ def test_add_articles_duplicate_site_and_id_in_site(refresh_tables, engine): def test_update_article(refresh_tables, engine): page = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) article = Article( site=DALLAS_FREE_PRESS.name, @@ -215,12 +151,10 @@ def test_delete_article(refresh_tables, engine): page1 = Page( id=page_id1, url="https://dallasfreepress.com/example-article-1/", - article_exclude_reason=None, ) page2 = Page( id=page_id2, url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) article1 = Article( site=DALLAS_FREE_PRESS.name, diff --git a/tests/integration/models/test_embedding.py b/tests/integration/models/test_embedding.py index 9868c6b..c44fbe4 100644 --- a/tests/integration/models/test_embedding.py +++ b/tests/integration/models/test_embedding.py @@ -5,16 +5,9 @@ import pytest from sqlmodel import Session, func, select -from article_rec_db.models import ( - MAX_EMBEDDING_DIMENSIONS, - Article, - Embedding, - Execution, - Page, - Recommendation, - StrategyRecommendationType, - StrategyType, -) +from article_rec_db.models import Article, Embedding, Execution, Page, Recommendation +from article_rec_db.models.embedding import MAX_EMBEDDING_DIMENSIONS +from article_rec_db.models.execution import StrategyRecommendationType, StrategyType from article_rec_db.sites import DALLAS_FREE_PRESS @@ -26,7 +19,6 @@ def rng() -> np.random.Generator: def test_add_embedding(refresh_tables, engine, rng): page = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) article = Article( site=DALLAS_FREE_PRESS.name, @@ -67,15 +59,12 @@ def test_add_embedding(refresh_tables, engine, rng): def test_select_embeddings_knn(refresh_tables, engine, rng): page1 = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) page2 = Page( url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) page3 = Page( url="https://dallasfreepress.com/example-article-3/", - article_exclude_reason=None, ) article1 = Article( site=DALLAS_FREE_PRESS.name, @@ -154,12 +143,10 @@ def test_delete_embedding(refresh_tables, engine): page1 = Page( id=page_id1, url="https://dallasfreepress.com/example-article-1/", - article_exclude_reason=None, ) page2 = Page( id=page_id2, url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) article1 = Article( site=DALLAS_FREE_PRESS.name, diff --git a/tests/integration/models/test_execution.py b/tests/integration/models/test_execution.py index bf4cd63..cb6f8d4 100644 --- a/tests/integration/models/test_execution.py +++ b/tests/integration/models/test_execution.py @@ -3,16 +3,9 @@ from sqlmodel import Session, func, select -from article_rec_db.models import ( - MAX_EMBEDDING_DIMENSIONS, - Article, - Embedding, - Execution, - Page, - Recommendation, - StrategyRecommendationType, - StrategyType, -) +from article_rec_db.models import Article, Embedding, Execution, Page, Recommendation +from article_rec_db.models.embedding import MAX_EMBEDDING_DIMENSIONS +from article_rec_db.models.execution import StrategyRecommendationType, StrategyType from article_rec_db.sites import DALLAS_FREE_PRESS @@ -39,12 +32,10 @@ def test_delete_execution(refresh_tables, engine): page1 = Page( id=page_id1, url="https://dallasfreepress.com/example-article-1/", - article_exclude_reason=None, ) page2 = Page( id=page_id2, url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) article1 = Article( site=DALLAS_FREE_PRESS.name, diff --git a/tests/integration/models/test_page.py b/tests/integration/models/test_page.py index bc4b8b5..6048d36 100644 --- a/tests/integration/models/test_page.py +++ b/tests/integration/models/test_page.py @@ -5,24 +5,15 @@ from sqlalchemy.exc import IntegrityError from sqlmodel import Session, func, select -from article_rec_db.models import ( - MAX_EMBEDDING_DIMENSIONS, - Article, - ArticleExcludeReason, - Embedding, - Execution, - Page, - Recommendation, - StrategyRecommendationType, - StrategyType, -) +from article_rec_db.models import Article, Embedding, Execution, Page, Recommendation +from article_rec_db.models.embedding import MAX_EMBEDDING_DIMENSIONS +from article_rec_db.models.execution import StrategyRecommendationType, StrategyType from article_rec_db.sites import DALLAS_FREE_PRESS def test_add_page_not_article(refresh_tables, engine): page = Page( - url="https://afrolanews.org/", - article_exclude_reason=ArticleExcludeReason.NOT_ARTICLE, + url="https://afrolanews.org/", # Home page, so not an article ) with Session(engine) as session: session.add(page) @@ -32,18 +23,15 @@ def test_add_page_not_article(refresh_tables, engine): assert isinstance(page.db_created_at, datetime) assert page.db_updated_at is None assert page.url == "https://afrolanews.org/" - assert page.article_exclude_reason == ArticleExcludeReason.NOT_ARTICLE assert len(page.article) == 0 def test_add_pages_duplicate_url(refresh_tables, engine): page1 = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) page2 = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) with Session(engine) as session: session.add(page1) @@ -66,7 +54,6 @@ def test_add_pages_duplicate_url(refresh_tables, engine): def test_update_page(refresh_tables, engine): page = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) with Session(engine) as session: session.add(page) @@ -89,12 +76,10 @@ def test_delete_page(refresh_tables, engine): page1 = Page( id=page_id1, url="https://dallasfreepress.com/example-article-1/", - article_exclude_reason=None, ) page2 = Page( id=page_id2, url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) article1 = Article( site=DALLAS_FREE_PRESS.name, diff --git a/tests/integration/models/test_recommendation.py b/tests/integration/models/test_recommendation.py index 60f5da8..5f76d83 100644 --- a/tests/integration/models/test_recommendation.py +++ b/tests/integration/models/test_recommendation.py @@ -5,23 +5,15 @@ from sqlalchemy.exc import IntegrityError from sqlmodel import Session, func, select -from article_rec_db.models import ( - MAX_EMBEDDING_DIMENSIONS, - Article, - Embedding, - Execution, - Page, - Recommendation, - StrategyRecommendationType, - StrategyType, -) +from article_rec_db.models import Article, Embedding, Execution, Page, Recommendation +from article_rec_db.models.embedding import MAX_EMBEDDING_DIMENSIONS +from article_rec_db.models.execution import StrategyRecommendationType, StrategyType from article_rec_db.sites import DALLAS_FREE_PRESS def test_add_default_recommendation(refresh_tables, engine): page = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) article = Article( site=DALLAS_FREE_PRESS.name, @@ -60,7 +52,6 @@ def test_add_default_recommendation(refresh_tables, engine): def test_add_default_recommendation_with_nonnull_source_id(refresh_tables, engine): page = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) article = Article( site=DALLAS_FREE_PRESS.name, @@ -93,12 +84,10 @@ def test_add_recommendation_source_target_interchangeable(refresh_tables, engine page1 = Page( id=UUID(int=1), # smaller ID url="https://dallasfreepress.com/example-article-1/", - article_exclude_reason=None, ) page2 = Page( id=UUID(int=2), # larger ID url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) article1 = Article( @@ -156,7 +145,6 @@ def test_add_recommendation_source_target_interchangeable(refresh_tables, engine def test_add_recommendation_source_target_interchangeable_no_source(refresh_tables, engine): page = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) article = Article( site=DALLAS_FREE_PRESS.name, @@ -191,12 +179,10 @@ def test_add_recommendation_source_target_interchangeable_wrong_order_recommenda page1 = Page( id=UUID(int=1), # smaller ID url="https://dallasfreepress.com/example-article-1/", - article_exclude_reason=None, ) page2 = Page( id=UUID(int=2), # larger ID url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) article1 = Article( @@ -240,12 +226,10 @@ def test_add_recommendation_source_target_interchangeable_wrong_order_article_si page1 = Page( id=UUID(int=1), # smaller ID url="https://dallasfreepress.com/example-article-1/", - article_exclude_reason=None, ) page2 = Page( id=UUID(int=2), # larger ID url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) article1 = Article( @@ -290,7 +274,6 @@ def test_add_recommendation_source_target_interchangeable_wrong_order_article_si def test_add_recommendation_invalid_score(refresh_tables, engine): page = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) article = Article( site=DALLAS_FREE_PRESS.name, @@ -319,7 +302,6 @@ def test_add_recommendation_invalid_score(refresh_tables, engine): def test_add_recommendations_duplicate(refresh_tables, engine): page = Page( url="https://dallasfreepress.com/example-article/", - article_exclude_reason=None, ) article = Article( site=DALLAS_FREE_PRESS.name, @@ -358,12 +340,10 @@ def test_delete_recommendation(refresh_tables, engine): page1 = Page( id=page_id1, url="https://dallasfreepress.com/example-article-1/", - article_exclude_reason=None, ) page2 = Page( id=page_id2, url="https://dallasfreepress.com/example-article-2/", - article_exclude_reason=None, ) article1 = Article( site=DALLAS_FREE_PRESS.name,