Skip to content

Commit

Permalink
Remove article_exclude_reason column (#17)
Browse files Browse the repository at this point in the history
* [remove] article_exclude_reason

* [change] update tests

* [change] bump version

* [add] article language column

* [add] alembic revision
  • Loading branch information
duynguyen158 authored Dec 4, 2023
1 parent bf7ad76 commit cd6a12e
Show file tree
Hide file tree
Showing 13 changed files with 102 additions and 177 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""remove article_exclude_reason field from page table and add new article fields
Revision ID: 46c271f8f06f
Revises: e8f9a954be12 # noqa: W291
Create Date: 2023-12-04 13:38:09.372485
"""
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

from alembic import op

# revision identifiers, used by Alembic.
revision = "46c271f8f06f"
down_revision = "e8f9a954be12"
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
sa.Enum("ENGLISH", "SPANISH", name="language").create(op.get_bind())
op.add_column(
"article",
sa.Column("language", postgresql.ENUM("ENGLISH", "SPANISH", name="language", create_type=False), nullable=False),
)
op.add_column("article", sa.Column("is_in_house_content", sa.Boolean(), nullable=False))
op.drop_column("page", "article_exclude_reason")
sa.Enum("NOT_ARTICLE", "NOT_IN_HOUSE_ARTICLE", "TEST_ARTICLE", "HAS_EXCLUDED_TAG", name="articleexcludereason").drop(
op.get_bind()
)
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
sa.Enum(
"NOT_ARTICLE", "NOT_IN_HOUSE_ARTICLE", "TEST_ARTICLE", "HAS_EXCLUDED_TAG", name="articleexcludereason"
).create(op.get_bind())
op.add_column(
"page",
sa.Column(
"article_exclude_reason",
postgresql.ENUM(
"NOT_ARTICLE",
"NOT_IN_HOUSE_ARTICLE",
"TEST_ARTICLE",
"HAS_EXCLUDED_TAG",
name="articleexcludereason",
create_type=False,
),
autoincrement=False,
nullable=True,
),
)
op.drop_column("article", "is_in_house_content")
op.drop_column("article", "language")
sa.Enum("ENGLISH", "SPANISH", name="language").drop(op.get_bind())
# ### end Alembic commands ###
11 changes: 3 additions & 8 deletions article_rec_db/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,15 @@
__all__ = [
"SQLModel",
"Article",
"Page",
"SQLModel",
"ArticleExcludeReason",
"Embedding",
"StrategyType",
"MAX_EMBEDDING_DIMENSIONS",
"Execution",
"Recommendation",
"StrategyRecommendationType",
]

from .article import Article
from .embedding import MAX_DIMENSIONS as MAX_EMBEDDING_DIMENSIONS
from .embedding import Embedding
from .execution import Execution, StrategyRecommendationType, StrategyType
from .execution import Execution
from .helpers import SQLModel
from .page import ArticleExcludeReason, Page
from .page import Page
from .recommendation import Recommendation
25 changes: 14 additions & 11 deletions article_rec_db/models/article.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from datetime import datetime
from typing import Annotated, Any
from enum import StrEnum
from typing import Annotated
from uuid import UUID

from sqlalchemy import event
from sqlmodel import Column, Field, Relationship, String, UniqueConstraint

from article_rec_db.sites import SiteName
Expand All @@ -11,6 +11,16 @@
from .page import Page


class Language(StrEnum):
"""
Languages an article can have, in the IETF tag format
"""

# Add more as needed
ENGLISH = "en"
SPANISH = "es"


class Article(SQLModel, UpdateTracked, table=True):
__table_args__ = (UniqueConstraint("site", "id_in_site", name="article_site_idinsite_unique"),)

Expand All @@ -19,6 +29,8 @@ class Article(SQLModel, UpdateTracked, table=True):
id_in_site: str # ID of article in the partner site's internal system
title: str
published_at: datetime
language: Language = Language.ENGLISH
is_in_house_content: bool = True

# An article is always a page, but a page is not always an article
page: Page = Relationship(back_populates="article")
Expand Down Expand Up @@ -54,12 +66,3 @@ class Article(SQLModel, UpdateTracked, table=True):
"lazy": "joined",
},
)


@event.listens_for(Article, "before_insert")
def validate_page_is_not_excluded(mapper: Any, connection: Any, target: Article) -> None:
# If page is none, the foreign key constraint will throw; see the test_article_without_page test
if target.page is not None:
assert (
target.page.article_exclude_reason is None
), "Page has a non-null article_exclude_reason, so it cannot be added as an article"
4 changes: 2 additions & 2 deletions article_rec_db/models/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
from .helpers import AutoUUIDPrimaryKey, CreationTracked, SQLModel

# The maximum number of dimensions that the vector can have. Vectors with fewer dimensions will be padded with zeros.
MAX_DIMENSIONS = 384
MAX_EMBEDDING_DIMENSIONS = 384


class Embedding(SQLModel, AutoUUIDPrimaryKey, CreationTracked, table=True):
article_id: Annotated[UUID, Field(foreign_key="article.page_id")]
execution_id: Annotated[UUID, Field(foreign_key="execution.id")]
vector: Annotated[list[float], Field(sa_column=Column(Vector(MAX_DIMENSIONS)))]
vector: Annotated[list[float], Field(sa_column=Column(Vector(MAX_EMBEDDING_DIMENSIONS)))]

# An embedding always corresonds to an article
article: Article = Relationship(back_populates="embeddings")
Expand Down
2 changes: 1 addition & 1 deletion article_rec_db/models/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class StrategyRecommendationType(StrEnum):

class Execution(SQLModel, AutoUUIDPrimaryKey, CreationTracked, table=True):
"""
Log of training job executions, each with respect to a single strategy.
Log of training job task executions, each with respect to a single strategy.
"""

strategy: StrategyType
Expand Down
9 changes: 0 additions & 9 deletions article_rec_db/models/page.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from enum import StrEnum
from typing import Annotated
from uuid import UUID, uuid4

Expand All @@ -8,17 +7,9 @@
from .helpers import AutoUUIDPrimaryKey, SQLModel, UpdateTracked


class ArticleExcludeReason(StrEnum):
NOT_ARTICLE = "not_article"
NOT_IN_HOUSE_ARTICLE = "not_in_house_article"
TEST_ARTICLE = "test_article"
HAS_EXCLUDED_TAG = "has_excluded_tag"


class Page(SQLModel, AutoUUIDPrimaryKey, UpdateTracked, table=True):
id: Annotated[UUID, Field(default_factory=uuid4, primary_key=True)]
url: Annotated[HttpUrl, Field(sa_column=Column(String, unique=True))]
article_exclude_reason: ArticleExcludeReason | None = None

# An article is always a page, but a page is not always an article
# Techinically SQLModel considers Page the "many" in the many-to-one relationship, so this list will only ever have at most one element
Expand Down
2 changes: 1 addition & 1 deletion article_rec_db/models/recommendation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class Recommendation(SQLModel, AutoUUIDPrimaryKey, CreationTracked, table=True):
),
]

# A recommendation always corresponds to a job execution
# A recommendation always corresponds to a task execution
execution: Execution = Relationship(back_populates="recommendations")

# A default recommendation always corresponds to a target article, but not necessarily to a source article
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
description = "Database models and migrations for the Local News Lab's article recommendation system"
name = "article-rec-db"
version = "0.0.5"
version = "0.0.6"
authors = ["Duy Nguyen <hello.duyknguyen@gmail.com>"]
license = "MIT"
readme = "README.md"
Expand Down
82 changes: 8 additions & 74 deletions tests/integration/models/test_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,25 @@
from sqlalchemy.exc import IntegrityError
from sqlmodel import Session, func, select

from article_rec_db.models import (
MAX_EMBEDDING_DIMENSIONS,
Article,
ArticleExcludeReason,
Embedding,
Execution,
Page,
Recommendation,
StrategyRecommendationType,
StrategyType,
)
from article_rec_db.sites import AFRO_LA, DALLAS_FREE_PRESS
from article_rec_db.models import Article, Embedding, Execution, Page, Recommendation
from article_rec_db.models.article import Language
from article_rec_db.models.embedding import MAX_EMBEDDING_DIMENSIONS
from article_rec_db.models.execution import StrategyRecommendationType, StrategyType
from article_rec_db.sites import DALLAS_FREE_PRESS


def test_add_article_with_page(refresh_tables, engine):
# This is how we would add a page that is also an article
page = Page(
url="https://dallasfreepress.com/example-article/",
article_exclude_reason=None,
)
article_published_at = datetime.utcnow()
article = Article(
site=DALLAS_FREE_PRESS.name,
id_in_site="1234",
title="Example Article",
published_at=article_published_at,
language=Language.SPANISH,
page=page,
)

Expand All @@ -42,7 +35,6 @@ def test_add_article_with_page(refresh_tables, engine):
assert isinstance(page.db_created_at, datetime)
assert page.db_updated_at is None
assert page.url == "https://dallasfreepress.com/example-article/"
assert page.article_exclude_reason is None
assert len(page.article) == 1
assert page.article[0] is article

Expand All @@ -53,6 +45,8 @@ def test_add_article_with_page(refresh_tables, engine):
assert article.id_in_site == "1234"
assert article.title == "Example Article"
assert article.published_at == article_published_at
assert article.language == Language.SPANISH
assert article.is_in_house_content is True
assert article.page is page
assert len(article.embeddings) == 0
assert len(article.recommendations_where_this_is_source) == 0
Expand Down Expand Up @@ -84,69 +78,12 @@ def test_add_article_without_page(refresh_tables, engine):
assert num_articles == 0


def test_add_article_excluded_from_page_side(refresh_tables, engine):
article = Article(
site=AFRO_LA.name,
id_in_site="1234",
title="Actually a Home Page and Not an Article",
published_at=datetime.utcnow(),
)
page = Page(
id=uuid4(),
url="https://afrolanews.org/",
article_exclude_reason=ArticleExcludeReason.NOT_ARTICLE,
article=[article],
)

with Session(engine) as session:
session.add(page)

with pytest.raises(
AssertionError, match=r"Page has a non-null article_exclude_reason, so it cannot be added as an article"
):
session.commit()

# Check that nothing is written
session.rollback()
num_articles = session.exec(select(func.count(Article.page_id))).one()
assert num_articles == 0


def test_add_article_excluded_from_article_side(refresh_tables, engine):
page = Page(
url="https://afrolanews.org/",
article_exclude_reason=ArticleExcludeReason.NOT_ARTICLE,
)
article = Article(
site=AFRO_LA.name,
id_in_site="1234",
title="Actually a Home Page and Not an Article",
published_at=datetime.utcnow(),
page=page,
)

with Session(engine) as session:
session.add(article)

with pytest.raises(
AssertionError, match=r"Page has a non-null article_exclude_reason, so it cannot be added as an article"
):
session.commit()

# Check that nothing is written
session.rollback()
num_articles = session.exec(select(func.count(Article.page_id))).one()
assert num_articles == 0


def test_add_articles_duplicate_site_and_id_in_site(refresh_tables, engine):
page1 = Page(
url="https://dallasfreepress.com/example-article/",
article_exclude_reason=None,
)
page2 = Page(
url="https://dallasfreepress.com/example-article-2/",
article_exclude_reason=None,
)
id_in_site = "1234"
article1 = Article(
Expand Down Expand Up @@ -185,7 +122,6 @@ def test_add_articles_duplicate_site_and_id_in_site(refresh_tables, engine):
def test_update_article(refresh_tables, engine):
page = Page(
url="https://dallasfreepress.com/example-article/",
article_exclude_reason=None,
)
article = Article(
site=DALLAS_FREE_PRESS.name,
Expand Down Expand Up @@ -215,12 +151,10 @@ def test_delete_article(refresh_tables, engine):
page1 = Page(
id=page_id1,
url="https://dallasfreepress.com/example-article-1/",
article_exclude_reason=None,
)
page2 = Page(
id=page_id2,
url="https://dallasfreepress.com/example-article-2/",
article_exclude_reason=None,
)
article1 = Article(
site=DALLAS_FREE_PRESS.name,
Expand Down
Loading

0 comments on commit cd6a12e

Please sign in to comment.