From 7b101ce6e365e44621a7e8e2fb7113581d548e40 Mon Sep 17 00:00:00 2001 From: Alex Cannan Date: Sun, 30 Jul 2023 23:36:41 -0400 Subject: [PATCH] beginnings of a neo4j driver --- articlesa/neo/README.md | 11 +++++++++ articlesa/neo/__init__.py | 52 +++++++++++++++++++++++++++++++++++---- articlesa/neo/__main__.py | 26 +++++++++++--------- articlesa/types.py | 7 +++++- 4 files changed, 78 insertions(+), 18 deletions(-) diff --git a/articlesa/neo/README.md b/articlesa/neo/README.md index d3f8e79..7bd2745 100644 --- a/articlesa/neo/README.md +++ b/articlesa/neo/README.md @@ -32,6 +32,8 @@ WITH article UNWIND $authors AS author_name MERGE (author:Author {name: author_name}) MERGE (article)-[:AUTHORED_BY]->(author) +MERGE (publisher:Publisher {netloc: $publisher}) +MERGE (article)-[:PUBLISHED_BY]->(publisher) ``` #### Get article that matches url @@ -40,3 +42,12 @@ MERGE (article)-[:AUTHORED_BY]->(author) MATCH (article:Article {url: $url}) RETURN article ``` + +#### Get a count of all articles + +```cypher +MATCH (article:Article) +MATCH (author:Author) +MATCH (publisher:Publisher) +RETURN COUNT(article) AS articleCount, COUNT(author) AS authorCount, COUNT(publisher) AS publisherCount +``` \ No newline at end of file diff --git a/articlesa/neo/__init__.py b/articlesa/neo/__init__.py index 66d1e99..136b3c9 100644 --- a/articlesa/neo/__init__.py +++ b/articlesa/neo/__init__.py @@ -7,19 +7,21 @@ import os -from neo4j import AsyncGraphDatabase, AsyncDriver +from neo4j import AsyncGraphDatabase, AsyncDriver, EagerResult +from articlesa.types import ParsedArticle -class Neo4JDriver(): + +class Neo4JArticleDriver(): """ - Neo4JDriver is an async context manager for interacting with the neo4j database. + Neo4JArticleDriver is an async context manager for interacting with the neo4j database. - You can do a lot of things with it, I promise. + All reads and writes to the database happen within this context manager. """ _driver: AsyncDriver uri: str = os.environ.get("NEO4J_URI", "bolt://localhost:7687") - async def __aenter__(self) -> 'Neo4JDriver': + async def __aenter__(self) -> 'Neo4JArticleDriver': """Enter the async context manager and return the driver.""" async with AsyncGraphDatabase.driver(self.uri) as driver: self._driver = driver @@ -28,3 +30,43 @@ async def __aenter__(self) -> 'Neo4JDriver': async def __aexit__(self, exc_type, exc_value, traceback) -> None: # noqa """Exit the async context manager.""" await self._driver.close() + + async def get_stats(self) -> EagerResult: + """Get quick stats describing the database.""" + query = """\ + MATCH (article:Article) + MATCH (author:Author) + MATCH (publisher:Publisher) + RETURN COUNT(article) AS articleCount, COUNT(author) AS authorCount, COUNT(publisher) AS publisherCount + """ + return await self._driver.execute_query(query) + + async def put_article(self, parsed_article: ParsedArticle) -> None: + """ + Put a parsed article into the database. + + Includes putting author and publisher nodes. + """ + query = """\ + MERGE (article:Article {url: $url}) + SET article.title = $title, + article.links = $links, + article.published = $published, + article.parsedAtUtc = $parsedAtUtc + WITH article + UNWIND $authors AS author_name + MERGE (author:Author {name: author_name}) + MERGE (article)-[:AUTHORED_BY]->(author) + MERGE (publisher:Publisher {netloc: $publisher}) + MERGE (article)-[:PUBLISHED_BY]->(publisher) + """ + await self._driver.execute_query( + query, + url=parsed_article.url, + title=parsed_article.title, + links=parsed_article.links, + published=parsed_article.published, + parsedAtUtc=parsed_article.parsedAtUtc, + authors=parsed_article.authors, + publisher=parsed_article.publisher, + ) diff --git a/articlesa/neo/__main__.py b/articlesa/neo/__main__.py index 5760ad6..586d90c 100644 --- a/articlesa/neo/__main__.py +++ b/articlesa/neo/__main__.py @@ -1,20 +1,22 @@ +"""Entrypoint for random neo4j tasks/tests.""" import asyncio from pprint import pprint -from articlesa.neo import Neo4JDriver +from articlesa.neo import Neo4JArticleDriver +article_urls = [ + "https://apnews.com/article/washington-virginia-maryland-loud-boom-crash-military-jet-biden-joint-base-andrews-7116356c23f2ade0d6c842159e261f1b", + "https://www.thegatewaypundit.com/2019/11/revealed-adam-schiff-connected-to-both-companies-named-in-7-4-billion-burisma-us-ukraine-corruption-case/", +] -async def main(): - query = '''\ - CREATE (article:FakeArticle { - title: "Your Article Title", - content: "Lorem ipsum dolor sit amet, consectetur adipiscing elit...", - published_date: "2023-07-25" - }) - ''' - async with Neo4JDriver() as driver: - response = await driver._driver.execute_query(query) - pprint(response.summary.__dict__) + +async def main() -> None: # noqa: D103 + from articlesa.worker.parse import parse_article, ParsedArticle + async with Neo4JArticleDriver() as driver: + article = ParsedArticle.parse_obj(await parse_article(article_urls[0])) + await driver.put_article(article) + response = await driver.get_stats() + pprint(response.summary.__dict__) # noqa: T203 asyncio.run(main()) diff --git a/articlesa/types.py b/articlesa/types.py index 0543ab2..0ac3ed5 100644 --- a/articlesa/types.py +++ b/articlesa/types.py @@ -84,11 +84,16 @@ class ParsedArticle(BaseModel): text: str authors: list[str] links: list[str] - published: str + published: str # isoformat parsedAtUtc: datetime urlhash: Optional[str] = None depth: Optional[int] = None + @property + def publisher(self) -> str: + """Return the publisher's netloc.""" + return urlparse(self.url).netloc + class StreamEvent(Enum): """SSE event types."""