-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
) This PR does two things: - Add a combined generate-extract command, fixes #158 - Adds cell type templates, fixes #159 ## Generate-Extract `ontogpt generate-extract -m gpt-4 -t cell_type "Acinar Cell Of Salivary Gland"` This does two things 1. asks GPT to generate a summary of the cell type 2. parses/extracts knowledge from that cell type This rescuscitates the original HALO idea. We could in principle **directly generate an entire knowledgebase in structured form from the latent GPT KB** Example output: ```yaml extracted_object: cell_type: Acinar cell of a salivary gland parents: - CL:0000066 subtypes: - CL:0000313 - CL:0000319 localizations: - UBERON:0001044 - UBERON:0009842 diseases: - AUTO:Sj%C3%B6gren%27s%20syndrome - MONDO:0021357 named_entities: - id: CL:0000066 label: Epithelial cell - id: CL:0000313 label: Serous cells - id: CL:0000319 label: Mucous cells - id: UBERON:0001044 label: Salivary gland - id: UBERON:0009842 label: Acinus - id: AUTO:Sj%C3%B6gren%27s%20syndrome label: Sjögren's syndrome - id: MONDO:0021357 label: Salivary gland tumors ``` ## Cell Type Templates This PR also demonstrates using subclasses for more refined subtypes Compare the two: 1. `ontogpt generate-extract -m gpt-4 -t cell_type "L2/3 Intratelencephalic Projecting Glutamatergic Neuron Of The Primary Motor Cortex"` 2. 1ontogpt generate-extract -m gpt-4 -t cell_type.InterneuronDocument "L2/3 Intratelencephalic Projecting Glutamatergic Neuron Of The Primary Motor Cortex"` The first uses the generic base class. the second uses a subclass designed for interneurons, which has an extra slot for projection fields Example output: ```yaml extracted_object: cell_type: L2/3 Intratelencephalic Projecting Glutamatergic Neuron of the Primary Motor Cortex range: Not mentioned parents: - AUTO:excitatory%20neuron subtypes: - AUTO:Not%20mentioned localizations: - UBERON:0000956 - UBERON:0001384 genes: - AUTO:Not%20mentioned diseases: - MONDO:0005180 - MONDO:0020128 projects_to_or_from: - UBERON:0001893 named_entities: - id: UBERON:0001893 label: telencephalon - id: AUTO:excitatory%20neuron label: excitatory neuron - id: AUTO:Not%20mentioned label: Not mentioned - id: UBERON:0000956 label: cerebral cortex - id: UBERON:0001384 label: primary motor cortex - id: MONDO:0005180 label: Parkinson's disease - id: MONDO:0020128 label: motor neuron disease ```
- Loading branch information
Showing
6 changed files
with
387 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
from __future__ import annotations | ||
from datetime import datetime, date | ||
from enum import Enum | ||
from typing import List, Dict, Optional, Any, Union, Literal | ||
from pydantic import BaseModel as BaseModel, Field | ||
from linkml_runtime.linkml_model import Decimal | ||
|
||
metamodel_version = "None" | ||
version = "None" | ||
|
||
class WeakRefShimBaseModel(BaseModel): | ||
__slots__ = '__weakref__' | ||
|
||
class ConfiguredBaseModel(WeakRefShimBaseModel, | ||
validate_assignment = True, | ||
validate_all = True, | ||
underscore_attrs_are_private = True, | ||
extra = 'forbid', | ||
arbitrary_types_allowed = True): | ||
pass | ||
|
||
|
||
class BrainRegionIdentifier(str, Enum): | ||
|
||
|
||
dummy = "dummy" | ||
|
||
|
||
class NullDataOptions(str, Enum): | ||
|
||
UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" | ||
NOT_APPLICABLE = "NOT_APPLICABLE" | ||
NOT_MENTIONED = "NOT_MENTIONED" | ||
|
||
|
||
|
||
class CellTypeDocument(ConfiguredBaseModel): | ||
|
||
cell_type: Optional[str] = Field(None, description="""the name of the cell type described""") | ||
range: Optional[str] = Field(None) | ||
parents: Optional[List[str]] = Field(default_factory=list, description="""categorization""") | ||
subtypes: Optional[List[str]] = Field(default_factory=list) | ||
localizations: Optional[List[str]] = Field(default_factory=list) | ||
genes: Optional[List[str]] = Field(default_factory=list) | ||
diseases: Optional[List[str]] = Field(default_factory=list) | ||
|
||
|
||
|
||
class InterneuronDocument(CellTypeDocument): | ||
|
||
projects_to_or_from: Optional[List[str]] = Field(default_factory=list, description="""Brain structures from which this cell type projects into or receives projections from""") | ||
cell_type: Optional[str] = Field(None, description="""the name of the cell type described""") | ||
range: Optional[str] = Field(None) | ||
parents: Optional[List[str]] = Field(default_factory=list, description="""categorization""") | ||
subtypes: Optional[List[str]] = Field(default_factory=list) | ||
localizations: Optional[List[str]] = Field(default_factory=list) | ||
genes: Optional[List[str]] = Field(default_factory=list) | ||
diseases: Optional[List[str]] = Field(default_factory=list) | ||
|
||
|
||
|
||
class ExtractionResult(ConfiguredBaseModel): | ||
""" | ||
A result of extracting knowledge on text | ||
""" | ||
input_id: Optional[str] = Field(None) | ||
input_title: Optional[str] = Field(None) | ||
input_text: Optional[str] = Field(None) | ||
raw_completion_output: Optional[str] = Field(None) | ||
prompt: Optional[str] = Field(None) | ||
extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""") | ||
named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""") | ||
|
||
|
||
|
||
class NamedEntity(ConfiguredBaseModel): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class Gene(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class Pathway(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class AnatomicalStructure(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class BrainRegion(AnatomicalStructure): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class CellType(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class Disease(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class Drug(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class CompoundExpression(ConfiguredBaseModel): | ||
|
||
None | ||
|
||
|
||
|
||
class Triple(CompoundExpression): | ||
""" | ||
Abstract parent for Relation Extraction tasks | ||
""" | ||
subject: Optional[str] = Field(None) | ||
predicate: Optional[str] = Field(None) | ||
object: Optional[str] = Field(None) | ||
qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") | ||
subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""") | ||
object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""") | ||
|
||
|
||
|
||
class TextWithTriples(ConfiguredBaseModel): | ||
|
||
publication: Optional[Publication] = Field(None) | ||
triples: Optional[List[Triple]] = Field(default_factory=list) | ||
|
||
|
||
|
||
class RelationshipType(NamedEntity): | ||
|
||
id: str = Field(None, description="""A unique identifier for the named entity""") | ||
label: Optional[str] = Field(None, description="""The label (name) of the named thing""") | ||
|
||
|
||
|
||
class Publication(ConfiguredBaseModel): | ||
|
||
id: Optional[str] = Field(None, description="""The publication identifier""") | ||
title: Optional[str] = Field(None, description="""The title of the publication""") | ||
abstract: Optional[str] = Field(None, description="""The abstract of the publication""") | ||
combined_text: Optional[str] = Field(None) | ||
full_text: Optional[str] = Field(None, description="""The full text of the publication""") | ||
|
||
|
||
|
||
class AnnotatorResult(ConfiguredBaseModel): | ||
|
||
subject_text: Optional[str] = Field(None) | ||
object_id: Optional[str] = Field(None) | ||
object_text: Optional[str] = Field(None) | ||
|
||
|
||
|
||
|
||
# Update forward refs | ||
# see https://pydantic-docs.helpmanual.io/usage/postponed_annotations/ | ||
CellTypeDocument.update_forward_refs() | ||
InterneuronDocument.update_forward_refs() | ||
ExtractionResult.update_forward_refs() | ||
NamedEntity.update_forward_refs() | ||
Gene.update_forward_refs() | ||
Pathway.update_forward_refs() | ||
AnatomicalStructure.update_forward_refs() | ||
BrainRegion.update_forward_refs() | ||
CellType.update_forward_refs() | ||
Disease.update_forward_refs() | ||
Drug.update_forward_refs() | ||
CompoundExpression.update_forward_refs() | ||
Triple.update_forward_refs() | ||
TextWithTriples.update_forward_refs() | ||
RelationshipType.update_forward_refs() | ||
Publication.update_forward_refs() | ||
AnnotatorResult.update_forward_refs() | ||
|
Oops, something went wrong.