From 80681cc2d31d39087a6d89f7c53026790dbc1ca6 Mon Sep 17 00:00:00 2001 From: sathvik Date: Sat, 31 Aug 2024 15:01:48 +0530 Subject: [PATCH] changes to main.py --- Community_Generation/communitySummary.py | 2 +- Graph_Generation/graph_extraction.py | 2 +- Graph_Retrieval/context_based_node_retrieval.py | 2 +- config.yaml | 11 ++++++++--- main.py | 7 ++++--- readme.md | 10 ++++++++++ 6 files changed, 25 insertions(+), 9 deletions(-) diff --git a/Community_Generation/communitySummary.py b/Community_Generation/communitySummary.py index bdb3959..68aaebd 100644 --- a/Community_Generation/communitySummary.py +++ b/Community_Generation/communitySummary.py @@ -139,7 +139,7 @@ def update_communities(self): response=self.chain.invoke({"input_text":community_description}) if isinstance(response,AIMessage): response=response.content - print(response) + response=response[response.find("{"):response.rfind("}")]+"}" community_class=Community.model_validate(from_json(response,allow_partial=True)) diff --git a/Graph_Generation/graph_extraction.py b/Graph_Generation/graph_extraction.py index 69f142b..8a87bba 100644 --- a/Graph_Generation/graph_extraction.py +++ b/Graph_Generation/graph_extraction.py @@ -64,7 +64,7 @@ def _extract_relations(self,entities,input_text,retries=3)->Relation: chain=self.templates[2] | self.llm response=chain.invoke({"entities":entities,"input_text":input_text}) - if isinstance(response,langchain_core.messages.ai.AIMessage): + if isinstance(response,langchain_core.messages.ai.AIMessage) or not isinstance(response,str): response=response.content response=response[response.find("{"):response.rfind("}")+1] diff --git a/Graph_Retrieval/context_based_node_retrieval.py b/Graph_Retrieval/context_based_node_retrieval.py index 063c8e1..35adcf5 100644 --- a/Graph_Retrieval/context_based_node_retrieval.py +++ b/Graph_Retrieval/context_based_node_retrieval.py @@ -26,7 +26,7 @@ def __init__(self, llm, graph,node2vec_model_path,data_dir="node_data",community self.vectorstore=vectorstore self.community=CommunitySummary(self.graph,self.llm,self.community_data,self.create) - print("Communities",os.listdir(self.community_data)) + def setup(self): if self.create: diff --git a/config.yaml b/config.yaml index 0c70c34..6ee3bbd 100644 --- a/config.yaml +++ b/config.yaml @@ -1,14 +1,18 @@ -data_path: # folder or file containing the data +data_path: data +chunk_path: chunk chunk_size: 512 chunk_overlap: 128 api_key: # openai api key or groq api key. Use key if provided else uses ollama model specified. -server: # Groq or OpenAI or Ollama -model: # specify the model to use. +server: # Groq or OpenAI or Ollama +model: # specify the model to use. +embedding_server: #ollama or huggingface or openai +embedding_model: # embedding model temperature: 0.5 use_sentence_embeddings: False #True if you want to use sentence embeddings node2vec_model_path: ./model/node2vec.model sentence_model_path: ./model/sentence.model node2vec_embeddings_path: ./embeddings/node2vec_embeddings.npy +vectorstore_path: ./model/vectorstore graph_file_path: ./graph/graph.pkl collection_name: node_data node_data_dir: ./node_data @@ -18,3 +22,4 @@ sentence_embeddings_path: embeddings/sentence_embeddings.npy node_names_path: ./embeddings/node_names.npy sentence_model_name: all-MiniLM-L6-v2 faiss_model_path: ./model/faiss.index + diff --git a/main.py b/main.py index 748b914..bed819c 100644 --- a/main.py +++ b/main.py @@ -77,6 +77,7 @@ def initialize_embedding_model(config): sentence_model_name = config.get("sentence_model_name") use_sentence_embeddings = config.get("use_sentence_embeddings") chunk_path=config.get("chunk_path") +vectorstore_path=config.get("vectorstore_path") @@ -96,7 +97,7 @@ def main(create_graph): data = DataLoader(path=config["data_path"], chunk_size=chunk_size, chunk_overlap=chunk_overlap).load() NxData = PrepareDataForNX().execute(data, chain) graph = nx.Graph() - vectorstore=VectorStore(embedding=initialize_embedding_model(config),persist_dir=node_vectorstore_path,collection_name=collection_name,create=True,documents=data,metadata=[{"chunk_id":i} for i in range(len(data))]) + vectorstore=VectorStore(embedding=initialize_embedding_model(config),persist_dir=vectorstore_path,collection_name=collection_name,create=True,documents=data,metadata=[{"chunk_id":i} for i in range(len(data))]) graph.add_nodes_from(NxData[0]) graph.add_edges_from(NxData[1]) @@ -161,10 +162,10 @@ def main(create_graph): updates="\n".join(updates) updates=DataLoader(path=None,chunk_overlap=chunk_overlap,chunk_size=chunk_size).load_text(updates) - start_chunk=len(VectorStore(embedding=initialize_embedding_model(config),persist_dir=node_vectorstore_path,collection_name=collection_name,create=False,update=False).get_vectorstore().get()["documents"]) + start_chunk=len(VectorStore(embedding=initialize_embedding_model(config),persist_dir=vectorstore_path,collection_name=collection_name,create=False,update=False).get_vectorstore().get()["documents"]) - vectorstore=VectorStore(embedding=initialize_embedding_model(config),persist_dir=node_vectorstore_path,collection_name=collection_name,create=False,update=True,documents=updates,metadata=[{"chunk_id":i} for i in range(start_chunk,start_chunk+len(updates))]) + vectorstore=VectorStore(embedding=initialize_embedding_model(config),persist_dir=vectorstore_path,collection_name=collection_name,create=False,update=True,documents=updates,metadata=[{"chunk_id":i} for i in range(start_chunk,start_chunk+len(updates))]) sync.syncTempFolder() chain=GraphExtractionChain(llm=llm) diff --git a/readme.md b/readme.md index 4a9a6ca..06017b2 100644 --- a/readme.md +++ b/readme.md @@ -1,4 +1,5 @@ [![Knowledge graph workflow](https://github.com/Sathvik21S21Rao/KnowledgeGraph/actions/workflows/main.yml/badge.svg)](https://github.com/Sathvik21S21Rao/KnowledgeGraph/actions/workflows/main.yml) + # GraphRAG GraphRAG is a **Python project** that uses **graph-based methods** for information retrieval. It uses **language models** and **embeddings** to create and interact with a **graph of data**. @@ -82,6 +83,15 @@ When running `main.py`, you will be asked whether you want to create a **new gra Once the application is running, you can interact with it by typing **queries** into the console. The application will respond with information retrieved from the graph based on your query. To end the conversation, type **'exit'**. +## Future scope + +1. An interactive UI for seamless user experience +2. Integrating vector search along with graph search +3. Allowing users to maintain multiple graphs. +4. Shifting from in memory graph computation to disk based retrieval + +## Tested to work with Groq(LLama3.1-70b) and Ollama Embedding(nomic-embed-text) + ## Visualisation of graph - Built using the data in [testing_data.py](https://github.com/Sathvik21S21Rao/KnowledgeGraph/blob/main/testing_data.py)