From 80681cc2d31d39087a6d89f7c53026790dbc1ca6 Mon Sep 17 00:00:00 2001
From: sathvik <sathvik.rao@iiitb.ac.in>
Date: Sat, 31 Aug 2024 15:01:48 +0530
Subject: [PATCH] changes to main.py

---
 Community_Generation/communitySummary.py        |  2 +-
 Graph_Generation/graph_extraction.py            |  2 +-
 Graph_Retrieval/context_based_node_retrieval.py |  2 +-
 config.yaml                                     | 11 ++++++++---
 main.py                                         |  7 ++++---
 readme.md                                       | 10 ++++++++++
 6 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/Community_Generation/communitySummary.py b/Community_Generation/communitySummary.py
index bdb3959..68aaebd 100644
--- a/Community_Generation/communitySummary.py
+++ b/Community_Generation/communitySummary.py
@@ -139,7 +139,7 @@ def update_communities(self):
                 response=self.chain.invoke({"input_text":community_description})
                 if isinstance(response,AIMessage):
                     response=response.content
-                print(response)
+               
                 response=response[response.find("{"):response.rfind("}")]+"}"
                 
                 community_class=Community.model_validate(from_json(response,allow_partial=True))
diff --git a/Graph_Generation/graph_extraction.py b/Graph_Generation/graph_extraction.py
index 69f142b..8a87bba 100644
--- a/Graph_Generation/graph_extraction.py
+++ b/Graph_Generation/graph_extraction.py
@@ -64,7 +64,7 @@ def _extract_relations(self,entities,input_text,retries=3)->Relation:
         
         chain=self.templates[2] | self.llm
         response=chain.invoke({"entities":entities,"input_text":input_text})
-        if isinstance(response,langchain_core.messages.ai.AIMessage):
+        if isinstance(response,langchain_core.messages.ai.AIMessage) or not isinstance(response,str):
             response=response.content
         response=response[response.find("{"):response.rfind("}")+1]
         
diff --git a/Graph_Retrieval/context_based_node_retrieval.py b/Graph_Retrieval/context_based_node_retrieval.py
index 063c8e1..35adcf5 100644
--- a/Graph_Retrieval/context_based_node_retrieval.py
+++ b/Graph_Retrieval/context_based_node_retrieval.py
@@ -26,7 +26,7 @@ def __init__(self, llm, graph,node2vec_model_path,data_dir="node_data",community
         self.vectorstore=vectorstore
 
         self.community=CommunitySummary(self.graph,self.llm,self.community_data,self.create)
-        print("Communities",os.listdir(self.community_data))
+        
         
     def setup(self):
         if self.create:
diff --git a/config.yaml b/config.yaml
index 0c70c34..6ee3bbd 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,14 +1,18 @@
-data_path: # folder or file containing the data
+data_path: data
+chunk_path: chunk
 chunk_size: 512
 chunk_overlap: 128
 api_key:   # openai api key or groq api key. Use key if provided else uses ollama model specified.
-server:    # Groq or OpenAI or Ollama
-model:  # specify the model to use.
+server: # Groq or OpenAI or Ollama
+model: # specify the model to use.
+embedding_server:  #ollama or huggingface or openai
+embedding_model:  # embedding model
 temperature: 0.5
 use_sentence_embeddings: False #True if you want to use sentence embeddings
 node2vec_model_path: ./model/node2vec.model
 sentence_model_path: ./model/sentence.model
 node2vec_embeddings_path: ./embeddings/node2vec_embeddings.npy
+vectorstore_path: ./model/vectorstore
 graph_file_path: ./graph/graph.pkl
 collection_name: node_data
 node_data_dir: ./node_data
@@ -18,3 +22,4 @@ sentence_embeddings_path: embeddings/sentence_embeddings.npy
 node_names_path: ./embeddings/node_names.npy
 sentence_model_name: all-MiniLM-L6-v2
 faiss_model_path: ./model/faiss.index
+
diff --git a/main.py b/main.py
index 748b914..bed819c 100644
--- a/main.py
+++ b/main.py
@@ -77,6 +77,7 @@ def initialize_embedding_model(config):
 sentence_model_name = config.get("sentence_model_name")
 use_sentence_embeddings = config.get("use_sentence_embeddings")
 chunk_path=config.get("chunk_path")
+vectorstore_path=config.get("vectorstore_path")
 
 
 
@@ -96,7 +97,7 @@ def main(create_graph):
             data = DataLoader(path=config["data_path"], chunk_size=chunk_size, chunk_overlap=chunk_overlap).load()
             NxData = PrepareDataForNX().execute(data, chain)
             graph = nx.Graph()
-            vectorstore=VectorStore(embedding=initialize_embedding_model(config),persist_dir=node_vectorstore_path,collection_name=collection_name,create=True,documents=data,metadata=[{"chunk_id":i} for i in range(len(data))])
+            vectorstore=VectorStore(embedding=initialize_embedding_model(config),persist_dir=vectorstore_path,collection_name=collection_name,create=True,documents=data,metadata=[{"chunk_id":i} for i in range(len(data))])
             graph.add_nodes_from(NxData[0])
             graph.add_edges_from(NxData[1])
             
@@ -161,10 +162,10 @@ def main(create_graph):
                 updates="\n".join(updates)
                 updates=DataLoader(path=None,chunk_overlap=chunk_overlap,chunk_size=chunk_size).load_text(updates)
 
-                start_chunk=len(VectorStore(embedding=initialize_embedding_model(config),persist_dir=node_vectorstore_path,collection_name=collection_name,create=False,update=False).get_vectorstore().get()["documents"])
+                start_chunk=len(VectorStore(embedding=initialize_embedding_model(config),persist_dir=vectorstore_path,collection_name=collection_name,create=False,update=False).get_vectorstore().get()["documents"])
 
         
-                vectorstore=VectorStore(embedding=initialize_embedding_model(config),persist_dir=node_vectorstore_path,collection_name=collection_name,create=False,update=True,documents=updates,metadata=[{"chunk_id":i} for i in range(start_chunk,start_chunk+len(updates))])
+                vectorstore=VectorStore(embedding=initialize_embedding_model(config),persist_dir=vectorstore_path,collection_name=collection_name,create=False,update=True,documents=updates,metadata=[{"chunk_id":i} for i in range(start_chunk,start_chunk+len(updates))])
                 
                 sync.syncTempFolder()
                 chain=GraphExtractionChain(llm=llm)
diff --git a/readme.md b/readme.md
index 4a9a6ca..06017b2 100644
--- a/readme.md
+++ b/readme.md
@@ -1,4 +1,5 @@
 [![Knowledge graph workflow](https://github.com/Sathvik21S21Rao/KnowledgeGraph/actions/workflows/main.yml/badge.svg)](https://github.com/Sathvik21S21Rao/KnowledgeGraph/actions/workflows/main.yml)
+
 # GraphRAG
 
 GraphRAG is a **Python project** that uses **graph-based methods** for information retrieval. It uses **language models** and **embeddings** to create and interact with a **graph of data**.
@@ -82,6 +83,15 @@ When running `main.py`, you will be asked whether you want to create a **new gra
 
 Once the application is running, you can interact with it by typing **queries** into the console. The application will respond with information retrieved from the graph based on your query. To end the conversation, type **'exit'**.
 
+## Future scope
+
+1. An interactive UI for seamless user experience
+2. Integrating vector search along with graph search
+3. Allowing users to maintain multiple graphs.
+4. Shifting from in memory graph computation to disk based retrieval
+
+## Tested to work with Groq(LLama3.1-70b) and Ollama Embedding(nomic-embed-text)
+
 ## Visualisation of graph
 
 - Built using the data in [testing_data.py](https://github.com/Sathvik21S21Rao/KnowledgeGraph/blob/main/testing_data.py)