NREL · jeisenman23 · Nov 14, 2024 · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/elm/wizard.py b/elm/wizard.py
@@ -3,6 +3,7 @@
 ELM energy wizard
 """
 from abc import ABC, abstractmethod
+from time import perf_counter
 import copy
 import os
 import json
@@ -61,10 +62,12 @@ def query_vector_db(self, query, limit=100):
             ranked strings/scores outputs.
         """
 
-    def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,
-                       convo=False):
+    def engineer_query(self, query,
+                       token_budget=None,
+                       new_info_threshold=0.7,
+                       convo=False,
+                       timeit=False):
         """Engineer a query for GPT using the corpus of information
-
         Parameters
         ----------
         query : str
@@ -87,6 +90,9 @@ def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,
         references : list
             The list of references (strs) used in the engineered prompt is
             returned here
+        vector_query_time : float
+            if timeit is True, then we will time how long the query to the
+            vectorDB takes
         """
 
         self.messages.append({"role": "user", "content": query})
@@ -99,9 +105,10 @@ def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,
             query = '\n\n'.join(query)
 
         token_budget = token_budget or self.token_budget
-
+        start_time = perf_counter()
         strings, _, idx = self.query_vector_db(query)
-
+        end_time = perf_counter()
+        vector_query_time = end_time - start_time
         message = copy.deepcopy(self.MODEL_INSTRUCTION)
         question = f"\n\nQuestion: {query}"
         used_index = []
@@ -125,7 +132,8 @@ def engineer_query(self, query, token_budget=None, new_info_threshold=0.7,
         message = message + question
         used_index = np.array(used_index)
         references = self.make_ref_list(used_index)
-
+        if timeit:
+            return message, references, used_index, vector_query_time
         return message, references, used_index
 
     @abstractmethod
@@ -152,10 +160,10 @@ def chat(self, query,
              token_budget=None,
              new_info_threshold=0.7,
              print_references=False,
-             return_chat_obj=False):
+             return_chat_obj=False,
+             timeit=False):
         """Answers a query by doing a semantic search of relevant text with
         embeddings and then sending engineered query to the LLM.
-
         Parameters
         ----------
         query : str
@@ -184,7 +192,8 @@ def chat(self, query,
             valid ref_col.
         return_chat_obj : bool
             Flag to only return the ChatCompletion from OpenAI API.
-
+        timeit : bool
+            Flag to return the performance metrics on API calls.
         Returns
         -------
         response : str
@@ -195,12 +204,15 @@ def chat(self, query,
         references : list
             If debug is True, the list of references (strs) used in the
             engineered prompt is returned here
+        performance : dict
+            If timeit is True, returns dictionary with keys of total_chat_time,
+            chat_completion_time and vectordb_query_time.
         """
-
+        start_chat_time = perf_counter()
         out = self.engineer_query(query, token_budget=token_budget,
                                   new_info_threshold=new_info_threshold,
-                                  convo=convo)
-        query, references, _ = out
+                                  convo=convo, timeit=True)
+        query, references, _, vector_query_time = out
 
         messages = [{"role": "system", "content": self.MODEL_ROLE},
                     {"role": "user", "content": query}]
@@ -209,18 +221,18 @@ def chat(self, query,
                       messages=messages,
                       temperature=temperature,
                       stream=stream)
+        start_completion_time = perf_counter()
 
         response = self._client.chat.completions.create(**kwargs)
-
+        finish_completion_time = perf_counter()
+        chat_completion_time = start_completion_time - finish_completion_time
         if return_chat_obj:
             return response, query, references
-
         if stream:
             for chunk in response:
                 chunk_msg = chunk.choices[0].delta.content or ""
                 response_message += chunk_msg
                 print(chunk_msg, end='')
-
         else:
             response_message = response.choices[0].message.content
 
@@ -234,7 +246,15 @@ def chat(self, query,
             response_message += ref_msg
             if stream:
                 print(ref_msg)
-
+        end_time = perf_counter()
+        total_chat_time = start_chat_time - end_time
+        performance = {
+            "total_chat_time": total_chat_time,
+            "chat_completion_time": chat_completion_time,
+            "vectordb_query_time": vector_query_time
+        }
+        if timeit and debug:
+            return response_message, query, references, performance
         if debug:
             return response_message, query, references
         else: