From fd7d63b3270f83ed76752a507807a459d8995dd5 Mon Sep 17 00:00:00 2001 From: Blaister9 Date: Thu, 3 Oct 2024 10:57:29 -0500 Subject: [PATCH] =?UTF-8?q?Implementaci=C3=B3n=20de=20chatbot=20Laura?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/chatbot_laura/chatbot_logic.py | 156 +++++++++++++------------ backend/chatbot_laura/views.py | 38 +++--- 2 files changed, 101 insertions(+), 93 deletions(-) diff --git a/backend/chatbot_laura/chatbot_logic.py b/backend/chatbot_laura/chatbot_logic.py index 8c2e437..4d6c462 100644 --- a/backend/chatbot_laura/chatbot_logic.py +++ b/backend/chatbot_laura/chatbot_logic.py @@ -1,88 +1,96 @@ from openai import OpenAI - import pandas as pd - import json - import faiss - import numpy as np - import os - import pickle - from django.conf import settings +import pandas as pd +import json +import faiss +import numpy as np +import os +import pickle +from django.conf import settings +from dotenv import load_dotenv - client = OpenAI(api_key=settings.OPENAI_API_KEY) +load_dotenv() - embedding_file = os.path.join(settings.BASE_DIR, "chatbot_laura", "embeddings.pkl") - index_file = os.path.join(settings.BASE_DIR, "chatbot_laura", "faiss_index.index") +# Leer la API key de OpenAI desde las variables de entorno +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = OpenAI(api_key=OPENAI_API_KEY) - def get_embedding(text, model="text-embedding-ada-002"): - response = client.embeddings.create(input=text, model=model) - return response.data[0].embedding +# Actualizar las rutas de los archivos +data_dir = os.path.join(settings.BASE_DIR, "data") +embedding_file = os.path.join(data_dir, "embeddings.pkl") +index_file = os.path.join(data_dir, "faiss_index.index") +json_file = os.path.join(data_dir, "preguntas_respuestas_procesadasV1.json") - def save_embeddings(embeddings, file_name): - with open(file_name, 'wb') as f: - pickle.dump(embeddings, f) +def get_embedding(text, model="text-embedding-ada-002"): + response = client.embeddings.create(input=text, model=model) + return response.data[0].embedding - def load_embeddings(file_name): - with open(file_name, 'rb') as f: - return pickle.load(f) +def save_embeddings(embeddings, file_name): + with open(file_name, 'wb') as f: + pickle.dump(embeddings, f) - def process_data(json_file): - with open(json_file, "r") as file: - data = json.load(file) +def load_embeddings(file_name): + with open(file_name, 'rb') as f: + return pickle.load(f) - processed_data = [] - for item in data: - if item['type'] == 'qa': - text_for_embedding = f"{item['content']['pregunta']} {item['content']['respuesta']}" - elif item['type'] == 'info': - text_for_embedding = f"{item['content']['titulo']} {item['content'].get('descripcion', '')}" - else: - continue +def process_data(json_file): + with open(json_file, "r") as file: + data = json.load(file) - processed_data.append({ - 'text_for_embedding': text_for_embedding, - 'full_content': item['content'], - 'type': item['type'], - 'url': item.get('url', ''), - 'metadata': item.get('metadata', {}) - }) + processed_data = [] + for item in data: + if item['type'] == 'qa': + text_for_embedding = f"{item['content']['pregunta']} {item['content']['respuesta']}" + elif item['type'] == 'info': + text_for_embedding = f"{item['content']['titulo']} {item['content'].get('descripcion', '')}" + else: + continue - return pd.DataFrame(processed_data) + processed_data.append({ + 'text_for_embedding': text_for_embedding, + 'full_content': item['content'], + 'type': item['type'], + 'url': item.get('url', ''), + 'metadata': item.get('metadata', {}) + }) - def initialize_or_load_index(df): - if os.path.exists(embedding_file) and os.path.exists(index_file): - embeddings = load_embeddings(embedding_file) - index = faiss.read_index(index_file) - else: - df['embedding'] = df['text_for_embedding'].apply(lambda x: get_embedding(x)) - embedding_matrix = np.array(df['embedding'].tolist()).astype('float32') - embedding_matrix /= np.linalg.norm(embedding_matrix, axis=1)[:, None] - - index = faiss.IndexFlatIP(embedding_matrix.shape[1]) - index.add(embedding_matrix) - - save_embeddings(df['embedding'].tolist(), embedding_file) - faiss.write_index(index, index_file) - embeddings = df['embedding'].tolist() + return pd.DataFrame(processed_data) - return index, embeddings +def initialize_or_load_index(df): + if os.path.exists(embedding_file) and os.path.exists(index_file): + embeddings = load_embeddings(embedding_file) + index = faiss.read_index(index_file) + else: + df['embedding'] = df['text_for_embedding'].apply(lambda x: get_embedding(x)) + embedding_matrix = np.array(df['embedding'].tolist()).astype('float32') + embedding_matrix /= np.linalg.norm(embedding_matrix, axis=1)[:, None] + + index = faiss.IndexFlatIP(embedding_matrix.shape[1]) + index.add(embedding_matrix) + + save_embeddings(df['embedding'].tolist(), embedding_file) + faiss.write_index(index, index_file) + embeddings = df['embedding'].tolist() - def search(query, df, index, k=3): - query_embedding = np.array(get_embedding(query)).astype('float32') - query_embedding /= np.linalg.norm(query_embedding) - D, I = index.search(np.array([query_embedding]), k) - - results = [] - for i in range(k): - result = df.iloc[I[0][i]] - results.append({ - 'content': result['full_content'], - 'url': result['url'], - 'type': result['type'], - 'metadata': result['metadata'], - 'similarity_score': float(D[0][i]) - }) - - return results + return index, embeddings - # Initialize data and index - df = process_data(os.path.join(settings.BASE_DIR, "chatbot_laura", "preguntas_respuestas_procesadasV1.json")) - index, embeddings = initialize_or_load_index(df) \ No newline at end of file +def search(query, df, index, k=3): + query_embedding = np.array(get_embedding(query)).astype('float32') + query_embedding /= np.linalg.norm(query_embedding) + D, I = index.search(np.array([query_embedding]), k) + + results = [] + for i in range(k): + result = df.iloc[I[0][i]] + results.append({ + 'content': result['full_content'], + 'url': result['url'], + 'type': result['type'], + 'metadata': result['metadata'], + 'similarity_score': float(D[0][i]) + }) + + return results + +# Initialize data and index +df = process_data(json_file) +index, embeddings = initialize_or_load_index(df) \ No newline at end of file diff --git a/backend/chatbot_laura/views.py b/backend/chatbot_laura/views.py index e32dde9..069c414 100644 --- a/backend/chatbot_laura/views.py +++ b/backend/chatbot_laura/views.py @@ -1,26 +1,26 @@ from django.http import JsonResponse - from channels.generic.websocket import AsyncWebsocketConsumer - import json - from .chatbot_logic import search, df, index +from channels.generic.websocket import AsyncWebsocketConsumer +import json +from .chatbot_logic import search, df, index - class ChatConsumer(AsyncWebsocketConsumer): - async def connect(self): - await self.accept() +class ChatConsumer(AsyncWebsocketConsumer): + async def connect(self): + await self.accept() - async def disconnect(self, close_code): - pass + async def disconnect(self, close_code): + pass - async def receive(self, text_data): - text_data_json = json.loads(text_data) - query = text_data_json['message'] + async def receive(self, text_data): + text_data_json = json.loads(text_data) + query = text_data_json['message'] - results = search(query, df, index) + results = search(query, df, index) - await self.send(text_data=json.dumps({ - 'message': results - })) + await self.send(text_data=json.dumps({ + 'message': results + })) - def search_view(request): - query = request.GET.get('query', '') - results = search(query, df, index) - return JsonResponse({'results': results}) \ No newline at end of file +def search_view(request): + query = request.GET.get('query', '') + results = search(query, df, index) + return JsonResponse({'results': results}) \ No newline at end of file