-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
101 additions
and
93 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,88 +1,96 @@ | ||
from openai import OpenAI | ||
import pandas as pd | ||
import json | ||
import faiss | ||
import numpy as np | ||
import os | ||
import pickle | ||
from django.conf import settings | ||
import pandas as pd | ||
import json | ||
import faiss | ||
import numpy as np | ||
import os | ||
import pickle | ||
from django.conf import settings | ||
from dotenv import load_dotenv | ||
|
||
client = OpenAI(api_key=settings.OPENAI_API_KEY) | ||
load_dotenv() | ||
|
||
embedding_file = os.path.join(settings.BASE_DIR, "chatbot_laura", "embeddings.pkl") | ||
index_file = os.path.join(settings.BASE_DIR, "chatbot_laura", "faiss_index.index") | ||
# Leer la API key de OpenAI desde las variables de entorno | ||
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | ||
client = OpenAI(api_key=OPENAI_API_KEY) | ||
|
||
def get_embedding(text, model="text-embedding-ada-002"): | ||
response = client.embeddings.create(input=text, model=model) | ||
return response.data[0].embedding | ||
# Actualizar las rutas de los archivos | ||
data_dir = os.path.join(settings.BASE_DIR, "data") | ||
embedding_file = os.path.join(data_dir, "embeddings.pkl") | ||
index_file = os.path.join(data_dir, "faiss_index.index") | ||
json_file = os.path.join(data_dir, "preguntas_respuestas_procesadasV1.json") | ||
|
||
def save_embeddings(embeddings, file_name): | ||
with open(file_name, 'wb') as f: | ||
pickle.dump(embeddings, f) | ||
def get_embedding(text, model="text-embedding-ada-002"): | ||
response = client.embeddings.create(input=text, model=model) | ||
return response.data[0].embedding | ||
|
||
def load_embeddings(file_name): | ||
with open(file_name, 'rb') as f: | ||
return pickle.load(f) | ||
def save_embeddings(embeddings, file_name): | ||
with open(file_name, 'wb') as f: | ||
pickle.dump(embeddings, f) | ||
|
||
def process_data(json_file): | ||
with open(json_file, "r") as file: | ||
data = json.load(file) | ||
def load_embeddings(file_name): | ||
with open(file_name, 'rb') as f: | ||
return pickle.load(f) | ||
|
||
processed_data = [] | ||
for item in data: | ||
if item['type'] == 'qa': | ||
text_for_embedding = f"{item['content']['pregunta']} {item['content']['respuesta']}" | ||
elif item['type'] == 'info': | ||
text_for_embedding = f"{item['content']['titulo']} {item['content'].get('descripcion', '')}" | ||
else: | ||
continue | ||
def process_data(json_file): | ||
with open(json_file, "r") as file: | ||
data = json.load(file) | ||
|
||
processed_data.append({ | ||
'text_for_embedding': text_for_embedding, | ||
'full_content': item['content'], | ||
'type': item['type'], | ||
'url': item.get('url', ''), | ||
'metadata': item.get('metadata', {}) | ||
}) | ||
processed_data = [] | ||
for item in data: | ||
if item['type'] == 'qa': | ||
text_for_embedding = f"{item['content']['pregunta']} {item['content']['respuesta']}" | ||
elif item['type'] == 'info': | ||
text_for_embedding = f"{item['content']['titulo']} {item['content'].get('descripcion', '')}" | ||
else: | ||
continue | ||
|
||
return pd.DataFrame(processed_data) | ||
processed_data.append({ | ||
'text_for_embedding': text_for_embedding, | ||
'full_content': item['content'], | ||
'type': item['type'], | ||
'url': item.get('url', ''), | ||
'metadata': item.get('metadata', {}) | ||
}) | ||
|
||
def initialize_or_load_index(df): | ||
if os.path.exists(embedding_file) and os.path.exists(index_file): | ||
embeddings = load_embeddings(embedding_file) | ||
index = faiss.read_index(index_file) | ||
else: | ||
df['embedding'] = df['text_for_embedding'].apply(lambda x: get_embedding(x)) | ||
embedding_matrix = np.array(df['embedding'].tolist()).astype('float32') | ||
embedding_matrix /= np.linalg.norm(embedding_matrix, axis=1)[:, None] | ||
|
||
index = faiss.IndexFlatIP(embedding_matrix.shape[1]) | ||
index.add(embedding_matrix) | ||
|
||
save_embeddings(df['embedding'].tolist(), embedding_file) | ||
faiss.write_index(index, index_file) | ||
embeddings = df['embedding'].tolist() | ||
return pd.DataFrame(processed_data) | ||
|
||
return index, embeddings | ||
def initialize_or_load_index(df): | ||
if os.path.exists(embedding_file) and os.path.exists(index_file): | ||
embeddings = load_embeddings(embedding_file) | ||
index = faiss.read_index(index_file) | ||
else: | ||
df['embedding'] = df['text_for_embedding'].apply(lambda x: get_embedding(x)) | ||
embedding_matrix = np.array(df['embedding'].tolist()).astype('float32') | ||
embedding_matrix /= np.linalg.norm(embedding_matrix, axis=1)[:, None] | ||
|
||
index = faiss.IndexFlatIP(embedding_matrix.shape[1]) | ||
index.add(embedding_matrix) | ||
|
||
save_embeddings(df['embedding'].tolist(), embedding_file) | ||
faiss.write_index(index, index_file) | ||
embeddings = df['embedding'].tolist() | ||
|
||
def search(query, df, index, k=3): | ||
query_embedding = np.array(get_embedding(query)).astype('float32') | ||
query_embedding /= np.linalg.norm(query_embedding) | ||
D, I = index.search(np.array([query_embedding]), k) | ||
|
||
results = [] | ||
for i in range(k): | ||
result = df.iloc[I[0][i]] | ||
results.append({ | ||
'content': result['full_content'], | ||
'url': result['url'], | ||
'type': result['type'], | ||
'metadata': result['metadata'], | ||
'similarity_score': float(D[0][i]) | ||
}) | ||
|
||
return results | ||
return index, embeddings | ||
|
||
# Initialize data and index | ||
df = process_data(os.path.join(settings.BASE_DIR, "chatbot_laura", "preguntas_respuestas_procesadasV1.json")) | ||
index, embeddings = initialize_or_load_index(df) | ||
def search(query, df, index, k=3): | ||
query_embedding = np.array(get_embedding(query)).astype('float32') | ||
query_embedding /= np.linalg.norm(query_embedding) | ||
D, I = index.search(np.array([query_embedding]), k) | ||
|
||
results = [] | ||
for i in range(k): | ||
result = df.iloc[I[0][i]] | ||
results.append({ | ||
'content': result['full_content'], | ||
'url': result['url'], | ||
'type': result['type'], | ||
'metadata': result['metadata'], | ||
'similarity_score': float(D[0][i]) | ||
}) | ||
|
||
return results | ||
|
||
# Initialize data and index | ||
df = process_data(json_file) | ||
index, embeddings = initialize_or_load_index(df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,26 @@ | ||
from django.http import JsonResponse | ||
from channels.generic.websocket import AsyncWebsocketConsumer | ||
import json | ||
from .chatbot_logic import search, df, index | ||
from channels.generic.websocket import AsyncWebsocketConsumer | ||
import json | ||
from .chatbot_logic import search, df, index | ||
|
||
class ChatConsumer(AsyncWebsocketConsumer): | ||
async def connect(self): | ||
await self.accept() | ||
class ChatConsumer(AsyncWebsocketConsumer): | ||
async def connect(self): | ||
await self.accept() | ||
|
||
async def disconnect(self, close_code): | ||
pass | ||
async def disconnect(self, close_code): | ||
pass | ||
|
||
async def receive(self, text_data): | ||
text_data_json = json.loads(text_data) | ||
query = text_data_json['message'] | ||
async def receive(self, text_data): | ||
text_data_json = json.loads(text_data) | ||
query = text_data_json['message'] | ||
|
||
results = search(query, df, index) | ||
results = search(query, df, index) | ||
|
||
await self.send(text_data=json.dumps({ | ||
'message': results | ||
})) | ||
await self.send(text_data=json.dumps({ | ||
'message': results | ||
})) | ||
|
||
def search_view(request): | ||
query = request.GET.get('query', '') | ||
results = search(query, df, index) | ||
return JsonResponse({'results': results}) | ||
def search_view(request): | ||
query = request.GET.get('query', '') | ||
results = search(query, df, index) | ||
return JsonResponse({'results': results}) |