-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
163 lines (131 loc) · 5.2 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from flask import Flask, request, jsonify
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from flask_cors import CORS
from huggingface_hub import login
import requests
import os
import time
import json
import re
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
app = Flask(__name__)
CORS(app)
# Environment variables for Hugging Face Token and Dr Droid Token
hf_token = os.getenv('HF_TOKEN')
dr_droid_token = os.getenv('DR_DROID_TOKEN')
headers = {
'Authorization': f'Bearer {dr_droid_token}',
}
def execute_workflows(workflow_names):
for workflow_name in workflow_names:
json_data = {
'workflow_name': workflow_name,
}
response = requests.post('http://localhost/executor/workflows/api/execute', headers=headers, json=json_data)
workflow_list = ['Fine Tuning Data - Integrity Checks', 'App Data Validation', 'Batch Inference Performance Debugging']
# Log in to the Hugging Face Hub with your token
login(token=hf_token)
model_id = "ShubhamBhardwaj994/medical_research_llama_3.1-8b"
# Configure for 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
# Load the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=quantization_config,
device_map="auto",
)
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Create a text generation pipeline
text_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device_map="auto",
model_kwargs={"torch_dtype": torch.bfloat16}
)
# Load the JSONL file and return the list of entries
def load_jsonl_file(file_path):
entries = []
with open(file_path, 'r') as file:
for line in file:
entry = json.loads(line.strip())
entries.append(entry)
return entries
@app.route('/finetuned_llm', methods=['POST'])
def finetuned_llm():
data = request.get_json()
if not data or 'content' not in data:
return jsonify({"error": "Invalid input"}), 400
# Extract the structured input, providing default if input is None
content = data['content']
instruction = content.get('instruction', "")
input_content = content.get('input', "") # Use an empty string if input is None
# Prepare the messages structure
messages = [
{"role": "system", "content": instruction},
{"role": "user", "content": input_content or ""}
]
# Load JSONL entries
jsonl_entries = load_jsonl_file('fine_tuning_dataset.jsonl') # Replace with your actual file path
# Find matching entry in JSONL file
expected_output = None
for entry in jsonl_entries:
if entry['instruction'] == instruction and entry['input'] == input_content:
expected_output = entry['output']
break
# Measure the time taken for generating the response
start_time = time.time()
try:
# Generate the response
outputs = text_pipeline(
messages,
max_new_tokens=50,
do_sample=True,
temperature=0.6,
top_p=0.9,
)
end_time = time.time()
elapsed_time = end_time - start_time
# Ensure outputs are as expected
if outputs and isinstance(outputs, list) and "generated_text" in outputs[0]:
generated_text = outputs[0]["generated_text"]
# Extract the first 'assistant' role message
assistant_response = next(
(message["content"] for message in generated_text if message["role"] == "assistant"),
"No answer found."
)
# Extract content before the stop word directly
stop_word = "<|im_end|>" # Define the stop word
stop_word_index = assistant_response.lower().find(stop_word.lower())
if stop_word_index != -1:
assistant_response = assistant_response[:stop_word_index]
# Extract content after the colon
colon_index = assistant_response.find(':')
if colon_index != -1:
assistant_response = assistant_response[colon_index + 1:]
# Clean the assistant response
assistant_response = assistant_response.replace("\n", " ") # Replace newline characters with spaces
assistant_response = re.sub(r"\s+", " ", assistant_response) # Normalize whitespace
assistant_response = assistant_response.strip() # Trim leading and trailing whitespace
# Compare with expected output
is_match = assistant_response == expected_output
if not is_match:
execute_workflows(workflow_list)
else:
assistant_response = "Error: Unable to generate text."
is_match = False
except Exception as e:
print(f"Error during text generation: {e}")
assistant_response = "Error: Exception during processing."
is_match = False
return jsonify({
"answer": assistant_response,
"expected": expected_output,
"match": is_match
})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8000)