-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsummarizer_v2.py
151 lines (128 loc) · 4.85 KB
/
summarizer_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import re
import json
import ollama
from dotenv import load_dotenv
load_dotenv()
ollama_host = os.getenv("REMOTE_OLLAMA_HOST")
print(f"ollama_host: {ollama_host}")
client = ollama.Client(host='http://' + ollama_host + ':11434')
def split_into_sentences(text):
"""
Split text into sentences by typical punctuation delimiters.
We'll split on '.', '?', '!', and preserve those delimiters
so we can re-attach them.
Then we trim whitespace.
"""
# A simple approach: use a regex that matches on (.?!),
# capturing the punctuation to re-attach. This won't be perfect for
# abbreviations, decimal points, etc., but it's a decent start.
sentence_pattern = re.compile(r'([^.?!]+[.?!])')
# This returns a list of "sentence-like" strings, each ending with punctuation
parts = sentence_pattern.findall(text)
# If there's leftover text without punctuation, we handle that as well
remainder = sentence_pattern.sub('', text).strip()
if remainder:
parts.append(remainder)
# Clean up extra whitespace
sentences = [p.strip() for p in parts if p.strip()]
return sentences
def chunk_transcript(transcript, max_words_per_chunk=4000):
"""
Improved chunker:
1) Split transcript into sentences.
2) Combine sentences into chunks until we reach ~4k words.
3) If a single sentence is >4k words, we split that sentence by words.
Returns a list of chunk strings.
"""
sentences = split_into_sentences(transcript)
chunks = []
current_words = []
current_count = 0
for sentence in sentences:
# Word count for this sentence
words_in_sentence = sentence.split()
sentence_len = len(words_in_sentence)
if sentence_len > max_words_per_chunk:
# The sentence alone exceeds chunk size
# -> break this sentence into sub-chunks by words
start = 0
while start < sentence_len:
end = start + max_words_per_chunk
sub_chunk_words = words_in_sentence[start:end]
sub_chunk_str = " ".join(sub_chunk_words)
chunks.append(sub_chunk_str)
start = end
else:
# Check if adding this sentence to current chunk
# would exceed max_words_per_chunk
if current_count + sentence_len > max_words_per_chunk:
# flush current chunk
chunks.append(" ".join(current_words))
current_words = []
current_count = 0
# Add this sentence
current_words.extend(words_in_sentence)
current_count += sentence_len
# leftover
if current_words:
chunks.append(" ".join(current_words))
return chunks
# ----------------------------
# 2) IMPROVED PROMPT ENGINEERING
# ----------------------------
def build_prompts_for_chunk(chunk_text):
"""
Return a dict of four prompts:
- "concise": a short summary
- "key_topics": high-level topics
- "takeaways": key insights, lessons
- "comprehensive": thorough notes capturing examples, references, quotes, etc.
We add a bit more "context" or "instruction" for each prompt.
"""
return {
"concise": f"""
You are an expert summarizer. Read the following text and produce a concise summary
(no more than 150 words) covering the main idea only:
TEXT:
{chunk_text}
""".strip(),
"key_topics": f"""
You are an expert note-taker. From the following text, list the main topics or themes
(with short bullet points), focusing on clarity and coverage:
TEXT:
{chunk_text}
""".strip(),
"takeaways": f"""
You are a teaching assistant. From the text below, list the key takeaways or lessons
the reader should remember. Focus on clarity and practical insights, in short bullet points:
TEXT:
{chunk_text}
""".strip(),
"comprehensive": f"""
You are a meticulous researcher. Provide a comprehensive set of notes about
the following text, capturing major points, examples, references, or quotes.
Organize your notes with headings or bullet points. Aim for thoroughness:
TEXT:
{chunk_text}
""".strip()
}
def ollama_generate_chunk(model_name, prompt):
"""
Example direct HTTP request to Ollama.
We demonstrate extra parameters like 'temperature' or 'top_p' if desired.
"""
try:
response = client.chat(
model=model_name,
messages=[{"role": "user", "content": prompt}]
)
#enhanced_text = response.get("message", {}).get("content", "").strip()
#resp = requests.post(url, json=payload, timeout=300)
#resp.raise_for_status()
data = response.get("message", {}).get("content", "").strip()
return data #.get("content", "").strip()
except Exception as e:
#logger.error(f"Ollama request failed: {e}")
print(f"Ollama request failed: {e}")
return ""