-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextSummarizer.py
81 lines (60 loc) · 2.48 KB
/
TextSummarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from nltk.corpus import stopwords
from heapq import nlargest
from string import punctuation
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration # type: ignore
import streamlit as st # type: ignore
import nltk
stopWords = set(stopwords.words("english"))
st.title("Text - Summarizer")
input_text = st.text_area("Enter your text to Summarize: ",height = 200)
col1, col2, col3 = st.columns(3)
with col1:
option = st.selectbox(
"Click for Different Types for Summarization Techniques",
(
"Extractive",
"Abstractive",
),
)
text = input_text
if st.button("Summarize"):
if option == "Extractive":
words = word_tokenize(text)
for i in range(len(words)):
words[i] = words[i].lower()
freqTable = dict()
for word in words:
if word not in stopWords:
if word not in punctuation:
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
sentences = sent_tokenize(text)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
else:
sentenceValue[sentence] = freq
maxValue = max(sentenceValue.values())
for sentence in sentenceValue:
sentenceValue[sentence] = sentenceValue[sentence]/maxValue
length = int(len(sentenceValue)*0.35)
summary = nlargest(length,sentenceValue,key = sentenceValue.get)
Summary = ""
for sentence in summary:
Summary += sentence
st.write(Summary)
else:
mlength = int(len(text)*0.40)
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt")
summary_ids = model.generate(inputs, max_length=mlength, min_length=40, length_penalty=1.0, num_beams=10, early_stopping=False)
Summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
st.write(Summary)