Update app.py
Browse files
app.py
CHANGED
@@ -16,6 +16,7 @@ import docx2txt
|
|
16 |
from io import StringIO
|
17 |
from PyPDF2 import PdfFileReader
|
18 |
import warnings
|
|
|
19 |
warnings.filterwarnings("ignore")
|
20 |
|
21 |
|
@@ -63,6 +64,28 @@ def article_text_extractor(url: str):
|
|
63 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
64 |
|
65 |
return article_header, chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
def preprocess_plain_text(x):
|
68 |
|
@@ -85,6 +108,7 @@ def extract_pdf(file):
|
|
85 |
for i in range(count):
|
86 |
page = pdfReader.getPage(i)
|
87 |
all_text += page.extractText()
|
|
|
88 |
|
89 |
return all_text
|
90 |
|
@@ -199,11 +223,11 @@ if is_url:
|
|
199 |
|
200 |
elif upload_doc:
|
201 |
|
202 |
-
clean_text = preprocess_plain_text(extract_text_from_file(upload_doc))
|
203 |
|
204 |
else:
|
205 |
|
206 |
-
clean_text = preprocess_plain_text(plain_text)
|
207 |
|
208 |
summarize = st.button("Summarize")
|
209 |
|
|
|
16 |
from io import StringIO
|
17 |
from PyPDF2 import PdfFileReader
|
18 |
import warnings
|
19 |
+
from nltk import sent_tokenize
|
20 |
warnings.filterwarnings("ignore")
|
21 |
|
22 |
|
|
|
64 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
65 |
|
66 |
return article_header, chunks
|
67 |
+
|
68 |
+
def chunk_clean_text(text):
|
69 |
+
|
70 |
+
sentences = sent_tokenize(text)
|
71 |
+
current_chunk = 0
|
72 |
+
chunks = []
|
73 |
+
|
74 |
+
for sentence in sentences:
|
75 |
+
if len(chunks) == current_chunk + 1:
|
76 |
+
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
|
77 |
+
chunks[current_chunk].extend(sentence.split(" "))
|
78 |
+
else:
|
79 |
+
current_chunk += 1
|
80 |
+
chunks.append(sentence.split(" "))
|
81 |
+
else:
|
82 |
+
print(current_chunk)
|
83 |
+
chunks.append(sentence.split(" "))
|
84 |
+
|
85 |
+
for chunk_id in range(len(chunks)):
|
86 |
+
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
87 |
+
|
88 |
+
return chunks
|
89 |
|
90 |
def preprocess_plain_text(x):
|
91 |
|
|
|
108 |
for i in range(count):
|
109 |
page = pdfReader.getPage(i)
|
110 |
all_text += page.extractText()
|
111 |
+
|
112 |
|
113 |
return all_text
|
114 |
|
|
|
223 |
|
224 |
elif upload_doc:
|
225 |
|
226 |
+
clean_text = chunk_clean_text(preprocess_plain_text(extract_text_from_file(upload_doc)))
|
227 |
|
228 |
else:
|
229 |
|
230 |
+
clean_text = chunk_clean_text(preprocess_plain_text(plain_text))
|
231 |
|
232 |
summarize = st.button("Summarize")
|
233 |
|