Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Soumen commited on Sep 8, 2023

Commit

a2c3102

•

1 Parent(s): 9d5dc1c

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -18

app.py CHANGED Viewed

@@ -20,7 +20,10 @@ st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sideb
 import torch
 import docx2txt
 from PIL import Image
-from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_bytes
 import pdfplumber
 #from line_cor import mark_region
@@ -96,24 +99,32 @@ def main():
         st.session_state["photo"]="not done"
     if st.session_state["photo"]=="done" or message:
         if uploaded_photo and uploaded_photo.type=='application/pdf':
-            text=None
-            #file = uploaded_photo.read() # Read the data
-            #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
-            #image_result.write(file)
-            tet = read_pdf_with_pdfplumber(uploaded_photo)
-            #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
-            values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
-            text = tet[values[0]*7*10:values[1]*7*100] if values[0]!=len(tet)//(7*100) else tet[len(tet)//(7*100):]
             st.text("Selected text for summarize: ")
-            if st.button("BENGALI"):
-                st.success(text)
-                st.text("Summarized text: ")
-                bansum(text)
-            if st.button("ENGLISH"):
-                st.success(text)
-                st.text("Summarized text: ")
-                engsum(text)
         elif uploaded_photo and uploaded_photo.type !='application/pdf':
             text=None
             img = Image.open(uploaded_photo)

 import torch
 import docx2txt
 from PIL import Image
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import CharacterTextSplitter
+import tempfile
+#from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_bytes
 import pdfplumber
 #from line_cor import mark_region
         st.session_state["photo"]="not done"
     if st.session_state["photo"]=="done" or message:
         if uploaded_photo and uploaded_photo.type=='application/pdf':
+            text=[]
+            loader = None
+            #tet = read_pdf(uploaded_photo)
+            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+                temp_file.write(uploaded_photo.read())
+                temp_file_path = temp_file.name
+            loader = PyPDFLoader(temp_file_path)
+            if loader:
+                text.extend(loader.load())
+                os.remove(temp_file_path)
+            text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
+            text_chunks = text_splitter.split_documents(text)
+           # values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
+            #text = tet[values[0]*7*10:values[1]*7*100] if values[0]!=len(tet)//(7*100) else tet[len(tet)//(7*100):]
             st.text("Selected text for summarize: ")
+            # if st.button("BENGALI"):
+            #     st.success(text)
+            #     st.text("Summarized text: ")
+            #     bansum(text)
+            #if st.button("ENGLISH"):
+            st.success(type(text_chunks))
+            st.success(text_chunks[0])
+            st.text("Summarized text: ")
+            engsum(text_chunks[0])
         elif uploaded_photo and uploaded_photo.type !='application/pdf':
             text=None
             img = Image.open(uploaded_photo)