Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Soumen commited on Sep 8, 2023

Commit

29e33a8

1 Parent(s): 40f693b

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -46

app.py CHANGED Viewed

@@ -20,10 +20,7 @@ st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sideb
 import torch
 import docx2txt
 from PIL import Image
-from langchain.document_loaders import PyPDFLoader
-from langchain.text_splitter import CharacterTextSplitter
-import tempfile
-#from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_bytes
 import pdfplumber
 #from line_cor import mark_region
@@ -43,26 +40,26 @@ headers1 = {"Authorization": "Bearer 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"}
 API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
 headers2 = {"Authorization": "Bearer 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"}
-# def read_pdf(file):
-# #     images=pdf2image.convert_from_path(file)
-# #     # print(type(images))
-#     pdfReader = PdfFileReader(file)
-#     count = pdfReader.numPages
-#     all_page_text = " "
-#     for i in range(count):
-#         page = pdfReader.getPage(i)
-#         all_page_text += page.extractText()+" "
-#     return all_page_text
-def read_pdf_with_pdfplumber(file):
-    # Open the uploaded PDF file with pdfplumber
-    with pdfplumber.open(file) as pdf:
-        extracted_text = ''
-        for page in pdf.pages:
-            extracted_text += page.extract_text()
-    # Display the extracted text
-    #st.text(extracted_text)
-    return extracted_text
 def engsum(output):
     def query(payload):
@@ -99,31 +96,24 @@ def main():
         st.session_state["photo"]="not done"
     if st.session_state["photo"]=="done" or message:
         if uploaded_photo and uploaded_photo.type=='application/pdf':
-            text=[]
-            loader = None
-            #tet = read_pdf(uploaded_photo)
-            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
-                temp_file.write(uploaded_photo.read())
-                temp_file_path = temp_file.name
-            loader = PyPDFLoader(temp_file_path)
-            if loader:
-                text.extend(loader.load())
-                os.remove(temp_file_path)
-            text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
-            text_chunks = text_splitter.split_documents(text)
-           # values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
-            #text = tet[values[0]*7*10:values[1]*7*100] if values[0]!=len(tet)//(7*100) else tet[len(tet)//(7*100):]
             st.text("Selected text for summarize: ")
-            # if st.button("BENGALI"):
-            #     st.success(text)
-            #     st.text("Summarized text: ")
-            #     bansum(text)
-            #if st.button("ENGLISH"):
-            st.success(type(text_chunks))
-            st.success(text_chunks[0])
-            st.text("Summarized text: ")
-            engsum(text_chunks[0])
         elif uploaded_photo and uploaded_photo.type !='application/pdf':
             text=None

 import torch
 import docx2txt
 from PIL import Image
+from PyPDF2 import PdfFileReader
 from pdf2image import convert_from_bytes
 import pdfplumber
 #from line_cor import mark_region
 API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
 headers2 = {"Authorization": "Bearer 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"}
+def read_pdf(file):
+#     images=pdf2image.convert_from_path(file)
+#     # print(type(images))
+    pdfReader = PdfFileReader(file)
+    count = pdfReader.numPages
+    all_page_text = " "
+    for i in range(count):
+        page = pdfReader.getPage(i)
+        all_page_text += page.extractText()+" "
+    return all_page_text
+# def read_pdf_with_pdfplumber(file):
+#     # Open the uploaded PDF file with pdfplumber
+#     with pdfplumber.open(file) as pdf:
+#         extracted_text = ''
+#         for page in pdf.pages:
+#             extracted_text += page.extract_text()
+#     # Display the extracted text
+#     #st.text(extracted_text)
+#     return extracted_text
 def engsum(output):
     def query(payload):
         st.session_state["photo"]="not done"
     if st.session_state["photo"]=="done" or message:
         if uploaded_photo and uploaded_photo.type=='application/pdf':
+            tet = read_pdf(uploaded_photo)
+            # with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            #     temp_file.write(uploaded_photo.read())
+            #     temp_file_path = temp_file.name
+            # loader = PyPDFLoader(temp_file_path)
+            # if loader:
+            #     text.extend(loader.load())
+            #     os.remove(temp_file_path)
+            # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
+            # text_chunks = text_splitter.split_documents(text)
+            values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
+            text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
             st.text("Selected text for summarize: ")
+            #st.success(type(text_chunks))
+            st.success(text)
+            st.text("Summarized Text: ")
+            engsum(text)
         elif uploaded_photo and uploaded_photo.type !='application/pdf':
             text=None