Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Soumen commited on Sep 6, 2023

Commit

1a9fa41

1 Parent(s): f176b84

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -17

app.py CHANGED Viewed

@@ -55,22 +55,17 @@ def read_pdf(file):
 #         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
         all_page_text += page.extractText()+" "
     return all_page_text
-# def read_pdf_with_pdfplumber(file):
-#     all_page_text=" "
-# #     all_page_text = ""
-#     with pdfplumber.open(file) as pdf:
-# 	    page = pdf.pages[0]
-#     ge=page.to_image()
-#     img = Image.open(ge)
-#     img = img.save("img.png")
-#     image_name = cv2.imread("img.png")
-# # get co-ordinates to c
-# #         return page.extract_text()
-# #         get co-ordinates to cr
-# # # get co-ordinates to cr
-#     text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
-#     all_page_text += text + " " #page.extractText()
-#     return all_page_text
 st.title("NLP APPLICATION")
 #@st.cache_resource(experimental_allow_widgets=True)
 def main():
@@ -97,7 +92,7 @@ def main():
             #file = uploaded_photo.read() # Read the data
             #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
             #image_result.write(file)
-            tet = read_pdf(uploaded_photo)
             #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*10)])
             text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*10):]

 #         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
         all_page_text += page.extractText()+" "
     return all_page_text
+def read_pdf_with_pdfplumber(file):
+    # Open the uploaded PDF file with pdfplumber
+    with pdfplumber.open(file) as pdf:
+        extracted_text = ''
+        for page in pdf.pages:
+            extracted_text += page.extract_text()
+    # Display the extracted text
+    text=st.text(extracted_text)
 st.title("NLP APPLICATION")
 #@st.cache_resource(experimental_allow_widgets=True)
 def main():
             #file = uploaded_photo.read() # Read the data
             #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
             #image_result.write(file)
+            tet = read_pdf_with_pdfplumber(uploaded_photo)
             #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
             values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*10)])
             text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*10):]