Soumen commited on
Commit
29e33a8
1 Parent(s): 40f693b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -46
app.py CHANGED
@@ -20,10 +20,7 @@ st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sideb
20
  import torch
21
  import docx2txt
22
  from PIL import Image
23
- from langchain.document_loaders import PyPDFLoader
24
- from langchain.text_splitter import CharacterTextSplitter
25
- import tempfile
26
- #from PyPDF2 import PdfFileReader
27
  from pdf2image import convert_from_bytes
28
  import pdfplumber
29
  #from line_cor import mark_region
@@ -43,26 +40,26 @@ headers1 = {"Authorization": "Bearer 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"}
43
  API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
44
  headers2 = {"Authorization": "Bearer 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"}
45
 
46
- # def read_pdf(file):
47
- # # images=pdf2image.convert_from_path(file)
48
- # # # print(type(images))
49
- # pdfReader = PdfFileReader(file)
50
- # count = pdfReader.numPages
51
- # all_page_text = " "
52
- # for i in range(count):
53
- # page = pdfReader.getPage(i)
54
- # all_page_text += page.extractText()+" "
55
- # return all_page_text
56
- def read_pdf_with_pdfplumber(file):
57
- # Open the uploaded PDF file with pdfplumber
58
- with pdfplumber.open(file) as pdf:
59
- extracted_text = ''
60
- for page in pdf.pages:
61
- extracted_text += page.extract_text()
62
 
63
- # Display the extracted text
64
- #st.text(extracted_text)
65
- return extracted_text
66
 
67
  def engsum(output):
68
  def query(payload):
@@ -99,31 +96,24 @@ def main():
99
  st.session_state["photo"]="not done"
100
  if st.session_state["photo"]=="done" or message:
101
  if uploaded_photo and uploaded_photo.type=='application/pdf':
102
- text=[]
103
- loader = None
104
- #tet = read_pdf(uploaded_photo)
105
- with tempfile.NamedTemporaryFile(delete=False) as temp_file:
106
- temp_file.write(uploaded_photo.read())
107
- temp_file_path = temp_file.name
108
 
109
- loader = PyPDFLoader(temp_file_path)
110
- if loader:
111
- text.extend(loader.load())
112
- os.remove(temp_file_path)
113
- text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
114
- text_chunks = text_splitter.split_documents(text)
115
- # values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
116
- #text = tet[values[0]*7*10:values[1]*7*100] if values[0]!=len(tet)//(7*100) else tet[len(tet)//(7*100):]
117
  st.text("Selected text for summarize: ")
118
- # if st.button("BENGALI"):
119
- # st.success(text)
120
- # st.text("Summarized text: ")
121
- # bansum(text)
122
- #if st.button("ENGLISH"):
123
- st.success(type(text_chunks))
124
- st.success(text_chunks[0])
125
- st.text("Summarized text: ")
126
- engsum(text_chunks[0])
127
 
128
  elif uploaded_photo and uploaded_photo.type !='application/pdf':
129
  text=None
 
20
  import torch
21
  import docx2txt
22
  from PIL import Image
23
+ from PyPDF2 import PdfFileReader
 
 
 
24
  from pdf2image import convert_from_bytes
25
  import pdfplumber
26
  #from line_cor import mark_region
 
40
  API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
41
  headers2 = {"Authorization": "Bearer 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"}
42
 
43
+ def read_pdf(file):
44
+ # images=pdf2image.convert_from_path(file)
45
+ # # print(type(images))
46
+ pdfReader = PdfFileReader(file)
47
+ count = pdfReader.numPages
48
+ all_page_text = " "
49
+ for i in range(count):
50
+ page = pdfReader.getPage(i)
51
+ all_page_text += page.extractText()+" "
52
+ return all_page_text
53
+ # def read_pdf_with_pdfplumber(file):
54
+ # # Open the uploaded PDF file with pdfplumber
55
+ # with pdfplumber.open(file) as pdf:
56
+ # extracted_text = ''
57
+ # for page in pdf.pages:
58
+ # extracted_text += page.extract_text()
59
 
60
+ # # Display the extracted text
61
+ # #st.text(extracted_text)
62
+ # return extracted_text
63
 
64
  def engsum(output):
65
  def query(payload):
 
96
  st.session_state["photo"]="not done"
97
  if st.session_state["photo"]=="done" or message:
98
  if uploaded_photo and uploaded_photo.type=='application/pdf':
99
+ tet = read_pdf(uploaded_photo)
100
+ # with tempfile.NamedTemporaryFile(delete=False) as temp_file:
101
+ # temp_file.write(uploaded_photo.read())
102
+ # temp_file_path = temp_file.name
 
 
103
 
104
+ # loader = PyPDFLoader(temp_file_path)
105
+ # if loader:
106
+ # text.extend(loader.load())
107
+ # os.remove(temp_file_path)
108
+ # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
109
+ # text_chunks = text_splitter.split_documents(text)
110
+ values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
111
+ text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
112
  st.text("Selected text for summarize: ")
113
+ #st.success(type(text_chunks))
114
+ st.success(text)
115
+ st.text("Summarized Text: ")
116
+ engsum(text)
 
 
 
 
 
117
 
118
  elif uploaded_photo and uploaded_photo.type !='application/pdf':
119
  text=None