EmreYY20 commited on
Commit
46193fd
1 Parent(s): 7aa7a44

integrate extracive model in streamlit

Browse files
Files changed (2) hide show
  1. app.py +18 -57
  2. extractive_model.py +50 -0
app.py CHANGED
@@ -1,58 +1,19 @@
1
  import streamlit as st
2
- import pandas as pd
3
- import PyPDF2
4
- import base64
5
- from summarizer import bert_summarizer, simple_summarizer
6
-
7
- summarizer = bert_summarizer
8
-
9
- # Set page to wide mode
10
- st.set_page_config(layout="wide")
11
-
12
- # Function to handle file upload and return its content
13
- def load_pdf(file):
14
- pdf_reader = PyPDF2.PdfReader(file)
15
- pdf_text = ""
16
- for page_num in range(len(pdf_reader.pages)):
17
- pdf_text += pdf_reader.pages[page_num].extract_text()
18
- return pdf_text
19
-
20
- # Main app
21
- def main():
22
-
23
- st.title("Streamlit App")
24
-
25
- # Layout: 3 columns
26
- col1, col2, col3 = st.columns([1, 3, 2], gap="large")
27
-
28
- # Left column: Dropdown menu
29
- with col1:
30
- dropdown_options = ['Abstractive', 'Extractive']
31
- dropdown_selection = st.selectbox("Choose type of summerizer:", dropdown_options)
32
-
33
- # Middle column: Text input and File uploader
34
- with col2:
35
- user_input = st.text_input("Enter your text here:")
36
- uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
37
- if st.button("Summarize"):
38
- # Handling file upload
39
- if uploaded_file is not None:
40
- file_content = load_pdf(uploaded_file)
41
- st.write("PDF uploaded successfully.")
42
- # summary = summarizer(file_content)
43
- summary = file_content
44
- elif user_input is not None:
45
- # summary = summarizer(user_input)
46
- summary = user_input
47
- else:
48
- st.wirte("Upload a PDF or put in your text!")
49
- st.session_state.summary = summary
50
-
51
- # Right column: Displaying text after pressing 'Summarize'
52
- with col3:
53
- st.write("Output:")
54
- if 'summary' in st.session_state:
55
- st.write(st.session_state.summary)
56
-
57
- if __name__ == "__main__":
58
- main()
 
1
  import streamlit as st
2
+ from extractive_model import summarize_pdf_with_textrank
3
+
4
+ st.title("PDF Summarization App")
5
+
6
+ pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
7
+ summary_length = st.slider("Select the number of sentences for the summary", 1, 20, 10)
8
+
9
+ if pdf_file is not None and st.button("Summarize"):
10
+ # Save uploaded PDF to a temporary file
11
+ with open("temp_pdf.pdf", "wb") as f:
12
+ f.write(pdf_file.getbuffer())
13
+
14
+ # Generate summary
15
+ summary = summarize_pdf_with_textrank("temp_pdf.pdf")
16
+
17
+ # Display summary
18
+ st.write("Summary:")
19
+ st.write(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
extractive_model.py CHANGED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """from sumy.parsers.plaintext import PlaintextParser
2
+ from sumy.nlp.tokenizers import Tokenizer
3
+ from sumy.summarizers.lsa import LsaSummarizer
4
+ from sumy.summarizers.lex_rank import LexRankSummarizer
5
+ from sumy.summarizers.text_rank import TextRankSummarizer
6
+ from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
7
+ from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
8
+ from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor
9
+ from sumy.nlp.stemmers import Stemmer
10
+ from sumy.utils import get_stop_words"""
11
+
12
+ import PyPDF2
13
+ from sumy.parsers.plaintext import PlaintextParser
14
+ from sumy.nlp.tokenizers import Tokenizer
15
+ from sumy.summarizers.text_rank import TextRankSummarizer
16
+
17
+ def summarize_pdf_with_textrank(pdf_path, sentences_count=10):
18
+ """
19
+ Summarizes the content of a PDF file using TextRank algorithm.
20
+
21
+ Args:
22
+ pdf_path (str): Path to the PDF file.
23
+ sentences_count (int): Number of sentences for the summary.
24
+
25
+ Returns:
26
+ str: Summarized text.
27
+ """
28
+
29
+ # Extract text from the PDF
30
+ pdf_text = ""
31
+ with open(pdf_path, "rb") as pdf_file:
32
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
33
+ for page in pdf_reader.pages:
34
+ pdf_text += page.extract_text() or ""
35
+
36
+ # Check if text extraction was successful
37
+ if not pdf_text.strip():
38
+ return "Text extraction from PDF failed or PDF is empty."
39
+
40
+ # Create a parser for the extracted text
41
+ parser = PlaintextParser.from_string(pdf_text, Tokenizer("english"))
42
+
43
+ # Use TextRank for summarization
44
+ text_rank_summarizer = TextRankSummarizer()
45
+ text_rank_summary = text_rank_summarizer(parser.document, sentences_count=sentences_count)
46
+
47
+ # Compile summary into a single string
48
+ summary_text = "\n".join(str(sentence) for sentence in text_rank_summary)
49
+
50
+ return summary_text