mishrasahil934 commited on
Commit
baec4bc
·
verified ·
1 Parent(s): 648594a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -20
app.py CHANGED
@@ -27,26 +27,20 @@ def extract_text_from_pdf(pdf_path):
27
 
28
  # Web Scraping Function
29
  def scrape_article(url):
30
- response = requests.get(url, timeout=10)
31
- response.raise_for_status() # Raise an error if the request fails
32
- soup = BeautifulSoup(response.content, 'html.parser')
33
 
34
- # Extract the main content (common tags for articles)
35
- paragraphs = soup.find_all('p')
36
- article_text = "\n".join([para.get_text() for para in paragraphs])
37
 
38
- if not article_text.strip():
39
- raise ValueError("Unable to extract content from the page.")
40
- return article_text
41
-
42
- # Function to split long text into smaller chunks
43
- def split_text(input_text, max_tokens=512):
44
- words = input_text.split()
45
- return [" ".join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)]
46
 
47
  # LLM pipeline for summarization
48
  def llm_pipeline(input_text):
49
- chunks = split_text(input_text, max_tokens=512) # Split text into chunks
50
  pipe_sum = pipeline(
51
  'summarization',
52
  model=base_model,
@@ -54,11 +48,8 @@ def llm_pipeline(input_text):
54
  max_length=500,
55
  min_length=50,
56
  )
57
- summaries = []
58
- for chunk in chunks:
59
- result = pipe_sum(chunk)
60
- summaries.append(result[0]['summary_text'])
61
- return " ".join(summaries)
62
 
63
  @st.cache_data
64
  # Function to display the PDF
 
27
 
28
  # Web Scraping Function
29
  def scrape_article(url):
30
+ response = requests.get(url, timeout=10)
31
+ response.raise_for_status() # Raise an error if the request fails
32
+ soup = BeautifulSoup(response.content, 'html.parser')
33
 
34
+ # Extract the main content (common tags for articles)
35
+ paragraphs = soup.find_all('p')
36
+ article_text = "\n".join([para.get_text() for para in paragraphs])
37
 
38
+ if not article_text.strip():
39
+ raise ValueError("Unable to extract content from the page.")
40
+ return article_text
 
 
 
 
 
41
 
42
  # LLM pipeline for summarization
43
  def llm_pipeline(input_text):
 
44
  pipe_sum = pipeline(
45
  'summarization',
46
  model=base_model,
 
48
  max_length=500,
49
  min_length=50,
50
  )
51
+ result = pipe_sum(input_text)
52
+ return result[0]['summary_text']
 
 
 
53
 
54
  @st.cache_data
55
  # Function to display the PDF