Makima57 commited on
Commit
05f7c2a
·
verified ·
1 Parent(s): 92a289d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -15
app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
5
- import chunk # Import the chunking functions from chunk.py
6
 
7
  # Function to perform Google search and return the first two links
8
  def google_search(query):
@@ -29,11 +29,11 @@ def scrape_text(webpage_content):
29
  try:
30
  soup = BeautifulSoup(webpage_content, 'html.parser')
31
  for script in soup(["script", "style"]):
32
- script.decompose()
33
- text = soup.get_text()
34
- lines = (line.strip() for line in text.splitlines())
35
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
36
- text = '\n'.join(chunk for chunk in chunks if chunk)
37
  return text
38
  except Exception as e:
39
  st.error(f"Failed to scrape text from webpage content: {e}")
@@ -48,23 +48,39 @@ query = st.text_input("Enter search query", "")
48
  # Button to trigger search
49
  if st.button("Search"):
50
  if query:
51
- first_two_links = google_search(query)
52
  if first_two_links:
53
  for i, link in enumerate(first_two_links, 1):
54
- st.success(f"Link {i}: [Click here]({link})")
55
-
56
  # Fetch webpage content
57
  webpage_content = fetch_webpage_content(link)
58
  if webpage_content:
59
  # Scrape text from webpage content
60
  scraped_text = scrape_text(webpage_content)
61
- if scraped_text:
 
 
 
 
62
  # Chunk the scraped text using chunk.py
63
  chunked_text = chunk.chunk_text(scraped_text)
 
 
 
 
 
64
 
65
- st.write(f"Chunked Data for Link {i}:")
66
- for chunk_part in chunked_text:
67
- st.write(chunk_part)
 
 
 
 
 
 
 
68
 
69
- # Save and download chunked data using the function from chunk.py
70
- chunk.save_and_download_chunked_data(chunked_text, file_name=f"chunked_data_link_{i}.txt")
 
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
5
+ import chunk # Importing the chunk module
6
 
7
  # Function to perform Google search and return the first two links
8
  def google_search(query):
 
29
  try:
30
  soup = BeautifulSoup(webpage_content, 'html.parser')
31
  for script in soup(["script", "style"]):
32
+ script.decompose() # Remove unnecessary elements
33
+ text = soup.get_text() # Get raw text
34
+ lines = (line.strip() for line in text.splitlines()) # Strip lines
35
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Split and clean
36
+ text = '\n'.join(chunk for chunk in chunks if chunk) # Join cleaned text
37
  return text
38
  except Exception as e:
39
  st.error(f"Failed to scrape text from webpage content: {e}")
 
48
  # Button to trigger search
49
  if st.button("Search"):
50
  if query:
51
+ first_two_links = google_search(query) # Get first two links
52
  if first_two_links:
53
  for i, link in enumerate(first_two_links, 1):
54
+ st.success(f"Link {i}: [Click here]({link})") # Display links
55
+
56
  # Fetch webpage content
57
  webpage_content = fetch_webpage_content(link)
58
  if webpage_content:
59
  # Scrape text from webpage content
60
  scraped_text = scrape_text(webpage_content)
61
+
62
+ if scraped_text: # Ensure scraped_text is not empty
63
+ st.write(f"Scraped Content for Link {i}:")
64
+ st.text(scraped_text[:500]) # Display first 500 characters of the content
65
+
66
  # Chunk the scraped text using chunk.py
67
  chunked_text = chunk.chunk_text(scraped_text)
68
+
69
+ if chunked_text: # Ensure chunked_text is not empty
70
+ st.write(f"Chunked Data for Link {i}:")
71
+ for chunk_part in chunked_text:
72
+ st.write(chunk_part) # Display each chunk
73
 
74
+ # Save and download chunked data using chunk.py
75
+ chunk.save_and_download_chunked_data(chunked_text, file_name=f"chunked_data_link_{i}.txt")
76
+ else:
77
+ st.warning("No chunked data available")
78
+ else:
79
+ st.warning("No content scraped from this link")
80
+ else:
81
+ st.warning("No results found")
82
+ else:
83
+ st.error("Please enter a query")
84
 
85
+
86
+