Makima57 commited on
Commit
df2d3aa
·
verified ·
1 Parent(s): 330cdb8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -26
app.py CHANGED
@@ -2,13 +2,19 @@ import streamlit as st
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
5
- import chunk # Import the chunking function from chunk.py
6
 
7
  # Function to perform Google search and return the first two links
8
  def google_search(query):
9
  try:
10
- search_results = search(query, num_results=2) # Get first two results
11
- first_two_links = [next(search_results, None), next(search_results, None)]
 
 
 
 
 
 
12
  return first_two_links
13
  except Exception as e:
14
  st.error(f"An error occurred: {e}")
@@ -28,11 +34,15 @@ def fetch_webpage_content(url):
28
  def scrape_text(webpage_content):
29
  try:
30
  soup = BeautifulSoup(webpage_content, 'html.parser')
 
31
  for script in soup(["script", "style"]):
32
  script.decompose()
33
  text = soup.get_text()
 
34
  lines = (line.strip() for line in text.splitlines())
 
35
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
 
36
  text = '\n'.join(chunk for chunk in chunks if chunk)
37
  return text
38
  except Exception as e:
@@ -40,7 +50,7 @@ def scrape_text(webpage_content):
40
  return None
41
 
42
  # Streamlit app UI
43
- st.title("Search and Chunk Webpage Content")
44
 
45
  # Input field for search query
46
  query = st.text_input("Enter search query", "")
@@ -50,8 +60,8 @@ if st.button("Search"):
50
  if query:
51
  first_two_links = google_search(query)
52
  if first_two_links:
53
- for i, link in enumerate(first_two_links, 1):
54
- st.success(f"Link {i}: [Click here]({link})")
55
 
56
  # Fetch webpage content
57
  webpage_content = fetch_webpage_content(link)
@@ -59,28 +69,19 @@ if st.button("Search"):
59
  # Scrape text from webpage content
60
  scraped_text = scrape_text(webpage_content)
61
  if scraped_text:
62
- # Chunk the scraped text using chunk.py
63
- chunked_text = chunk.chunk_text(scraped_text)
64
-
65
- chunk.display_chunks(chunked_text)
66
- # Save chunked data to a .txt file for later use
67
- file_name = f"chunked_data_link_{i}.txt"
68
- with open(file_name, "w") as f:
69
- f.write("\n---\n".join(chunked_text)) # Separate chunks by a line break and delimiter
70
-
71
- st.write(f"Chunked Data for Link {i}:")
72
- for chunk_part in chunked_text:
73
- st.write(chunk_part)
74
-
75
- # Provide a unique key for each download button
76
  st.download_button(
77
- label=f"Download Chunked Webpage Content for Link {i}",
78
- data="\n---\n".join(chunked_text),
79
- file_name=file_name,
80
- mime="text/plain",
81
- key=f"download_button_{i}" # Unique key for each button
82
  )
83
  else:
84
  st.warning("No results found")
85
  else:
86
- st.error("Please enter a query")
 
2
  from googlesearch import search
3
  import requests
4
  from bs4 import BeautifulSoup
5
+ import chunk # Import the chunking functionality from app2.py
6
 
7
  # Function to perform Google search and return the first two links
8
  def google_search(query):
9
  try:
10
+ query = query + "/t site:https://medium.com/"
11
+ search_results = search(query, num_results=10) # Get up to 10 results
12
+ first_two_links = []
13
+ for i, link in enumerate(search_results):
14
+ if i < 2:
15
+ first_two_links.append(link)
16
+ else:
17
+ break
18
  return first_two_links
19
  except Exception as e:
20
  st.error(f"An error occurred: {e}")
 
34
  def scrape_text(webpage_content):
35
  try:
36
  soup = BeautifulSoup(webpage_content, 'html.parser')
37
+ # Remove all script and style elements
38
  for script in soup(["script", "style"]):
39
  script.decompose()
40
  text = soup.get_text()
41
+ # Break the text into lines and remove leading/trailing spaces
42
  lines = (line.strip() for line in text.splitlines())
43
+ # Break multi-headlines into a line each
44
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
45
+ # Drop blank lines
46
  text = '\n'.join(chunk for chunk in chunks if chunk)
47
  return text
48
  except Exception as e:
 
50
  return None
51
 
52
  # Streamlit app UI
53
+ st.title("Search Link Finder")
54
 
55
  # Input field for search query
56
  query = st.text_input("Enter search query", "")
 
60
  if query:
61
  first_two_links = google_search(query)
62
  if first_two_links:
63
+ for i, link in enumerate(first_two_links):
64
+ st.success(f"Link {i+1}: [Click here]({link})")
65
 
66
  # Fetch webpage content
67
  webpage_content = fetch_webpage_content(link)
 
69
  # Scrape text from webpage content
70
  scraped_text = scrape_text(webpage_content)
71
  if scraped_text:
72
+ st.write(f"Scraped Content from Link {i+1} (Chunked):")
73
+
74
+ # Call the chunking function from app2.py
75
+ chunk.display_chunks(scraped_text)
76
+
77
+ # Option to download the entire scraped content
 
 
 
 
 
 
 
 
78
  st.download_button(
79
+ label=f"Download Full Webpage Content from Link {i+1}",
80
+ data=scraped_text,
81
+ file_name=f"webpage_content_{i+1}.txt",
82
+ mime="text/plain"
 
83
  )
84
  else:
85
  st.warning("No results found")
86
  else:
87
+ st.error("Please enter a query")