Peter Vandenabeele commited on
Commit
a92d81b
1 Parent(s): aa936e5

Clean up scraping to eliminate scripts and style,,but keep other tags in order

Browse files
Files changed (2) hide show
  1. app.py +7 -7
  2. scrape_website.py +31 -46
app.py CHANGED
@@ -2,7 +2,7 @@ import torch
2
  from peft import PeftModel
3
  import transformers
4
  import gradio as gr
5
- from scrape_website import process_webpage
6
  assert (
7
  "LlamaTokenizer" in transformers._import_structure["models.llama"]
8
  ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
@@ -59,14 +59,14 @@ else:
59
 
60
  def generate_prompt(instruction, input=None):
61
  if input:
62
- return f"""Below is an url that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
63
  ### Instruction:
64
  {instruction}
65
  ### Input:
66
  {input}
67
  ### Response:"""
68
  else:
69
- return f"""Below is an url that describes a task. Write a response that appropriately completes the request.
70
  ### Instruction:
71
  {instruction}
72
  ### Response:"""
@@ -80,7 +80,7 @@ if torch.__version__ >= "2":
80
 
81
  def evaluate(
82
  instruction,
83
- url,
84
  temperature=0.1,
85
  top_p=0.75,
86
  top_k=40,
@@ -88,7 +88,7 @@ def evaluate(
88
  max_new_tokens=128,
89
  **kwargs,
90
  ):
91
- content = process_webpage(url=url)
92
  # avoid GPU memory overflow
93
  with torch.no_grad():
94
  torch.cuda.empty_cache()
@@ -122,8 +122,8 @@ g = gr.Interface(
122
  gr.components.Textbox(
123
  lines=2, label="FAQ", placeholder="Ask me anything about this website?"
124
  ),
125
- gr.components.Textbox(lines=1, label="Website URL", placeholder="https://www.meet-drift.ai/"),
126
- # gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
127
  # gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
128
  # gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
129
  # gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
 
2
  from peft import PeftModel
3
  import transformers
4
  import gradio as gr
5
+ from scrape_website import process_webpages
6
  assert (
7
  "LlamaTokenizer" in transformers._import_structure["models.llama"]
8
  ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
 
59
 
60
  def generate_prompt(instruction, input=None):
61
  if input:
62
+ return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
63
  ### Instruction:
64
  {instruction}
65
  ### Input:
66
  {input}
67
  ### Response:"""
68
  else:
69
+ return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
70
  ### Instruction:
71
  {instruction}
72
  ### Response:"""
 
80
 
81
  def evaluate(
82
  instruction,
83
+ urls_string,
84
  temperature=0.1,
85
  top_p=0.75,
86
  top_k=40,
 
88
  max_new_tokens=128,
89
  **kwargs,
90
  ):
91
+ content = process_webpages(urls=urls_string.split())
92
  # avoid GPU memory overflow
93
  with torch.no_grad():
94
  torch.cuda.empty_cache()
 
122
  gr.components.Textbox(
123
  lines=2, label="FAQ", placeholder="Ask me anything about this website?"
124
  ),
125
+ gr.components.Textbox(lines=2, label="Website URLs", placeholder="https://www.example.org/ https://www.example.com/"),
126
+ gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
127
  # gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
128
  # gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
129
  # gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
scrape_website.py CHANGED
@@ -1,45 +1,21 @@
1
  import requests
2
  from bs4 import BeautifulSoup
 
3
 
4
- TOKEN_CUT_OFF = 2500
5
 
6
- def process_webpage(url:str):
7
- # A set to keep track of visited pages
8
- visited_pages = set()
9
-
10
- text_list = []
11
-
12
- # A function to recursively get all child pages
13
- def get_child_pages(url):
14
- # Make a GET request to the page and get the HTML content
15
- response = requests.get(url)
16
- html_content = response.content
17
-
18
- # Parse the HTML content using BeautifulSoup
19
- soup = BeautifulSoup(html_content, "html.parser")
20
-
21
- # Get all the text content from the relevant HTML tags
22
- text_content = ""
23
- for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
24
- for element in soup.find_all(tag):
25
- text_content += element.get_text() + " "
26
-
27
- # Add the page to the set of visited pages
28
- text_content = f"page {url} contains: " + text_content
29
- visited_pages.add(url)
30
 
31
- # Find all the child links and recursively get their text content
32
- for link in soup.find_all("a"):
33
- href = link.get("href")
34
- if href and href not in visited_pages and url in href:
35
- get_child_pages(href)
36
 
37
- text_list.append(text_content)
 
38
 
39
- # Get the text content of the landing page
40
- # get_child_pages(url)
41
 
42
- # Make a GET request to the page and get the HTML content
 
43
  response = requests.get(url)
44
  html_content = response.content
45
 
@@ -47,20 +23,29 @@ def process_webpage(url:str):
47
  soup = BeautifulSoup(html_content, "html.parser")
48
 
49
  # Get all the text content from the relevant HTML tags
50
- text_content = ""
51
- for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
52
- for element in soup.find_all(tag):
53
- text_content += element.get_text() + " "
 
54
 
55
- # # make main page as first item
56
- # text_list.reverse()
57
- # text_list_cut_off = text_list[:TOKEN_CUT_OFF]
58
- # page_content = "\n".join(text_list_cut_off)
59
- # # Print the text content of the landing page and all child pages
60
- # print(page_content)
61
- # return page_content
62
  print(text_content)
63
  return text_content
64
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  if __name__ == '__main__':
66
- process_webpage(url="https://www.meet-drift.ai/")
 
 
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ from typing import List
4
 
5
+ CHARACTER_CUT_OFF = 20000
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ def remove_tags(soup: BeautifulSoup) -> str:
9
+ for data in soup(['style', 'script']):
10
+ # Remove tags
11
+ data.decompose()
 
12
 
13
+ # return data by retrieving the tag content
14
+ return ' '.join(soup.stripped_strings)
15
 
 
 
16
 
17
+ def read_webpage(url:str) -> str:
18
+ print(f"Getting the response from url : {url})")
19
  response = requests.get(url)
20
  html_content = response.content
21
 
 
23
  soup = BeautifulSoup(html_content, "html.parser")
24
 
25
  # Get all the text content from the relevant HTML tags
26
+ text_content = remove_tags(soup)
27
+
28
+ # for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]:
29
+ # for element in soup.find_all(tag):
30
+ # text_content += element.get_text() + " "
31
 
 
 
 
 
 
 
 
32
  print(text_content)
33
  return text_content
34
 
35
+ def process_webpages(urls:List[str]):
36
+ # A set to keep track of visited pages
37
+ visited_pages = set()
38
+ aggregated_text = ""
39
+ for url in urls:
40
+ visited_pages.add(url)
41
+ aggregated_text += f"\nGetting the content of {url}:\n"
42
+ aggregated_text += read_webpage(url)
43
+
44
+ return aggregated_text[:CHARACTER_CUT_OFF]
45
+
46
+
47
  if __name__ == '__main__':
48
+ print(process_webpages(urls=[
49
+ "https://www.example.org",
50
+ "https://www.example.com",
51
+ ]))