faq-website

Runtime error

App Files Files Community

Peter Vandenabeele commited on Mar 31, 2023

Commit

a92d81b

1 Parent(s): aa936e5

Clean up scraping to eliminate scripts and style,,but keep other tags in order

Browse files

Files changed (2) hide show

app.py +7 -7
scrape_website.py +31 -46

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 from peft import PeftModel
 import transformers
 import gradio as gr
-from scrape_website import process_webpage
 assert (
     "LlamaTokenizer" in transformers._import_structure["models.llama"]
 ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
@@ -59,14 +59,14 @@ else:
 def generate_prompt(instruction, input=None):
     if input:
-        return f"""Below is an url that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
 {instruction}
 ### Input:
 {input}
 ### Response:"""
     else:
-        return f"""Below is an url that describes a task. Write a response that appropriately completes the request.
 ### Instruction:
 {instruction}
 ### Response:"""
@@ -80,7 +80,7 @@ if torch.__version__ >= "2":
 def evaluate(
     instruction,
-    url,
     temperature=0.1,
     top_p=0.75,
     top_k=40,
@@ -88,7 +88,7 @@ def evaluate(
     max_new_tokens=128,
     **kwargs,
 ):
-    content = process_webpage(url=url)
     # avoid GPU memory overflow
     with torch.no_grad():
         torch.cuda.empty_cache()
@@ -122,8 +122,8 @@ g = gr.Interface(
         gr.components.Textbox(
             lines=2, label="FAQ", placeholder="Ask me anything about this website?"
         ),
-        gr.components.Textbox(lines=1, label="Website URL", placeholder="https://www.meet-drift.ai/"),
-        # gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
         # gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
         # gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
         # gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),

 from peft import PeftModel
 import transformers
 import gradio as gr
+from scrape_website import process_webpages
 assert (
     "LlamaTokenizer" in transformers._import_structure["models.llama"]
 ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
 def generate_prompt(instruction, input=None):
     if input:
+        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 ### Instruction:
 {instruction}
 ### Input:
 {input}
 ### Response:"""
     else:
+        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 ### Instruction:
 {instruction}
 ### Response:"""
 def evaluate(
     instruction,
+    urls_string,
     temperature=0.1,
     top_p=0.75,
     top_k=40,
     max_new_tokens=128,
     **kwargs,
 ):
+    content = process_webpages(urls=urls_string.split())
     # avoid GPU memory overflow
     with torch.no_grad():
         torch.cuda.empty_cache()
         gr.components.Textbox(
             lines=2, label="FAQ", placeholder="Ask me anything about this website?"
         ),
+        gr.components.Textbox(lines=2, label="Website URLs", placeholder="https://www.example.org/ https://www.example.com/"),
+        gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
         # gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
         # gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
         # gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),

scrape_website.py CHANGED Viewed

@@ -1,45 +1,21 @@
 import requests
 from bs4 import BeautifulSoup
-TOKEN_CUT_OFF = 2500
-def process_webpage(url:str):
-    # A set to keep track of visited pages
-    visited_pages = set()
-    text_list = []
-    # A function to recursively get all child pages
-    def get_child_pages(url):
-        # Make a GET request to the page and get the HTML content
-        response = requests.get(url)
-        html_content = response.content
-        # Parse the HTML content using BeautifulSoup
-        soup = BeautifulSoup(html_content, "html.parser")
-        # Get all the text content from the relevant HTML tags
-        text_content = ""
-        for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
-            for element in soup.find_all(tag):
-                text_content += element.get_text() + " "
-        # Add the page to the set of visited pages
-        text_content = f"page {url} contains: " + text_content
-        visited_pages.add(url)
-        # Find all the child links and recursively get their text content
-        for link in soup.find_all("a"):
-            href = link.get("href")
-            if href and href not in visited_pages and url in href:
-                get_child_pages(href)
-        text_list.append(text_content)
-    # Get the text content of the landing page
-    # get_child_pages(url)
-    # Make a GET request to the page and get the HTML content
     response = requests.get(url)
     html_content = response.content
@@ -47,20 +23,29 @@ def process_webpage(url:str):
     soup = BeautifulSoup(html_content, "html.parser")
     # Get all the text content from the relevant HTML tags
-    text_content = ""
-    for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
-        for element in soup.find_all(tag):
-            text_content += element.get_text() + " "
-    # # make main page as first item
-    # text_list.reverse()
-    # text_list_cut_off = text_list[:TOKEN_CUT_OFF]
-    # page_content = "\n".join(text_list_cut_off)
-    # # Print the text content of the landing page and all child pages
-    # print(page_content)
-    # return page_content
     print(text_content)
     return text_content
 if __name__ == '__main__':
-    process_webpage(url="https://www.meet-drift.ai/")

 import requests
 from bs4 import BeautifulSoup
+from typing import List
+CHARACTER_CUT_OFF = 20000
+def remove_tags(soup: BeautifulSoup) -> str:
+    for data in soup(['style', 'script']):
+        # Remove tags
+        data.decompose()
+    # return data by retrieving the tag content
+    return ' '.join(soup.stripped_strings)
+def read_webpage(url:str) -> str:
+    print(f"Getting the response from url : {url})")
     response = requests.get(url)
     html_content = response.content
     soup = BeautifulSoup(html_content, "html.parser")
     # Get all the text content from the relevant HTML tags
+    text_content = remove_tags(soup)
+    # for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]:
+    #     for element in soup.find_all(tag):
+    #         text_content += element.get_text() + " "
     print(text_content)
     return text_content
+def process_webpages(urls:List[str]):
+    # A set to keep track of visited pages
+    visited_pages = set()
+    aggregated_text = ""
+    for url in urls:
+        visited_pages.add(url)
+        aggregated_text += f"\nGetting the content of {url}:\n"
+        aggregated_text += read_webpage(url)
+    return aggregated_text[:CHARACTER_CUT_OFF]
 if __name__ == '__main__':
+    print(process_webpages(urls=[
+        "https://www.example.org",
+        "https://www.example.com",
+        ]))