Spaces:
Running
Running
import requests | |
from bs4 import BeautifulSoup | |
import google.generativeai as genai | |
from langchain.llms import OpenAI | |
from langchain.chat_models import ChatOpenAI | |
from langchain.agents.agent_types import AgentType | |
#from langchain_experimental.agents.agent_toolkits import create_csv_agent | |
from llama_index.llms import OpenAI | |
from llama_index import VectorStoreIndex, SimpleDirectoryReader | |
from llama_index.llms import OpenAI | |
from llama_index import StorageContext, load_index_from_storage | |
#os.environ["OPENAI_API_KEY"] | |
import concurrent.futures | |
# URL of the page to scrape | |
base_url = 'https://help.storemate.cloud/docs/reports/' | |
def fetch_web_data(url): | |
try: | |
# Send a GET request to the URL | |
response = requests.get(url) | |
# Parse the page content with BeautifulSoup | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find the title and section content | |
title = soup.find('h1').get_text() | |
# Find the section with the title "Renew Package Subscription" | |
section = soup.find('h1').find_next('div').find_next('div') | |
# Extract the text content from the section | |
section_text = section.get_text().strip() | |
section_text = section_text + f"\nMore detail link: {url}" | |
# Save the data into a text file | |
with open(f"user_guide/{title}.txt", "w") as file: | |
file.write(f"{title}\n{section_text}") | |
except Exception as e: | |
print(f"Failed to fetch data from {url}: {e}") | |
def get_base_links(): | |
# Send a GET request to the base URL | |
response = requests.get(base_url) | |
# Parse the page content with BeautifulSoup | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find all <a> tags with href attributes | |
links = soup.find_all('a', href=True) | |
# Collect all valid links | |
valid_links = [] | |
for link in links: | |
href = link['href'] | |
if href.startswith("https://help.storemate.cloud/docs/"): | |
valid_links.append(href) | |
print("Base links collected") | |
# Use ThreadPoolExecutor to fetch web data in parallel | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
executor.map(fetch_web_data, valid_links) | |
def update_user_guide(): | |
get_base_links() | |
# try: | |
# storage_context = StorageContext.from_defaults(persist_dir="llama_index") | |
# index = load_index_from_storage(storage_context=storage_context) | |
# print("loaded") | |
# except: | |
documents = SimpleDirectoryReader("user_guide").load_data() | |
index = VectorStoreIndex.from_documents(documents) | |
index.storage_context.persist("llama_index") | |
print("index created") | |
return "done" | |