twilio_whatapp_api / user_guide_sync.py
Arafath10's picture
Update user_guide_sync.py
35c49e8 verified
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
#from langchain_experimental.agents.agent_toolkits import create_csv_agent
from llama_index.llms import OpenAI
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms import OpenAI
from llama_index import StorageContext, load_index_from_storage
#os.environ["OPENAI_API_KEY"]
import concurrent.futures
# URL of the page to scrape
base_url = 'https://help.storemate.cloud/docs/reports/'
def fetch_web_data(url):
try:
# Send a GET request to the URL
response = requests.get(url)
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find the title and section content
title = soup.find('h1').get_text()
# Find the section with the title "Renew Package Subscription"
section = soup.find('h1').find_next('div').find_next('div')
# Extract the text content from the section
section_text = section.get_text().strip()
section_text = section_text + f"\nMore detail link: {url}"
# Save the data into a text file
with open(f"user_guide/{title}.txt", "w") as file:
file.write(f"{title}\n{section_text}")
except Exception as e:
print(f"Failed to fetch data from {url}: {e}")
def get_base_links():
# Send a GET request to the base URL
response = requests.get(base_url)
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find all <a> tags with href attributes
links = soup.find_all('a', href=True)
# Collect all valid links
valid_links = []
for link in links:
href = link['href']
if href.startswith("https://help.storemate.cloud/docs/"):
valid_links.append(href)
print("Base links collected")
# Use ThreadPoolExecutor to fetch web data in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(fetch_web_data, valid_links)
def update_user_guide():
get_base_links()
# try:
# storage_context = StorageContext.from_defaults(persist_dir="llama_index")
# index = load_index_from_storage(storage_context=storage_context)
# print("loaded")
# except:
documents = SimpleDirectoryReader("user_guide").load_data()
index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist("llama_index")
print("index created")
return "done"