SuperExpert / tools /advanced_scraper.py
JarvisChan630's picture
first commit
75309ed
raw
history blame
1.64 kB
import os
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.messages import AIMessage
from fake_useragent import UserAgent
ua = UserAgent()
os.environ["USER_AGENT"] = ua.random
def scraper(url: str, doc_type: str) -> dict:
if doc_type == "html":
try:
loader = AsyncChromiumLoader([url])
html = loader.load()
# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["p"])
print({"source":url, "content": AIMessage(docs_transformed[0].page_content)})
return {"source":url, "content": AIMessage(docs_transformed[0].page_content)}
except Exception as e:
return {"source": url, "content": AIMessage(f"Error scraping website: {str(e)}")}
elif doc_type == "pdf":
try:
loader = PyPDFLoader(url)
pages = loader.load_and_split()
# print({"source":url, "content":AIMessage(pages)})
return {"source":url, "content":AIMessage(pages)}
except Exception as e:
return {"source": url, "content": AIMessage(f"Error scraping PDF: {str(e)}")}
else:
return {"source": url, "content": AIMessage("Unsupported document type, supported types are 'html' and 'pdf'.")}
if __name__ == "__main__":
scraper("https://python.langchain.com/v0.1/docs/modules/data_connection/document_loaders/pdf/", "html")