Spaces:
Running
Running
import datetime | |
import uuid | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import os | |
from langchain.document_loaders import WebBaseLoader, TextLoader, Docx2txtLoader, PyMuPDFLoader | |
from whatsapp_chat_custom import WhatsAppChatLoader # use this instead of from langchain.document_loaders import WhatsAppChatLoader | |
from collections import deque | |
import re | |
from bs4 import BeautifulSoup | |
import requests | |
from urllib.parse import urlparse | |
import mimetypes | |
from pathlib import Path | |
import tiktoken | |
# Regex pattern to match a URL | |
HTTP_URL_PATTERN = r'^http[s]*://.+' | |
mimetypes.init() | |
media_files = tuple([x for x in mimetypes.types_map if mimetypes.types_map[x].split('/')[0] in ['image', 'video', 'audio']]) | |
filter_strings = ['/email-protection#'] | |
def transformApi(api_key=''): | |
if api_key==os.getenv("TEMP_PWD"): | |
return os.getenv("OPENAI_API_KEY") | |
elif api_key is None or api_key=='': | |
return 'Null' | |
else: | |
return api_key | |
def get_hyperlinks(url): | |
try: | |
reqs = requests.get(url) | |
if not reqs.headers.get('Content-Type').startswith("text/html") or 400<=reqs.status_code<600: | |
return [] | |
soup = BeautifulSoup(reqs.text, 'html.parser') | |
except Exception as e: | |
print(e) | |
return [] | |
hyperlinks = [] | |
for link in soup.find_all('a', href=True): | |
hyperlinks.append(link.get('href')) | |
return hyperlinks | |
# Function to get the hyperlinks from a URL that are within the same domain | |
def get_domain_hyperlinks(local_domain, url): | |
clean_links = [] | |
for link in set(get_hyperlinks(url)): | |
clean_link = None | |
# If the link is a URL, check if it is within the same domain | |
if re.search(HTTP_URL_PATTERN, link): | |
# Parse the URL and check if the domain is the same | |
url_obj = urlparse(link) | |
if url_obj.netloc.replace('www.','') == local_domain.replace('www.',''): | |
clean_link = link | |
# If the link is not a URL, check if it is a relative link | |
else: | |
if link.startswith("/"): | |
link = link[1:] | |
elif link.startswith(("#", '?', 'mailto:')): | |
continue | |
if 'wp-content/uploads' in url: | |
clean_link = url+ "/" + link | |
else: | |
clean_link = "https://" + local_domain + "/" + link | |
if clean_link is not None: | |
clean_link = clean_link.strip().rstrip('/').replace('/../', '/') | |
if not any(x in clean_link for x in filter_strings): | |
clean_links.append(clean_link) | |
# Return the list of hyperlinks that are within the same domain | |
return list(set(clean_links)) | |
# this function will get you a list of all the URLs from the base URL | |
def crawl(url, local_domain, prog=None): | |
# Create a queue to store the URLs to crawl | |
queue = deque([url]) | |
# Create a set to store the URLs that have already been seen (no duplicates) | |
seen = set([url]) | |
# While the queue is not empty, continue crawling | |
while queue: | |
# Get the next URL from the queue | |
url_pop = queue.pop() | |
# Get the hyperlinks from the URL and add them to the queue | |
for link in get_domain_hyperlinks(local_domain, url_pop): | |
if link not in seen: | |
queue.append(link) | |
seen.add(link) | |
if len(seen)>=100: | |
return seen | |
if prog is not None: prog(1, desc=f'Crawling: {url_pop}') | |
return seen | |
def ingestURL(documents, url, crawling=True, prog=None): | |
url = url.rstrip('/') | |
# Parse the URL and get the domain | |
local_domain = urlparse(url).netloc | |
if not (local_domain and url.startswith('http')): | |
return documents | |
print('Loading URL', url) | |
if crawling: | |
# crawl to get other webpages from this URL | |
if prog is not None: prog(0, desc=f'Crawling: {url}') | |
links = crawl(url, local_domain, prog) | |
if prog is not None: prog(1, desc=f'Crawling: {url}') | |
else: | |
links = set([url]) | |
# separate pdf and other links | |
c_links, pdf_links = [], [] | |
for x in links: | |
if x.endswith('.pdf'): | |
pdf_links.append(x) | |
elif not x.endswith(media_files): | |
c_links.append(x) | |
# Clean links loader using WebBaseLoader | |
if prog is not None: prog(0.5, desc=f'Ingesting: {url}') | |
if c_links: | |
loader = WebBaseLoader(list(c_links)) | |
documents.extend(loader.load()) | |
# remote PDFs loader | |
for pdf_link in list(pdf_links): | |
loader = PyMuPDFLoader(pdf_link) | |
doc = loader.load() | |
for x in doc: | |
x.metadata['source'] = loader.source | |
documents.extend(doc) | |
return documents | |
def ingestFiles(documents, files_list, prog=None): | |
for fPath in files_list: | |
doc = None | |
if fPath.endswith('.pdf'): | |
doc = PyMuPDFLoader(fPath).load() | |
elif fPath.endswith('.txt') and not 'WhatsApp Chat with' in fPath: | |
doc = TextLoader(fPath).load() | |
elif fPath.endswith(('.doc', 'docx')): | |
doc = Docx2txtLoader(fPath).load() | |
elif 'WhatsApp Chat with' in fPath and fPath.endswith('.csv'): # Convert Whatsapp TXT files to CSV using https://whatstk.streamlit.app/ | |
doc = WhatsAppChatLoader(fPath).load() | |
else: | |
pass | |
if doc is not None and doc[0].page_content: | |
if prog is not None: prog(1, desc='Loaded file: '+fPath.rsplit('/')[0]) | |
print('Loaded file:', fPath) | |
documents.extend(doc) | |
return documents | |
def data_ingestion(inputDir=None, file_list=[], url_list=[], prog=None): | |
documents = [] | |
# Ingestion from Input Directory | |
if inputDir is not None: | |
files = [str(x) for x in Path(inputDir).glob('**/*')] | |
documents = ingestFiles(documents, files) | |
if file_list: | |
documents = ingestFiles(documents, file_list, prog) | |
# Ingestion from URLs - also try https://python.langchain.com/docs/integrations/document_loaders/recursive_url_loader | |
if url_list: | |
for url in url_list: | |
documents = ingestURL(documents, url, prog=prog) | |
# Cleanup documents | |
for x in documents: | |
if 'WhatsApp Chat with' not in x.metadata['source']: | |
x.page_content = x.page_content.strip().replace('\n', ' ').replace('\\n', ' ').replace(' ', ' ') | |
# print(f"Total number of documents: {len(documents)}") | |
return documents | |
def split_docs(documents): | |
# Splitting and Chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=250) # default chunk size of 4000 makes around 1k tokens per doc. with k=4, this means 4k tokens input to LLM. | |
docs = text_splitter.split_documents(documents) | |
return docs | |
def getSourcesFromMetadata(metadata, sourceOnly=True, sepFileUrl=True): | |
# metadata: list of metadata dict from all documents | |
setSrc = set() | |
for x in metadata: | |
metadataText = '' # we need to convert each metadata dict into a string format. This string will be added to a set | |
if x is not None: | |
# extract source first, and then extract all other items | |
source = x['source'] | |
source = source.rsplit('/',1)[-1] if 'http' not in source else source | |
notSource = [] | |
for k,v in x.items(): | |
if v is not None and k!='source' and k in ['page', 'title']: | |
notSource.extend([f"{k}: {v}"]) | |
metadataText = ', '.join([f'source: {source}'] + notSource) if sourceOnly==False else source | |
setSrc.add(metadataText) | |
if sepFileUrl: | |
src_files = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' not in x], key=str.casefold))])) | |
src_urls = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' in x], key=str.casefold))])) | |
src_files = 'Files:\n'+src_files if src_files else '' | |
src_urls = 'URLs:\n'+src_urls if src_urls else '' | |
newLineSep = '\n\n' if src_files and src_urls else '' | |
return src_files + newLineSep + src_urls , len(setSrc) | |
else: | |
src_docs = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted(list(setSrc), key=str.casefold))])) | |
return src_docs, len(setSrc) | |
def getVsDict(embeddingFunc, docs, vsDict={}): | |
# create chroma client if doesnt exist | |
if vsDict.get('chromaClient') is None: | |
vsDict['chromaDir'] = './vecstore/'+str(uuid.uuid1()) | |
vsDict['chromaClient'] = Chroma(embedding_function=embeddingFunc, persist_directory=vsDict['chromaDir']) | |
# clear chroma client before adding new docs | |
if vsDict['chromaClient']._collection.count()>0: | |
vsDict['chromaClient'].delete(vsDict['chromaClient'].get()['ids']) | |
# add new docs to chroma client | |
vsDict['chromaClient'].add_documents(docs) | |
print('vectorstore count:',vsDict['chromaClient']._collection.count(), 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) | |
return vsDict | |
# used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function) | |
def localData_vecStore(openApiKey=None, inputDir=None, file_list=[], url_list=[], vsDict={}): | |
documents = data_ingestion(inputDir, file_list, url_list) | |
if not documents: | |
return {} | |
docs = split_docs(documents) | |
# Embeddings | |
embeddings = OpenAIEmbeddings(openai_api_key=openApiKey) | |
# create chroma client if doesnt exist | |
vsDict_hd = getVsDict(embeddings, docs, vsDict) | |
# get sources from metadata | |
src_str = getSourcesFromMetadata(vsDict_hd['chromaClient'].get()['metadatas']) | |
src_str = str(src_str[1]) + ' source document(s) successfully loaded in vector store.'+'\n\n' + src_str[0] | |
print(src_str) | |
return vsDict_hd | |
def num_tokens_from_string(string, encoding_name = "cl100k_base"): | |
"""Returns the number of tokens in a text string.""" | |
encoding = tiktoken.get_encoding(encoding_name) | |
num_tokens = len(encoding.encode(string)) | |
return num_tokens |