File size: 1,549 Bytes
4e00df7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from .common import process_file
from langchain.document_loaders import UnstructuredHTMLLoader
import requests
import re
import unicodedata
import tempfile
import os
import streamlit as st
from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile

def process_html(vector_store, file, stats_db):
    return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)


def get_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

def create_html_file(url, content):
    file_name = slugify(url) + ".html"
    temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
    with open(temp_file_path, 'w') as temp_file:
        temp_file.write(content)

    record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
    uploaded_file = UploadedFile(record)
    
    return uploaded_file, temp_file_path

def delete_tempfile(temp_file_path, url, ret):
    try:
        os.remove(temp_file_path)
        if ret:
            st.write(f"✅ Content saved... {url}  ")
    except OSError as e:
        print(f"Error while deleting the temporary file: {str(e)}")
        if ret:
            st.write(f"❌ Error while saving content... {url}  ")

def slugify(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    text = re.sub(r'[^\w\s-]', '', text).strip().lower()
    text = re.sub(r'[-\s]+', '-', text)
    return text