Spaces:
Sleeping
Sleeping
File size: 3,751 Bytes
ff80e03 efb7ae3 8952e96 a9c912a 6f83b86 ff80e03 d57edca ff80e03 d57edca ff80e03 492dd22 f6e4704 ff80e03 959ae62 ff80e03 8729dff 97acbde ff80e03 8952e96 ff80e03 70b9f54 624f9a5 6e81acd 624f9a5 ff80e03 088dc12 814a16d 457cb74 088dc12 457cb74 8729dff ff80e03 5f28d3a 457cb74 ff80e03 457cb74 ff80e03 db75ea6 ff80e03 d57edca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import streamlit as st
import os
from llama_index.core import SimpleDirectoryReader, Document, VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import TokenTextSplitter
# Configuraci贸n inicial y variables de entorno
st.title("Busca en PDFs")
# Entradas de usuario para las API keys
st.write("Obten tu api-key de OpenAI")
OPENAI_API_KEY = st.text_input('OpenAI Api Key', type='password')
if OPENAI_API_KEY:
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# Funci贸n principal
def main():
st.write("Aplicaci贸n en Streamlit ejecut谩ndose correctamente")
# Configura las carpetas necesarias
source_data_folder = "MisDatos"
path_db = "VectorDB"
os.makedirs(source_data_folder, exist_ok=True)
os.makedirs(path_db, exist_ok=True)
# Carga de archivos PDF
uploaded_files = st.file_uploader("Sube archivos PDF", type="pdf", accept_multiple_files=True)
if uploaded_files:
for uploaded_file in uploaded_files:
with open(os.path.join(source_data_folder, uploaded_file.name), "wb") as f:
f.write(uploaded_file.getbuffer())
st.write(f"{len(uploaded_files)} archivos subidos exitosamente")
# Leer y procesar PDFs
loader = SimpleDirectoryReader(source_data_folder)
data_on_pdf = loader.load_data()
st.write(f"Se han cargado {len(data_on_pdf)} documentos")
# Preprocesar texto
def preprocess_text(text):
cleaned_text = ' '.join(text.split())
return cleaned_text
# Particionar datos
text_splitter = TokenTextSplitter(
chunk_size=1024,
chunk_overlap=200
)
splits = []
for doc in data_on_pdf:
cleaned_text = preprocess_text(doc.text)
split_docs = text_splitter.split_text(cleaned_text)
split_docs = [Document(text=chunk, metadata=doc.metadata) for chunk in split_docs]
splits.extend(split_docs)
st.write(f"Se han creado {len(splits)} fragmentos")
# Embeddings
embeddings_model = OpenAIEmbedding(model="text-embedding-ada-002")
# Crear el 铆ndice vectorial
index = VectorStoreIndex.from_documents(splits, embedding_model=embeddings_model)
# Crear motor de consultas
query_engine = index.as_query_engine()
# Configuraci贸n del LLM
llm = OpenAI(api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo-0125", temperature=0.8)
# Definici贸n del pipeline de procesamiento
def rag_pipeline(question):
response = query_engine.query(question)
#context_docs = response.get_documents() # Obtener documentos correctamente
context_docs = display_response(response)
context = "\n\n".join(doc.text for doc in context_docs)
prompt = f"Contexto:\n{context}\n\nPregunta: {question}\nRespuesta:"
llm_response = llm(prompt)
return llm_response
# Interacci贸n con el usuario
pregunta = st.text_area("Haz una pregunta sobre los documentos:")
if pregunta:
try:
response = rag_pipeline(pregunta)
st.markdown(response)
except Exception as e:
st.error(f"Error al procesar la pregunta: {e}")
main()
else:
st.write("Por favor, proporciona la API key para continuar.") |