Spaces:

regraded01
/

pdf-to-table

Running

File size: 1,509 Bytes

ce61e92
 
9f5f200
5aaa965
5d4bf7d
 
5aaa965
123ba7e
 
5d4bf7d
65db96a
 
123ba7e
ce61e92
 
 
 
 
 
 
 
5d4bf7d
 
 
 
 
 
ce61e92
 
 
 
 
 
 
d3ce4e0
ce61e92
 
 
 
9f5f200
 
d3ce4e0
5aaa965
5d4bf7d

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

import streamlit as st

import tiktoken

from src.utils import load_config_values
from src.dev_llm import FakeLLM
from src.pdfParser import extract_text_from_pdf


# Load in model and pipeline configuration values
system_message, context_message, model_id = load_config_values(
    config_keys=[
        "system_message",
        "context_message",
        "model_id",
    ]
)

openai_tokeniser = tiktoken.encoding_for_model(model_id)

# TODO: Change this to reflect prod model rather than dev models
# Initalise fake values and a fake LLM to test out the full pipeline
tmp_llm = FakeLLM()

prompt = ChatPromptTemplate.from_template(
    template=context_message,
)

chain = (
    {
        "system_message": lambda x: system_message,
        "pdf_text": lambda x: uploaded_text,
        "data_to_extract": RunnablePassthrough()
    }
    |prompt
    |tmp_llm
)

user_input_1 = st.text_input(label="input")


uploaded_doc = st.file_uploader(
    label="Upload PDF:\n",
    type=".pdf"
)
if uploaded_doc is not None:
    uploaded_text = extract_text_from_pdf(uploaded_doc)

    st.text(chain.invoke(user_input_1))
    text_length = len(openai_tokeniser.encode(uploaded_text))
    system_message_length = len(openai_tokeniser.encode(system_message))
    context_message_length = len(openai_tokeniser.encode(context_message))
    st.text(text_length + system_message_length + context_message_length)