pdf-to-table / app_langchain.py
regraded01's picture
feat: store model_id as a config variable
5d4bf7d
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
import streamlit as st
import tiktoken
from src.utils import load_config_values
from src.dev_llm import FakeLLM
from src.pdfParser import extract_text_from_pdf
# Load in model and pipeline configuration values
system_message, context_message, model_id = load_config_values(
config_keys=[
"system_message",
"context_message",
"model_id",
]
)
openai_tokeniser = tiktoken.encoding_for_model(model_id)
# TODO: Change this to reflect prod model rather than dev models
# Initalise fake values and a fake LLM to test out the full pipeline
tmp_llm = FakeLLM()
prompt = ChatPromptTemplate.from_template(
template=context_message,
)
chain = (
{
"system_message": lambda x: system_message,
"pdf_text": lambda x: uploaded_text,
"data_to_extract": RunnablePassthrough()
}
|prompt
|tmp_llm
)
user_input_1 = st.text_input(label="input")
uploaded_doc = st.file_uploader(
label="Upload PDF:\n",
type=".pdf"
)
if uploaded_doc is not None:
uploaded_text = extract_text_from_pdf(uploaded_doc)
st.text(chain.invoke(user_input_1))
text_length = len(openai_tokeniser.encode(uploaded_text))
system_message_length = len(openai_tokeniser.encode(system_message))
context_message_length = len(openai_tokeniser.encode(context_message))
st.text(text_length + system_message_length + context_message_length)