RAG / app.py
mahynski's picture
updated token counts and context windows explicitly for each model
0444d24 verified
raw
history blame
11.8 kB
import tempfile
import os
import tiktoken
import streamlit as st
from llama_index.core import (
VectorStoreIndex,
Settings,
)
from llama_parse import LlamaParse
from streamlit_pdf_viewer import pdf_viewer
class MistralTokens:
"""
Returns tokens for MistralAI models.
See: https://docs.mistral.ai/guides/tokenization/
"""
def __init__(self, llm_name):
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
if 'open-mistral-nemo' in llm_name:
self.tokenizer = MistralTokenizer.v3(is_tekken=True)
else:
# This might work for all models, but their documentation is unclear.
self.tokenizer = MistralTokenizer.from_model(llm_name)
def __call__(self, input):
"""This returns all the tokens indices in a list since LlamaIndex seems to count by calling `len()` on the tokenizer function."""
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
return self.tokenizer.encode_chat_completion(
ChatCompletionRequest(
tools=[],
messages=[
UserMessage(content=input)
]
)
).tokens
class GeminiTokens:
"""
Returns tokens for Gemini models.
See: https://medium.com/google-cloud/counting-gemini-text-tokens-locally-with-the-vertex-ai-sdk-78979fea6244
"""
def __init__(self, llm_name):
from vertexai.preview import tokenization
self.tokenizer = tokenization.get_tokenizer_for_model(llm_name)
def __call__(self, input):
"""This returns all the tokens in a list since LlamaIndex seems to count by calling `len()` on the tokenizer function."""
tokens = []
for list in self.tokenizer.compute_tokens(input).token_info_list:
tokens += list.tokens
return tokens
def main():
with st.sidebar:
st.title('Document Summarization and QA System')
# Select Provider
provider = st.selectbox(
label="Select LLM Provider",
options=['google', 'huggingface', 'mistralai', 'openai'],
index=3
)
# Select LLM
if provider == 'google':
llm_list = ['gemini-1.0-pro', 'gemini-1.5-flash', 'gemini-1.5-pro']
elif provider == 'huggingface':
llm_list = []
elif provider == 'mistralai':
llm_list = ["mistral-large-latest", "open-mistral-nemo-latest"]
elif provider == 'openai':
llm_list = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo', 'gpt-4o', 'gpt-4o-mini']
else:
llm_list = []
if provider == 'huggingface':
llm_name = st.text_input(
"Enter LLM namespace/model-name",
value="HuggingFaceH4/zephyr-7b-alpha",
)
# Also give the user the option for different embedding models, too
embed_name = st.text_input(
label="Enter embedding namespace/model-name",
value="BAAI/bge-small-en-v1.5",
)
else:
llm_name = st.selectbox(
label="Select LLM Model",
options=llm_list,
index=0
)
# Temperature
temperature = st.slider(
"Temperature",
min_value=0.0,
max_value=1.0,
value=0.0,
step=0.05,
)
# Enter Parsing API Key
parse_key = st.text_input(
"Enter your LlamaParse API Key",
value=None
)
# Enter LLM API Key
llm_key = st.text_input(
"Enter your LLM provider API Key",
value=None,
)
# Create LLM
# Global tokenization needs to be consistent with LLM for token counting
# https://docs.llamaindex.ai/en/stable/module_guides/models/llms/
if llm_key is not None:
if provider == 'google':
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
max_output_tokens = 8192 # https://firebase.google.com/docs/vertex-ai/gemini-models
os.environ['GOOGLE_API_KEY'] = str(llm_key)
Settings.llm = Gemini(
model=f"models/{llm_name}",
token=os.environ.get("GOOGLE_API_KEY"),
temperature=temperature,
max_tokens=max_output_tokens
)
Settings.tokenizer = GeminiTokens(llm_name)
Settings.num_output = max_output_tokens
Settings.embed_model = GeminiEmbedding(
model_name="models/text-embedding-004", api_key=os.environ.get("GOOGLE_API_KEY") #, title="this is a document"
)
if llm_name == 'gemini-1.0-pro':
total_token_limit = 32760
else:
total_token_limit = 1e6
Settings.context_window = total_token_limit - max_output_tokens # Gemini counts total tokens
elif provider == 'huggingface':
if llm_name is not None and embed_name is not None:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.embeddings.huggingface import HuggingFaceInferenceAPIEmbedding
from transformers import AutoTokenizer
max_output_tokens = 2048 # Just a generic value
os.environ['HFTOKEN'] = str(llm_key)
Settings.llm = HuggingFaceInferenceAPI(
model_name=llm_name,
token=os.environ.get("HFTOKEN"),
temperature=temperature,
max_tokens=max_output_tokens
)
Settings.tokenizer = AutoTokenizer.from_pretrained(
llm_name,
token=os.environ.get("HFTOKEN"),
)
Settings.num_output = max_output_tokens
Settings.embed_model = HuggingFaceInferenceAPIEmbedding(
model_name=embed_name
)
Settings.context_window = 4096 # Just a generic value
elif provider == 'mistralai':
from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
max_output_tokens = 8192 # Based on internet consensus since this is not well documented
os.environ['MISTRAL_API_KEY'] = str(llm_key)
Settings.llm = MistralAI(
model=llm_name,
temperature=temperature,
max_tokens=max_output_tokens,
random_seed=42,
safe_mode=True
)
Settings.tokenizer = MistralTokens(llm_name)
Settings.num_output = max_output_tokens
Settings.embed_model = MistralAIEmbedding(
model_name="mistral-embed",
api_key=os.environ.get("MISTRAL_API_KEY")
)
Settings.context_window = 128000 # 128k for flagship models - doesn't seem to count input tokens
elif provider == 'openai':
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
# https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4
if llm_name == 'gpt-3.5-turbo':
max_output_tokens = 4096
context_window = 16385
elif llm_name == 'gpt-4' :
max_output_tokens = 8192
context_window = 8192
elif llm_name == 'gpt-4-turbo'
max_output_tokens = 4096
context_window = 128000
elif llm_name == 'gpt-4o':
max_output_tokens = 4096
context_window = 128000
elif llm_name == 'gpt-4o-mini':
max_output_tokens = 16384
context_window = 128000
os.environ["OPENAI_API_KEY"] = str(llm_key)
Settings.llm = OpenAI(
model=llm_name,
temperature=temperature,
max_tokens=max_output_tokens
)
Settings.tokenizer = tiktoken.encoding_for_model(llm_name).encode
Settings.num_output = max_output_tokens
Settings.embed_model = OpenAIEmbedding()
Settings.context_window = context_window
else:
raise NotImplementedError(f"{provider} is not supported yet")
uploaded_file = st.file_uploader(
"Choose a PDF file to upload",
type=['pdf'],
accept_multiple_files=False
)
parsed_document = None
if uploaded_file is not None:
# Parse the file
parser = LlamaParse(
api_key=parse_key, # Can also be set in your env as LLAMA_CLOUD_API_KEY
result_type="text" # "markdown" and "text" are available
)
# Create a temporary directory to save the file then load and parse it
temp_dir = tempfile.TemporaryDirectory()
temp_filename = os.path.join(temp_dir.name, uploaded_file.name)
with open(temp_filename, "wb") as f:
f.write(uploaded_file.getvalue())
parsed_document = parser.load_data(temp_filename)
temp_dir.cleanup()
col1, col2 = st.columns(2)
with col2:
tab1, tab2 = st.tabs(["Uploaded File", "Parsed File",])
with tab1:
if uploaded_file is not None: # Display the pdf
bytes_data = uploaded_file.getvalue()
pdf_viewer(input=bytes_data, width=700)
with tab2:
if parsed_document is not None: # Showed the raw parsing result
st.write(parsed_document)
with col1:
st.markdown(
"""
# Instructions
1. Obtain an [API Key](https://cloud.llamaindex.ai/api-key) from LlamaParse to parse your document.
2. Obtain a similar API Key from your preferred LLM provider. Note, if you are using [Hugging Face](https://huggingface.co/models) you may need to request access to a model if it is gated.
3. Make selections at the left and upload a document to use as context.
4. Begin asking questions below!
"""
)
st.divider()
prompt_txt = 'You are a trusted scientific expert that only responds truthfully to inquiries. Summarize this document in a 3-5 sentences.'
prompt = st.text_area(
label="Enter your query.",
key="prompt_widget",
value=prompt_txt
)
run = st.button("Answer", type="primary")
if parsed_document is not None and run:
index = VectorStoreIndex.from_documents(parsed_document)
query_engine = index.as_query_engine()
response = query_engine.query(prompt)
st.write(response.response)
if __name__ == '__main__':
# Global configurations
# from llama_index.core import set_global_handler
# set_global_handler("langfuse")
# Also add API Key for this if using
st.set_page_config(layout="wide")
main()