jojortz's picture
add timeout to model run
26197e0
raw
history blame
11.9 kB
import base64
import json
import os
import requests
import anthropic
import openai
from dotenv import load_dotenv
from pathlib import Path
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from unstructured.partition.auto import partition
from preprocessors.preprocessor import PdfPreprocessor
from postprocessors.postprocessor import ClaudePostprocessor, GPTPostprocessor
load_dotenv()
class Model:
BASE_URL: str | None = None
API_KEY: str | None = None
MODEL: str | None = None
REQUIRES_OPENAI: bool = False
REQUIRES_ANTHROPIC: bool = False
PROMPT: str = "Convert these images to markdown"
def __init_subclass__(cls) -> None:
"""Initialize subclass."""
super().__init_subclass__()
def __init__(self):
if self.REQUIRES_OPENAI:
if not self.API_KEY:
raise ValueError("Model api key is not provided")
if not self.MODEL:
raise ValueError("Model name is not provided")
if self.BASE_URL:
self._client = openai.OpenAI(
base_url=self.BASE_URL,
api_key=self.API_KEY,
)
else:
self._client = openai.OpenAI(api_key=self.API_KEY)
elif self.REQUIRES_ANTHROPIC:
if not self.API_KEY:
raise ValueError("Model api key is not provided")
if not self.MODEL:
raise ValueError("Model name is not provided")
self._client = anthropic.Anthropic(
api_key=self.API_KEY,
)
def run(self, file_path: str) -> str:
"""Extract model.
Args:
file_path: path to file to extract
Returns:
str: output markdown
"""
raise NotImplementedError("Model extract method is not implemented")
class CambioVQA0713(Model):
BASE_URL = "http://44.242.239.38:8000/v1"
API_KEY = "Cambioml2024!"
MODEL = "cambiollm-dust-preview-0713"
REQUIRES_OPENAI = True
USE_BEAM_SEARCH = True
def __init__(self):
"""Init."""
super().__init__()
def run(self, file_path: str) -> str:
"""Extract data in real-time.
Args:
file_path (str): The path to the file to be parsed.
Returns:
str: The extracted data.
"""
try:
pdf_preprocessor = PdfPreprocessor()
file_contents = pdf_preprocessor.run(file_path)
contents = []
for content in file_contents:
contents.append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{content}",
},
},)
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Convert this image to markdown\nOutput figures\nOutput charts\nOutput tables\nOutput footnotes\nOutput headers\nOutput footers\nOutput page nums",
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{file_contents[0]}",
},
},
],
}
]
print('Cambio Model - ready to run: ', json.dumps(messages[0])[:200])
if self.USE_BEAM_SEARCH:
response = self._client.chat.completions.create(
model=self.MODEL,
messages=messages,
top_p=1,
temperature=0,
extra_body={
"top_k": -1,
"use_beam_search": True,
"best_of": 2,
},
)
else:
response = self._client.chat.completions.create(
model=self.MODEL,
messages=messages,
max_tokens=1024,
temperature=0.3,
top_p=0.7,
extra_body={
"top_k": 20,
},
)
print('Cambio Model - response: ', response.choices[0].message.content)
return response.choices[0].message.content
except Exception as e:
print(f"Error processing input: {str(e)}")
return f"Error processing with CambioVQA0713: {str(e)}"
class AnyParserModel(Model):
BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
def run(self, file_path: str) -> str:
"""Extract data in real-time.
Args:
file_path (str): The path to the file to be parsed.
Returns:
str: The extracted data.
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")
# Check if the file exists
if not Path(file_path).is_file():
return "Error: File does not exist", "File does not exist"
if file_extension in ["pdf", "docx"]:
# Encode the PDF file content in base64
with open(file_path, "rb") as file:
encoded_file = base64.b64encode(file.read()).decode("utf-8")
else:
return "Error: Unsupported file type", "Unsupported file type"
# Create the JSON payload
payload = {
"file_content": encoded_file,
"file_type": file_extension,
}
# Set the headers
headers = {
"Content-Type": "application/json",
"x-api-key": self.API_KEY,
}
# Send the POST request
response = requests.post(
self.BASE_URL, headers=headers, data=json.dumps(payload), timeout=30
)
# Check if the request was successful
if response.status_code == 200:
try:
response_data = response.json()
response_list = []
for text in response_data["markdown"]:
response_list.append(text)
markdown_text = "\n".join(response_list)
return markdown_text
except json.JSONDecodeError:
return "Error: Invalid JSON response", f"Response: {response.text}"
else:
return f"Error: {response.status_code}", f"Response: {response.text}"
class LlamaParseModel(Model):
BASE_URL = None
API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')
def __init__(self):
"""Init."""
super().__init__()
if not self.API_KEY:
raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
def run(self, file_path: str) -> str:
"""Extract data in real-time.
Args:
file_path (str): The path to the file to be parsed.
Returns:
str: The extracted data.
"""
try:
parser = LlamaParse(
result_type="markdown",
num_workers=4,
verbose=True,
language="en",
)
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=[file_path], file_extractor=file_extractor).load_data()
markdown = "\n\n".join([doc.text for doc in documents])
return markdown
except Exception as e:
print(f"Error processing input: {str(e)}")
return f"Error processing with LlamaParse: {str(e)}"
class UnstructuredModel(Model):
BASE_URL = None
API_KEY = None
def __init__(self):
"""Init."""
super().__init__()
def run(self, file_path: str) -> str:
"""Extract data in real-time.
Args:
file_path (str): The path to the file to be parsed.
Returns:
str: The extracted data.
"""
try:
elements = partition(file_path)
# Combine the elements into a single string
parsed_text = "\n".join(element.text for element in elements if element.text)
# Handle case where no content is parsed
markdown = parsed_text if parsed_text else "No content parsed"
return markdown
except Exception as e:
return f"Error processing UnstructuredModel: {str(e)}"
class GPTModel(Model):
BASE_URL = None
API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "gpt-4o-mini"
REQUIRES_OPENAI = True
def __init__(self):
"""Init."""
super().__init__()
def run(self, file_path: str) -> str:
"""Extract data in real-time.
Args:
file_path (str): The path to the file to be parsed.
Returns:
str: The extracted data.
"""
try:
pdf_preprocessor = PdfPreprocessor()
gpt_postprocessor = GPTPostprocessor()
file_contents = pdf_preprocessor.run(file_path)
contents = []
for content in file_contents:
contents.append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{content}",
},
})
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": self.PROMPT},
*contents,
],
}
]
response = self._client.chat.completions.create(
model=self.MODEL,
messages=messages,
)
return gpt_postprocessor.run(response.choices[0].message.content)
except Exception as e:
print(f"Error processing input: {str(e)}")
return f"Error processing with GPTModel: {str(e)}"
class ClaudeModel(Model):
BASE_URL = "http://103.114.163.134:3000/v1/"
API_KEY = os.getenv("ANTHROPIC_API_KEY")
MODEL = "claude-3-5-sonnet-20240620"
REQUIRES_ANTHROPIC = True
def __init__(self):
"""Init."""
super().__init__()
def run(self, file_path: str) -> str:
"""Extract data in real-time.
Args:
file_path (str): The path to the file to be parsed.
Returns:
str: The extracted data.
"""
try:
prompt = self.PROMPT
pdf_preprocessor = PdfPreprocessor()
claude_postprocessor = ClaudePostprocessor()
file_contents = pdf_preprocessor.run(file_path)
contents = []
for content in file_contents:
contents.append(
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": content,
}
})
messages = [
{"role": "user", "content": [
{"type": "text", "text": prompt},
*contents,
]}
]
response = self._client.messages.create(
model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
)
print('-----------\n\n***Anthropic Response:\n\n ', response.content[0].text)
return claude_postprocessor.run(response.content[0].text)
except Exception as e:
return f"Error processing ClaudeModel: {str(e)}"