\n\n"
for section in document['Sections']:
markdown += "
\n\n"
section_number = section['SectionNumber']
section_title = section['Title']
markdown += f"
{section_number}. {section_title}
\n\n"
markdown += f"
\n\n{section['Content']}\n\n
\n\n"
for subsection in section.get('Subsections', []):
subsection_number = subsection['SectionNumber']
subsection_title = subsection['Title']
markdown += f"
{subsection_number} {subsection_title}
\n\n"
markdown += f"
\n\n{subsection['Content']}\n\n
\n\n"
markdown += "
"
return markdown
async def load_documents(documents: List[UploadFile]) -> List[str]:
"""
Load and parse documents using LlamaParse.
Args:
documents (List[UploadFile]): List of uploaded document files.
Returns:
List[str]: List of parsed document contents.
"""
parser = LlamaParse(
api_key=os.getenv("LLAMA_PARSE_API_KEY"),
result_type="markdown",
num_workers=4,
verbose=True,
language="en",
)
# Save uploaded files temporarily
temp_files = []
for doc in documents:
temp_file_path = f"/tmp/{doc.filename}"
with open(temp_file_path, "wb") as buffer:
content = await doc.read()
buffer.write(content)
temp_files.append(temp_file_path)
try:
# Use LlamaParse to extract content
print(f"processing files {str(temp_files)}")
parsed_documents = await parser.aload_data(temp_files)
documents_list = [doc.text for doc in parsed_documents]
return documents_list
finally:
# Clean up temporary files
for temp_file in temp_files:
os.remove(temp_file)
router = APIRouter()
class JsonDocumentResponse(BaseModel):
json_document: Dict
# class JsonDocumentRequest(BaseModel):
# query: str
# template: bool = False
# images: Optional[List[UploadFile]] = File(None)
# documents: Optional[List[UploadFile]] = File(None)
# conversation_id: str = ""
class MarkdownDocumentRequest(BaseModel):
json_document: Dict
query: str
template: bool = False
conversation_id: str = ""
MESSAGE_DELIMITER = b"\n---DELIMITER---\n"
def yield_message(message):
message_json = json.dumps(message, ensure_ascii=False).encode('utf-8')
return message_json + MESSAGE_DELIMITER
async def generate_document_stream(document_generator: DocumentGenerator, document_outline: Dict, query: str, template: bool = False, conversation_id: str = ""):
document_generator.document_outline = document_outline
db_manager = DatabaseManager()
overall_objective = query
document_layout = json.dumps(document_generator.document_outline, indent=2)
cache_key = f"image_context_{conversation_id}"
image_context = await FastAPICache.get_backend().get(cache_key)
SECTION_PROMPT_SYSTEM = DOCUMENT_SECTION_PROMPT_SYSTEM if not template else DOCUMENT_TEMPLATE_SECTION_PROMPT_SYSTEM
document_generator.content_messages = [
{
"role": "system",
"content": SECTION_PROMPT_SYSTEM.format(
overall_objective=overall_objective,
document_layout=document_layout
)
}
]
if image_context:
document_generator.content_messages[0]["content"] += f"