Spaces:

traversaal-internal
/

traversaal_pro_embed_fastapi

Running

vhr1007

Remove .env file and removed load env file

9dcbf6a 3 months ago

3.99 kB

	## API Call
	# main.py
	## FASTAPI Main py file to access the POST body

	from fastapi import FastAPI, Header, HTTPException
	from pydantic import BaseModel
	from typing import List, Optional
	from utils.s3_utils import read_s3_file
	from utils.embedding_utils import read_document, cumulative_semantic_chunking, embed_chunks
	from utils.qdrant_utils import store_embeddings
	import logging
	import time
	import os
	from dotenv import load_dotenv

	#load_dotenv()

	# Retrieve the API key from the environment
	API_KEY = os.getenv('X_API_KEY')
	bucket_name = os.getenv('bucket_name')

	# Initialize logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	app = FastAPI()

	class Metadata(BaseModel):
	mime_type: str
	file_size_bytes: str
	file_format: str

	class DocumentIndexRequest(BaseModel):
	metadata: Metadata
	bucket_key: str
	user_id: str
	org_id: str
	file_id: int
	data_source_id: int

	@app.get("/")
	async def root():
	return {"message": "Welcome to the Document Indexing API!"}

	@app.post("/api/document-index")
	async def document_index(request: DocumentIndexRequest, x_api_key: str = Header(...)):
	logging.info(f"Received request: {request}")
	start_time = time.time()
	# Check if the API key provided in the header matches the one in the environment
	if x_api_key != API_KEY:
	logging.warning("Unauthorized access attempt with invalid API key.")
	raise HTTPException(status_code=401, detail="Unauthorized")

	try:
	#bucket_name = "document-ingestion-drive-dev"

	# Read file from S3 using the presigned URL
	content, metadata, file_format = read_s3_file(bucket_name, request.bucket_key)
	logging.info(f"File {request.bucket_key} retrieved from S3 with format {file_format}.")


	# Reading content using Simple Directory Reader
	text_content = read_document(content, file_id=request.file_id, file_format=file_format)
	logging.info(f"Text content extracted from file {request.bucket_key}.")
	#print('text_content',text_content)

	# Chunking text using semantic chunking
	chunks = cumulative_semantic_chunking(text_content, max_chunk_size=512, similarity_threshold=0.8)
	logging.info(f"Text content chunked into {len(chunks)} chunks.")
	# Embed chunks
	embeddings, total_tokens = embed_chunks(chunks)
	logging.info(f"Text content embedded into vectors. Total tokens used: {total_tokens}.")

	# Store embeddings in Qdrant
	store_embeddings(
	chunks=chunks,
	embeddings=embeddings,
	user_id=request.user_id,
	data_source_id=request.data_source_id,
	file_id=request.file_id,
	organization_id=request.org_id,
	s3_bucket_key=request.bucket_key,
	total_tokens=total_tokens
	)

	logging.info(f"Embeddings for {request.bucket_key} stored successfully with metadata: {metadata}")
	logging.info(f"Embeddings for stored successfully with {total_tokens} tokens")
	time_taken = time.time() - start_time
	logging.info(f"Time taken to process and embed the document: {time_taken} seconds")


	return {"message": "Embeddings stored successfully"}


	except FileNotFoundError as e:
	logging.error(f"File not found: {str(e)}")
	raise HTTPException(status_code=404, detail=str(e))
	except PermissionError as e:
	logging.error(f"Permission error: {str(e)}")
	raise HTTPException(status_code=403, detail=str(e))
	except HTTPException as e:
	logging.error(f"HTTP error: {str(e.detail)}")
	raise
	except Exception as e:
	logging.error(f"Error processing file {request.bucket_key}: {str(e)}")
	raise HTTPException(status_code=500, detail=str(e))

	# Run the FastAPI app
	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)