Spaces:

traversaal-internal
/

traversaal_pro_embed_fastapi

Running

App Files Files Community

vishanth10 commited on Aug 15

Commit

0181645

•

1 Parent(s): 2ef1eee

init commit from repo

Browse files

Files changed (8) hide show

__pycache__/app.cpython-311.pyc +0 -0
app.py +110 -0
utils/__pycache__/embedding_utils.cpython-311.pyc +0 -0
utils/__pycache__/qdrant_utils.cpython-311.pyc +0 -0
utils/__pycache__/s3_utils.cpython-311.pyc +0 -0
utils/embedding_utils.py +139 -0
utils/qdrant_utils.py +72 -0
utils/s3_utils.py +74 -0

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (6.11 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+## API Call
+# main.py
+## FASTAPI Main py file to access the POST body
+from fastapi import FastAPI, Header, HTTPException
+from pydantic import BaseModel
+from typing import List, Optional
+from utils.s3_utils import read_s3_file
+from utils.embedding_utils import read_document, cumulative_semantic_chunking, embed_chunks
+from utils.qdrant_utils import store_embeddings
+import logging
+import time
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# Retrieve the API key from the environment
+API_KEY = os.getenv('X_API_KEY')
+bucket_name = os.getenv('bucket_name')
+# Initialize logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+app = FastAPI()
+class Metadata(BaseModel):
+    mime_type: str
+    file_size_bytes: str
+    file_format: str
+class DocumentIndexRequest(BaseModel):
+    metadata: Metadata
+    bucket_key: str
+    user_id: str
+    org_id: str
+    file_id: int
+    data_source_id: int
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the Document Indexing API!"}
+@app.post("/api/document-index")
+async def document_index(request: DocumentIndexRequest, x_api_key: str = Header(...)):
+    logging.info(f"Received request: {request}")
+    start_time = time.time()
+    # Check if the API key provided in the header matches the one in the environment
+    if x_api_key != API_KEY:
+        logging.warning("Unauthorized access attempt with invalid API key.")
+        raise HTTPException(status_code=401, detail="Unauthorized")
+    try:
+        #bucket_name = "document-ingestion-drive-dev"
+        # Read file from S3 using the presigned URL
+        content, metadata, file_format = read_s3_file(bucket_name, request.bucket_key)
+        logging.info(f"File {request.bucket_key} retrieved from S3 with format {file_format}.")
+        # Reading content using Simple Directory Reader
+        text_content = read_document(content, file_id=request.file_id, file_format=file_format)
+        logging.info(f"Text content extracted from file {request.bucket_key}.")
+        #print('text_content',text_content)
+        # Chunking text using semantic chunking
+        chunks = cumulative_semantic_chunking(text_content, max_chunk_size=512, similarity_threshold=0.8)
+        logging.info(f"Text content chunked into {len(chunks)} chunks.")
+        # Embed chunks
+        embeddings, total_tokens = embed_chunks(chunks)
+        logging.info(f"Text content embedded into vectors. Total tokens used: {total_tokens}.")
+        # Store embeddings in Qdrant
+        store_embeddings(
+            chunks=chunks,
+            embeddings=embeddings,
+            user_id=request.user_id,
+            data_source_id=request.data_source_id,
+            file_id=request.file_id,
+            organization_id=request.org_id,
+            s3_bucket_key=request.bucket_key,
+            total_tokens=total_tokens
+        )
+        logging.info(f"Embeddings for {request.bucket_key} stored successfully with metadata: {metadata}")
+        logging.info(f"Embeddings for stored successfully with {total_tokens} tokens")
+        time_taken = time.time() - start_time
+        logging.info(f"Time taken to process and embed the document: {time_taken} seconds")
+        return {"message": "Embeddings stored successfully"}
+    except FileNotFoundError as e:
+        logging.error(f"File not found: {str(e)}")
+        raise HTTPException(status_code=404, detail=str(e))
+    except PermissionError as e:
+        logging.error(f"Permission error: {str(e)}")
+        raise HTTPException(status_code=403, detail=str(e))
+    except HTTPException as e:
+        logging.error(f"HTTP error: {str(e.detail)}")
+        raise
+    except Exception as e:
+        logging.error(f"Error processing file {request.bucket_key}: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+# Run the FastAPI app
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

utils/__pycache__/embedding_utils.cpython-311.pyc ADDED Viewed

Binary file (8.2 kB). View file

utils/__pycache__/qdrant_utils.cpython-311.pyc ADDED Viewed

Binary file (3.67 kB). View file

utils/__pycache__/s3_utils.cpython-311.pyc ADDED Viewed

Binary file (3.74 kB). View file

utils/embedding_utils.py ADDED Viewed

	@@ -0,0 +1,139 @@

+#embedding_utils.py
+from sentence_transformers import SentenceTransformer
+from llama_index.core import SimpleDirectoryReader
+from huggingface_hub import login
+from typing import List, Tuple
+from dotenv import load_dotenv
+import numpy as np
+import os
+import tempfile
+from docx import Document
+import tempfile
+import os
+import logging
+# Load environment variables from .env file
+load_dotenv()
+# Load Hugging Face token from environment variable
+huggingface_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
+if huggingface_token:
+    login(token=huggingface_token, add_to_git_credential=True)
+else:
+    raise ValueError("Hugging Face token is not set. Please set the HUGGINGFACE_HUB_TOKEN environment variable.")
+# Load model and tokenizer with authentication
+#model_name = 'nvidia/NV-Embed-v1'
+model_name = 'nomic-ai/nomic-embed-text-v1.5'
+model = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True)
+model.max_seq_length = 4096
+model.tokenizer.padding_side = "right"
+def read_document(file_content: bytes, file_id: int, file_format: str) -> str:
+    """Extract text content from a document file depending on its format."""
+    try:
+        # Create a temporary directory to save the file
+        with tempfile.TemporaryDirectory() as temp_dir:
+            file_path = os.path.join(temp_dir, f"document_{file_id}.{file_format}")
+            # Save the content to the file
+            with open(file_path, 'wb') as temp_file:
+                temp_file.write(file_content)
+            # Handle different file formats
+            if file_format.lower() == 'docx':
+                text_content = extract_text_from_docx(file_path)
+            elif file_format.lower() == 'pdf':
+                text_content = extract_text_from_pdf(file_path)
+            elif file_format.lower() in ['txt', 'md', 'csv']:
+                reader = SimpleDirectoryReader(input_files=[file_path])
+                documents = reader.load_data()
+                text_content = documents[0].text if documents else ''
+            else:
+                raise ValueError(f"Unsupported file format: {file_format}")
+            if text_content:
+                return text_content
+            else:
+                raise ValueError("No content extracted from the document.")
+    except Exception as e:
+        logging.error(f"Error reading document: {e}")
+        raise
+def extract_text_from_docx(file_path: str) -> str:
+    """Extract text from a DOCX file."""
+    try:
+        doc = Document(file_path)
+        full_text = [para.text for para in doc.paragraphs]
+        return '\n'.join(full_text)
+    except Exception as e:
+        logging.error(f"Error extracting text from DOCX file: {e}")
+        raise
+def extract_text_from_pdf(file_path: str) -> str:
+    """Extract text from a PDF file."""
+    import pdfplumber
+    try:
+        with pdfplumber.open(file_path) as pdf:
+            full_text = [page.extract_text() for page in pdf.pages]
+        return '\n'.join(full_text).strip()
+    except Exception as e:
+        logging.error(f"Error extracting text from PDF file: {e}")
+        raise
+def cumulative_semantic_chunking( text: str, max_chunk_size: int, similarity_threshold: float, embedding_model: SentenceTransformer = model) -> List[str]:
+    """Cumulative semantic chunking using sentence embeddings."""
+    sentences = text.split('.')
+    # Encode sentences
+    sentence_embeddings = model.encode(sentences)
+    chunks = []
+    current_chunk = sentences[0]
+    #print('current chunk',current_chunk)
+    current_embedding = sentence_embeddings[0]
+    for sentence, embedding in zip(sentences[1:], sentence_embeddings[1:]):
+        combined_chunk = current_chunk + '. ' + sentence
+        combined_embedding = (current_embedding * len(current_chunk.split()) + embedding * len(sentence.split())) / (len(current_chunk.split()) + len(sentence.split()))
+        similarity = np.dot(current_embedding, combined_embedding) / (np.linalg.norm(current_embedding) * np.linalg.norm(combined_embedding))
+        if similarity >= similarity_threshold and len(combined_chunk) <= max_chunk_size:
+            current_chunk = combined_chunk
+            current_embedding = combined_embedding
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence
+            current_embedding = embedding
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    #print('chunks',chunks)
+    return chunks
+# def embed_chunks(chunks: List[str]) -> List[np.ndarray]:
+#     """Embed the chunks using the SentenceTransformer model."""
+#     return model.encode(chunks)
+def embed_chunks(chunks: List[str]) -> Tuple[List[np.ndarray], int]:
+    """Embed the chunks using the SentenceTransformer model and return embeddings along with the total token count."""
+    total_tokens = 0
+    embeddings = []
+    for chunk in chunks:
+        tokens = model.tokenizer.encode(chunk, add_special_tokens=False)
+        total_tokens += len(tokens)
+        embedding = model.encode(chunk)
+        embeddings.append(embedding)
+    return embeddings, total_tokens

utils/qdrant_utils.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# qdrant_utils.py contains utility functions to interact with Qdrant, a vector similarity search engine.
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+import os
+import logging
+import uuid
+# Initialize logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+qdrant_client = QdrantClient(
+    url=os.getenv('QDRANT_URL'),
+    api_key=os.getenv('QDRANT_API_KEY')
+)
+def create_collection_if_not_exists(collection_name, vector_size):
+    try:
+        # Check if collection exists
+        collections = qdrant_client.get_collections().collections
+        if not any(collection.name == collection_name for collection in collections):
+            # Create the collection if it doesn't exist
+            qdrant_client.create_collection(
+                collection_name=collection_name,
+                vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE)
+            )
+            logging.info(f"Created new collection: {collection_name}")
+        else:
+            logging.info(f"Collection {collection_name} already exists")
+    except Exception as e:
+        logging.error(f"Error creating collection: {str(e)}")
+        raise
+def store_embeddings(chunks, embeddings, user_id, data_source_id, file_id, organization_id, s3_bucket_key, total_tokens):
+    try:
+        collection_name = "embed"    # Name of the collection in Qdrant
+        vector_size = len(embeddings[0])
+        # Ensure the collection exists
+        create_collection_if_not_exists(collection_name, vector_size)
+        # Prepare points for Qdrant
+        points = []
+        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+            chunk_id = str(uuid.uuid4())  # Generate a unique ID for each chunk
+            points.append(
+                models.PointStruct(
+                    id=chunk_id,
+                    vector=embedding.tolist(),  # Convert numpy array to list
+                    payload={
+                        "user_id": user_id,
+                        "data_source_id": data_source_id,
+                        "file_id": file_id,
+                        "organization_id": organization_id,
+                        "chunk_index": i,
+                        "chunk_text": chunk,
+                        "s3_bucket_key": s3_bucket_key,
+                        "total_tokens": total_tokens
+                    }
+                )
+            )
+        # Store embeddings in Qdrant
+        qdrant_client.upsert(
+            collection_name=collection_name,
+            points=points
+        )
+        logging.info(f"Successfully stored {len(points)} embeddings")
+    except Exception as e:
+        logging.error(f"Error storing embeddings in Qdrant: {str(e)}")
+        raise

utils/s3_utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import boto3
+import os
+import urllib.parse
+import logging
+from botocore.exceptions import NoCredentialsError, ClientError
+#bucket_name = "document-ingestion-drive-dev"
+aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
+aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
+def get_s3_client():
+    try:
+        s3_client = boto3.client(
+            's3',
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+            region_name='us-west-2'
+        )
+        logging.info("S3 client initialized successfully.")
+        return s3_client
+    except NoCredentialsError as e:
+        logging.error(f"Failed to initialize S3 client: {str(e)}")
+        raise
+    except Exception as e:
+        logging.error(f"Failed to initialize S3 client: {str(e)}")
+        raise
+def read_s3_file(bucket_name, key):
+    try:
+        logging.info(f"Reading file from S3: bucket={bucket_name}, key={key}")
+        s3 = get_s3_client()
+        response = s3.get_object(Bucket=bucket_name, Key=key)
+        content = response['Body'].read()
+        metadata = response.get('Metadata', {})
+        # Attempt to get the file format from metadata
+        file_format = metadata.get('file_format')  # Assuming 'file_format' is set as custom metadata
+        if not file_format:
+            # Fallback to using Content-Type if 'file_format' is not set in metadata
+            content_type = response.get('ContentType')
+            if content_type:
+                if 'word' in content_type:
+                    file_format = 'docx'
+                elif 'pdf' in content_type:
+                    file_format = 'pdf'
+                elif 'text' in content_type:
+                    file_format = 'txt'
+                else:
+                    file_format = 'unknown'
+            else:
+                raise ValueError("File format could not be determined from metadata or Content-Type.")
+        logging.info(f"File read successfully from S3: bucket={bucket_name}, key={key}, format={file_format}")
+        return content, metadata, file_format
+    except s3.exceptions.NoSuchKey:
+        logging.error(f"File not found in S3: bucket={bucket_name}, key={key}")
+        raise FileNotFoundError(f"File not found: bucket={bucket_name}, key={key}")
+    except NoCredentialsError:
+        logging.error("AWS credentials not found.")
+        raise PermissionError("AWS credentials not found.")
+    except ClientError as e:
+        logging.error(f"Error reading file from S3: {str(e)}")
+        raise
+    except Exception as e:
+        logging.error(f"Error reading file from S3: {str(e)}")
+        raise
+# Initialize logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')