vishanth10 commited on
Commit
0181645
1 Parent(s): 2ef1eee

init commit from repo

Browse files
__pycache__/app.cpython-311.pyc ADDED
Binary file (6.11 kB). View file
 
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## API Call
2
+ # main.py
3
+ ## FASTAPI Main py file to access the POST body
4
+
5
+ from fastapi import FastAPI, Header, HTTPException
6
+ from pydantic import BaseModel
7
+ from typing import List, Optional
8
+ from utils.s3_utils import read_s3_file
9
+ from utils.embedding_utils import read_document, cumulative_semantic_chunking, embed_chunks
10
+ from utils.qdrant_utils import store_embeddings
11
+ import logging
12
+ import time
13
+ import os
14
+ from dotenv import load_dotenv
15
+
16
+ load_dotenv()
17
+
18
+ # Retrieve the API key from the environment
19
+ API_KEY = os.getenv('X_API_KEY')
20
+ bucket_name = os.getenv('bucket_name')
21
+
22
+ # Initialize logging
23
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
+
25
+ app = FastAPI()
26
+
27
+ class Metadata(BaseModel):
28
+ mime_type: str
29
+ file_size_bytes: str
30
+ file_format: str
31
+
32
+ class DocumentIndexRequest(BaseModel):
33
+ metadata: Metadata
34
+ bucket_key: str
35
+ user_id: str
36
+ org_id: str
37
+ file_id: int
38
+ data_source_id: int
39
+
40
+ @app.get("/")
41
+ async def root():
42
+ return {"message": "Welcome to the Document Indexing API!"}
43
+
44
+ @app.post("/api/document-index")
45
+ async def document_index(request: DocumentIndexRequest, x_api_key: str = Header(...)):
46
+ logging.info(f"Received request: {request}")
47
+ start_time = time.time()
48
+ # Check if the API key provided in the header matches the one in the environment
49
+ if x_api_key != API_KEY:
50
+ logging.warning("Unauthorized access attempt with invalid API key.")
51
+ raise HTTPException(status_code=401, detail="Unauthorized")
52
+
53
+ try:
54
+ #bucket_name = "document-ingestion-drive-dev"
55
+
56
+ # Read file from S3 using the presigned URL
57
+ content, metadata, file_format = read_s3_file(bucket_name, request.bucket_key)
58
+ logging.info(f"File {request.bucket_key} retrieved from S3 with format {file_format}.")
59
+
60
+
61
+ # Reading content using Simple Directory Reader
62
+ text_content = read_document(content, file_id=request.file_id, file_format=file_format)
63
+ logging.info(f"Text content extracted from file {request.bucket_key}.")
64
+ #print('text_content',text_content)
65
+
66
+ # Chunking text using semantic chunking
67
+ chunks = cumulative_semantic_chunking(text_content, max_chunk_size=512, similarity_threshold=0.8)
68
+ logging.info(f"Text content chunked into {len(chunks)} chunks.")
69
+ # Embed chunks
70
+ embeddings, total_tokens = embed_chunks(chunks)
71
+ logging.info(f"Text content embedded into vectors. Total tokens used: {total_tokens}.")
72
+
73
+ # Store embeddings in Qdrant
74
+ store_embeddings(
75
+ chunks=chunks,
76
+ embeddings=embeddings,
77
+ user_id=request.user_id,
78
+ data_source_id=request.data_source_id,
79
+ file_id=request.file_id,
80
+ organization_id=request.org_id,
81
+ s3_bucket_key=request.bucket_key,
82
+ total_tokens=total_tokens
83
+ )
84
+
85
+ logging.info(f"Embeddings for {request.bucket_key} stored successfully with metadata: {metadata}")
86
+ logging.info(f"Embeddings for stored successfully with {total_tokens} tokens")
87
+ time_taken = time.time() - start_time
88
+ logging.info(f"Time taken to process and embed the document: {time_taken} seconds")
89
+
90
+
91
+ return {"message": "Embeddings stored successfully"}
92
+
93
+
94
+ except FileNotFoundError as e:
95
+ logging.error(f"File not found: {str(e)}")
96
+ raise HTTPException(status_code=404, detail=str(e))
97
+ except PermissionError as e:
98
+ logging.error(f"Permission error: {str(e)}")
99
+ raise HTTPException(status_code=403, detail=str(e))
100
+ except HTTPException as e:
101
+ logging.error(f"HTTP error: {str(e.detail)}")
102
+ raise
103
+ except Exception as e:
104
+ logging.error(f"Error processing file {request.bucket_key}: {str(e)}")
105
+ raise HTTPException(status_code=500, detail=str(e))
106
+
107
+ # Run the FastAPI app
108
+ if __name__ == "__main__":
109
+ import uvicorn
110
+ uvicorn.run(app, host="0.0.0.0", port=8000)
utils/__pycache__/embedding_utils.cpython-311.pyc ADDED
Binary file (8.2 kB). View file
 
utils/__pycache__/qdrant_utils.cpython-311.pyc ADDED
Binary file (3.67 kB). View file
 
utils/__pycache__/s3_utils.cpython-311.pyc ADDED
Binary file (3.74 kB). View file
 
utils/embedding_utils.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #embedding_utils.py
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+ from llama_index.core import SimpleDirectoryReader
5
+ from huggingface_hub import login
6
+ from typing import List, Tuple
7
+ from dotenv import load_dotenv
8
+ import numpy as np
9
+ import os
10
+ import tempfile
11
+ from docx import Document
12
+ import tempfile
13
+ import os
14
+ import logging
15
+
16
+
17
+ # Load environment variables from .env file
18
+ load_dotenv()
19
+
20
+ # Load Hugging Face token from environment variable
21
+ huggingface_token = os.getenv('HUGGINGFACE_HUB_TOKEN')
22
+
23
+
24
+ if huggingface_token:
25
+ login(token=huggingface_token, add_to_git_credential=True)
26
+ else:
27
+ raise ValueError("Hugging Face token is not set. Please set the HUGGINGFACE_HUB_TOKEN environment variable.")
28
+
29
+ # Load model and tokenizer with authentication
30
+ #model_name = 'nvidia/NV-Embed-v1'
31
+ model_name = 'nomic-ai/nomic-embed-text-v1.5'
32
+ model = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True)
33
+ model.max_seq_length = 4096
34
+ model.tokenizer.padding_side = "right"
35
+
36
+
37
+ def read_document(file_content: bytes, file_id: int, file_format: str) -> str:
38
+ """Extract text content from a document file depending on its format."""
39
+ try:
40
+ # Create a temporary directory to save the file
41
+ with tempfile.TemporaryDirectory() as temp_dir:
42
+ file_path = os.path.join(temp_dir, f"document_{file_id}.{file_format}")
43
+
44
+ # Save the content to the file
45
+ with open(file_path, 'wb') as temp_file:
46
+ temp_file.write(file_content)
47
+
48
+ # Handle different file formats
49
+ if file_format.lower() == 'docx':
50
+ text_content = extract_text_from_docx(file_path)
51
+ elif file_format.lower() == 'pdf':
52
+ text_content = extract_text_from_pdf(file_path)
53
+ elif file_format.lower() in ['txt', 'md', 'csv']:
54
+ reader = SimpleDirectoryReader(input_files=[file_path])
55
+ documents = reader.load_data()
56
+ text_content = documents[0].text if documents else ''
57
+ else:
58
+ raise ValueError(f"Unsupported file format: {file_format}")
59
+
60
+ if text_content:
61
+ return text_content
62
+ else:
63
+ raise ValueError("No content extracted from the document.")
64
+
65
+ except Exception as e:
66
+ logging.error(f"Error reading document: {e}")
67
+ raise
68
+
69
+ def extract_text_from_docx(file_path: str) -> str:
70
+ """Extract text from a DOCX file."""
71
+ try:
72
+ doc = Document(file_path)
73
+ full_text = [para.text for para in doc.paragraphs]
74
+ return '\n'.join(full_text)
75
+ except Exception as e:
76
+ logging.error(f"Error extracting text from DOCX file: {e}")
77
+ raise
78
+
79
+ def extract_text_from_pdf(file_path: str) -> str:
80
+ """Extract text from a PDF file."""
81
+ import pdfplumber
82
+ try:
83
+ with pdfplumber.open(file_path) as pdf:
84
+ full_text = [page.extract_text() for page in pdf.pages]
85
+ return '\n'.join(full_text).strip()
86
+ except Exception as e:
87
+ logging.error(f"Error extracting text from PDF file: {e}")
88
+ raise
89
+
90
+
91
+
92
+ def cumulative_semantic_chunking( text: str, max_chunk_size: int, similarity_threshold: float, embedding_model: SentenceTransformer = model) -> List[str]:
93
+ """Cumulative semantic chunking using sentence embeddings."""
94
+ sentences = text.split('.')
95
+
96
+ # Encode sentences
97
+ sentence_embeddings = model.encode(sentences)
98
+
99
+ chunks = []
100
+ current_chunk = sentences[0]
101
+ #print('current chunk',current_chunk)
102
+ current_embedding = sentence_embeddings[0]
103
+
104
+ for sentence, embedding in zip(sentences[1:], sentence_embeddings[1:]):
105
+ combined_chunk = current_chunk + '. ' + sentence
106
+ combined_embedding = (current_embedding * len(current_chunk.split()) + embedding * len(sentence.split())) / (len(current_chunk.split()) + len(sentence.split()))
107
+
108
+ similarity = np.dot(current_embedding, combined_embedding) / (np.linalg.norm(current_embedding) * np.linalg.norm(combined_embedding))
109
+
110
+ if similarity >= similarity_threshold and len(combined_chunk) <= max_chunk_size:
111
+ current_chunk = combined_chunk
112
+ current_embedding = combined_embedding
113
+ else:
114
+ chunks.append(current_chunk.strip())
115
+ current_chunk = sentence
116
+ current_embedding = embedding
117
+
118
+ if current_chunk:
119
+ chunks.append(current_chunk.strip())
120
+
121
+ #print('chunks',chunks)
122
+ return chunks
123
+
124
+ # def embed_chunks(chunks: List[str]) -> List[np.ndarray]:
125
+ # """Embed the chunks using the SentenceTransformer model."""
126
+ # return model.encode(chunks)
127
+
128
+ def embed_chunks(chunks: List[str]) -> Tuple[List[np.ndarray], int]:
129
+ """Embed the chunks using the SentenceTransformer model and return embeddings along with the total token count."""
130
+ total_tokens = 0
131
+ embeddings = []
132
+
133
+ for chunk in chunks:
134
+ tokens = model.tokenizer.encode(chunk, add_special_tokens=False)
135
+ total_tokens += len(tokens)
136
+ embedding = model.encode(chunk)
137
+ embeddings.append(embedding)
138
+
139
+ return embeddings, total_tokens
utils/qdrant_utils.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # qdrant_utils.py contains utility functions to interact with Qdrant, a vector similarity search engine.
2
+
3
+ from qdrant_client import QdrantClient
4
+ from qdrant_client.http import models
5
+ import os
6
+ import logging
7
+ import uuid
8
+
9
+ # Initialize logging
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+ qdrant_client = QdrantClient(
13
+ url=os.getenv('QDRANT_URL'),
14
+ api_key=os.getenv('QDRANT_API_KEY')
15
+ )
16
+
17
+ def create_collection_if_not_exists(collection_name, vector_size):
18
+ try:
19
+ # Check if collection exists
20
+ collections = qdrant_client.get_collections().collections
21
+ if not any(collection.name == collection_name for collection in collections):
22
+ # Create the collection if it doesn't exist
23
+ qdrant_client.create_collection(
24
+ collection_name=collection_name,
25
+ vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE)
26
+ )
27
+ logging.info(f"Created new collection: {collection_name}")
28
+ else:
29
+ logging.info(f"Collection {collection_name} already exists")
30
+ except Exception as e:
31
+ logging.error(f"Error creating collection: {str(e)}")
32
+ raise
33
+
34
+ def store_embeddings(chunks, embeddings, user_id, data_source_id, file_id, organization_id, s3_bucket_key, total_tokens):
35
+ try:
36
+ collection_name = "embed" # Name of the collection in Qdrant
37
+ vector_size = len(embeddings[0])
38
+
39
+ # Ensure the collection exists
40
+ create_collection_if_not_exists(collection_name, vector_size)
41
+
42
+ # Prepare points for Qdrant
43
+ points = []
44
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
45
+ chunk_id = str(uuid.uuid4()) # Generate a unique ID for each chunk
46
+ points.append(
47
+ models.PointStruct(
48
+ id=chunk_id,
49
+ vector=embedding.tolist(), # Convert numpy array to list
50
+ payload={
51
+ "user_id": user_id,
52
+ "data_source_id": data_source_id,
53
+ "file_id": file_id,
54
+ "organization_id": organization_id,
55
+ "chunk_index": i,
56
+ "chunk_text": chunk,
57
+ "s3_bucket_key": s3_bucket_key,
58
+ "total_tokens": total_tokens
59
+
60
+ }
61
+ )
62
+ )
63
+
64
+ # Store embeddings in Qdrant
65
+ qdrant_client.upsert(
66
+ collection_name=collection_name,
67
+ points=points
68
+ )
69
+ logging.info(f"Successfully stored {len(points)} embeddings")
70
+ except Exception as e:
71
+ logging.error(f"Error storing embeddings in Qdrant: {str(e)}")
72
+ raise
utils/s3_utils.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import os
3
+ import urllib.parse
4
+ import logging
5
+ from botocore.exceptions import NoCredentialsError, ClientError
6
+
7
+ #bucket_name = "document-ingestion-drive-dev"
8
+ aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
9
+ aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
10
+
11
+
12
+
13
+ def get_s3_client():
14
+ try:
15
+ s3_client = boto3.client(
16
+ 's3',
17
+ aws_access_key_id=aws_access_key_id,
18
+ aws_secret_access_key=aws_secret_access_key,
19
+ region_name='us-west-2'
20
+ )
21
+ logging.info("S3 client initialized successfully.")
22
+ return s3_client
23
+ except NoCredentialsError as e:
24
+ logging.error(f"Failed to initialize S3 client: {str(e)}")
25
+ raise
26
+ except Exception as e:
27
+ logging.error(f"Failed to initialize S3 client: {str(e)}")
28
+ raise
29
+
30
+
31
+ def read_s3_file(bucket_name, key):
32
+ try:
33
+ logging.info(f"Reading file from S3: bucket={bucket_name}, key={key}")
34
+ s3 = get_s3_client()
35
+ response = s3.get_object(Bucket=bucket_name, Key=key)
36
+
37
+ content = response['Body'].read()
38
+ metadata = response.get('Metadata', {})
39
+
40
+ # Attempt to get the file format from metadata
41
+ file_format = metadata.get('file_format') # Assuming 'file_format' is set as custom metadata
42
+ if not file_format:
43
+ # Fallback to using Content-Type if 'file_format' is not set in metadata
44
+ content_type = response.get('ContentType')
45
+ if content_type:
46
+ if 'word' in content_type:
47
+ file_format = 'docx'
48
+ elif 'pdf' in content_type:
49
+ file_format = 'pdf'
50
+ elif 'text' in content_type:
51
+ file_format = 'txt'
52
+ else:
53
+ file_format = 'unknown'
54
+ else:
55
+ raise ValueError("File format could not be determined from metadata or Content-Type.")
56
+
57
+ logging.info(f"File read successfully from S3: bucket={bucket_name}, key={key}, format={file_format}")
58
+ return content, metadata, file_format
59
+ except s3.exceptions.NoSuchKey:
60
+ logging.error(f"File not found in S3: bucket={bucket_name}, key={key}")
61
+ raise FileNotFoundError(f"File not found: bucket={bucket_name}, key={key}")
62
+ except NoCredentialsError:
63
+ logging.error("AWS credentials not found.")
64
+ raise PermissionError("AWS credentials not found.")
65
+ except ClientError as e:
66
+ logging.error(f"Error reading file from S3: {str(e)}")
67
+ raise
68
+ except Exception as e:
69
+ logging.error(f"Error reading file from S3: {str(e)}")
70
+ raise
71
+
72
+
73
+ # Initialize logging
74
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')