import boto3 import os import urllib.parse import logging from botocore.exceptions import NoCredentialsError, ClientError #bucket_name = "document-ingestion-drive-dev" aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY') def get_s3_client(): try: s3_client = boto3.client( 's3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name='us-west-2' ) logging.info("S3 client initialized successfully.") return s3_client except NoCredentialsError as e: logging.error(f"Failed to initialize S3 client: {str(e)}") raise except Exception as e: logging.error(f"Failed to initialize S3 client: {str(e)}") raise def read_s3_file(bucket_name, key): try: logging.info(f"Reading file from S3: bucket={bucket_name}, key={key}") s3 = get_s3_client() response = s3.get_object(Bucket=bucket_name, Key=key) content = response['Body'].read() metadata = response.get('Metadata', {}) # Attempt to get the file format from metadata file_format = metadata.get('file_format') # Assuming 'file_format' is set as custom metadata if not file_format: # Fallback to using Content-Type if 'file_format' is not set in metadata content_type = response.get('ContentType') if content_type: if 'word' in content_type: file_format = 'docx' elif 'pdf' in content_type: file_format = 'pdf' elif 'text' in content_type: file_format = 'txt' else: file_format = 'unknown' else: raise ValueError("File format could not be determined from metadata or Content-Type.") logging.info(f"File read successfully from S3: bucket={bucket_name}, key={key}, format={file_format}") return content, metadata, file_format except s3.exceptions.NoSuchKey: logging.error(f"File not found in S3: bucket={bucket_name}, key={key}") raise FileNotFoundError(f"File not found: bucket={bucket_name}, key={key}") except NoCredentialsError: logging.error("AWS credentials not found.") raise PermissionError("AWS credentials not found.") except ClientError as e: logging.error(f"Error reading file from S3: {str(e)}") raise except Exception as e: logging.error(f"Error reading file from S3: {str(e)}") raise # Initialize logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')