import numpy as np # TODO: Move exception classes into separate file class DimensionalityMismatchError(ValueError): """Raised when the dimensions of query and corpus vectors don't match.""" pass class ZeroVectorError(ValueError): """Raised when a zero vector is encountered.""" pass class EmptyInputError(ValueError): """Raised when the input arrays are empty.""" pass def cosine_similarity( query_vector: np.ndarray, corpus_vectors: np.ndarray ) -> np.ndarray: """ Calculate cosine similarity between prompt vectors. Args: query_vector: Vectorized prompt query of shape (1, D). corpus_vectors: Vectorized prompt corpus of shape (N, D). Returns: The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same. Raises: DimensionalityMismatchError: If dimensions of query_vector and corpus_vectors do not match. ZeroVectorError: If query_vector is a zero vector or any corpus vector is a zero vector. EmptyInputError: If query_vector or corpus_vectors are empty. Note: - This implementation assumes the use of SentenceTransformer with the "all-MiniLM-L6-v2" model. - SentenceTransformer embeddings are unlikely to produce zero vectors, even for empty or irrelevant inputs. - However, checks for zero vectors are included to handle potential edge cases and ensure robustness for future modifications or alternative embedding models. """ # Validate input shapes and properties if query_vector.shape[0] != 1: raise DimensionalityMismatchError(f"query_vector must have shape (1, D), but got shape {query_vector.shape}.") if query_vector.shape[1] != corpus_vectors.shape[1]: raise DimensionalityMismatchError( f"query_vector shape {query_vector.shape} does not match corpus_vectors shape {corpus_vectors.shape}." ) if query_vector.size == 0 or corpus_vectors.size == 0: raise EmptyInputError("query_vector and corpus_vectors must not be empty.") # Compute query norm and check for zero vector query_norm = np.linalg.norm(query_vector, axis=1)[0] if query_norm == 0: raise ZeroVectorError("query_vector must not be a zero vector.") # Check if any corpus vector is zero (alternative option for raising an error if any corpus vector is zero, # can be filtering out zero vectors and raising the error only if all corpus vectors are zero vectors) corpus_norms = np.linalg.norm(corpus_vectors, axis=1) if np.any(corpus_norms == 0): raise ZeroVectorError("corpus_vectors must not contain zero vectors.") # Compute cosine similarity dot_products = np.dot(corpus_vectors, query_vector.T).flatten() similarities = dot_products / (query_norm * corpus_norms) return similarities