Spaces:
Sleeping
Sleeping
code cleaned up
Browse files- utils/helper_functions.py +130 -18
utils/helper_functions.py
CHANGED
@@ -19,7 +19,18 @@ openai.api_key = os.environ["OPENAI_API_KEY"]
|
|
19 |
|
20 |
|
21 |
def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
|
22 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# Concatenate the list of dataframes
|
24 |
combined_dataframe = pd.concat(
|
25 |
dataframes, ignore_index=True
|
@@ -64,21 +75,50 @@ def call_chatgpt(prompt: str) -> str:
|
|
64 |
|
65 |
|
66 |
def openai_text_embedding(prompt: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
|
68 |
"data"
|
69 |
-
][0][
|
|
|
|
|
70 |
|
71 |
|
72 |
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
# Compute sentence embeddings
|
74 |
embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
|
75 |
embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
|
76 |
|
77 |
-
# Convert to
|
78 |
embedding1 = np.asarray(embedding1)
|
79 |
embedding2 = np.asarray(embedding2)
|
80 |
|
81 |
# Calculate cosine similarity between the embeddings
|
|
|
82 |
similarity_score = 1 - cosine(embedding1, embedding2)
|
83 |
|
84 |
return similarity_score
|
@@ -88,11 +128,29 @@ def add_dist_score_column(
|
|
88 |
dataframe: pd.DataFrame,
|
89 |
sentence: str,
|
90 |
) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
dataframe["stsopenai"] = dataframe["questions"].apply(
|
92 |
lambda x: calculate_sts_openai_score(str(x), sentence)
|
93 |
)
|
94 |
|
|
|
95 |
sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
|
|
|
|
|
96 |
return sorted_dataframe.iloc[:5, :]
|
97 |
|
98 |
|
@@ -181,21 +239,75 @@ def llama2_7b_ysa(prompt: str) -> str:
|
|
181 |
|
182 |
|
183 |
def quantize_to_4bit(arr: Union[np.ndarray, Any]) -> np.ndarray:
|
184 |
-
"""
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
|
193 |
def quantized_influence(arr1: np.ndarray, arr2: np.ndarray) -> float:
|
194 |
-
"""
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
def merge_dataframes(dataframes: List[pd.DataFrame]) -> pd.DataFrame:
|
22 |
+
"""
|
23 |
+
Merges a list of pandas DataFrames into a single DataFrame.
|
24 |
+
|
25 |
+
This function concatenates the given DataFrames and filters the resulting DataFrame to only include the columns 'context', 'questions', and 'answers'.
|
26 |
+
|
27 |
+
Parameters:
|
28 |
+
dataframes (List[pd.DataFrame]): A list of DataFrames to be merged.
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
pd.DataFrame: The concatenated DataFrame containing only the specified columns.
|
32 |
+
"""
|
33 |
+
|
34 |
# Concatenate the list of dataframes
|
35 |
combined_dataframe = pd.concat(
|
36 |
dataframes, ignore_index=True
|
|
|
75 |
|
76 |
|
77 |
def openai_text_embedding(prompt: str) -> str:
|
78 |
+
"""
|
79 |
+
Retrieves the text embedding for a given prompt using OpenAI's text-embedding model.
|
80 |
+
|
81 |
+
This function utilizes OpenAI's API to generate an embedding for the input text. It specifically uses the "text-embedding-ada-002" model.
|
82 |
+
|
83 |
+
Parameters:
|
84 |
+
prompt (str): The text input for which to generate an embedding.
|
85 |
+
|
86 |
+
Returns:
|
87 |
+
str: A string representation of the text embedding.
|
88 |
+
"""
|
89 |
+
|
90 |
+
# Call OpenAI API to create a text embedding
|
91 |
return openai.Embedding.create(input=prompt, model="text-embedding-ada-002")[
|
92 |
"data"
|
93 |
+
][0][
|
94 |
+
"embedding"
|
95 |
+
] # Retrieve the embedding from the response
|
96 |
|
97 |
|
98 |
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
|
99 |
+
"""
|
100 |
+
Calculates the Semantic Textual Similarity (STS) between two sentences using OpenAI's text-embedding model.
|
101 |
+
|
102 |
+
This function computes embeddings for each sentence and then calculates the cosine similarity between these embeddings. A higher score indicates greater similarity.
|
103 |
+
|
104 |
+
Parameters:
|
105 |
+
sentence1 (str): The first sentence for similarity comparison.
|
106 |
+
sentence2 (str): The second sentence for similarity comparison.
|
107 |
+
|
108 |
+
Returns:
|
109 |
+
float: The STS score representing the similarity between sentence1 and sentence2.
|
110 |
+
"""
|
111 |
+
|
112 |
# Compute sentence embeddings
|
113 |
embedding1 = openai_text_embedding(sentence1) # Flatten the embedding array
|
114 |
embedding2 = openai_text_embedding(sentence2) # Flatten the embedding array
|
115 |
|
116 |
+
# Convert embeddings to NumPy arrays
|
117 |
embedding1 = np.asarray(embedding1)
|
118 |
embedding2 = np.asarray(embedding2)
|
119 |
|
120 |
# Calculate cosine similarity between the embeddings
|
121 |
+
# Since 'cosine' returns the distance, 1 - distance is used to get similarity
|
122 |
similarity_score = 1 - cosine(embedding1, embedding2)
|
123 |
|
124 |
return similarity_score
|
|
|
128 |
dataframe: pd.DataFrame,
|
129 |
sentence: str,
|
130 |
) -> pd.DataFrame:
|
131 |
+
"""
|
132 |
+
Adds a new column to the provided DataFrame with STS (Semantic Textual Similarity) scores,
|
133 |
+
calculated between a given sentence and each question in the 'questions' column of the DataFrame.
|
134 |
+
The DataFrame is then sorted by this new column in descending order and the top 5 rows are returned.
|
135 |
+
|
136 |
+
Parameters:
|
137 |
+
dataframe (pd.DataFrame): A pandas DataFrame containing a 'questions' column.
|
138 |
+
sentence (str): The sentence against which to compute STS scores for each question in the DataFrame.
|
139 |
+
|
140 |
+
Returns:
|
141 |
+
pd.DataFrame: A DataFrame containing the original data along with the new 'stsopenai' column,
|
142 |
+
sorted by the 'stsopenai' column, and limited to the top 5 entries with the highest scores.
|
143 |
+
"""
|
144 |
+
|
145 |
+
# Calculate the STS score between `sentence` and each row's `question`
|
146 |
dataframe["stsopenai"] = dataframe["questions"].apply(
|
147 |
lambda x: calculate_sts_openai_score(str(x), sentence)
|
148 |
)
|
149 |
|
150 |
+
# Sort the dataframe by the newly added 'stsopenai' column in descending order
|
151 |
sorted_dataframe = dataframe.sort_values(by="stsopenai", ascending=False)
|
152 |
+
|
153 |
+
# Return the top 5 rows from the sorted dataframe
|
154 |
return sorted_dataframe.iloc[:5, :]
|
155 |
|
156 |
|
|
|
239 |
|
240 |
|
241 |
def quantize_to_4bit(arr: Union[np.ndarray, Any]) -> np.ndarray:
|
242 |
+
"""
|
243 |
+
Converts an array to a 4-bit representation by normalizing and scaling its values.
|
244 |
+
|
245 |
+
The function first checks if the input is an instance of numpy ndarray,
|
246 |
+
if not, it converts the input into a numpy ndarray. Then, it normalizes
|
247 |
+
the values of the array to be between 0 and 1. Finally, it scales these
|
248 |
+
normalized values to the range of 0-15, corresponding to 4-bit integers,
|
249 |
+
and returns this array of integers.
|
250 |
+
|
251 |
+
Parameters:
|
252 |
+
arr (Union[np.ndarray, Any]): An array or any type that can be converted to a numpy ndarray.
|
253 |
+
|
254 |
+
Returns:
|
255 |
+
np.ndarray: A numpy ndarray containing the input data quantized to 4-bit representation.
|
256 |
+
|
257 |
+
Examples:
|
258 |
+
>>> quantize_to_4bit([0, 128, 255])
|
259 |
+
array([ 0, 7, 15])
|
260 |
+
"""
|
261 |
+
if not isinstance(arr, np.ndarray): # Check if the input is a numpy array
|
262 |
+
arr = np.array(arr) # Convert to numpy array if not already
|
263 |
+
|
264 |
+
arr_min = arr.min() # Find minimum value in the array
|
265 |
+
arr_max = arr.max() # Find maximum value in the array
|
266 |
+
|
267 |
+
# Normalize array values to a [0, 1] range
|
268 |
+
normalized_arr = (arr - arr_min) / (arr_max - arr_min)
|
269 |
+
|
270 |
+
# Scale normalized values to a 0-15 range (4-bit) and convert to integer
|
271 |
+
return np.round(normalized_arr * 15).astype(int)
|
272 |
|
273 |
|
274 |
def quantized_influence(arr1: np.ndarray, arr2: np.ndarray) -> float:
|
275 |
+
"""
|
276 |
+
Calculates a weighted measure of influence between two arrays based on their quantized (4-bit) versions.
|
277 |
+
|
278 |
+
This function first quantizes both input arrays to 4-bit representations and then calculates a weighting based
|
279 |
+
on the unique values of the first array's quantized version. It uses these weights to compute local averages
|
280 |
+
within the second array's quantized version, assessing the influence of the first array on the second.
|
281 |
+
The influence is normalized by the standard deviation of the second array's quantized version.
|
282 |
+
|
283 |
+
Parameters:
|
284 |
+
arr1 (np.ndarray): The first input numpy array.
|
285 |
+
arr2 (np.ndarray): The second input numpy array.
|
286 |
+
|
287 |
+
Returns:
|
288 |
+
float: The calculated influence value, representing a weighted average that has been normalized.
|
289 |
+
|
290 |
+
Note:
|
291 |
+
Both inputs must be numpy ndarrays and it's expected that a function named `quantize_to_4bit`
|
292 |
+
exists for converting an array to its 4-bit representation.
|
293 |
+
"""
|
294 |
+
arr1_4bit = quantize_to_4bit(arr1) # Quantize the first array to 4-bit
|
295 |
+
arr2_4bit = quantize_to_4bit(arr2) # Quantize the second array to 4-bit
|
296 |
+
|
297 |
+
unique_values = np.unique(
|
298 |
+
arr1_4bit
|
299 |
+
) # Get the unique 4-bit values from the first array
|
300 |
+
y_bar_global = np.mean(
|
301 |
+
arr2_4bit
|
302 |
+
) # Calculate the global mean of the second array's 4-bit version
|
303 |
+
|
304 |
+
# Compute the sum of squares of the differences between local and global means,
|
305 |
+
# each weighted by the square of the count of values in the local mean
|
306 |
+
weighted_local_averages = [
|
307 |
+
(np.mean((arr2_4bit[arr1_4bit == val]) - y_bar_global) ** 2)
|
308 |
+
* len(arr2_4bit[arr1_4bit == val]) ** 2
|
309 |
+
for val in unique_values
|
310 |
+
]
|
311 |
+
|
312 |
+
# Return normalized weighted mean by dividing by the standard deviation of the second array's 4-bit version
|
313 |
+
return np.mean(weighted_local_averages) / np.std(arr2_4bit)
|