Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
from argparse import ArgumentParser | |
from typing import List | |
import torch | |
from torch.utils.data import DataLoader | |
from sentence_transformers import SentenceTransformer, InputExample, losses | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
def load_data(dataset_dir: str, data_split: str, list_of_langs: List[str]) -> List[InputExample]: | |
data_list = [] | |
for lang in list_of_langs: | |
train_data_path = os.path.join(dataset_dir, lang, f"{lang}_{data_split}.csv") | |
if not os.path.exists(train_data_path): | |
print(f"{data_split} data for {lang} does not exist") | |
continue | |
df = pd.read_csv(train_data_path) | |
scores = df["label"].tolist() | |
scores = [float(score) for score in scores] | |
sentence_1s = df["sentence1"].tolist() | |
sentence_2s = df["sentence2"].tolist() | |
for i in range(len(scores)): | |
data_list.append(InputExample(texts=[sentence_1s[i], sentence_2s[i]], label=scores[i])) | |
return data_list | |
dataset_dir= "data" | |
list_of_langs=["eng"] | |
train_examples = load_data(dataset_dir, "train", list_of_langs) | |
test_examples = load_data(dataset_dir, "test", list_of_langs) | |
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) | |
test_dataloader = DataLoader(test_examples, shuffle=False, batch_size=16) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(device) | |
model = SentenceTransformer("sentence-transformers/LaBSE", device=device) | |
loss_function = losses.CosineSimilarityLoss(model=model) | |
model.fit( | |
train_objectives=[(train_dataloader, loss_function)], | |
epochs=10, | |
warmup_steps=100, | |
output_path="semrel_baselines/models/finetuned_esp_labse", | |
) | |
def test_model(test_examples): | |
sentence_1s = [ex.texts[0] for ex in test_examples] | |
sentence_2s = [ex.texts[1] for ex in test_examples] | |
scores = [ex.label for ex in test_examples] | |
# Calculate embeddings | |
embeddings1 = model.encode(sentence_1s, convert_to_tensor=True) | |
embeddings2 = model.encode(sentence_2s, convert_to_tensor=True) | |
# Calculate cosine similarity | |
cos_sim = cosine_similarity(embeddings1.cpu(), embeddings2.cpu()) | |
cos_sim_scores = [cos_sim[i, i] for i in range(len(cos_sim))] | |
spearman_corr = np.corrcoef(scores, cos_sim_scores)[0, 1] | |
return spearman_corr | |
train_corr = test_model(train_examples) | |
test_corr = test_model(test_examples) | |
print (f'Train Spearman correlation: {train_corr:.2f}%, Test Spearman correlation: {test_corr:.2f}%') | |
# Save the predictions to submission.csv | |
sentence_1s = [ex.texts[0] for ex in test_examples] | |
sentence_2s = [ex.texts[1] for ex in test_examples] | |
scores = [ex.label for ex in test_examples] | |
embeddings1 = model.encode(sentence_1s, convert_to_tensor=True) | |
embeddings2 = model.encode(sentence_2s, convert_to_tensor=True) | |
cos_sim = cosine_similarity(embeddings1.cpu(), embeddings2.cpu()) | |
cos_sim_scores = [cos_sim[i, i] for i in range(len(cos_sim))] | |
results_df = pd.DataFrame({ | |
"sentence1": sentence_1s, | |
"sentence2": sentence_2s, | |
"label": cos_sim_scores | |
}) | |
result_path = "submission.csv" | |
results_df.to_csv(result_path, index=False) | |
print(f"Results saved to {result_path}") | |