Spaces:

Ransaka
/

sinhala-embedding-space

Sleeping

File size: 3,216 Bytes

d06496c

"""
This file contains the code for the embeddings. 
    Tested models as follows:
        - Ransaka/SinhalaRoberta
        - keshan/SinhalaBERTo
This file used Ransaka/SinhalaRoberta model for the embeddings.

You can download the model from huggingface.co
    - https://huggingface.co/Ransaka/SinhalaRoberta
    - https://huggingface.co/keshan/SinhalaBERTo

You can download dataset from kaggle.com
    - https://www.kaggle.com/datasets/ransakaravihara/hiru-news-set3

"""
import random
import numpy as np
import pandas as pd

import torch
from sentence_transformers import SentenceTransformer, models,util

model_id = "Ransaka/SinhalaRoberta"

def load_and_process_data(file_path:str)->list:
    """
    This function loads the data from the file path and process it.
    """
    def processor(text:str)->str:
        """Only addresses the most common issues in the dataset"""
        return text\
            .replace("\u200d","")\
            .replace("Read More..","")\
            .replace("ඡායාරූප","")\
            .replace("\xa0","")\
            .replace("වීඩියෝ","")\
            .replace("()","")

    def basic_processing(series:pd.Series)->pd.Series:
        """Applies the processor function to a pandas series"""
        return series\
        .apply(processor)
    
    df  = pd.read_csv(file_path)
    df.dropna(inplace=True)
    df['Headline'] = basic_processing(df['Headline'])
    # df['fullText'] = basic_processing(df['fullText'])

    #only headlines used for the embeddings
    sentences = df['Headline'].values.tolist()
    random.shuffle(sentences)
    return sentences

def load_model(model_id:str)->SentenceTransformer:
    """
    This function loads the model from the huggingface.co
    """
    word_embedding_model = models.Transformer(model_id, max_seq_length=514)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

def get_embeddings(model: SentenceTransformer, sentences: list)->list:
    """
    This function returns the embeddings for the given sentences.
    """
    return model.encode(sentences)

def save_embeddings(embeddings: list, file_path: str):
    """
    This function saves the embeddings to the given file path.
    """
    np.save(file_path, embeddings)

def load_embeddings(file_path: str)->list:
    """
    This function loads the embeddings from the given file path.
    """
    return np.load(file_path)

def get_similar(model:SentenceTransformer,embeddings: list, query: str, top_k: int = 5)->list:
    """
    This function returns the top k similar sentences for the given query.
    """
    query_embedding = model.encode([query])[0]
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    return top_results

if __name__ == "__main__":  
    file_path = r"data\top_cluster_dataset.csv"

    #load and process data
    sentences = load_and_process_data(file_path)
    model = load_model(model_id)

    #get embeddings
    embeddings = get_embeddings(model, sentences)
    save_embeddings(embeddings, r"data\embeddings.npy")