ColBERTv2-mmarco-de-0.1

This is a German ColBERT implementation based on colbert-ir/colbertv2.0

As I'm limited on GPU Training did not go through all the way. "Only" 10 checkpoints were trained.

Code

My code is probably a mess, but YOLO!

data prep

from datasets import load_dataset
from ragatouille import RAGTrainer
from tqdm import tqdm
import pickle
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import concurrent

SAMPLE_SIZE = -1



def int_to_string(number):
    if number < 0:
        return "full"
    elif number < 1000:
        return str(number)
    elif number < 1000000:
        return f"{number // 1000}K"
    elif number >= 1000000:
        return f"{number // 1000000}M"

def process_chunk(chunk):
    return [list(item) for item in zip(chunk["query"], chunk["positive"], chunk["negative"])]

def chunked_iterable(iterable, chunk_size):
    """Yield successive chunks from iterable."""
    for i in range(0, len(iterable), chunk_size):
        yield iterable[i:i + chunk_size]

def process_dataset_concurrently(dataset, chunksize=1000):
    with ThreadPoolExecutor() as executor:
        # Wrap the dataset with tqdm for real-time updates
        wrapped_dataset = tqdm(chunked_iterable(dataset, chunksize), total=(len(dataset) + chunksize - 1) // chunksize)
        # Submit each chunk to the executor
        futures = [executor.submit(process_chunk, chunk) for chunk in wrapped_dataset]
        results = []
        for future in concurrent.futures.as_completed(futures):
            results.extend(future.result())
        return results

dataset = load_dataset('unicamp-dl/mmarco', 'german', trust_remote_code=True)


# Shuffle the dataset and seed for reproducibility if needed
shuffled_dataset = dataset['train'].shuffle(seed=42)


if SAMPLE_SIZE > 0:
    sampled_dataset = shuffled_dataset.select(range(SAMPLE_SIZE))
else:
    sampled_dataset = shuffled_dataset


triplets = process_dataset_concurrently(sampled_dataset, chunksize=10000)
trainer = RAGTrainer(model_name=f"ColBERT-mmacro-de-{int_to_string(SAMPLE_SIZE)}", pretrained_model_name="dbmdz/bert-base-german-cased", language_code="de",)
trainer.prepare_training_data(raw_data=triplets, mine_hard_negatives=False)

Training

from datasets import load_dataset
import os
from ragatouille import RAGTrainer
from tqdm import tqdm
import pickle
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
import concurrent
from pathlib import Path


def int_to_string(number):
    if number < 1000:
        return str(number)
    elif number < 1000000:
        return f"{number // 1000}K"
    elif number >= 1000000:
        return f"{number // 1000000}M"



SAMPLE_SIZE = 1000000


trainer = RAGTrainer(model_name=f"ColBERT-mmacro-de-{int_to_string(SAMPLE_SIZE)}", pretrained_model_name="dbmdz/bert-base-german-cased", language_code="de",)

trainer.data_dir = Path("/kaggle/input/mmarco-de-10m")

trainer.train(batch_size=32,
    nbits=4, # How many bits will the trained model use when compressing indexes
    maxsteps=500000, # Maximum steps hard stop
    use_ib_negatives=True, # Use in-batch negative to calculate loss
    dim=128, # How many dimensions per embedding. 128 is the default and works well.
    learning_rate=5e-6, # Learning rate, small values ([3e-6,3e-5] work best if the base model is BERT-like, 5e-6 is often the sweet spot)
    doc_maxlen=256, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
    use_relu=False, # Disable ReLU -- doesn't improve performance
    warmup_steps="auto", # Defaults to 10%
    )
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.

Dataset used to train domci/ColBERTv2-mmarco-de-0.1