I wrote a script to pretrain a model using this using an alpaca formatted dataset like my dataset bellow. It takes way to much ram for me to run though.
https://huggingface.co/datasets/Replete-AI/Everything_Instruct
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
import json
class HDComputing:
def __init__(self, dim):
self.dim = dim
def random_hv(self):
return np.random.choice([-1, 1], size=self.dim)
def bind(self, hv1, hv2):
return hv1 * hv2
def bundle(self, hvs):
return np.sign(np.sum(hvs, axis=0))
class HDCNNClassifier(nn.Module):
def __init__(self, dim, num_classes):
super(HDCNNClassifier, self).__init__()
self.fc1 = nn.Linear(dim, 512)
self.activation = nn.ReLU()
self.dropout = nn.Dropout(p=0.5)
self.fc2 = nn.Linear(512, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.activation(out)
out = self.dropout(out)
out = self.fc2(out)
return out
class CustomDataset(Dataset):
def __init__(self, data, hd_computer, max_seq_len):
self.data = data
self.hd_computer = hd_computer
self.max_seq_len = max_seq_len
self.vocab = self.build_vocab()
self.token_hvs = {token: self.hd_computer.random_hv() for token in self.vocab}
def build_vocab(self):
vocab = set()
for item in self.data:
text = f"{item['instruction']} {item['input']} {item['output']}"
vocab.update(text.split())
return list(vocab)
def encode_text(self, text):
tokens = text.split()[:self.max_seq_len]
hvs = [self.token_hvs.get(token, self.hd_computer.random_hv()) for token in tokens]
return self.hd_computer.bundle(hvs)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
text = f"{item['instruction']} {item['input']} {item['output']}"
encoded = self.encode_text(text)
return torch.tensor(encoded, dtype=torch.float32), 0
class ProgressBar:
def __init__(self, total_steps):
self.pbar = tqdm(total=total_steps, desc="Training Progress", unit="step")
def update(self):
self.pbar.update(1)
def close(self):
self.pbar.close()
if __name__ == "__main__":
dataset_path = "E:/DATASETS/Everything-Instruct.json"
dataset = []
with open(dataset_path, 'r', encoding='utf-8') as f:
for line in f:
dataset.append(json.loads(line.strip()))
hd_dim = 5000
hd_computer = HDComputing(hd_dim)
custom_dataset = CustomDataset(dataset, hd_computer, max_seq_len=8192)
batch_size = 32
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)
num_classes = 1
model = HDCNNClassifier(hd_dim, num_classes)
num_epochs = 5
learning_rate = 2e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
total_steps = len(custom_dataset) // batch_size * num_epochs
progress_bar = ProgressBar(total_steps)
model.train()
for epoch in range(num_epochs):
for batch in dataloader:
inputs, labels = batch
outputs = model(inputs)
loss = criterion(outputs, labels.float().unsqueeze(1))
optimizer.zero_grad()
loss.backward()
optimizer.step()
progress_bar.update()
progress_bar.close()
torch.save(model.state_dict(), "E:/models/HD_model.pth")
print("Training completed and model saved.")