CyberNative AI for CyberSecurity | Q/A Evaluation | Colibri_8b_v0.1 scored 74/100!
#1
by
CyberNative
- opened
Tested Colibri_8b_v0.1 on Cybersecurity Evaluation Dataset to measure the performance of cybersecurity models.
Correct (74.0): 370 | Incorrect: 130
We excluded eval dataset from training to prevent contamination. However, some contamination is still possible due to the nature of our training dataset creation method.
eval_colibri_exl2.py
import time
import random
random.seed(time.time())
import torch
print(f"PyTorch version: {torch.__version__}")
from exllamav2 import(
ExLlamaV2,
ExLlamaV2Config,
ExLlamaV2Cache,
ExLlamaV2Tokenizer,
)
from exllamav2.generator import (
ExLlamaV2BaseGenerator,
ExLlamaV2Sampler
)
# CyberNative-AI/Colibri_8b_v0.1
og_model_dir = "Colibri_8b_v0.1"
stop_tkn_id = 128256
config = ExLlamaV2Config()
config.debug_mode = True
config.model_dir = og_model_dir
config.prepare()
config.max_seq_len = 8192
model = ExLlamaV2(config)
cache = ExLlamaV2Cache(model, lazy = True)
model.load_autosplit(cache)
tokenizer = ExLlamaV2Tokenizer(config)
# Initialize generator
generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
def ai_complete(system_prompt, user_message, assistant_pre_message="", max_gen=4096, temperature = 0.3):
settings = ExLlamaV2Sampler.Settings()
settings.temperature = temperature
settings.top_p = 0.7
time_begin = time.time()
system_prompt = system_prompt.strip()
user_message = user_message.strip()
prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"+assistant_pre_message
generator.warmup()
random_seed = time_begin * random.randint(1, 1000) / random.randint(1, 1000)
og_output = generator.generate_simple(prompt, settings, num_tokens=max_gen, seed = random_seed, stop_token=stop_tkn_id, decode_special_tokens=True, encode_special_tokens=True)
og_output = og_output.replace(prompt, "").strip()
return og_output
QA_EXTRACTOR_PROMPT = """You are Colibri, an advanced cybersecurity AI assistant developed by CyberNative AI. Answer with correct answer option only, do not say anything else."""
def Eval_QA(question):
user_message = question
assistant_pre = "The correct answer is:"
predicted_text = ai_complete(QA_EXTRACTOR_PROMPT, user_message, assistant_pre, max_gen=5, temperature=0.1).strip()
predicted_text = predicted_text.split(" ")[0].strip()
return predicted_text
run_eval_cybersec_colibri.py
# https://huggingface.co/datasets/CyberNative/CyberSecurityEval
eval_dataset_file = "cybersec_qa_eval_500_pairs.jsonl"
import jsonlines
qa_pairs = []
with jsonlines.open(eval_dataset_file) as reader:
for obj in reader:
qa_pairs.append(obj)
print(len(qa_pairs))
import eval_colibri_exl2
SCORE_CORRECT = 0
SCORE_INCORRECT = 0
for pair in qa_pairs:
print("===")
question = pair["question"]
answer = pair["answer"]
answer = answer.replace("The correct answer is: ", "")
print(f"Question: {question}")
colibri_answer = eval_colibri_exl2.Eval_QA(question)
print(f"OG Answer: {answer} | Colibri Answer: {colibri_answer}")
# replace . and ) from answers
answer = answer.replace(".", "").replace(")", "").lower().strip()
colibri_answer = colibri_answer.replace(".", "").replace(")", "").lower().strip()
if answer == colibri_answer:
print("### Correct")
SCORE_CORRECT += 1
else:
print("### Incorrect")
SCORE_INCORRECT += 1
correct_percent = (SCORE_CORRECT / (SCORE_CORRECT + SCORE_INCORRECT)) * 100
print(f"Correct ({correct_percent}): {SCORE_CORRECT} | Incorrect: {SCORE_INCORRECT}")