# Helper funcs for LLM_XXXXX.py import tiktoken, json, os from langchain_core.output_parsers.format_instructions import JSON_FORMAT_INSTRUCTIONS from transformers import AutoTokenizer import GPUtil import time import psutil import threading import torch def remove_colons_and_double_apostrophes(text): return text.replace(":", "").replace("\"", "") def count_tokens(string, vendor, model_name): full_string = string + JSON_FORMAT_INSTRUCTIONS def run_count(full_string, model_name): # Ensure the encoding is obtained correctly. encoding = tiktoken.encoding_for_model(model_name) tokens = encoding.encode(full_string) return len(tokens) try: if vendor == 'mistral': tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") tokens = tokenizer.tokenize(full_string) return len(tokens) else: return run_count(full_string, model_name) except Exception as e: print(f"An error occurred: {e}") return 0 class SystemLoadMonitor(): def __init__(self, logger) -> None: self.monitoring_thread = None self.logger = logger self.gpu_usage = {'max_cpu_usage': 0, 'max_load': 0, 'max_vram_usage': 0, "max_ram_usage": 0, 'monitoring': True} self.start_time = None self.has_GPU = torch.cuda.is_available() self.monitor_interval = 2 def start_monitoring_usage(self): self.start_time = time.time() self.monitoring_thread = threading.Thread(target=self.monitor_usage, args=(self.monitor_interval,)) self.monitoring_thread.start() def monitor_usage(self, interval): while self.gpu_usage['monitoring']: # GPU monitoring if self.has_GPU: GPUs = GPUtil.getGPUs() for gpu in GPUs: self.gpu_usage['max_load'] = max(self.gpu_usage['max_load'], gpu.load) # Convert memory usage to GB memory_usage_gb = gpu.memoryUsed / 1024.0 self.gpu_usage['max_vram_usage'] = max(self.gpu_usage.get('max_vram_usage', 0), memory_usage_gb) # RAM monitoring ram_usage = psutil.virtual_memory().used / (1024.0 ** 3) # Get RAM usage in GB self.gpu_usage['max_ram_usage'] = max(self.gpu_usage.get('max_ram_usage', 0), ram_usage) # CPU monitoring cpu_usage = psutil.cpu_percent(interval=None) self.gpu_usage['max_cpu_usage'] = max(self.gpu_usage.get('max_cpu_usage', 0), cpu_usage) time.sleep(interval) def stop_monitoring_report_usage(self): self.gpu_usage['monitoring'] = False self.monitoring_thread.join() elapsed_time = time.time() - self.start_time self.logger.info(f"Inference Time: {round(elapsed_time,2)} seconds") self.logger.info(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%") self.logger.info(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB") if self.has_GPU: self.logger.info(f"Max GPU Load: {round(self.gpu_usage['max_load']*100,2)}%") self.logger.info(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'],2)}GB")