Spaces:
Running
Running
Major update. Support for 15 LLMs, World Flora Online taxonomy validation, geolocation, 2 OCR methods, significant UI changes, stability improvements, consistent JSON parsing
e91ac58
# Helper funcs for LLM_XXXXX.py | |
import tiktoken, json, os | |
from langchain_core.output_parsers.format_instructions import JSON_FORMAT_INSTRUCTIONS | |
from transformers import AutoTokenizer | |
import GPUtil | |
import time | |
import psutil | |
import threading | |
import torch | |
def remove_colons_and_double_apostrophes(text): | |
return text.replace(":", "").replace("\"", "") | |
def count_tokens(string, vendor, model_name): | |
full_string = string + JSON_FORMAT_INSTRUCTIONS | |
def run_count(full_string, model_name): | |
# Ensure the encoding is obtained correctly. | |
encoding = tiktoken.encoding_for_model(model_name) | |
tokens = encoding.encode(full_string) | |
return len(tokens) | |
try: | |
if vendor == 'mistral': | |
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") | |
tokens = tokenizer.tokenize(full_string) | |
return len(tokens) | |
else: | |
return run_count(full_string, model_name) | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return 0 | |
class SystemLoadMonitor(): | |
def __init__(self, logger) -> None: | |
self.monitoring_thread = None | |
self.logger = logger | |
self.gpu_usage = {'max_cpu_usage': 0, 'max_load': 0, 'max_vram_usage': 0, "max_ram_usage": 0, 'monitoring': True} | |
self.start_time = None | |
self.has_GPU = torch.cuda.is_available() | |
self.monitor_interval = 2 | |
def start_monitoring_usage(self): | |
self.start_time = time.time() | |
self.monitoring_thread = threading.Thread(target=self.monitor_usage, args=(self.monitor_interval,)) | |
self.monitoring_thread.start() | |
def monitor_usage(self, interval): | |
while self.gpu_usage['monitoring']: | |
# GPU monitoring | |
if self.has_GPU: | |
GPUs = GPUtil.getGPUs() | |
for gpu in GPUs: | |
self.gpu_usage['max_load'] = max(self.gpu_usage['max_load'], gpu.load) | |
# Convert memory usage to GB | |
memory_usage_gb = gpu.memoryUsed / 1024.0 | |
self.gpu_usage['max_vram_usage'] = max(self.gpu_usage.get('max_vram_usage', 0), memory_usage_gb) | |
# RAM monitoring | |
ram_usage = psutil.virtual_memory().used / (1024.0 ** 3) # Get RAM usage in GB | |
self.gpu_usage['max_ram_usage'] = max(self.gpu_usage.get('max_ram_usage', 0), ram_usage) | |
# CPU monitoring | |
cpu_usage = psutil.cpu_percent(interval=None) | |
self.gpu_usage['max_cpu_usage'] = max(self.gpu_usage.get('max_cpu_usage', 0), cpu_usage) | |
time.sleep(interval) | |
def stop_monitoring_report_usage(self): | |
self.gpu_usage['monitoring'] = False | |
self.monitoring_thread.join() | |
elapsed_time = time.time() - self.start_time | |
self.logger.info(f"Inference Time: {round(elapsed_time,2)} seconds") | |
self.logger.info(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%") | |
self.logger.info(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB") | |
if self.has_GPU: | |
self.logger.info(f"Max GPU Load: {round(self.gpu_usage['max_load']*100,2)}%") | |
self.logger.info(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'],2)}GB") | |