Spaces:

PharynxAI
/

LLM_FinetuneR

Paused

App Files Files Community

Diksha2001 commited on Dec 2, 2024

Commit

12b2deb

verified ·

1 Parent(s): ec45c15

Delete llm_evaluation.py

Browse files

Files changed (1) hide show

llm_evaluation.py +0 -149

llm_evaluation.py DELETED Viewed

@@ -1,149 +0,0 @@
-import json
-from sentence_transformers import SentenceTransformer, util
-import nltk
-from openai import OpenAI
-import os
-from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
-import time
-import asyncio
-import logging
-import sys
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-# Download necessary NLTK resources
-nltk.download('punkt')
-def load_input_data():
-    """Load input data from command line arguments."""
-    try:
-        input_data = json.loads(sys.argv[1])
-        return input_data
-    except json.JSONDecodeError as e:
-        logging.error(f"Failed to decode JSON input: {e}")
-        sys.exit(1)
-def initialize_openai_client(api_key, base_url):
-    """Initialize the OpenAI client."""
-    return OpenAI(api_key=api_key, base_url=base_url)
-def load_model():
-    """Load the pre-trained models for evaluation."""
-    semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
-    return semantic_model
-def evaluate_semantic_similarity(expected_response, model_response, semantic_model):
-    """Evaluate semantic similarity using Sentence-BERT."""
-    expected_embedding = semantic_model.encode(expected_response, convert_to_tensor=True)
-    model_embedding = semantic_model.encode(model_response, convert_to_tensor=True)
-    similarity_score = util.pytorch_cos_sim(expected_embedding, model_embedding)
-    return similarity_score.item()
-def evaluate_bleu(expected_response, model_response):
-    """Evaluate BLEU score using NLTK's sentence_bleu."""
-    expected_tokens = nltk.word_tokenize(expected_response.lower())
-    model_tokens = nltk.word_tokenize(model_response.lower())
-    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
-    bleu_score = nltk.translate.bleu_score.sentence_bleu([expected_tokens], model_tokens, smoothing_function=smoothing_function)
-    return bleu_score
-async def create_with_retries(client, **kwargs):
-    """Retry mechanism for handling transient server errors asynchronously."""
-    max_retries = 3  # Retry up to 3 times
-    retry_delay = 5  # Retry delay in seconds
-    timeout = 60  # Set timeout to 60 seconds (or adjust as needed)
-    for attempt in range(max_retries):
-        try:
-            # Attempt to make the API request with an increased timeout
-            response = await client.chat.completions.create(**kwargs, timeout=timeout)
-            return response  # Return the response if successful
-        except Exception as e:  # Catch all exceptions
-            if attempt < max_retries - 1:  # Only retry for the first two attempts
-                logging.error(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Retrying...")
-                await asyncio.sleep(retry_delay)  # Wait before retrying
-            else:
-                logging.error(f"API request failed after {max_retries} attempts: {e}")
-                # Capture additional debugging information here
-                logging.debug(f"Request data: {kwargs}")
-                raise Exception("API request failed after retries") from e
-async def evaluate_model(data, model_name, client, semantic_model):
-    """Evaluate the model using the provided data."""
-    semantic_scores = []
-    bleu_scores = []
-    for entry in data:
-        prompt = entry['prompt']
-        expected_response = entry['response']
-        # Create a chat completion using OpenAI API
-        response = await create_with_retries(
-            client,
-            model=f"PharynxAI/{model_name}",
-            messages=[
-                {"role": "system", "content": " "},
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": prompt}
-            ],
-            temperature=0.7,
-            max_tokens=200,
-            timeout=400
-        )
-        # Ensure the response contains choices
-        if not response.choices:
-            logging.error(f"No choices returned for prompt: {prompt}. Skipping this entry.")
-            continue
-        model_response = response.choices[0].message.content  # Extract model's response
-        # Evaluate scores
-        semantic_score = evaluate_semantic_similarity(expected_response, model_response, semantic_model)
-        semantic_scores.append(semantic_score)
-        bleu_score = evaluate_bleu(expected_response, model_response)
-        bleu_scores.append(bleu_score)
-    # Calculate average scores
-    avg_semantic_score = sum(semantic_scores) / len(semantic_scores) if semantic_scores else 0
-    avg_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
-    print(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
-    print(f"Average BLEU Score: {avg_bleu_score:.4f}")
-    # Create comprehensive results dictionary
-    evaluation_results = {
-        'average_semantic_score': avg_semantic_score,
-        'average_bleu_score': avg_bleu_score
-    }
-    # Print results to stdout for capturing in handler
-    print(json.dumps(evaluation_results))
-    logging.info("\nOverall Average Scores:")
-    logging.info(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
-    logging.info(f"Average BLEU Score: {avg_bleu_score:.4f}")
-    return evaluation_results
-async def main():
-    # Load input data
-    input_data = load_input_data()
-    model_name = input_data["model_name"]
-    # Initialize the OpenAI Client with your RunPod API Key and Endpoint URL
-    client = OpenAI(
-        api_key="MIGZGJKYD6PU8KTHTBQ8FMEMGP2RAW5DVXABFVFD",
-        base_url="https://api.runpod.ai/v2/6vg8gj8ia9vd1w/openai/v1",
-    )
-    # Load pre-trained models
-    semantic_model = load_model()
-    # Load your dataset (replace with your actual JSON file)
-    with open('output_json.json', 'r') as f:
-        data = json.load(f)
-    # Run the evaluation asynchronously
-    await evaluate_model(data, model_name, client, semantic_model)
-# Start the event loop
-asyncio.run(main())