Diksha2001 commited on
Commit
12b2deb
·
verified ·
1 Parent(s): ec45c15

Delete llm_evaluation.py

Browse files
Files changed (1) hide show
  1. llm_evaluation.py +0 -149
llm_evaluation.py DELETED
@@ -1,149 +0,0 @@
1
- import json
2
- from sentence_transformers import SentenceTransformer, util
3
- import nltk
4
- from openai import OpenAI
5
- import os
6
- from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
7
- import time
8
- import asyncio
9
- import logging
10
- import sys
11
- # Configure logging
12
- logging.basicConfig(level=logging.INFO)
13
- # Download necessary NLTK resources
14
- nltk.download('punkt')
15
- def load_input_data():
16
- """Load input data from command line arguments."""
17
- try:
18
- input_data = json.loads(sys.argv[1])
19
- return input_data
20
- except json.JSONDecodeError as e:
21
- logging.error(f"Failed to decode JSON input: {e}")
22
- sys.exit(1)
23
-
24
- def initialize_openai_client(api_key, base_url):
25
- """Initialize the OpenAI client."""
26
- return OpenAI(api_key=api_key, base_url=base_url)
27
-
28
- def load_model():
29
- """Load the pre-trained models for evaluation."""
30
- semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
31
- return semantic_model
32
-
33
- def evaluate_semantic_similarity(expected_response, model_response, semantic_model):
34
- """Evaluate semantic similarity using Sentence-BERT."""
35
- expected_embedding = semantic_model.encode(expected_response, convert_to_tensor=True)
36
- model_embedding = semantic_model.encode(model_response, convert_to_tensor=True)
37
- similarity_score = util.pytorch_cos_sim(expected_embedding, model_embedding)
38
- return similarity_score.item()
39
-
40
- def evaluate_bleu(expected_response, model_response):
41
- """Evaluate BLEU score using NLTK's sentence_bleu."""
42
- expected_tokens = nltk.word_tokenize(expected_response.lower())
43
- model_tokens = nltk.word_tokenize(model_response.lower())
44
- smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
45
- bleu_score = nltk.translate.bleu_score.sentence_bleu([expected_tokens], model_tokens, smoothing_function=smoothing_function)
46
- return bleu_score
47
-
48
- async def create_with_retries(client, **kwargs):
49
- """Retry mechanism for handling transient server errors asynchronously."""
50
- max_retries = 3 # Retry up to 3 times
51
- retry_delay = 5 # Retry delay in seconds
52
- timeout = 60 # Set timeout to 60 seconds (or adjust as needed)
53
-
54
- for attempt in range(max_retries):
55
- try:
56
- # Attempt to make the API request with an increased timeout
57
- response = await client.chat.completions.create(**kwargs, timeout=timeout)
58
- return response # Return the response if successful
59
- except Exception as e: # Catch all exceptions
60
- if attempt < max_retries - 1: # Only retry for the first two attempts
61
- logging.error(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Retrying...")
62
- await asyncio.sleep(retry_delay) # Wait before retrying
63
- else:
64
- logging.error(f"API request failed after {max_retries} attempts: {e}")
65
- # Capture additional debugging information here
66
- logging.debug(f"Request data: {kwargs}")
67
- raise Exception("API request failed after retries") from e
68
-
69
-
70
- async def evaluate_model(data, model_name, client, semantic_model):
71
- """Evaluate the model using the provided data."""
72
- semantic_scores = []
73
- bleu_scores = []
74
-
75
- for entry in data:
76
- prompt = entry['prompt']
77
- expected_response = entry['response']
78
-
79
- # Create a chat completion using OpenAI API
80
- response = await create_with_retries(
81
- client,
82
- model=f"PharynxAI/{model_name}",
83
- messages=[
84
- {"role": "system", "content": " "},
85
- {"role": "system", "content": "You are a helpful assistant."},
86
- {"role": "user", "content": prompt}
87
- ],
88
- temperature=0.7,
89
- max_tokens=200,
90
- timeout=400
91
- )
92
-
93
- # Ensure the response contains choices
94
- if not response.choices:
95
- logging.error(f"No choices returned for prompt: {prompt}. Skipping this entry.")
96
- continue
97
-
98
- model_response = response.choices[0].message.content # Extract model's response
99
-
100
- # Evaluate scores
101
- semantic_score = evaluate_semantic_similarity(expected_response, model_response, semantic_model)
102
- semantic_scores.append(semantic_score)
103
-
104
- bleu_score = evaluate_bleu(expected_response, model_response)
105
- bleu_scores.append(bleu_score)
106
-
107
- # Calculate average scores
108
- avg_semantic_score = sum(semantic_scores) / len(semantic_scores) if semantic_scores else 0
109
- avg_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
110
-
111
- print(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
112
- print(f"Average BLEU Score: {avg_bleu_score:.4f}")
113
-
114
- # Create comprehensive results dictionary
115
- evaluation_results = {
116
- 'average_semantic_score': avg_semantic_score,
117
- 'average_bleu_score': avg_bleu_score
118
- }
119
-
120
- # Print results to stdout for capturing in handler
121
- print(json.dumps(evaluation_results))
122
-
123
- logging.info("\nOverall Average Scores:")
124
- logging.info(f"Average Semantic Similarity: {avg_semantic_score:.4f}")
125
- logging.info(f"Average BLEU Score: {avg_bleu_score:.4f}")
126
-
127
- return evaluation_results
128
-
129
-
130
- async def main():
131
- # Load input data
132
- input_data = load_input_data()
133
- model_name = input_data["model_name"]
134
- # Initialize the OpenAI Client with your RunPod API Key and Endpoint URL
135
- client = OpenAI(
136
- api_key="MIGZGJKYD6PU8KTHTBQ8FMEMGP2RAW5DVXABFVFD",
137
- base_url="https://api.runpod.ai/v2/6vg8gj8ia9vd1w/openai/v1",
138
- )
139
- # Load pre-trained models
140
- semantic_model = load_model()
141
- # Load your dataset (replace with your actual JSON file)
142
- with open('output_json.json', 'r') as f:
143
- data = json.load(f)
144
-
145
- # Run the evaluation asynchronously
146
- await evaluate_model(data, model_name, client, semantic_model)
147
-
148
- # Start the event loop
149
- asyncio.run(main())