dh-mc commited on
Commit
5b4cc8a
·
1 Parent(s): 251efe4
Files changed (3) hide show
  1. .env.example +2 -81
  2. app.py +29 -7
  3. eval_modules/utils.py +13 -2
.env.example CHANGED
@@ -1,81 +1,2 @@
1
- LLM_MODEL_TYPE=huggingface
2
- # LLM_MODEL_TYPE=openai
3
- # LLM_MODEL_TYPE=hftgi
4
- # LLM_MODEL_TYPE=ollama
5
- # LLM_MODEL_TYPE=google
6
- # LLM_MODEL_TYPE=vllm
7
-
8
- HUGGINGFACE_AUTH_TOKEN=
9
-
10
- HFTGI_SERVER_URL=
11
-
12
- OPENAI_API_KEY=
13
-
14
- GOOGLE_API_KEY=
15
-
16
- # if unset, default to "gpt-3.5-turbo"
17
- OPENAI_MODEL_NAME=
18
-
19
- # GEMINI_MODEL_NAME=gemini-1.5-pro-latest
20
-
21
- # OLLAMA_MODEL_NAME=orca2:7b
22
- # OLLAMA_MODEL_NAME=mistral:7b
23
- # OLLAMA_MODEL_NAME=gemma:7b
24
- # OLLAMA_MODEL_NAME=llama2:7b
25
- OLLAMA_MODEL_NAME=llama3:8b
26
-
27
- OLLAMA_RP=1.15
28
- HF_RP=1.15
29
-
30
- LANGCHAIN_DEBUG=false
31
- BATCH_SIZE=1
32
- APPLY_CHAT_TEMPLATE_FOR_RAG=true
33
-
34
- # cpu, mps or cuda:0 - if unset, use whatever detected
35
- HF_EMBEDDINGS_DEVICE_TYPE=
36
- HF_PIPELINE_DEVICE_TYPE=
37
-
38
- # uncomment one of the below to load corresponding quantized model
39
- # LOAD_QUANTIZED_MODEL=4bit
40
- # LOAD_QUANTIZED_MODEL=8bit
41
-
42
- QA_WITH_RAG=true
43
- # QA_WITH_RAG=false
44
-
45
- RETRIEVER_TYPE=questions_file
46
- # RETRIEVER_TYPE=vectorstore
47
-
48
- QUESTIONS_FILE_PATH="./data/datasets/ms_macro.json"
49
-
50
- DISABLE_MODEL_PRELOADING=true
51
- CHAT_HISTORY_ENABLED=false
52
- SHOW_PARAM_SETTINGS=false
53
- SHARE_GRADIO_APP=false
54
-
55
- # if unset, default to "hkunlp/instructor-xl"
56
- HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
57
-
58
- # number of cpu cores - used to set n_threads for GPT4ALL & LlamaCpp models
59
- NUMBER_OF_CPU_CORES=
60
-
61
- USING_TORCH_BFLOAT16=true
62
-
63
- # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-3b"
64
- # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-7b"
65
- # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
66
- # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
67
- # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
68
- # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
69
- # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
70
- # HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
71
- # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
72
- # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-13b-chat-hf"
73
- # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-70b-chat-hf"
74
- # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Meta-Llama-3-8B-Instruct"
75
- # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Meta-Llama-3-70B-Instruct"
76
- # HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-7b"
77
- # HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-13b"
78
- HUGGINGFACE_MODEL_NAME_OR_PATH="google/gemma-1.1-2b-it"
79
- # HUGGINGFACE_MODEL_NAME_OR_PATH="google/gemma-1.1-7b-it"
80
- # HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Phi-3-mini-128k-instruct"
81
- # HUGGINGFACE_MODEL_NAME_OR_PATH="mistralai/Mistral-7B-Instruct-v0.2"
 
1
+ HF_TOKEN=
2
+ MODEL_NAME=microsoft/Phi-3.5-mini-instruct
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,10 +1,30 @@
1
  import json
2
  import os
 
 
3
  import gradio as gr
4
- from huggingface_hub import InferenceClient
5
- from eval_modules.utils import calc_bleu_rouge_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from eval_modules.calc_repetitions_v2e import detect_repetitions
7
 
 
 
 
 
 
8
  questions_file_path = os.getenv("QUESTIONS_FILE_PATH") or "./ms_macro.json"
9
 
10
  questions = json.loads(open(questions_file_path).read())
@@ -18,7 +38,8 @@ For more information on `huggingface_hub` Inference API support, please check th
18
  """
19
  # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
20
  # client = InferenceClient("HuggingFaceH4/zephyr-7b-gemma-v0.1")
21
- client = InferenceClient("microsoft/Phi-3.5-mini-instruct")
 
22
 
23
 
24
  def chat(
@@ -74,11 +95,11 @@ def chat(
74
  answer = partial_text
75
  (whitespace_score, repetition_score, total_repetitions) = detect_repetitions(answer)
76
  partial_text += "\n\nRepetition Metrics:\n"
77
- partial_text += f"1. Whitespace Score: {whitespace_score:.3f}\n"
78
- partial_text += f"1. Repetition Score: {repetition_score:.3f}\n"
79
  partial_text += f"1. Total Repetitions: {total_repetitions:.3f}\n"
80
  partial_text += (
81
- f"1. Non-Repetitive Ratio: {1 - total_repetitions / len(answer):.3f}\n"
82
  )
83
 
84
  if index >= 0: # RAG
@@ -87,11 +108,12 @@ def chat(
87
  if "wellFormedAnswers" in questions[index]
88
  else "answers"
89
  )
90
- scores = calc_bleu_rouge_scores([answer], [questions[index][key]], debug=True)
91
 
92
  partial_text += "\n\n Performance Metrics:\n"
93
  partial_text += f'1. BLEU-1: {scores["bleu_scores"]["bleu"]:.3f}\n'
94
  partial_text += f'1. RougeL: {scores["rouge_scores"]["rougeL"]:.3f}\n'
 
95
 
96
  partial_text += f"\n\nGround truth: {questions[index][key][0]}\n"
97
 
 
1
  import json
2
  import os
3
+ import sys
4
+ import evaluate
5
  import gradio as gr
6
+ from dotenv import find_dotenv, load_dotenv
7
+ from huggingface_hub import InferenceClient, login
8
+
9
+ found_dotenv = find_dotenv(".env")
10
+
11
+ if len(found_dotenv) == 0:
12
+ found_dotenv = find_dotenv(".env.example")
13
+ print(f"loading env vars from: {found_dotenv}")
14
+ load_dotenv(found_dotenv, override=False)
15
+
16
+ path = os.path.dirname(found_dotenv)
17
+ print(f"Adding {path} to sys.path")
18
+ sys.path.append(path)
19
+
20
+ from eval_modules.utils import calc_perf_scores
21
  from eval_modules.calc_repetitions_v2e import detect_repetitions
22
 
23
+ model_name = os.getenv("MODEL_NAME") or "microsoft/Phi-3.5-mini-instruct"
24
+ hf_token = os.getenv("HF_TOKEN")
25
+
26
+ login(token=hf_token, add_to_git_credential=True)
27
+
28
  questions_file_path = os.getenv("QUESTIONS_FILE_PATH") or "./ms_macro.json"
29
 
30
  questions = json.loads(open(questions_file_path).read())
 
38
  """
39
  # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
40
  # client = InferenceClient("HuggingFaceH4/zephyr-7b-gemma-v0.1")
41
+ # client = InferenceClient("microsoft/Phi-3.5-mini-instruct")
42
+ client = InferenceClient(model_name, token=hf_token)
43
 
44
 
45
  def chat(
 
95
  answer = partial_text
96
  (whitespace_score, repetition_score, total_repetitions) = detect_repetitions(answer)
97
  partial_text += "\n\nRepetition Metrics:\n"
98
+ partial_text += f"1. EWC Repetition Score: {whitespace_score:.3f}\n"
99
+ partial_text += f"1. Text Repetition Score: {repetition_score:.3f}\n"
100
  partial_text += f"1. Total Repetitions: {total_repetitions:.3f}\n"
101
  partial_text += (
102
+ f"1. Repetition Ratio: {total_repetitions / len(answer):.3f}\n"
103
  )
104
 
105
  if index >= 0: # RAG
 
108
  if "wellFormedAnswers" in questions[index]
109
  else "answers"
110
  )
111
+ scores = calc_perf_scores([answer], [questions[index][key]], debug=True)
112
 
113
  partial_text += "\n\n Performance Metrics:\n"
114
  partial_text += f'1. BLEU-1: {scores["bleu_scores"]["bleu"]:.3f}\n'
115
  partial_text += f'1. RougeL: {scores["rouge_scores"]["rougeL"]:.3f}\n'
116
+ partial_text += f'1. BERT-F1: {scores["bert_scores"]["f1"][0]:.3f}\n'
117
 
118
  partial_text += f"\n\nGround truth: {questions[index][key][0]}\n"
119
 
eval_modules/utils.py CHANGED
@@ -173,9 +173,10 @@ def ensure_model_is_downloaded(llm_model_type):
173
 
174
  bleu = evaluate.load("bleu")
175
  rouge = evaluate.load("rouge")
 
176
 
177
 
178
- def calc_bleu_rouge_scores(predictions, references, debug=False):
179
  if debug:
180
  print("predictions:", predictions)
181
  print("references:", references)
@@ -184,7 +185,17 @@ def calc_bleu_rouge_scores(predictions, references, debug=False):
184
  predictions=predictions, references=references, max_order=1
185
  )
186
  rouge_scores = rouge.compute(predictions=predictions, references=references)
187
- result = {"bleu_scores": bleu_scores, "rouge_scores": rouge_scores}
 
 
 
 
 
 
 
 
 
 
188
 
189
  if debug:
190
  print("result:", result)
 
173
 
174
  bleu = evaluate.load("bleu")
175
  rouge = evaluate.load("rouge")
176
+ bert_score = evaluate.load("bertscore")
177
 
178
 
179
+ def calc_perf_scores(predictions, references, debug=False):
180
  if debug:
181
  print("predictions:", predictions)
182
  print("references:", references)
 
185
  predictions=predictions, references=references, max_order=1
186
  )
187
  rouge_scores = rouge.compute(predictions=predictions, references=references)
188
+ bert_scores = bert_score.compute(
189
+ predictions=predictions,
190
+ references=references,
191
+ lang="en",
192
+ model_type="microsoft/deberta-large-mnli",
193
+ )
194
+ result = {
195
+ "bleu_scores": bleu_scores,
196
+ "rouge_scores": rouge_scores,
197
+ "bert_scores": bert_scores,
198
+ }
199
 
200
  if debug:
201
  print("result:", result)