Spaces:
Sleeping
Sleeping
Upload 24 files
Browse files- .github/ci_cd_pipeline.yml +76 -0
- Dockerfile +10 -0
- app.py +16 -0
- config/config.yaml +38 -0
- data/dataset.py +46 -0
- docker-compose.yml +39 -0
- evaluations/evaluate_models.py +62 -0
- evaluations/prompt_engineering.py +46 -0
- evaluations/rag.py +29 -0
- models/full_finetune_model.py +5 -0
- models/lora_model.py +8 -0
- models/pert_model.py +8 -0
- models/student_model.py +6 -0
- requirements.txt +37 -0
- scripts/deploy_model.py +16 -0
- scripts/distill_student.py +63 -0
- scripts/evaluate.py +44 -0
- scripts/train_full_finetune.py +42 -0
- scripts/train_lora.py +43 -0
- scripts/train_pert.py +43 -0
- tests/test_datasets.py +16 -0
- tests/test_metrics.py +13 -0
- tests/test_models.py +13 -0
- utils/monitor.py +21 -0
.github/ci_cd_pipeline.yml
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: CI/CD Pipeline
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
pull_request:
|
8 |
+
branches:
|
9 |
+
- main
|
10 |
+
|
11 |
+
jobs:
|
12 |
+
build:
|
13 |
+
runs-on: ubuntu-latest
|
14 |
+
|
15 |
+
steps:
|
16 |
+
- name: Checkout code
|
17 |
+
uses: actions/checkout@v2
|
18 |
+
|
19 |
+
- name: Set up Python
|
20 |
+
uses: actions/setup-python@v2
|
21 |
+
with:
|
22 |
+
python-version: '3.8'
|
23 |
+
|
24 |
+
- name: Install dependencies
|
25 |
+
run: |
|
26 |
+
python -m pip install --upgrade pip
|
27 |
+
pip install -r requirements.txt
|
28 |
+
|
29 |
+
- name: Run unit tests
|
30 |
+
run: |
|
31 |
+
python -m unittest discover tests
|
32 |
+
|
33 |
+
- name: Train Full Fine-Tuning model
|
34 |
+
run: |
|
35 |
+
python scripts/train_full_finetune.py
|
36 |
+
env:
|
37 |
+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
|
38 |
+
|
39 |
+
- name: Train PERT model
|
40 |
+
run: |
|
41 |
+
python scripts/train_pert.py
|
42 |
+
env:
|
43 |
+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
|
44 |
+
|
45 |
+
- name: Train LoRA model
|
46 |
+
run: |
|
47 |
+
python scripts/train_lora.py
|
48 |
+
env:
|
49 |
+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
|
50 |
+
|
51 |
+
- name: Distill Student model
|
52 |
+
run: |
|
53 |
+
python scripts/distill_student.py
|
54 |
+
env:
|
55 |
+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
|
56 |
+
|
57 |
+
- name: Prompt Engineering Evaluation
|
58 |
+
run: |
|
59 |
+
python evaluations/prompt_engineering.py
|
60 |
+
env:
|
61 |
+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
|
62 |
+
|
63 |
+
- name: RAG Evaluation using LangChain
|
64 |
+
run: |
|
65 |
+
python langchain/rag.py
|
66 |
+
|
67 |
+
- name: Evaluate models
|
68 |
+
run: |
|
69 |
+
python scripts/evaluate.py
|
70 |
+
|
71 |
+
- name: Deploy model
|
72 |
+
if: ${{ success() }}
|
73 |
+
run: |
|
74 |
+
python scripts/deploy_model.py
|
75 |
+
env:
|
76 |
+
HF_AUTH_TOKEN: ${{ secrets.HF_AUTH_TOKEN }}
|
Dockerfile
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.8-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
COPY requirements.txt requirements.txt
|
6 |
+
RUN pip install -r requirements.txt
|
7 |
+
|
8 |
+
COPY . .
|
9 |
+
|
10 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, jsonify, request
|
2 |
+
import mlflow.pyfunc
|
3 |
+
|
4 |
+
app = Flask(__name__)
|
5 |
+
|
6 |
+
# Load the model as a PyFuncModel.
|
7 |
+
model = mlflow.pyfunc.load_model(model_uri="models:/deployed_model/1")
|
8 |
+
|
9 |
+
@app.route('/predict', methods=['POST'])
|
10 |
+
def predict():
|
11 |
+
data = request.get_json()
|
12 |
+
predictions = model.predict(data)
|
13 |
+
return jsonify(predictions.tolist())
|
14 |
+
|
15 |
+
if __name__ == '__main__':
|
16 |
+
app.run(host='0.0.0.0', port=5000)
|
config/config.yaml
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
training:
|
2 |
+
num_epochs: 3
|
3 |
+
batch_size: 16
|
4 |
+
learning_rate: 2e-5
|
5 |
+
|
6 |
+
dataset:
|
7 |
+
name: imdb
|
8 |
+
split: train[:10%]
|
9 |
+
|
10 |
+
model:
|
11 |
+
adapter:
|
12 |
+
reduction_factor: 16
|
13 |
+
lora:
|
14 |
+
r: 4
|
15 |
+
alpha: 32
|
16 |
+
student:
|
17 |
+
hidden_size: 384
|
18 |
+
|
19 |
+
evaluation:
|
20 |
+
models:
|
21 |
+
- bert-base-uncased
|
22 |
+
- distilbert-base-uncased
|
23 |
+
- roberta-base
|
24 |
+
- gpt2
|
25 |
+
- bart-base
|
26 |
+
- electra-small-discriminator
|
27 |
+
- t5-small
|
28 |
+
- xlm-roberta-base
|
29 |
+
- albert-base-v2
|
30 |
+
- xlnet-base-cased
|
31 |
+
- deberta-base
|
32 |
+
- camembert-base
|
33 |
+
- marianmt-en-de
|
34 |
+
- m2m100_418M
|
35 |
+
|
36 |
+
wandb:
|
37 |
+
project: fine_tuning_comparison
|
38 |
+
entity: your_wandb_username
|
data/dataset.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from transformers import BertTokenizer
|
3 |
+
|
4 |
+
def load_and_tokenize_data(config):
|
5 |
+
"""
|
6 |
+
Load and tokenize data based on the provided configuration.
|
7 |
+
|
8 |
+
Args:
|
9 |
+
config (dict): Configuration dictionary containing dataset and tokenizer details.
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
tuple: A tuple containing the tokenized train and test datasets.
|
13 |
+
"""
|
14 |
+
# Load the dataset
|
15 |
+
dataset = load_dataset(config['dataset']['name'], split=config['dataset']['split'])
|
16 |
+
dataset = dataset.train_test_split(test_size=0.2)
|
17 |
+
train_dataset = dataset['train']
|
18 |
+
test_dataset = dataset['test']
|
19 |
+
|
20 |
+
# Initialize the tokenizer
|
21 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
22 |
+
|
23 |
+
# Define the tokenization function
|
24 |
+
def tokenize_function(examples):
|
25 |
+
return tokenizer(examples['text'], padding='max_length', truncation=True)
|
26 |
+
|
27 |
+
# Apply tokenization to the train and test datasets
|
28 |
+
train_dataset = train_dataset.map(tokenize_function, batched=True)
|
29 |
+
test_dataset = test_dataset.map(tokenize_function, batched=True)
|
30 |
+
|
31 |
+
# Set the format to PyTorch tensors
|
32 |
+
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
33 |
+
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
|
34 |
+
|
35 |
+
return train_dataset, test_dataset
|
36 |
+
|
37 |
+
# Example usage
|
38 |
+
if __name__ == "__main__":
|
39 |
+
config = {
|
40 |
+
'dataset': {
|
41 |
+
'name': 'imdb',
|
42 |
+
'split': 'train[:10%]'
|
43 |
+
}
|
44 |
+
}
|
45 |
+
train_dataset, test_dataset = load_and_tokenize_data(config)
|
46 |
+
print("Train dataset and Test dataset have been loaded and tokenized successfully.")
|
docker-compose.yml
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
training:
|
2 |
+
num_epochs: 3
|
3 |
+
batch_size: 16
|
4 |
+
learning_rate: 2e-5
|
5 |
+
|
6 |
+
dataset:
|
7 |
+
name: imdb
|
8 |
+
split: train[:10%]
|
9 |
+
|
10 |
+
model:
|
11 |
+
adapter:
|
12 |
+
reduction_factor: 16
|
13 |
+
lora:
|
14 |
+
r: 4
|
15 |
+
alpha: 32
|
16 |
+
student:
|
17 |
+
hidden_size: 384
|
18 |
+
|
19 |
+
evaluation:
|
20 |
+
models:
|
21 |
+
- bert-base-uncased
|
22 |
+
- distilbert-base-uncased
|
23 |
+
- roberta-base
|
24 |
+
- gpt2
|
25 |
+
- bart-base
|
26 |
+
- electra-small-discriminator
|
27 |
+
- t5-small
|
28 |
+
- xlm-roberta-base
|
29 |
+
- albert-base-v2
|
30 |
+
- xlnet-base-cased
|
31 |
+
- deberta-base
|
32 |
+
- camembert-base
|
33 |
+
- marianmt-en-de
|
34 |
+
- m2m100_418M
|
35 |
+
|
36 |
+
wandb:
|
37 |
+
project: fine_tuning_comparison
|
38 |
+
entity: your_wandb_username
|
39 |
+
|
evaluations/evaluate_models.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wandb
|
2 |
+
import yaml
|
3 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
|
4 |
+
from data.datasets import load_and_tokenize_data
|
5 |
+
from utils.monitor import measure_resources
|
6 |
+
|
7 |
+
# Charger la configuration
|
8 |
+
with open('config/config.yaml', 'r') as f:
|
9 |
+
config = yaml.safe_load(f)
|
10 |
+
|
11 |
+
# Initialiser wandb
|
12 |
+
wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
|
13 |
+
|
14 |
+
# Charger les donn�es
|
15 |
+
train_dataset, test_dataset = load_and_tokenize_data(config)
|
16 |
+
|
17 |
+
def evaluate_model(model_name):
|
18 |
+
# Charger le mod�le et le tokenizer
|
19 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
21 |
+
|
22 |
+
# Tokenizer les donn�es
|
23 |
+
train_dataset = train_dataset.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True), batched=True)
|
24 |
+
test_dataset = test_dataset.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True), batched=True)
|
25 |
+
|
26 |
+
# D�finir les arguments de formation
|
27 |
+
training_args = TrainingArguments(
|
28 |
+
output_dir=f'./results/{model_name}',
|
29 |
+
num_train_epochs=config['training']['num_epochs'],
|
30 |
+
per_device_train_batch_size=config['training']['batch_size'],
|
31 |
+
per_device_eval_batch_size=config['training']['batch_size'],
|
32 |
+
evaluation_strategy='epoch',
|
33 |
+
save_steps=10_000,
|
34 |
+
save_total_limit=2,
|
35 |
+
logging_dir='./logs',
|
36 |
+
logging_steps=10,
|
37 |
+
)
|
38 |
+
|
39 |
+
# Cr�er le Trainer
|
40 |
+
trainer = Trainer(
|
41 |
+
model=model,
|
42 |
+
args=training_args,
|
43 |
+
train_dataset=train_dataset,
|
44 |
+
eval_dataset=test_dataset,
|
45 |
+
)
|
46 |
+
|
47 |
+
# Mesurer les ressources et �valuer le mod�le
|
48 |
+
peak_memory, training_time = measure_resources(trainer, model_name)
|
49 |
+
|
50 |
+
# �valuation des performances
|
51 |
+
metrics = trainer.evaluate()
|
52 |
+
|
53 |
+
wandb.log({
|
54 |
+
'model_name': model_name,
|
55 |
+
'peak_memory_MB': peak_memory,
|
56 |
+
'training_time_seconds': training_time,
|
57 |
+
**metrics
|
58 |
+
})
|
59 |
+
|
60 |
+
# �valuer chaque mod�le
|
61 |
+
for model_name in config['evaluation']['models']:
|
62 |
+
evaluate_model(model_name)
|
evaluations/prompt_engineering.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wandb
|
2 |
+
import yaml
|
3 |
+
from transformers import pipeline
|
4 |
+
|
5 |
+
# Charger la configuration
|
6 |
+
with open('config/config.yaml', 'r') as f:
|
7 |
+
config = yaml.safe_load(f)
|
8 |
+
|
9 |
+
# Initialiser wandb
|
10 |
+
wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
|
11 |
+
|
12 |
+
# Charger le mod�le fine-tuned
|
13 |
+
model_name = "results_student" # Remplacer par le chemin vers le mod�le student
|
14 |
+
tokenizer_name = "distilbert-base-uncased"
|
15 |
+
|
16 |
+
# Configuration du pipeline
|
17 |
+
nlp = pipeline("text-classification", model=model_name, tokenizer=tokenizer_name)
|
18 |
+
|
19 |
+
# Simuler des exemples pour l'�valuation
|
20 |
+
examples = [
|
21 |
+
{"reference": "This is a great movie.", "candidate": "This is a fantastic movie."},
|
22 |
+
{"reference": "I love this film.", "candidate": "I enjoy this movie."}
|
23 |
+
]
|
24 |
+
|
25 |
+
def evaluate_prompt(example, shots=0):
|
26 |
+
prompt = example["candidate"]
|
27 |
+
if shots == 1:
|
28 |
+
prompt = "Classify the sentiment of the following text: " + prompt
|
29 |
+
elif shots > 1:
|
30 |
+
prompt = "Classify the sentiment of the following text based on these examples:\n" + \
|
31 |
+
"Example: This movie is terrible. -> Negative\n" + \
|
32 |
+
"Example: I love this movie. -> Positive\n" + \
|
33 |
+
prompt
|
34 |
+
|
35 |
+
result = nlp(prompt)[0]
|
36 |
+
return result
|
37 |
+
|
38 |
+
# �valuer les prompts
|
39 |
+
for example in examples:
|
40 |
+
for shots in [0, 1, 5]:
|
41 |
+
result = evaluate_prompt(example, shots)
|
42 |
+
wandb.log({
|
43 |
+
'example': example['candidate'],
|
44 |
+
'shots': shots,
|
45 |
+
'result': result
|
46 |
+
})
|
evaluations/rag.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
rom langchain.chains import RAGChain
|
2 |
+
from langchain.llms import HuggingFace
|
3 |
+
from langchain.retrievers import BM25Retriever
|
4 |
+
from langchain.prompts import PromptTemplate
|
5 |
+
import yaml
|
6 |
+
|
7 |
+
# Charger la configuration
|
8 |
+
with open('config/config.yaml', 'r') as f:
|
9 |
+
config = yaml.safe_load(f)
|
10 |
+
|
11 |
+
# Configuration du mod�le
|
12 |
+
llm = HuggingFace("distilbert-base-uncased")
|
13 |
+
|
14 |
+
# Configuration du retriever
|
15 |
+
retriever = BM25Retriever.from_documents(["This is a great movie.", "I love this film."])
|
16 |
+
|
17 |
+
# Cr�ation du template de prompt
|
18 |
+
template = PromptTemplate("Classify the sentiment of the following text: {text}")
|
19 |
+
|
20 |
+
# Cr�ation de la cha�ne RAG
|
21 |
+
rag_chain = RAGChain(llm=llm, retriever=retriever, prompt_template=template)
|
22 |
+
|
23 |
+
# Exemples de textes � classifier
|
24 |
+
texts = ["This is a fantastic movie.", "I enjoy this movie."]
|
25 |
+
|
26 |
+
# Utiliser RAG pour obtenir des classifications avec contexte
|
27 |
+
for text in texts:
|
28 |
+
result = rag_chain.run({"text": text})
|
29 |
+
print(f"Text: {text}, Result: {result}")
|
models/full_finetune_model.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertForSequenceClassification
|
2 |
+
|
3 |
+
def get_full_finetune_model():
|
4 |
+
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
5 |
+
return model
|
models/lora_model.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertForSequenceClassification, LoRAConfig
|
2 |
+
|
3 |
+
def get_lora_model(config):
|
4 |
+
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
5 |
+
lora_config = LoRAConfig(r=config['model']['lora']['r'], alpha=config['model']['lora']['alpha'])
|
6 |
+
model.add_lora('imdb_lora', config=lora_config)
|
7 |
+
model.train_lora('imdb_lora')
|
8 |
+
return model
|
models/pert_model.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertForSequenceClassification, AdapterConfig
|
2 |
+
|
3 |
+
def get_pert_model(config):
|
4 |
+
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
5 |
+
adapter_config = AdapterConfig(mh_adapter=True, output_adapter=True, reduction_factor=config['model']['adapter']['reduction_factor'])
|
6 |
+
model.add_adapter('imdb_adapter', config=adapter_config)
|
7 |
+
model.train_adapter('imdb_adapter')
|
8 |
+
return model
|
models/student_model.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import DistilBertForSequenceClassification
|
2 |
+
|
3 |
+
def get_student_model(config):
|
4 |
+
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
|
5 |
+
model.config.hidden_size = config['model']['student']['hidden_size']
|
6 |
+
return model
|
requirements.txt
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
accelerate
|
3 |
+
torch
|
4 |
+
datasets
|
5 |
+
wandb
|
6 |
+
pyyaml
|
7 |
+
langchain
|
8 |
+
flask
|
9 |
+
mlflow
|
10 |
+
streamlit
|
11 |
+
streamlit==1.32.1
|
12 |
+
flask
|
13 |
+
mlflow
|
14 |
+
fastapi==0.74.*
|
15 |
+
requests==2.27.*
|
16 |
+
uvicorn[standard]==0.17.*
|
17 |
+
sentencepiece==0.2.*
|
18 |
+
python-dotenv
|
19 |
+
wikipedia
|
20 |
+
tiktoken
|
21 |
+
neo4j
|
22 |
+
streamlit
|
23 |
+
Pillow
|
24 |
+
fastapi
|
25 |
+
PyPDF2
|
26 |
+
pydantic
|
27 |
+
uvicorn
|
28 |
+
sse-starlette
|
29 |
+
boto3
|
30 |
+
streamlit==1.32.1
|
31 |
+
# missing from the langchain base image?
|
32 |
+
langchain-openai==0.2.4
|
33 |
+
langchain-community==0.3.3
|
34 |
+
langchain-google-genai==2.0.3
|
35 |
+
langchain-ollama==0.2.0
|
36 |
+
langchain-huggingface==0.1.1
|
37 |
+
langchain-aws==0.2.4
|
scripts/deploy_model.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import HfApi
|
2 |
+
import os
|
3 |
+
|
4 |
+
def deploy_model_to_huggingface(model_path, model_name):
|
5 |
+
api = HfApi()
|
6 |
+
api.upload_folder(
|
7 |
+
folder_path=model_path,
|
8 |
+
repo_id=model_name,
|
9 |
+
commit_message="Deploying fine-tuned model",
|
10 |
+
use_auth_token=os.getenv("HF_AUTH_TOKEN")
|
11 |
+
)
|
12 |
+
|
13 |
+
if __name__ == "__main__":
|
14 |
+
model_path = "./results_student"
|
15 |
+
model_name = "your_model_name"
|
16 |
+
deploy_model_to_huggingface(model_path, model_name)
|
scripts/distill_student.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import wandb
|
3 |
+
import yaml
|
4 |
+
from transformers import Trainer, TrainingArguments
|
5 |
+
from data.datasets import load_and_tokenize_data
|
6 |
+
from models.full_finetune_model import get_full_finetune_model
|
7 |
+
from models.student_model import get_student_model
|
8 |
+
|
9 |
+
# Charger la configuration
|
10 |
+
with open('config/config.yaml', 'r') as f:
|
11 |
+
config = yaml.safe_load(f)
|
12 |
+
|
13 |
+
# Initialiser wandb
|
14 |
+
wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
|
15 |
+
|
16 |
+
# Charger les donn�es
|
17 |
+
train_dataset, test_dataset = load_and_tokenize_data(config)
|
18 |
+
|
19 |
+
# Charger le mod�le teacher et le mod�le student
|
20 |
+
teacher_model = get_full_finetune_model()
|
21 |
+
student_model = get_student_model(config)
|
22 |
+
|
23 |
+
# D�finir les arguments de formation pour la distillation
|
24 |
+
training_args = TrainingArguments(
|
25 |
+
output_dir='./results_student',
|
26 |
+
num_train_epochs=config['training']['num_epochs'],
|
27 |
+
per_device_train_batch_size=config['training']['batch_size'],
|
28 |
+
per_device_eval_batch_size=config['training']['batch_size'],
|
29 |
+
evaluation_strategy='epoch',
|
30 |
+
save_steps=10_000,
|
31 |
+
save_total_limit=2,
|
32 |
+
logging_dir='./logs',
|
33 |
+
logging_steps=10,
|
34 |
+
)
|
35 |
+
|
36 |
+
# D�finir le distillateur
|
37 |
+
class DistillationTrainer(Trainer):
|
38 |
+
def compute_loss(self, model, inputs, return_outputs=False):
|
39 |
+
# Forward pass of teacher model
|
40 |
+
with torch.no_grad():
|
41 |
+
teacher_outputs = teacher_model(**inputs)
|
42 |
+
|
43 |
+
# Forward pass of student model
|
44 |
+
student_outputs = model(**inputs)
|
45 |
+
|
46 |
+
# Compute distillation loss
|
47 |
+
loss = torch.nn.functional.kl_div(
|
48 |
+
torch.nn.functional.log_softmax(student_outputs.logits, dim=-1),
|
49 |
+
torch.nn.functional.softmax(teacher_outputs.logits, dim=-1),
|
50 |
+
reduction='batchmean'
|
51 |
+
)
|
52 |
+
return (loss, student_outputs) if return_outputs else loss
|
53 |
+
|
54 |
+
# Cr�er le Trainer pour la distillation
|
55 |
+
trainer = DistillationTrainer(
|
56 |
+
model=student_model,
|
57 |
+
args=training_args,
|
58 |
+
train_dataset=train_dataset,
|
59 |
+
eval_dataset=test_dataset,
|
60 |
+
)
|
61 |
+
|
62 |
+
# Mesurer les ressources et entra�ner le mod�le student
|
63 |
+
measure_resources(trainer, "Distillation")
|
scripts/evaluate.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wandb
|
2 |
+
from datasets import load_metric
|
3 |
+
from transformers import pipeline
|
4 |
+
import yaml
|
5 |
+
|
6 |
+
# Charger la configuration
|
7 |
+
with open('config/config.yaml', 'r') as f:
|
8 |
+
config = yaml.safe_load(f)
|
9 |
+
|
10 |
+
# Charger le mod�le fine-tuned
|
11 |
+
model_name = "results_student" # Remplacer par le chemin vers le mod�le student
|
12 |
+
tokenizer_name = "distilbert-base-uncased"
|
13 |
+
|
14 |
+
# Configuration de l'�valuation
|
15 |
+
bleu = load_metric("bleu")
|
16 |
+
rouge = load_metric("rouge")
|
17 |
+
|
18 |
+
# Initialiser wandb
|
19 |
+
wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
|
20 |
+
|
21 |
+
def evaluate_model(model_name, tokenizer_name):
|
22 |
+
nlp = pipeline("text-classification", model=model_name, tokenizer=tokenizer_name)
|
23 |
+
|
24 |
+
# Simuler des exemples pour l'�valuation
|
25 |
+
examples = [
|
26 |
+
{"reference": "This is a great movie.", "candidate": "This is a fantastic movie."},
|
27 |
+
{"reference": "I love this film.", "candidate": "I enjoy this movie."}
|
28 |
+
]
|
29 |
+
|
30 |
+
references = [e["reference"] for e in examples]
|
31 |
+
candidates = [nlp(e["candidate"])[0]["label"] for e in examples]
|
32 |
+
|
33 |
+
# Calcul des scores BLEU et ROUGE
|
34 |
+
bleu_score = bleu.compute(predictions=candidates, references=references)
|
35 |
+
rouge_score = rouge.compute(predictions=candidates, references=references)
|
36 |
+
|
37 |
+
# Enregistrer les scores sur wandb
|
38 |
+
wandb.log({
|
39 |
+
"bleu_score": bleu_score,
|
40 |
+
"rouge_score": rouge_score
|
41 |
+
})
|
42 |
+
|
43 |
+
# �valuer les mod�les
|
44 |
+
evaluate_model(model_name, tokenizer_name)
|
scripts/train_full_finetune.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wandb
|
2 |
+
import yaml
|
3 |
+
from transformers import Trainer, TrainingArguments
|
4 |
+
from utils.monitor import measure_resources
|
5 |
+
from data.datasets import load_and_tokenize_data
|
6 |
+
from models.full_finetune_model import get_full_finetune_model
|
7 |
+
|
8 |
+
# Charger la configuration
|
9 |
+
with open('config/config.yaml', 'r') as f:
|
10 |
+
config = yaml.safe_load(f)
|
11 |
+
|
12 |
+
# Initialiser wandb
|
13 |
+
wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
|
14 |
+
|
15 |
+
# Charger les donn�es
|
16 |
+
train_dataset, test_dataset = load_and_tokenize_data(config)
|
17 |
+
|
18 |
+
# Charger le mod�le
|
19 |
+
model = get_full_finetune_model()
|
20 |
+
|
21 |
+
# D�finir les arguments de formation
|
22 |
+
training_args = TrainingArguments(
|
23 |
+
output_dir='./results',
|
24 |
+
num_train_epochs=config['training']['num_epochs'],
|
25 |
+
per_device_train_batch_size=config['training']['batch_size'],
|
26 |
+
per_device_eval_batch_size=config['training']['batch_size'],
|
27 |
+
evaluation_strategy='epoch',
|
28 |
+
save_steps=10_000,
|
29 |
+
save_total_limit=2,
|
30 |
+
logging_dir='./logs',
|
31 |
+
logging_steps=10,
|
32 |
+
)
|
33 |
+
|
34 |
+
# Cr�er le Trainer
|
35 |
+
trainer = Trainer(
|
36 |
+
model=model,
|
37 |
+
args=training_args,
|
38 |
+
train_dataset=train_dataset,
|
39 |
+
eval_dataset=test_dataset,
|
40 |
+
)
|
41 |
+
#Mesurer les ressources et entra�ner le mod�le
|
42 |
+
measure_resources(trainer, "Full Fine-Tuning")
|
scripts/train_lora.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wandb
|
2 |
+
import yaml
|
3 |
+
from transformers import Trainer, TrainingArguments
|
4 |
+
from utils.monitor import measure_resources
|
5 |
+
from data.datasets import load_and_tokenize_data
|
6 |
+
from models.lora_model import get_lora_model
|
7 |
+
|
8 |
+
# Charger la configuration
|
9 |
+
with open('config/config.yaml', 'r') as f:
|
10 |
+
config = yaml.safe_load(f)
|
11 |
+
|
12 |
+
# Initialiser wandb
|
13 |
+
wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
|
14 |
+
|
15 |
+
# Charger les donn�es
|
16 |
+
train_dataset, test_dataset = load_and_tokenize_data(config)
|
17 |
+
|
18 |
+
# Charger le mod�le
|
19 |
+
model = get_lora_model(config)
|
20 |
+
|
21 |
+
# D�finir les arguments de formation
|
22 |
+
training_args = TrainingArguments(
|
23 |
+
output_dir='./results',
|
24 |
+
num_train_epochs=config['training']['num_epochs'],
|
25 |
+
per_device_train_batch_size=config['training']['batch_size'],
|
26 |
+
per_device_eval_batch_size=config['training']['batch_size'],
|
27 |
+
evaluation_strategy='epoch',
|
28 |
+
save_steps=10_000,
|
29 |
+
save_total_limit=2,
|
30 |
+
logging_dir='./logs',
|
31 |
+
logging_steps=10,
|
32 |
+
)
|
33 |
+
|
34 |
+
# Cr�er le Trainer
|
35 |
+
trainer = Trainer(
|
36 |
+
model=model,
|
37 |
+
args=training_args,
|
38 |
+
train_dataset=train_dataset,
|
39 |
+
eval_dataset=test_dataset,
|
40 |
+
)
|
41 |
+
|
42 |
+
# Mesurer les ressources et entra�ner le mod�le
|
43 |
+
measure_resources(trainer, "LoRA")
|
scripts/train_pert.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wandb
|
2 |
+
import yaml
|
3 |
+
from transformers import Trainer, TrainingArguments
|
4 |
+
from utils.monitor import measure_resources
|
5 |
+
from data.datasets import load_and_tokenize_data
|
6 |
+
from models.pert_model import get_pert_model
|
7 |
+
|
8 |
+
# Charger la configuration
|
9 |
+
with open('config/config.yaml', 'r') as f:
|
10 |
+
config = yaml.safe_load(f)
|
11 |
+
|
12 |
+
# Initialiser wandb
|
13 |
+
wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
|
14 |
+
|
15 |
+
# Charger les donn�es
|
16 |
+
train_dataset, test_dataset = load_and_tokenize_data(config)
|
17 |
+
|
18 |
+
# Charger le mod�le
|
19 |
+
model = get_pert_model(config)
|
20 |
+
|
21 |
+
# D�finir les arguments de formation
|
22 |
+
training_args = TrainingArguments(
|
23 |
+
output_dir='./results',
|
24 |
+
num_train_epochs=config['training']['num_epochs'],
|
25 |
+
per_device_train_batch_size=config['training']['batch_size'],
|
26 |
+
per_device_eval_batch_size=config['training']['batch_size'],
|
27 |
+
evaluation_strategy='epoch',
|
28 |
+
save_steps=10_000,
|
29 |
+
save_total_limit=2,
|
30 |
+
logging_dir='./logs',
|
31 |
+
logging_steps=10,
|
32 |
+
)
|
33 |
+
|
34 |
+
# Cr�er le Trainer
|
35 |
+
trainer = Trainer(
|
36 |
+
model=model,
|
37 |
+
args=training_args,
|
38 |
+
train_dataset=train_dataset,
|
39 |
+
eval_dataset=test_dataset,
|
40 |
+
)
|
41 |
+
|
42 |
+
# Mesurer les ressources et entra�ner le mod�le
|
43 |
+
measure_resources(trainer, "PERT (Adapters)")
|
tests/test_datasets.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from data.datasets import load_dataset, preprocess_data
|
3 |
+
|
4 |
+
class TestDatasets(unittest.TestCase):
|
5 |
+
|
6 |
+
def test_load_dataset(self):
|
7 |
+
df = load_dataset('data/test.csv')
|
8 |
+
self.assertIsNotNone(df)
|
9 |
+
|
10 |
+
def test_preprocess_data(self):
|
11 |
+
df = load_dataset('data/test.csv')
|
12 |
+
preprocessed_df = preprocess_data(df)
|
13 |
+
self.assertFalse(preprocessed_df.isnull().values.any())
|
14 |
+
|
15 |
+
if __name__ == '__main__':
|
16 |
+
unittest.main()
|
tests/test_metrics.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from evaluations.evaluate_models import evaluate_model
|
3 |
+
|
4 |
+
class TestMetrics(unittest.TestCase):
|
5 |
+
|
6 |
+
def test_evaluate_model(self):
|
7 |
+
model = train_full_finetune_model(train_data, val_data)
|
8 |
+
results = evaluate_model(model, val_data)
|
9 |
+
self.assertIn('accuracy', results)
|
10 |
+
self.assertIn('f1_score', results)
|
11 |
+
|
12 |
+
if __name__ == '__main__':
|
13 |
+
unittest.main()
|
tests/test_models.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from models.full_finetune_model import train_full_finetune_model
|
3 |
+
|
4 |
+
class TestModels(unittest.TestCase):
|
5 |
+
|
6 |
+
def test_train_full_finetune_model(self):
|
7 |
+
train_data = preprocess_data(load_dataset('data/train.csv'))
|
8 |
+
val_data = preprocess_data(load_dataset('data/val.csv'))
|
9 |
+
model = train_full_finetune_model(train_data, val_data)
|
10 |
+
self.assertIsNotNone(model)
|
11 |
+
|
12 |
+
if __name__ == '__main__':
|
13 |
+
unittest.main()
|
utils/monitor.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import torch
|
3 |
+
|
4 |
+
def measure_resources(trainer, method_name):
|
5 |
+
start_time = time.time()
|
6 |
+
torch.cuda.reset_peak_memory_stats()
|
7 |
+
|
8 |
+
# D�marrer l'entra�nement et enregistrer les m�triques
|
9 |
+
trainer.train()
|
10 |
+
|
11 |
+
end_time = time.time()
|
12 |
+
peak_memory = torch.cuda.max_memory_allocated() / (1024 * 1024) # Convertir en MB
|
13 |
+
training_time = end_time - start_time
|
14 |
+
|
15 |
+
# Enregistrer les m�triques sur wandb
|
16 |
+
wandb.log({
|
17 |
+
'method': method_name,
|
18 |
+
'peak_memory_MB': peak_memory,
|
19 |
+
'training_time_seconds': training_time
|
20 |
+
})
|
21 |
+
return peak_memory, training_time
|