jaothan commited on
Commit
fa64206
·
verified ·
1 Parent(s): 666ef7e

Upload 24 files

Browse files
.github/ci_cd_pipeline.yml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI/CD Pipeline
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ build:
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Checkout code
17
+ uses: actions/checkout@v2
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v2
21
+ with:
22
+ python-version: '3.8'
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install -r requirements.txt
28
+
29
+ - name: Run unit tests
30
+ run: |
31
+ python -m unittest discover tests
32
+
33
+ - name: Train Full Fine-Tuning model
34
+ run: |
35
+ python scripts/train_full_finetune.py
36
+ env:
37
+ WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
38
+
39
+ - name: Train PERT model
40
+ run: |
41
+ python scripts/train_pert.py
42
+ env:
43
+ WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
44
+
45
+ - name: Train LoRA model
46
+ run: |
47
+ python scripts/train_lora.py
48
+ env:
49
+ WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
50
+
51
+ - name: Distill Student model
52
+ run: |
53
+ python scripts/distill_student.py
54
+ env:
55
+ WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
56
+
57
+ - name: Prompt Engineering Evaluation
58
+ run: |
59
+ python evaluations/prompt_engineering.py
60
+ env:
61
+ WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
62
+
63
+ - name: RAG Evaluation using LangChain
64
+ run: |
65
+ python langchain/rag.py
66
+
67
+ - name: Evaluate models
68
+ run: |
69
+ python scripts/evaluate.py
70
+
71
+ - name: Deploy model
72
+ if: ${{ success() }}
73
+ run: |
74
+ python scripts/deploy_model.py
75
+ env:
76
+ HF_AUTH_TOKEN: ${{ secrets.HF_AUTH_TOKEN }}
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.8-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt requirements.txt
6
+ RUN pip install -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, jsonify, request
2
+ import mlflow.pyfunc
3
+
4
+ app = Flask(__name__)
5
+
6
+ # Load the model as a PyFuncModel.
7
+ model = mlflow.pyfunc.load_model(model_uri="models:/deployed_model/1")
8
+
9
+ @app.route('/predict', methods=['POST'])
10
+ def predict():
11
+ data = request.get_json()
12
+ predictions = model.predict(data)
13
+ return jsonify(predictions.tolist())
14
+
15
+ if __name__ == '__main__':
16
+ app.run(host='0.0.0.0', port=5000)
config/config.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ training:
2
+ num_epochs: 3
3
+ batch_size: 16
4
+ learning_rate: 2e-5
5
+
6
+ dataset:
7
+ name: imdb
8
+ split: train[:10%]
9
+
10
+ model:
11
+ adapter:
12
+ reduction_factor: 16
13
+ lora:
14
+ r: 4
15
+ alpha: 32
16
+ student:
17
+ hidden_size: 384
18
+
19
+ evaluation:
20
+ models:
21
+ - bert-base-uncased
22
+ - distilbert-base-uncased
23
+ - roberta-base
24
+ - gpt2
25
+ - bart-base
26
+ - electra-small-discriminator
27
+ - t5-small
28
+ - xlm-roberta-base
29
+ - albert-base-v2
30
+ - xlnet-base-cased
31
+ - deberta-base
32
+ - camembert-base
33
+ - marianmt-en-de
34
+ - m2m100_418M
35
+
36
+ wandb:
37
+ project: fine_tuning_comparison
38
+ entity: your_wandb_username
data/dataset.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import BertTokenizer
3
+
4
+ def load_and_tokenize_data(config):
5
+ """
6
+ Load and tokenize data based on the provided configuration.
7
+
8
+ Args:
9
+ config (dict): Configuration dictionary containing dataset and tokenizer details.
10
+
11
+ Returns:
12
+ tuple: A tuple containing the tokenized train and test datasets.
13
+ """
14
+ # Load the dataset
15
+ dataset = load_dataset(config['dataset']['name'], split=config['dataset']['split'])
16
+ dataset = dataset.train_test_split(test_size=0.2)
17
+ train_dataset = dataset['train']
18
+ test_dataset = dataset['test']
19
+
20
+ # Initialize the tokenizer
21
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
22
+
23
+ # Define the tokenization function
24
+ def tokenize_function(examples):
25
+ return tokenizer(examples['text'], padding='max_length', truncation=True)
26
+
27
+ # Apply tokenization to the train and test datasets
28
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
29
+ test_dataset = test_dataset.map(tokenize_function, batched=True)
30
+
31
+ # Set the format to PyTorch tensors
32
+ train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
33
+ test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
34
+
35
+ return train_dataset, test_dataset
36
+
37
+ # Example usage
38
+ if __name__ == "__main__":
39
+ config = {
40
+ 'dataset': {
41
+ 'name': 'imdb',
42
+ 'split': 'train[:10%]'
43
+ }
44
+ }
45
+ train_dataset, test_dataset = load_and_tokenize_data(config)
46
+ print("Train dataset and Test dataset have been loaded and tokenized successfully.")
docker-compose.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ training:
2
+ num_epochs: 3
3
+ batch_size: 16
4
+ learning_rate: 2e-5
5
+
6
+ dataset:
7
+ name: imdb
8
+ split: train[:10%]
9
+
10
+ model:
11
+ adapter:
12
+ reduction_factor: 16
13
+ lora:
14
+ r: 4
15
+ alpha: 32
16
+ student:
17
+ hidden_size: 384
18
+
19
+ evaluation:
20
+ models:
21
+ - bert-base-uncased
22
+ - distilbert-base-uncased
23
+ - roberta-base
24
+ - gpt2
25
+ - bart-base
26
+ - electra-small-discriminator
27
+ - t5-small
28
+ - xlm-roberta-base
29
+ - albert-base-v2
30
+ - xlnet-base-cased
31
+ - deberta-base
32
+ - camembert-base
33
+ - marianmt-en-de
34
+ - m2m100_418M
35
+
36
+ wandb:
37
+ project: fine_tuning_comparison
38
+ entity: your_wandb_username
39
+
evaluations/evaluate_models.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ import yaml
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
4
+ from data.datasets import load_and_tokenize_data
5
+ from utils.monitor import measure_resources
6
+
7
+ # Charger la configuration
8
+ with open('config/config.yaml', 'r') as f:
9
+ config = yaml.safe_load(f)
10
+
11
+ # Initialiser wandb
12
+ wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
13
+
14
+ # Charger les donn�es
15
+ train_dataset, test_dataset = load_and_tokenize_data(config)
16
+
17
+ def evaluate_model(model_name):
18
+ # Charger le mod�le et le tokenizer
19
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
20
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
21
+
22
+ # Tokenizer les donn�es
23
+ train_dataset = train_dataset.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True), batched=True)
24
+ test_dataset = test_dataset.map(lambda x: tokenizer(x['text'], padding='max_length', truncation=True), batched=True)
25
+
26
+ # D�finir les arguments de formation
27
+ training_args = TrainingArguments(
28
+ output_dir=f'./results/{model_name}',
29
+ num_train_epochs=config['training']['num_epochs'],
30
+ per_device_train_batch_size=config['training']['batch_size'],
31
+ per_device_eval_batch_size=config['training']['batch_size'],
32
+ evaluation_strategy='epoch',
33
+ save_steps=10_000,
34
+ save_total_limit=2,
35
+ logging_dir='./logs',
36
+ logging_steps=10,
37
+ )
38
+
39
+ # Cr�er le Trainer
40
+ trainer = Trainer(
41
+ model=model,
42
+ args=training_args,
43
+ train_dataset=train_dataset,
44
+ eval_dataset=test_dataset,
45
+ )
46
+
47
+ # Mesurer les ressources et �valuer le mod�le
48
+ peak_memory, training_time = measure_resources(trainer, model_name)
49
+
50
+ # �valuation des performances
51
+ metrics = trainer.evaluate()
52
+
53
+ wandb.log({
54
+ 'model_name': model_name,
55
+ 'peak_memory_MB': peak_memory,
56
+ 'training_time_seconds': training_time,
57
+ **metrics
58
+ })
59
+
60
+ # �valuer chaque mod�le
61
+ for model_name in config['evaluation']['models']:
62
+ evaluate_model(model_name)
evaluations/prompt_engineering.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ import yaml
3
+ from transformers import pipeline
4
+
5
+ # Charger la configuration
6
+ with open('config/config.yaml', 'r') as f:
7
+ config = yaml.safe_load(f)
8
+
9
+ # Initialiser wandb
10
+ wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
11
+
12
+ # Charger le mod�le fine-tuned
13
+ model_name = "results_student" # Remplacer par le chemin vers le mod�le student
14
+ tokenizer_name = "distilbert-base-uncased"
15
+
16
+ # Configuration du pipeline
17
+ nlp = pipeline("text-classification", model=model_name, tokenizer=tokenizer_name)
18
+
19
+ # Simuler des exemples pour l'�valuation
20
+ examples = [
21
+ {"reference": "This is a great movie.", "candidate": "This is a fantastic movie."},
22
+ {"reference": "I love this film.", "candidate": "I enjoy this movie."}
23
+ ]
24
+
25
+ def evaluate_prompt(example, shots=0):
26
+ prompt = example["candidate"]
27
+ if shots == 1:
28
+ prompt = "Classify the sentiment of the following text: " + prompt
29
+ elif shots > 1:
30
+ prompt = "Classify the sentiment of the following text based on these examples:\n" + \
31
+ "Example: This movie is terrible. -> Negative\n" + \
32
+ "Example: I love this movie. -> Positive\n" + \
33
+ prompt
34
+
35
+ result = nlp(prompt)[0]
36
+ return result
37
+
38
+ # �valuer les prompts
39
+ for example in examples:
40
+ for shots in [0, 1, 5]:
41
+ result = evaluate_prompt(example, shots)
42
+ wandb.log({
43
+ 'example': example['candidate'],
44
+ 'shots': shots,
45
+ 'result': result
46
+ })
evaluations/rag.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ rom langchain.chains import RAGChain
2
+ from langchain.llms import HuggingFace
3
+ from langchain.retrievers import BM25Retriever
4
+ from langchain.prompts import PromptTemplate
5
+ import yaml
6
+
7
+ # Charger la configuration
8
+ with open('config/config.yaml', 'r') as f:
9
+ config = yaml.safe_load(f)
10
+
11
+ # Configuration du mod�le
12
+ llm = HuggingFace("distilbert-base-uncased")
13
+
14
+ # Configuration du retriever
15
+ retriever = BM25Retriever.from_documents(["This is a great movie.", "I love this film."])
16
+
17
+ # Cr�ation du template de prompt
18
+ template = PromptTemplate("Classify the sentiment of the following text: {text}")
19
+
20
+ # Cr�ation de la cha�ne RAG
21
+ rag_chain = RAGChain(llm=llm, retriever=retriever, prompt_template=template)
22
+
23
+ # Exemples de textes � classifier
24
+ texts = ["This is a fantastic movie.", "I enjoy this movie."]
25
+
26
+ # Utiliser RAG pour obtenir des classifications avec contexte
27
+ for text in texts:
28
+ result = rag_chain.run({"text": text})
29
+ print(f"Text: {text}, Result: {result}")
models/full_finetune_model.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from transformers import BertForSequenceClassification
2
+
3
+ def get_full_finetune_model():
4
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
5
+ return model
models/lora_model.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertForSequenceClassification, LoRAConfig
2
+
3
+ def get_lora_model(config):
4
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
5
+ lora_config = LoRAConfig(r=config['model']['lora']['r'], alpha=config['model']['lora']['alpha'])
6
+ model.add_lora('imdb_lora', config=lora_config)
7
+ model.train_lora('imdb_lora')
8
+ return model
models/pert_model.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertForSequenceClassification, AdapterConfig
2
+
3
+ def get_pert_model(config):
4
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
5
+ adapter_config = AdapterConfig(mh_adapter=True, output_adapter=True, reduction_factor=config['model']['adapter']['reduction_factor'])
6
+ model.add_adapter('imdb_adapter', config=adapter_config)
7
+ model.train_adapter('imdb_adapter')
8
+ return model
models/student_model.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from transformers import DistilBertForSequenceClassification
2
+
3
+ def get_student_model(config):
4
+ model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
5
+ model.config.hidden_size = config['model']['student']['hidden_size']
6
+ return model
requirements.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ accelerate
3
+ torch
4
+ datasets
5
+ wandb
6
+ pyyaml
7
+ langchain
8
+ flask
9
+ mlflow
10
+ streamlit
11
+ streamlit==1.32.1
12
+ flask
13
+ mlflow
14
+ fastapi==0.74.*
15
+ requests==2.27.*
16
+ uvicorn[standard]==0.17.*
17
+ sentencepiece==0.2.*
18
+ python-dotenv
19
+ wikipedia
20
+ tiktoken
21
+ neo4j
22
+ streamlit
23
+ Pillow
24
+ fastapi
25
+ PyPDF2
26
+ pydantic
27
+ uvicorn
28
+ sse-starlette
29
+ boto3
30
+ streamlit==1.32.1
31
+ # missing from the langchain base image?
32
+ langchain-openai==0.2.4
33
+ langchain-community==0.3.3
34
+ langchain-google-genai==2.0.3
35
+ langchain-ollama==0.2.0
36
+ langchain-huggingface==0.1.1
37
+ langchain-aws==0.2.4
scripts/deploy_model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+ import os
3
+
4
+ def deploy_model_to_huggingface(model_path, model_name):
5
+ api = HfApi()
6
+ api.upload_folder(
7
+ folder_path=model_path,
8
+ repo_id=model_name,
9
+ commit_message="Deploying fine-tuned model",
10
+ use_auth_token=os.getenv("HF_AUTH_TOKEN")
11
+ )
12
+
13
+ if __name__ == "__main__":
14
+ model_path = "./results_student"
15
+ model_name = "your_model_name"
16
+ deploy_model_to_huggingface(model_path, model_name)
scripts/distill_student.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import wandb
3
+ import yaml
4
+ from transformers import Trainer, TrainingArguments
5
+ from data.datasets import load_and_tokenize_data
6
+ from models.full_finetune_model import get_full_finetune_model
7
+ from models.student_model import get_student_model
8
+
9
+ # Charger la configuration
10
+ with open('config/config.yaml', 'r') as f:
11
+ config = yaml.safe_load(f)
12
+
13
+ # Initialiser wandb
14
+ wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
15
+
16
+ # Charger les donn�es
17
+ train_dataset, test_dataset = load_and_tokenize_data(config)
18
+
19
+ # Charger le mod�le teacher et le mod�le student
20
+ teacher_model = get_full_finetune_model()
21
+ student_model = get_student_model(config)
22
+
23
+ # D�finir les arguments de formation pour la distillation
24
+ training_args = TrainingArguments(
25
+ output_dir='./results_student',
26
+ num_train_epochs=config['training']['num_epochs'],
27
+ per_device_train_batch_size=config['training']['batch_size'],
28
+ per_device_eval_batch_size=config['training']['batch_size'],
29
+ evaluation_strategy='epoch',
30
+ save_steps=10_000,
31
+ save_total_limit=2,
32
+ logging_dir='./logs',
33
+ logging_steps=10,
34
+ )
35
+
36
+ # D�finir le distillateur
37
+ class DistillationTrainer(Trainer):
38
+ def compute_loss(self, model, inputs, return_outputs=False):
39
+ # Forward pass of teacher model
40
+ with torch.no_grad():
41
+ teacher_outputs = teacher_model(**inputs)
42
+
43
+ # Forward pass of student model
44
+ student_outputs = model(**inputs)
45
+
46
+ # Compute distillation loss
47
+ loss = torch.nn.functional.kl_div(
48
+ torch.nn.functional.log_softmax(student_outputs.logits, dim=-1),
49
+ torch.nn.functional.softmax(teacher_outputs.logits, dim=-1),
50
+ reduction='batchmean'
51
+ )
52
+ return (loss, student_outputs) if return_outputs else loss
53
+
54
+ # Cr�er le Trainer pour la distillation
55
+ trainer = DistillationTrainer(
56
+ model=student_model,
57
+ args=training_args,
58
+ train_dataset=train_dataset,
59
+ eval_dataset=test_dataset,
60
+ )
61
+
62
+ # Mesurer les ressources et entra�ner le mod�le student
63
+ measure_resources(trainer, "Distillation")
scripts/evaluate.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ from datasets import load_metric
3
+ from transformers import pipeline
4
+ import yaml
5
+
6
+ # Charger la configuration
7
+ with open('config/config.yaml', 'r') as f:
8
+ config = yaml.safe_load(f)
9
+
10
+ # Charger le mod�le fine-tuned
11
+ model_name = "results_student" # Remplacer par le chemin vers le mod�le student
12
+ tokenizer_name = "distilbert-base-uncased"
13
+
14
+ # Configuration de l'�valuation
15
+ bleu = load_metric("bleu")
16
+ rouge = load_metric("rouge")
17
+
18
+ # Initialiser wandb
19
+ wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
20
+
21
+ def evaluate_model(model_name, tokenizer_name):
22
+ nlp = pipeline("text-classification", model=model_name, tokenizer=tokenizer_name)
23
+
24
+ # Simuler des exemples pour l'�valuation
25
+ examples = [
26
+ {"reference": "This is a great movie.", "candidate": "This is a fantastic movie."},
27
+ {"reference": "I love this film.", "candidate": "I enjoy this movie."}
28
+ ]
29
+
30
+ references = [e["reference"] for e in examples]
31
+ candidates = [nlp(e["candidate"])[0]["label"] for e in examples]
32
+
33
+ # Calcul des scores BLEU et ROUGE
34
+ bleu_score = bleu.compute(predictions=candidates, references=references)
35
+ rouge_score = rouge.compute(predictions=candidates, references=references)
36
+
37
+ # Enregistrer les scores sur wandb
38
+ wandb.log({
39
+ "bleu_score": bleu_score,
40
+ "rouge_score": rouge_score
41
+ })
42
+
43
+ # �valuer les mod�les
44
+ evaluate_model(model_name, tokenizer_name)
scripts/train_full_finetune.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ import yaml
3
+ from transformers import Trainer, TrainingArguments
4
+ from utils.monitor import measure_resources
5
+ from data.datasets import load_and_tokenize_data
6
+ from models.full_finetune_model import get_full_finetune_model
7
+
8
+ # Charger la configuration
9
+ with open('config/config.yaml', 'r') as f:
10
+ config = yaml.safe_load(f)
11
+
12
+ # Initialiser wandb
13
+ wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
14
+
15
+ # Charger les donn�es
16
+ train_dataset, test_dataset = load_and_tokenize_data(config)
17
+
18
+ # Charger le mod�le
19
+ model = get_full_finetune_model()
20
+
21
+ # D�finir les arguments de formation
22
+ training_args = TrainingArguments(
23
+ output_dir='./results',
24
+ num_train_epochs=config['training']['num_epochs'],
25
+ per_device_train_batch_size=config['training']['batch_size'],
26
+ per_device_eval_batch_size=config['training']['batch_size'],
27
+ evaluation_strategy='epoch',
28
+ save_steps=10_000,
29
+ save_total_limit=2,
30
+ logging_dir='./logs',
31
+ logging_steps=10,
32
+ )
33
+
34
+ # Cr�er le Trainer
35
+ trainer = Trainer(
36
+ model=model,
37
+ args=training_args,
38
+ train_dataset=train_dataset,
39
+ eval_dataset=test_dataset,
40
+ )
41
+ #Mesurer les ressources et entra�ner le mod�le
42
+ measure_resources(trainer, "Full Fine-Tuning")
scripts/train_lora.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ import yaml
3
+ from transformers import Trainer, TrainingArguments
4
+ from utils.monitor import measure_resources
5
+ from data.datasets import load_and_tokenize_data
6
+ from models.lora_model import get_lora_model
7
+
8
+ # Charger la configuration
9
+ with open('config/config.yaml', 'r') as f:
10
+ config = yaml.safe_load(f)
11
+
12
+ # Initialiser wandb
13
+ wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
14
+
15
+ # Charger les donn�es
16
+ train_dataset, test_dataset = load_and_tokenize_data(config)
17
+
18
+ # Charger le mod�le
19
+ model = get_lora_model(config)
20
+
21
+ # D�finir les arguments de formation
22
+ training_args = TrainingArguments(
23
+ output_dir='./results',
24
+ num_train_epochs=config['training']['num_epochs'],
25
+ per_device_train_batch_size=config['training']['batch_size'],
26
+ per_device_eval_batch_size=config['training']['batch_size'],
27
+ evaluation_strategy='epoch',
28
+ save_steps=10_000,
29
+ save_total_limit=2,
30
+ logging_dir='./logs',
31
+ logging_steps=10,
32
+ )
33
+
34
+ # Cr�er le Trainer
35
+ trainer = Trainer(
36
+ model=model,
37
+ args=training_args,
38
+ train_dataset=train_dataset,
39
+ eval_dataset=test_dataset,
40
+ )
41
+
42
+ # Mesurer les ressources et entra�ner le mod�le
43
+ measure_resources(trainer, "LoRA")
scripts/train_pert.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ import yaml
3
+ from transformers import Trainer, TrainingArguments
4
+ from utils.monitor import measure_resources
5
+ from data.datasets import load_and_tokenize_data
6
+ from models.pert_model import get_pert_model
7
+
8
+ # Charger la configuration
9
+ with open('config/config.yaml', 'r') as f:
10
+ config = yaml.safe_load(f)
11
+
12
+ # Initialiser wandb
13
+ wandb.init(project=config['wandb']['project'], entity=config['wandb']['entity'])
14
+
15
+ # Charger les donn�es
16
+ train_dataset, test_dataset = load_and_tokenize_data(config)
17
+
18
+ # Charger le mod�le
19
+ model = get_pert_model(config)
20
+
21
+ # D�finir les arguments de formation
22
+ training_args = TrainingArguments(
23
+ output_dir='./results',
24
+ num_train_epochs=config['training']['num_epochs'],
25
+ per_device_train_batch_size=config['training']['batch_size'],
26
+ per_device_eval_batch_size=config['training']['batch_size'],
27
+ evaluation_strategy='epoch',
28
+ save_steps=10_000,
29
+ save_total_limit=2,
30
+ logging_dir='./logs',
31
+ logging_steps=10,
32
+ )
33
+
34
+ # Cr�er le Trainer
35
+ trainer = Trainer(
36
+ model=model,
37
+ args=training_args,
38
+ train_dataset=train_dataset,
39
+ eval_dataset=test_dataset,
40
+ )
41
+
42
+ # Mesurer les ressources et entra�ner le mod�le
43
+ measure_resources(trainer, "PERT (Adapters)")
tests/test_datasets.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from data.datasets import load_dataset, preprocess_data
3
+
4
+ class TestDatasets(unittest.TestCase):
5
+
6
+ def test_load_dataset(self):
7
+ df = load_dataset('data/test.csv')
8
+ self.assertIsNotNone(df)
9
+
10
+ def test_preprocess_data(self):
11
+ df = load_dataset('data/test.csv')
12
+ preprocessed_df = preprocess_data(df)
13
+ self.assertFalse(preprocessed_df.isnull().values.any())
14
+
15
+ if __name__ == '__main__':
16
+ unittest.main()
tests/test_metrics.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from evaluations.evaluate_models import evaluate_model
3
+
4
+ class TestMetrics(unittest.TestCase):
5
+
6
+ def test_evaluate_model(self):
7
+ model = train_full_finetune_model(train_data, val_data)
8
+ results = evaluate_model(model, val_data)
9
+ self.assertIn('accuracy', results)
10
+ self.assertIn('f1_score', results)
11
+
12
+ if __name__ == '__main__':
13
+ unittest.main()
tests/test_models.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from models.full_finetune_model import train_full_finetune_model
3
+
4
+ class TestModels(unittest.TestCase):
5
+
6
+ def test_train_full_finetune_model(self):
7
+ train_data = preprocess_data(load_dataset('data/train.csv'))
8
+ val_data = preprocess_data(load_dataset('data/val.csv'))
9
+ model = train_full_finetune_model(train_data, val_data)
10
+ self.assertIsNotNone(model)
11
+
12
+ if __name__ == '__main__':
13
+ unittest.main()
utils/monitor.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import torch
3
+
4
+ def measure_resources(trainer, method_name):
5
+ start_time = time.time()
6
+ torch.cuda.reset_peak_memory_stats()
7
+
8
+ # D�marrer l'entra�nement et enregistrer les m�triques
9
+ trainer.train()
10
+
11
+ end_time = time.time()
12
+ peak_memory = torch.cuda.max_memory_allocated() / (1024 * 1024) # Convertir en MB
13
+ training_time = end_time - start_time
14
+
15
+ # Enregistrer les m�triques sur wandb
16
+ wandb.log({
17
+ 'method': method_name,
18
+ 'peak_memory_MB': peak_memory,
19
+ 'training_time_seconds': training_time
20
+ })
21
+ return peak_memory, training_time