GAMA-IT

Running on Zero

App Files Files Community

GAMA-IT / peft /tests /test_common_gpu.py

sonalkum

run

2ba374e 4 months ago

raw

history blame

5.66 kB

	# coding=utf-8
	# Copyright 2023-present the HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import gc
	import unittest

	import pytest
	import torch
	from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, WhisperForConditionalGeneration

	from peft import LoraConfig, PeftModel, get_peft_model
	from peft.import_utils import is_bnb_available

	from .testing_utils import require_bitsandbytes, require_torch_gpu, require_torch_multi_gpu


	if is_bnb_available():
	from peft.tuners.lora import Linear8bitLt


	@require_torch_gpu
	class PeftGPUCommonTests(unittest.TestCase):
	r"""
	A common tester to run common operations that are performed on GPU such as generation, loading in 8bit, etc.
	"""

	def setUp(self):
	self.seq2seq_model_id = "google/flan-t5-base"
	self.causal_lm_model_id = "facebook/opt-350m"
	self.audio_model_id = "openai/whisper-large"
	self.device = torch.device("cuda:0")

	def tearDown(self):
	r"""
	Efficient mechanism to free GPU memory after each test. Based on
	https://github.com/huggingface/transformers/issues/21094
	"""
	gc.collect()
	torch.cuda.empty_cache()
	gc.collect()

	@require_bitsandbytes
	def test_lora_bnb_quantization(self):
	r"""
	Test that tests if the 8bit quantization using LoRA works as expected
	"""
	whisper_8bit = WhisperForConditionalGeneration.from_pretrained(
	self.audio_model_id,
	device_map="auto",
	load_in_8bit=True,
	)

	opt_8bit = AutoModelForCausalLM.from_pretrained(
	self.causal_lm_model_id,
	device_map="auto",
	load_in_8bit=True,
	)

	flan_8bit = AutoModelForSeq2SeqLM.from_pretrained(
	self.seq2seq_model_id,
	device_map="auto",
	load_in_8bit=True,
	)

	flan_lora_config = LoraConfig(
	r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
	)

	opt_lora_config = LoraConfig(
	r=16,
	lora_alpha=32,
	target_modules=["q_proj", "v_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	)

	config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")

	flan_8bit = get_peft_model(flan_8bit, flan_lora_config)
	self.assertTrue(isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, Linear8bitLt))

	opt_8bit = get_peft_model(opt_8bit, opt_lora_config)
	self.assertTrue(isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, Linear8bitLt))

	whisper_8bit = get_peft_model(whisper_8bit, config)
	self.assertTrue(
	isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, Linear8bitLt)
	)

	@pytest.mark.multi_gpu_tests
	@require_torch_multi_gpu
	def test_lora_causal_lm_mutli_gpu_inference(self):
	r"""
	Test if LORA can be used for inference on multiple GPUs.
	"""
	lora_config = LoraConfig(
	r=16,
	lora_alpha=32,
	target_modules=["q_proj", "v_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	)

	model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, device_map="balanced")
	tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id)

	self.assertEqual(set(model.hf_device_map.values()), {0, 1})

	model = get_peft_model(model, lora_config)
	self.assertTrue(isinstance(model, PeftModel))

	dummy_input = "This is a dummy input:"
	input_ids = tokenizer(dummy_input, return_tensors="pt").input_ids.to(self.device)

	# this should work without any problem
	_ = model.generate(input_ids=input_ids)

	@require_torch_multi_gpu
	@pytest.mark.multi_gpu_tests
	@require_bitsandbytes
	def test_lora_seq2seq_lm_mutli_gpu_inference(self):
	r"""
	Test if LORA can be used for inference on multiple GPUs - 8bit version.
	"""
	lora_config = LoraConfig(
	r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM"
	)

	model = AutoModelForSeq2SeqLM.from_pretrained(self.seq2seq_model_id, device_map="balanced", load_in_8bit=True)
	tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id)

	self.assertEqual(set(model.hf_device_map.values()), {0, 1})

	model = get_peft_model(model, lora_config)
	self.assertTrue(isinstance(model, PeftModel))
	self.assertTrue(isinstance(model.base_model.model.encoder.block[0].layer[0].SelfAttention.q, Linear8bitLt))

	dummy_input = "This is a dummy input:"
	input_ids = tokenizer(dummy_input, return_tensors="pt").input_ids.to(self.device)

	# this should work without any problem
	_ = model.generate(input_ids=input_ids)