supermomo668
/

doodle-musicgen

Inference Endpoints

Model card Files Files and versions Community

supermomo668 commited on Aug 6, 2023

Commit

800934b

•

1 Parent(s): a7b15b1

stable

Browse files

Files changed (1) hide show

handler.py +35 -17

handler.py CHANGED Viewed

@@ -9,7 +9,7 @@ from audiocraft.models import MusicGen
 import yaml
 import math
-# import torchaudio
 import torch
 def get_bip_bip(
@@ -46,13 +46,15 @@ class generator:
 			duration=self.conf['duration']
 		)
 		device = "cuda" if torch.cuda.is_available() else "cpu"
-		self.model.to(device)
-		self.sampling_rate = self.model.config.audio_encoder.sampling_rate
-	def preprocess(self, text, audio):
-		audio = audio[: int(len(audio) // self.conf['nth_slice_prompt'])]
-	def generate(self, text:list, audio: np.array, **kwargs):
 		"""
 		text: ["modern melodic electronic dance music", "80s blues track with groovy saxophone"]
 		audio (np.array)
@@ -64,14 +66,27 @@ class generator:
 		# 	padding=True,
 		# 	return_tensors="pt",
 		# )
-		output = self.model.generate_with_chroma(
-			descriptions=[
-				text
-			],
-			melody_wavs=audio,
-			melody_sample_rate=self.conf['sampling_rate'],
-			progress=True
-		)
 		return output
@@ -80,7 +95,7 @@ class EndpointHandler:
 		# load model and processor from path
 		# self.model = MusicGen.from_pretrained(
 		# 	path, torch_dtype=torch.float16).to("cuda")
-		self.generator = generator('.conf/generation_conf.yaml')
 	def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
 		"""
@@ -88,12 +103,15 @@ class EndpointHandler:
 				data (:dict:):
 						The payload with the text prompt and generation parameters.
 		"""
-		prompt_duration = 2
 		# process input
 		text = data.pop("text", data)
 		audio = data.pop("audio", data)
 		parameters = data.pop("parameters", None)
-		audio, sr = sf.read(io.BytesIO(audio))
 		output = self.generate(text, audio, sr)
 		# # pass inputs with all kwargs in data

 import yaml
 import math
+import torchaudio
 import torch
 def get_bip_bip(
 			duration=self.conf['duration']
 		)
 		device = "cuda" if torch.cuda.is_available() else "cpu"
+		# self.model.to(device)
+		self.sampling_rate = self.model.sample_rate   # config.audio_encoder.sampling_rate
+	def preprocess(self, text, audio=None):
+		if audio is not None:
+			audio = audio[: int(len(audio) // self.conf['nth_slice_prompt'])]
+		return text, audio
+	def generate(self, text:list, audio: np.array=None, **kwargs):
 		"""
 		text: ["modern melodic electronic dance music", "80s blues track with groovy saxophone"]
 		audio (np.array)
 		# 	padding=True,
 		# 	return_tensors="pt",
 		# )
+		if kwargs.get('sr'):
+			sr = kwargs.get('sr')
+		else:
+			sr = self.conf['sampling_rate']
+		print(f"Generating from: Text:{text is not None} | audio:{audio is not None}")
+		text, audio = self.preprocess(text, audio)
+		if self.conf['model'] == 'melody' and audio is not None:
+			output = self.model.generate_with_chroma(
+				descriptions=[
+					text
+				],
+				melody_wavs=audio,
+				melody_sample_rate=sr,
+				# progress=True
+			)
+		else:
+			output = self.model.generate_continuation(
+				get_bip_bip(0.125), # .expand(2, -1, -1),
+    		32000, text,
+		    # progress_bar=True
+			)
 		return output
 		# load model and processor from path
 		# self.model = MusicGen.from_pretrained(
 		# 	path, torch_dtype=torch.float16).to("cuda")
+		self.generator = generator(os.path.join(path, '.conf/generation_conf.yaml'))
 	def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
 		"""
 				data (:dict:):
 						The payload with the text prompt and generation parameters.
 		"""
+		# prompt_duration = 2
 		# process input
 		text = data.pop("text", data)
 		audio = data.pop("audio", data)
 		parameters = data.pop("parameters", None)
+		audio, sr = torchaudio.load(audio)
+		audio = audio.unsqueeze(0)
+		# audio, sr = sf.read(io.BytesIO(audio))
 		output = self.generate(text, audio, sr)
 		# # pass inputs with all kwargs in data