Ngaima Sandiman commited on
Commit
9cc3964
1 Parent(s): 749932e

Changed transformer version to fix issues.

Browse files
requirements.txt CHANGED
@@ -1,14 +1,15 @@
1
  -e git+https://github.com/facebookresearch/audiocraft.git@f83babff6b5e97f75562127c4cc8122229c8f099#egg=audiocraft
2
  phonemizer
 
 
 
 
 
3
  spaces
4
  huggingface-hub
5
  num2words
6
- transformers
7
  numpy
8
  pillow
9
  safetensors
10
  tokenizers
11
- torch==2.1.0
12
- torchaudio
13
- torchvision
14
  aeneas
 
1
  -e git+https://github.com/facebookresearch/audiocraft.git@f83babff6b5e97f75562127c4cc8122229c8f099#egg=audiocraft
2
  phonemizer
3
+ transformers==4.43.1
4
+ torch==2.1.1
5
+ numpy==2.0.1
6
+ torchaudio
7
+ torchvision
8
  spaces
9
  huggingface-hub
10
  num2words
 
11
  numpy
12
  pillow
13
  safetensors
14
  tokenizers
 
 
 
15
  aeneas
src/model/modules/imagecraft.py CHANGED
@@ -405,15 +405,10 @@ class ImageCraft(nn.Module):
405
  max_tokens=30,
406
  do_sample=False,
407
  output_type="file",
408
- return_output="speech",
409
  ):
410
- if return_output == "speech" or return_output is None:
411
- transcript = self._generate_caption(image, max_tokens, do_sample)
412
- speech = self._generate_speech(transcript, output_type)
413
- return transcript, speech
414
- else:
415
- transcript = self._generate_caption(image, max_tokens, do_sample)
416
- return transcript
417
 
418
  @classmethod
419
  def from_pretrained(
 
405
  max_tokens=30,
406
  do_sample=False,
407
  output_type="file",
 
408
  ):
409
+ transcript = self._generate_caption(image, max_tokens, do_sample)
410
+ speech = self._generate_speech(transcript, output_type)
411
+ return transcript, speech
 
 
 
 
412
 
413
  @classmethod
414
  def from_pretrained(
src/model/modules/imagecraftprocessor.py CHANGED
@@ -40,9 +40,6 @@ class ImageCraftProcessor:
40
  tokenizer.add_eos_token = False
41
 
42
  self.tokenizer = tokenizer
43
- # self.image_processor = SiglipImageProcessor.from_pretrained(
44
- # "google/siglip-so400m-patch14-384"
45
- # )
46
 
47
  def __call__(
48
  self,
@@ -55,9 +52,6 @@ class ImageCraftProcessor:
55
  len(images) == 1 and len(text) == 1
56
  ), f"Received {len(images)} images for {len(text)} prompts."
57
 
58
- # pixel_values = self.image_processor(images=images, return_tensors="pt")[
59
- # "pixel_values"
60
- # ]
61
  pixel_values = process_images(
62
  images,
63
  size=(self.image_size, self.image_size),
 
40
  tokenizer.add_eos_token = False
41
 
42
  self.tokenizer = tokenizer
 
 
 
43
 
44
  def __call__(
45
  self,
 
52
  len(images) == 1 and len(text) == 1
53
  ), f"Received {len(images)} images for {len(text)} prompts."
54
 
 
 
 
55
  pixel_values = process_images(
56
  images,
57
  size=(self.image_size, self.image_size),
src/utils/model_utils.py CHANGED
@@ -19,13 +19,11 @@ def get_model_inputs(
19
  processor: ImageCraftProcessor,
20
  prompt: str,
21
  image: Image,
22
- suffix: Optional[str] = None,
23
  device: str = "cuda",
24
  ):
25
  images = [image]
26
  prompts = [prompt]
27
- if suffix is not None:
28
- suffix = [suffix]
29
  model_inputs = processor(text=prompts, images=images)
30
  model_inputs = move_inputs_to_device(model_inputs, device)
31
  return model_inputs
@@ -38,36 +36,3 @@ def get_config(config_file="config.json"):
38
  config = ImageCraftConfig(**model_config_file)
39
 
40
  return config
41
-
42
-
43
- # def load_hf_model(model_path: str, device: str) -> Tuple[ImageCraft, AutoTokenizer]:
44
-
45
- # # Load the tokenizer
46
- # tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="right")
47
- # assert tokenizer.padding_side == "right"
48
-
49
- # # Find all the *.safetensors files
50
- # safetensors_files = glob.glob(os.path.join(model_path, "*.safetensors"))
51
-
52
- # # ... and load them one by one in the tensors dictionary
53
- # tensors = {}
54
- # for safetensors_file in safetensors_files:
55
- # with safe_open(safetensors_file, framework="pt", device="cpu") as f:
56
- # for key in f.keys():
57
- # tensors[key] = f.get_tensor(key)
58
-
59
- # # Load the model's config
60
- # with open(os.path.join(model_path, "config.json"), "r") as f:
61
- # model_config_file = json.load(f)
62
- # config = ImageCraftConfig(**model_config_file)
63
-
64
- # # Create the model using the configuration
65
- # model = ImageCraft(config).to(device)
66
-
67
- # # Load the state dict of the model
68
- # model.load_state_dict(tensors, strict=False)
69
-
70
- # # Tie weights
71
- # model.tie_weights()
72
-
73
- # return (model, tokenizer)
 
19
  processor: ImageCraftProcessor,
20
  prompt: str,
21
  image: Image,
 
22
  device: str = "cuda",
23
  ):
24
  images = [image]
25
  prompts = [prompt]
26
+
 
27
  model_inputs = processor(text=prompts, images=images)
28
  model_inputs = move_inputs_to_device(model_inputs, device)
29
  return model_inputs
 
36
  config = ImageCraftConfig(**model_config_file)
37
 
38
  return config