sander-wood commited on
Commit
60735e7
1 Parent(s): 1c87ac0

Delete run_inference.py

Browse files
Files changed (1) hide show
  1. run_inference.py +0 -186
run_inference.py DELETED
@@ -1,186 +0,0 @@
1
- import gradio as gr
2
- import torch
3
- import random
4
- from unidecode import unidecode
5
- from transformers import GPT2LMHeadModel
6
- from samplings import top_p_sampling, temperature_sampling
7
-
8
- device = torch.device("cpu")
9
-
10
- description = """
11
- <div>
12
- <a style="display:inline-block" href='https://github.com/suno-ai/bark'><img src='https://img.shields.io/github/stars/suno-ai/bark?style=social' /></a>
13
- <a style='display:inline-block' href='https://discord.gg/J2B2vsjKuE'><img src='https://dcbadge.vercel.app/api/server/J2B2vsjKuE?compact=true&style=flat' /></a>
14
- <a style="display:inline-block; margin-left: 1em" href="https://huggingface.co/spaces/suno/bark?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space%20to%20skip%20the%20queue-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a>
15
- </div>
16
- Bark is a universal text-to-audio model created by [Suno](www.suno.ai), with code publicly available [here](https://github.com/suno-ai/bark). \
17
- Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \
18
- This demo should be used for research purposes only. Commercial use is strictly prohibited. \
19
- The model output is not censored and the authors do not endorse the opinions in the generated content. \
20
- Use at your own risk.
21
- """
22
-
23
- article = """
24
- ## 🌎 Foreign Language
25
- Bark supports various languages out-of-the-box and automatically determines language from input text. \
26
- When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice.
27
- Try the prompt:
28
- ```
29
- Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.
30
- ```
31
- ## 🤭 Non-Speech Sounds
32
- Below is a list of some known non-speech sounds, but we are finding more every day. \
33
- Please let us know if you find patterns that work particularly well on Discord!
34
- * [laughter]
35
- * [laughs]
36
- * [sighs]
37
- * [music]
38
- * [gasps]
39
- * [clears throat]
40
- * — or ... for hesitations
41
- * ♪ for song lyrics
42
- * capitalization for emphasis of a word
43
- * MAN/WOMAN: for bias towards speaker
44
- Try the prompt:
45
- ```
46
- " [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪."
47
- ```
48
- ## 🎶 Music
49
- Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \
50
- Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.
51
- Try the prompt:
52
- ```
53
- ♪ In the jungle, the mighty jungle, the lion barks tonight ♪
54
- ```
55
- ## 🧬 Voice Cloning
56
- Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \
57
- The model also attempts to preserve music, ambient noise, etc. from input audio. \
58
- However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from.
59
- ## 👥 Speaker Prompts
60
- You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \
61
- Please note that these are not always respected, especially if a conflicting audio history prompt is given.
62
- Try the prompt:
63
- ```
64
- WOMAN: I would like an oatmilk latte please.
65
- MAN: Wow, that's expensive!
66
- ```
67
- ## Details
68
- Bark model by [Suno](https://suno.ai/), including official [code](https://github.com/suno-ai/bark) and model weights. \
69
- Gradio demo supported by 🤗 Hugging Face. Bark is licensed under a non-commercial license: CC-BY 4.0 NC, see details on [GitHub](https://github.com/suno-ai/bark).
70
- """
71
-
72
- # examples = [
73
- # "Jazz standard in Minor key with a swing feel.",
74
- # "Jazz standard in Major key with a fast tempo.",
75
- # "Jazz standard in Blues form with a soulfoul melody.",
76
- # "a painting of a starry night with the moon in the sky",
77
- # "a green field with a blue sky and clouds",
78
- # "a beach with a castle on top of it"
79
- # ]
80
-
81
- class ABCTokenizer():
82
- def __init__(self):
83
- self.pad_token_id = 0
84
- self.bos_token_id = 2
85
- self.eos_token_id = 3
86
- self.merged_tokens = []
87
-
88
- for i in range(8):
89
- self.merged_tokens.append('[SECS_'+str(i+1)+']')
90
- for i in range(32):
91
- self.merged_tokens.append('[BARS_'+str(i+1)+']')
92
- for i in range(11):
93
- self.merged_tokens.append('[SIM_'+str(i)+']')
94
-
95
- def __len__(self):
96
- return 128+len(self.merged_tokens)
97
-
98
- def encode(self, text):
99
- encodings = {}
100
- encodings['input_ids'] = torch.tensor(self.txt2ids(text, self.merged_tokens))
101
- encodings['attention_mask'] = torch.tensor([1]*len(encodings['input_ids']))
102
- return encodings
103
-
104
- def decode(self, ids, skip_special_tokens=False):
105
- txt = ""
106
- for i in ids:
107
- if i>=128:
108
- if not skip_special_tokens:
109
- txt += self.merged_tokens[i-128]
110
- elif i!=self.bos_token_id and i!=self.eos_token_id:
111
- txt += chr(i)
112
- return txt
113
-
114
- def txt2ids(self, text, merged_tokens):
115
- ids = ["\""+str(ord(c))+"\"" for c in text]
116
- txt_ids = ' '.join(ids)
117
- for t_idx, token in enumerate(merged_tokens):
118
- token_ids = ["\""+str(ord(c))+"\"" for c in token]
119
- token_txt_ids = ' '.join(token_ids)
120
- txt_ids = txt_ids.replace(token_txt_ids, "\""+str(t_idx+128)+"\"")
121
-
122
- txt_ids = txt_ids.split(' ')
123
- txt_ids = [int(i[1:-1]) for i in txt_ids]
124
- return [self.bos_token_id]+txt_ids+[self.eos_token_id]
125
-
126
- def generate_abc(control_codes, prefix, num_tunes, max_length, top_p, temperature, seed):
127
-
128
- prefix = unidecode(control_codes + prefix)
129
- tokenizer = ABCTokenizer()
130
- model = GPT2LMHeadModel.from_pretrained('sander-wood/tunesformer').to(device)
131
-
132
- if prefix:
133
- ids = tokenizer.encode(prefix)['input_ids'][:-1]
134
- else:
135
- ids = torch.tensor([tokenizer.bos_token_id])
136
-
137
- random.seed(seed)
138
- tunes = ""
139
-
140
- for c_idx in range(num_tunes):
141
- print("\nX:"+str(c_idx+1)+"\n", end="")
142
- print(tokenizer.decode(ids[1:], skip_special_tokens=True), end="")
143
- input_ids = ids.unsqueeze(0)
144
- for t_idx in range(max_length):
145
- if seed!=None:
146
- n_seed = random.randint(0, 1000000)
147
- random.seed(n_seed)
148
- else:
149
- n_seed = None
150
-
151
- outputs = model(input_ids=input_ids.to(device))
152
- probs = outputs.logits[0][-1]
153
- probs = torch.nn.Softmax(dim=-1)(probs).cpu().detach().numpy()
154
- sampled_id = temperature_sampling(probs=top_p_sampling(probs,
155
- top_p=top_p,
156
- seed=n_seed,
157
- return_probs=True),
158
- seed=n_seed,
159
- temperature=temperature)
160
- input_ids = torch.cat((input_ids, torch.tensor([[sampled_id]])), 1)
161
- if sampled_id!=tokenizer.eos_token_id:
162
- print(tokenizer.decode([sampled_id], skip_special_tokens=True), end="")
163
- continue
164
- else:
165
- tune = "X:"+str(c_idx+1)+"\n"+tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True)
166
- tunes += tune+"\n\n"
167
- print("\n")
168
- break
169
-
170
- return tunes
171
-
172
- input_control_codes = gr.inputs.Textbox(lines=5, label="Control Codes", default="[SECS_2][BARS_9][SIM_3][BARS_9]")
173
- input_prefix = gr.inputs.Textbox(lines=5, label="Prefix", default="L:1/8\nQ:1/4=114\nM:3/4\nK:D\nde | \"D\"")
174
- input_num_tunes = gr.inputs.Slider(minimum=1, maximum=10, step=1, default=3, label="Number of Tunes")
175
- input_max_length = gr.inputs.Slider(minimum=10, maximum=1000, step=10, default=500, label="Max Length")
176
- input_top_p = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.05, default=0.9, label="Top P")
177
- input_temperature = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.05, default=0.9, label="Temperature")
178
- input_seed = gr.inputs.Textbox(lines=1, label="Seed", default="None")
179
- output_abc = gr.outputs.Textbox(label="Generated Tunes")
180
-
181
- gr.Interface(generate_abc,
182
- [input_control_codes, input_prefix, input_num_tunes, input_max_length, input_top_p, input_temperature, input_seed],
183
- output_abc,
184
- title="TunesFormer: Forming Tunes with Control Codes",
185
- description=description,
186
- article=article).launch()