import gradio as gr import librosa import numpy as np import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan checkpoint = "mikhail-panzo/zlm-fil-ceb_b64_le5_s8000" processor = SpeechT5Processor.from_pretrained(checkpoint) model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") spe = [-0.07156830281019211, -0.02638358250260353, 0.04677680879831314, 0.03367156162858009, -0.019285861402750015, -0.027602724730968475, -0.06917823851108551, -0.04798083007335663, 0.05392879247665405, 0.021232446655631065, -0.08971265703439713, -0.05227137356996536, 0.05129873752593994, 0.013318241573870182, 0.03634923696517944, 0.08183765411376953, -0.00023171716020442545, 0.02859080210328102, 0.013283342123031616, 0.008123684674501419, 0.043886449187994, 0.0027501601725816727, -0.017617085948586464, -0.05051666498184204, -0.045256875455379486, -0.008817761205136776, -0.060129791498184204, 0.03523271158337593, 0.06583297252655029, 0.030870387330651283, -0.002174368593841791, 0.0417785607278347, 0.02979600802063942, -0.026019243523478508, 0.01613667607307434, -0.08343936502933502, 0.028157565742731094, 0.05653348192572594, -0.04796013981103897, -0.06962310522794724, 0.009127648547291756, -0.047096963971853256, 0.01307157427072525, 0.04382061958312988, 0.017450829967856407, -0.12516599893569946, -0.020785771310329437, 0.012059417553246021, -0.05307995527982712, 0.059937842190265656, 0.013440544717013836, 0.010059639811515808, 0.03526052460074425, 0.022384969517588615, -0.07405367493629456, 0.010072286240756512, 0.016329325735569, 0.020433492958545685, 0.015963464975357056, 0.021014736965298653, 0.0026973108761012554, -0.01101595163345337, -0.016263974830508232, 0.03479960188269615, 0.006103777792304754, 0.01620979979634285, 0.011849654838442802, -0.029751954600214958, -0.07112693041563034, -0.06999105215072632, 0.003812580369412899, 0.011103129014372826, -0.008440849371254444, 0.018586942926049232, 0.030747711658477783, 0.04247446358203888, 0.030141443014144897, 0.02545987069606781, -0.05753854662179947, -0.08931085467338562, -0.07035337388515472, -0.05370721220970154, -0.05576084554195404, -0.07075389474630356, -0.03469623997807503, -0.05239512026309967, -0.053311314433813095, 0.05533606931567192, 0.025789743289351463, 0.059138841927051544, 0.02819150499999523, -0.0891687199473381, 0.0012175287120044231, -0.06351031363010406, 0.02202356979250908, 0.004803054500371218, 0.016199953854084015, 0.027727937325835228, -0.03267211467027664, -0.09165994077920914, 0.024599507451057434, -0.07309572398662567, -0.0748375803232193, 0.008631294593214989, 0.06571213901042938, -0.05350324138998985, 0.04521997645497322, 0.07352056354284286, 0.026083452627062798, 0.006921460852026939, -0.08377747237682343, 0.03632069751620293, 0.07667025923728943, 0.008553830906748772, 0.02307989075779915, 0.05463983863592148, -0.06686922162771225, -0.018010811880230904, -0.051589034497737885, -0.0008019809029065073, 0.0075337993912398815, -0.046973567456007004, 0.01978345401585102, 0.03698739781975746, -0.06664503365755081, 0.06484026461839676, -0.08575504273176193, 0.025846021249890327, 0.04178639501333237, 0.021939275786280632, 0.013616575859487057, 0.028715137392282486, 0.031019965186715126, 0.04372192174196243, 0.028126144781708717, -0.062495749443769455, -0.03828506916761398, 0.03408420830965042, -0.060381341725587845, -0.02459966018795967, 0.019428038969635963, 0.00387219013646245, -0.017680399119853973, 0.04297943040728569, -0.05092577263712883, 0.016452616080641747, 0.0021138808224350214, -0.0426134392619133, 0.044136885553598404, -0.08143087476491928, 0.04640118032693863, -0.04849568009376526, -0.05265757068991661, 0.027860509231686592, 0.00819639302790165, 0.033552370965480804, 0.026945138350129128, -0.08269435912370682, -0.07496176660060883, 0.04130542278289795, 0.037412092089653015, 0.004340531770139933, 0.025952592492103577, -0.06261271983385086, 0.012266535311937332, 0.010085804387927055, 0.043245233595371246, 0.031194033101201057, 0.026592832058668137, 0.030830781906843185, -0.022560613229870796, -0.05544402450323105, 0.017252881079912186, -0.08438578248023987, -0.06301704049110413, 0.04262023791670799, -0.021592646837234497, 0.02648831345140934, 0.06352358311414719, -0.06505220383405685, 0.02598809450864792, -0.04680369794368744, 0.007821363396942616, 0.03760896623134613, -0.05752832442522049, -0.008889094926416874, 0.04277804121375084, 0.026508638635277748, -0.061276230961084366, -0.05674714222550392, -0.016216745600104332, 0.03860216587781906, -0.04358423501253128, 0.038634736090898514, 0.03843136876821518, -0.0011079608229920268, 0.04906295984983444, 0.03465595468878746, 0.06316281855106354, 0.0056662471033632755, 0.004153784364461899, 0.012478937394917011, -0.055613450706005096, 0.005748559255152941, 0.01947004906833172, 0.01831246353685856, 0.03569320961833, -0.023230748251080513, 0.04925297945737839, -0.04683214798569679, 0.004089402034878731, -0.06549534201622009, -0.0742681473493576, 0.05717084929347038, -0.03963439166545868, 0.028655095025897026, 0.02070518024265766, -0.07004179060459137, 0.04068846255540848, 0.02499224804341793, -0.05228634551167488, 0.02812645025551319, -0.059428710490465164, -0.041417196393013, 0.022787131369113922, 0.053258467465639114, 0.06774449348449707, 0.09881281107664108, 0.04634056240320206, 0.0515253059566021, -0.026465322822332382, 0.04804180935025215, 0.025090856477618217, -0.05057351663708687, -0.008302120491862297, -0.025504931807518005, 0.023185551166534424, 0.015245426446199417, -0.002447819337248802, 0.052658431231975555, -0.052942872047424316, 0.025401152670383453, -0.10325764119625092, 0.02348904497921467, -0.03955519571900368, 0.01757780835032463, 0.05905766785144806, 0.053233884274959564, 0.005684420466423035, 0.05429721251130104, 0.037649232894182205, 0.0199859831482172, 0.010111426003277302, -0.05868203938007355, 0.014065082184970379, 0.029259104281663895, 0.029922068119049072, 0.0034907329827547073, -0.05438638851046562, 0.059400808066129684, 0.018886392936110497, -0.08010907471179962, 0.0244560819119215, 0.021696871146559715, 0.002420783508569002, 0.029741715639829636, 0.04413748160004616, 0.03540794923901558, 0.03500651195645332, -0.06278946995735168, 0.03048604167997837, -0.012842575088143349, 0.013577396050095558, -0.029430564492940903, -0.06439021974802017, 0.044991765171289444, 0.021648285910487175, 0.032694410532712936, -0.09407192468643188, -0.05216933414340019, -0.026286188513040543, 0.02807997167110443, 0.02027316205203533, -0.07291750609874725, -0.04755645990371704, 0.03232748061418533, -0.003975896630436182, 0.046563535928726196, -0.10469917207956314, 0.003908936865627766, 0.003936897963285446, 0.013791000470519066, -0.032840222120285034, 0.010787922888994217, -0.03470165655016899, 0.0065937950275838375, 0.023391149938106537, 0.019401634112000465, 0.055823955684900284, 0.011392011307179928, 0.05104760080575943, 0.0366746224462986, 0.01929694041609764, -0.07102135568857193, 0.008217417635023594, 0.056153032928705215, 0.029541660100221634, 0.002925268141552806, 0.03332659974694252, 0.027392012998461723, 0.07597773522138596, 0.04798854887485504, -0.025292834267020226, 0.028454342857003212, -0.08446473628282547, -0.06067478284239769, -0.0293490719050169, 0.015262295491993427, -0.0639355406165123, 0.05264747142791748, -0.05029783025383949, 0.02993948385119438, -0.03605620190501213, 0.004312526434659958, 0.03202211111783981, -0.040626250207424164, 0.03632079064846039, 0.0696728304028511, -0.057659704238176346, -0.04758629575371742, -0.03098507970571518, 0.027839157730340958, 0.005349151324480772, 0.04310200363397598, -0.07941040396690369, 0.05686274915933609, 0.04460901394486427, -0.022722845897078514, 0.018491603434085846, -0.05551289767026901, 0.03517179563641548, -0.05476537346839905, 0.03311903774738312, 0.012322511523962021, -0.07028210908174515, 0.029229193925857544, 0.03310919553041458, 0.009640451520681381, -0.044431671500205994, -0.06079164892435074, -0.05389634519815445, -0.04767846316099167, -0.06940813362598419, 0.10737128555774689, 0.019247161224484444, -0.045443810522556305, -0.008395393379032612, 0.0606759674847126, -0.04845020920038223, 0.008197005838155746, -0.1087162047624588, 0.030387507751584053, 0.00271149561740458, -0.007698268163949251, 0.045697201043367386, -0.0035735955461859703, -0.06050946190953255, 0.010610714554786682, -0.001759917358867824, 0.01752687245607376, 0.015779901295900345, 0.04578718543052673, -0.07859539240598679, -0.003715761937201023, -0.011839083395898342, -0.003137079766020179, 0.038235485553741455, 0.012716051191091537, 0.05720003321766853, 0.04342909902334213, -0.08046050369739532, -0.11780910193920135, 0.03214732185006142, 0.03455907478928566, 0.029433822259306908, -0.05209251120686531, 0.03660985082387924, -0.055470164865255356, 0.030609533190727234, 0.04391754791140556, 0.026429053395986557, 0.011438150890171528, -0.048309601843357086, 0.029823746532201767, 0.04564341530203819, 0.014016474597156048, 0.029349613934755325, -0.08406650274991989, 0.03287172690033913, -0.006857582833617926, -0.01916584186255932, -0.06288760155439377, 0.026359854266047478, -0.013939019292593002, 0.03585528954863548, 0.004547455348074436, 0.004690368194133043, -0.04211600124835968, -0.04825177043676376, -0.055354706943035126, -0.09765727818012238, 0.021828746423125267, 0.015368342399597168, -0.016511408612132072, 0.03647870570421219, -0.028675753623247147, -0.009988737292587757, -0.0924590528011322, -0.14393165707588196, -0.04364530369639397, -0.04670170694589615, 0.016749542206525803, 0.022685164585709572, -0.05605410411953926, -0.05195891857147217, 0.03136495128273964, 0.05333014205098152, 0.016319043934345245, 0.01992732845246792, -0.04337819293141365, -0.02839847281575203, -0.02022927813231945, -0.02641780860722065, 0.02930283546447754, 0.02049759030342102, 0.012575307860970497, 0.043038059026002884, 0.01778305135667324, -0.05488140136003494, -0.05374419316649437, 0.019190924242138863, 0.026568733155727386, 0.03294479846954346, 0.0037095528095960617, -0.014698716811835766, 0.02873227372765541, 0.02872776985168457, 0.012082632631063461, 0.007403180934488773, -0.015899790450930595, 0.043887410312891006, 0.02078593336045742, 0.03230574354529381, 0.035734377801418304, 0.011044871993362904, 0.016023660078644753, 0.03472379595041275, -0.086632139980793, -0.05434061959385872, 0.03821862116456032, -0.014385567978024483, 0.01139583345502615, 0.05264813452959061, 0.03325875476002693, -0.04525246471166611, 0.03155100345611572, 0.022725047543644905, -0.05694980174303055, 0.0347813256084919, 0.01891772262752056, -0.0030750874429941177, 0.04984341934323311, 0.0012191605055704713, 0.0053940583020448685, -0.055852703750133514, -0.048437196761369705, 0.007538347505033016, -0.005039219278842211, 0.060910243541002274, 0.03231583535671234, -0.06375285238027573, -0.04213077202439308, 0.014964931644499302, 0.031408704817295074, 0.04302803426980972, 0.04147221893072128, -0.06594009697437286, -0.00978210847824812, -0.017827890813350677, -0.013089115731418133, -0.044347189366817474, 0.05275992304086685, -0.06434307992458344, 0.03398493304848671, 0.0006133221322670579, 0.01772407628595829, -0.0817960873246193, 0.007210501004010439, -0.07263652235269547, -0.07417140901088715, 0.009443763643503189, 0.03782111033797264, -0.10852113366127014, 0.01312529481947422, -0.03526357561349869, -0.012330346740782261, -0.052082356065511703, 0.0159732885658741, 0.06413475424051285, 0.01836198754608631, 0.03275763988494873, -0.05589434504508972] def predict(text, speaker): if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) inputs = processor(text=text, return_tensors="pt") # limit input length input_ids = inputs["input_ids"] input_ids = input_ids[..., :model.config.max_text_positions] speaker_embedding = np.load(spe) speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) speech = (speech.numpy() * 32767).astype(np.int16) return (16000, speech) title = "CEB SEQ TTS" description = """ This demo system is intended for survey purposes only. """ gr.Interface( fn=predict, inputs=[ gr.Text(label="Input Text") ], outputs=[ gr.Audio(label="Generated Speech", type="numpy"), ], title=title, description=description, ).launch()