abidlabs's picture
abidlabs HF staff
Update clean.py
3c558b8 verified
from huggingface_hub import InferenceClient
from pathlib import Path
import gradio as gr
import os
MODEL_NAME = "meta-llama/Meta-Llama-3-70b-Instruct"
def split_text_into_chunks(text, chunk_size=600):
words = text.split()
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
def clean_transcript(audio_file, options, prompt, transcript: str):
text = f"### {Path(audio_file).with_suffix('').name}\n\n"
if options == []:
text += transcript
else:
chunks = split_text_into_chunks(transcript)
for chunk in chunks:
messages = [
{"role": "user", "content": prompt + "\n" + chunk}
]
client = InferenceClient(model=MODEL_NAME, token=os.getenv("HF_TOKEN"))
for c in client.chat_completion(messages, max_tokens=1000, stream=True):
token = c.choices[0].delta.content
text += token or ""
yield text, None
# write text to md file
md_file = Path(audio_file).with_suffix('.md')
md_file.write_text(text)
return text, gr.DownloadButton(interactive=True, value=md_file)