File size: 4,238 Bytes
201bb3e bca3d41 201bb3e bca3d41 201bb3e bca3d41 201bb3e bca3d41 201bb3e bca3d41 201bb3e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import pixeltable as pxt
from pixeltable.iterators import DocumentSplitter
from pixeltable.functions import openai
import os
import requests
import tempfile
import gradio as gr
def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, system_prompt, progress=gr.Progress()):
try:
os.environ['OPENAI_API_KEY'] = api_key
progress(0.1, desc="Initializing...")
pxt.drop_dir('document_audio', force=True)
pxt.create_dir('document_audio')
docs = pxt.create_table(
'document_audio.documents',
{
'document': pxt.Document,
'voice': pxt.String,
'style': pxt.String,
'mode_prompt': pxt.String
}
)
progress(0.2, desc="Processing document...")
docs.insert([{
'document': pdf_file.name,
'voice': voice_choice,
'style': style_choice,
'mode_prompt': system_prompt
}])
chunks = pxt.create_view(
'document_audio.chunks',
docs,
iterator=DocumentSplitter.create(
document=docs.document,
separators='token_limit',
limit=chunk_size
)
)
progress(0.4, desc="Text processing...")
chunks['content_response'] = openai.chat_completions(
messages=[
{
'role': 'system',
'content': docs.mode_prompt # Use the mode-specific prompt
},
{'role': 'user', 'content': chunks.text}
],
model='gpt-4o-mini-2024-07-18',
max_tokens=max_tokens,
temperature=temperature
)
chunks['content'] = chunks.content_response['choices'][0]['message']['content']
progress(0.6, desc="Script generation...")
chunks['script_response'] = openai.chat_completions(
messages=[
{
'role': 'system',
'content': f"""Convert content to audio script.
Style: {docs.style}
Format:
- Clear sentence structures
- Natural pauses (...)
- Term definitions when needed
- Proper transitions"""
},
{'role': 'user', 'content': chunks.content}
],
model='gpt-4o-mini-2024-07-18',
max_tokens=max_tokens,
temperature=temperature
)
chunks['script'] = chunks.script_response['choices'][0]['message']['content']
progress(0.8, desc="Audio synthesis...")
@pxt.udf(return_type=pxt.Audio)
def generate_audio(script: str, voice: str):
if not script or not voice:
return None
try:
response = requests.post(
"https://api.openai.com/v1/audio/speech",
headers={"Authorization": f"Bearer {api_key}"},
json={"model": "tts-1", "input": script, "voice": voice}
)
if response.status_code == 200:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
temp_file.write(response.content)
temp_file.close()
return temp_file.name
except Exception as e:
print(f"Error in audio synthesis: {e}")
return None
chunks['audio'] = generate_audio(chunks.script, docs.voice)
audio_path = chunks.select(chunks.audio).tail(1)['audio'][0]
results = chunks.select(
chunks.content,
chunks.script
).collect()
display_data = [
[f"Segment {idx + 1}", row['content'], row['script']]
for idx, row in enumerate(results)
]
progress(1.0, desc="Complete")
return display_data, audio_path, "Processing complete"
except Exception as e:
return None, None, f"Error: {str(e)}" |