MJobe commited on
Commit
1fab2aa
1 Parent(s): 4556b98

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +15 -11
main.py CHANGED
@@ -10,6 +10,7 @@ from pdf2image import convert_from_bytes
10
  from pydub import AudioSegment
11
  import numpy as np
12
  import json
 
13
 
14
  app = FastAPI()
15
 
@@ -167,22 +168,25 @@ async def transcribe_and_match(
167
  contents = await file.read()
168
  audio = AudioSegment.from_file(BytesIO(contents))
169
 
170
- # Convert AudioSegment to raw audio format in WAV
171
- raw_audio = BytesIO()
172
- audio.export(raw_audio, format="wav")
173
- raw_audio.seek(0)
174
 
175
- # Load the raw audio into a NumPy array
176
- samples = np.array(audio.get_array_of_samples()).astype(np.float64)
 
 
 
177
 
178
- # Step 2: Use the speech-to-text model (expecting NumPy array of float64)
179
- transcription_result = nlp_speech_to_text(raw_audio)
180
  transcription_text = transcription_result['text']
181
 
182
- # Step 3: Parse the field_data (which contains field names/IDs)
183
  fields = json.loads(field_data)
184
 
185
- # Step 4: Find the matching field for the transcription
186
  field_matches = {}
187
  for field in fields:
188
  field_label = field.get("field_label", "").lower()
@@ -192,7 +196,7 @@ async def transcribe_and_match(
192
  if field_label in transcription_text.lower():
193
  field_matches[field_id] = transcription_text
194
 
195
- # Step 5: Return transcription + matched fields
196
  return {
197
  "transcription": transcription_text,
198
  "matched_fields": field_matches
 
10
  from pydub import AudioSegment
11
  import numpy as np
12
  import json
13
+ import torchaudio
14
 
15
  app = FastAPI()
16
 
 
168
  contents = await file.read()
169
  audio = AudioSegment.from_file(BytesIO(contents))
170
 
171
+ # Step 2: Export to WAV format and load with torchaudio
172
+ wav_buffer = BytesIO()
173
+ audio.export(wav_buffer, format="wav")
174
+ wav_buffer.seek(0)
175
 
176
+ # Load audio using torchaudio
177
+ waveform, sample_rate = torchaudio.load(wav_buffer)
178
+
179
+ # Convert waveform to float64 if necessary
180
+ samples = waveform.numpy().astype(np.float64)
181
 
182
+ # Step 3: Use the speech-to-text model
183
+ transcription_result = nlp_speech_to_text(samples, sampling_rate=sample_rate)
184
  transcription_text = transcription_result['text']
185
 
186
+ # Step 4: Parse the field_data (which contains field names/IDs)
187
  fields = json.loads(field_data)
188
 
189
+ # Step 5: Find the matching field for the transcription
190
  field_matches = {}
191
  for field in fields:
192
  field_label = field.get("field_label", "").lower()
 
196
  if field_label in transcription_text.lower():
197
  field_matches[field_id] = transcription_text
198
 
199
+ # Step 6: Return transcription + matched fields
200
  return {
201
  "transcription": transcription_text,
202
  "matched_fields": field_matches