MJobe commited on
Commit
b3ae078
1 Parent(s): 6f772e9

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +14 -10
main.py CHANGED
@@ -8,6 +8,7 @@ from starlette.middleware import Middleware
8
  from starlette.middleware.cors import CORSMiddleware
9
  from pdf2image import convert_from_bytes
10
  from pydub import AudioSegment
 
11
 
12
  app = FastAPI()
13
 
@@ -165,22 +166,25 @@ async def transcribe_and_match(
165
  contents = await file.read()
166
  audio = AudioSegment.from_file(BytesIO(contents))
167
 
168
- # Optionally convert to wav if needed
169
- wav_io = BytesIO()
170
- audio.export(wav_io, format="wav")
171
- wav_io.seek(0)
 
172
 
173
- # Transcribe the WAV audio file
174
- transcription_result = nlp_speech_to_text(wav_io)
 
 
 
175
  transcription_text = transcription_result['text']
176
 
177
- # Step 2: Parse the field_data (which contains field names/IDs)
178
  import json
179
  fields = json.loads(field_data)
180
 
181
- # Step 3: Find the matching field for the transcription
182
  field_matches = {}
183
-
184
  for field in fields:
185
  field_label = field.get("field_label", "").lower()
186
  field_id = field.get("field_id", "")
@@ -189,7 +193,7 @@ async def transcribe_and_match(
189
  if field_label in transcription_text.lower():
190
  field_matches[field_id] = transcription_text
191
 
192
- # Step 4: Return transcription + matched fields
193
  return {
194
  "transcription": transcription_text,
195
  "matched_fields": field_matches
 
8
  from starlette.middleware.cors import CORSMiddleware
9
  from pdf2image import convert_from_bytes
10
  from pydub import AudioSegment
11
+ import numpy as np
12
 
13
  app = FastAPI()
14
 
 
166
  contents = await file.read()
167
  audio = AudioSegment.from_file(BytesIO(contents))
168
 
169
+ # Convert AudioSegment to a NumPy array
170
+ # First, export to raw audio format and then load into NumPy
171
+ raw_audio = BytesIO()
172
+ audio.export(raw_audio, format="wav")
173
+ raw_audio.seek(0)
174
 
175
+ # Convert audio to samples as NumPy array
176
+ samples = np.array(audio.get_array_of_samples())
177
+
178
+ # Step 2: Use the speech-to-text model (expecting NumPy array)
179
+ transcription_result = nlp_speech_to_text(samples)
180
  transcription_text = transcription_result['text']
181
 
182
+ # Step 3: Parse the field_data (which contains field names/IDs)
183
  import json
184
  fields = json.loads(field_data)
185
 
186
+ # Step 4: Find the matching field for the transcription
187
  field_matches = {}
 
188
  for field in fields:
189
  field_label = field.get("field_label", "").lower()
190
  field_id = field.get("field_id", "")
 
193
  if field_label in transcription_text.lower():
194
  field_matches[field_id] = transcription_text
195
 
196
+ # Step 5: Return transcription + matched fields
197
  return {
198
  "transcription": transcription_text,
199
  "matched_fields": field_matches