Spaces:

MJobe
/

document-vqa-v2

Running

MJobe commited on Oct 22

Commit

1fab2aa

•

1 Parent(s): 4556b98

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pdf2image import convert_from_bytes
 from pydub import AudioSegment
 import numpy as np
 import json
 app = FastAPI()
@@ -167,22 +168,25 @@ async def transcribe_and_match(
         contents = await file.read()
         audio = AudioSegment.from_file(BytesIO(contents))
-        # Convert AudioSegment to raw audio format in WAV
-        raw_audio = BytesIO()
-        audio.export(raw_audio, format="wav")
-        raw_audio.seek(0)
-        # Load the raw audio into a NumPy array
-        samples = np.array(audio.get_array_of_samples()).astype(np.float64)
-        # Step 2: Use the speech-to-text model (expecting NumPy array of float64)
-        transcription_result = nlp_speech_to_text(raw_audio)
         transcription_text = transcription_result['text']
-        # Step 3: Parse the field_data (which contains field names/IDs)
         fields = json.loads(field_data)
-        # Step 4: Find the matching field for the transcription
         field_matches = {}
         for field in fields:
             field_label = field.get("field_label", "").lower()
@@ -192,7 +196,7 @@ async def transcribe_and_match(
             if field_label in transcription_text.lower():
                 field_matches[field_id] = transcription_text
-        # Step 5: Return transcription + matched fields
         return {
             "transcription": transcription_text,
             "matched_fields": field_matches

 from pydub import AudioSegment
 import numpy as np
 import json
+import torchaudio
 app = FastAPI()
         contents = await file.read()
         audio = AudioSegment.from_file(BytesIO(contents))
+        # Step 2: Export to WAV format and load with torchaudio
+        wav_buffer = BytesIO()
+        audio.export(wav_buffer, format="wav")
+        wav_buffer.seek(0)
+        # Load audio using torchaudio
+        waveform, sample_rate = torchaudio.load(wav_buffer)
+        # Convert waveform to float64 if necessary
+        samples = waveform.numpy().astype(np.float64)
+        # Step 3: Use the speech-to-text model
+        transcription_result = nlp_speech_to_text(samples, sampling_rate=sample_rate)
         transcription_text = transcription_result['text']
+        # Step 4: Parse the field_data (which contains field names/IDs)
         fields = json.loads(field_data)
+        # Step 5: Find the matching field for the transcription
         field_matches = {}
         for field in fields:
             field_label = field.get("field_label", "").lower()
             if field_label in transcription_text.lower():
                 field_matches[field_id] = transcription_text
+        # Step 6: Return transcription + matched fields
         return {
             "transcription": transcription_text,
             "matched_fields": field_matches