Spaces:

MJobe
/

document-vqa-v2

Running

App Files Files Community

MJobe commited on Oct 22

Commit

4556b98

•

1 Parent(s): a191415

Update main.py

Browse files

Files changed (1) hide show

main.py +6 -7

main.py CHANGED Viewed

@@ -9,6 +9,7 @@ from starlette.middleware.cors import CORSMiddleware
 from pdf2image import convert_from_bytes
 from pydub import AudioSegment
 import numpy as np
 app = FastAPI()
@@ -166,21 +167,19 @@ async def transcribe_and_match(
         contents = await file.read()
         audio = AudioSegment.from_file(BytesIO(contents))
-        # Convert AudioSegment to a NumPy array
-        # First, export to raw audio format and then load into NumPy
         raw_audio = BytesIO()
         audio.export(raw_audio, format="wav")
         raw_audio.seek(0)
-        # Convert audio to samples as NumPy array (convert to float64)
         samples = np.array(audio.get_array_of_samples()).astype(np.float64)
         # Step 2: Use the speech-to-text model (expecting NumPy array of float64)
-        transcription_result = nlp_speech_to_text(samples)
         transcription_text = transcription_result['text']
         # Step 3: Parse the field_data (which contains field names/IDs)
-        import json
         fields = json.loads(field_data)
         # Step 4: Find the matching field for the transcription
@@ -189,7 +188,7 @@ async def transcribe_and_match(
             field_label = field.get("field_label", "").lower()
             field_id = field.get("field_id", "")
-            # Simple matching: if the transcribed text contains the field label (or something close)
             if field_label in transcription_text.lower():
                 field_matches[field_id] = transcription_text
@@ -200,7 +199,7 @@ async def transcribe_and_match(
         }
     except Exception as e:
-        return JSONResponse(content=f"Error processing audio or matching fields: {str(e)}", status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins

 from pdf2image import convert_from_bytes
 from pydub import AudioSegment
 import numpy as np
+import json
 app = FastAPI()
         contents = await file.read()
         audio = AudioSegment.from_file(BytesIO(contents))
+        # Convert AudioSegment to raw audio format in WAV
         raw_audio = BytesIO()
         audio.export(raw_audio, format="wav")
         raw_audio.seek(0)
+        # Load the raw audio into a NumPy array
         samples = np.array(audio.get_array_of_samples()).astype(np.float64)
         # Step 2: Use the speech-to-text model (expecting NumPy array of float64)
+        transcription_result = nlp_speech_to_text(raw_audio)
         transcription_text = transcription_result['text']
         # Step 3: Parse the field_data (which contains field names/IDs)
         fields = json.loads(field_data)
         # Step 4: Find the matching field for the transcription
             field_label = field.get("field_label", "").lower()
             field_id = field.get("field_id", "")
+            # Simple matching: if the transcribed text contains the field label
             if field_label in transcription_text.lower():
                 field_matches[field_id] = transcription_text
         }
     except Exception as e:
+        return JSONResponse(content={"error": f"Error processing audio or matching fields: {str(e)}"}, status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins