import gradio as gr from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info import torch import pandas as pd from datetime import datetime from azure.storage.blob import BlobServiceClient from io import BytesIO import re # Azure Storage Account details STORAGE_ACCOUNT_NAME = "piointernaldestrg" STORAGE_ACCOUNT_KEY = "Pd91QXwgXkiRyd4njM06B9rRFSvtMBijk99N9s7n1M405Kmn4vWzMUmm0vstoYtLLepFmKb9iBaJ+ASt6q+jwg==" CONTAINER_NAME = "invoices" # Initialize model and processor model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-AWQ", torch_dtype="auto") if torch.cuda.is_available(): model.to("cuda") processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-AWQ") # Function to process a batch of images def process_image_batch(model, processor, image_paths): results = [] for image_path in image_paths: try: prompt = ( "Please extract the following details from the invoice:\n" "- 'invoice_number'\n" "- 'date'\n" "- 'place of invoice (city)'\n" "- 'total amount'\n" "- 'category of invoice (like food, stay, travel, other)'" ) messages = [ { "role": "user", "content": [ {"type": "image", "image": image_path}, {"type": "text", "text": prompt}, ], } ] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to(model.device) generated_ids = model.generate(**inputs, max_new_tokens=128) generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) structured_data = { "invoice_number": None, "date": None, "place_of_invoice": None, "total_amount": None, "category_of_invoice": None, } total_amount_found = False for line in output_text[0].split("\n"): # Invoice number mapping logic if any(keyword in line.lower() for keyword in ["invoice_number", "number in bold", "number", "bill number", "estimate number"]): structured_data["invoice_number"] = line.split(":")[-1].strip() # Date mapping logic elif "date" in line.lower(): date = line.split(":")[-1].strip() structured_data["date"] = process_date(date) # Place of invoice mapping logic elif "place of invoice" in line.lower(): structured_data["place_of_invoice"] = line.split(":")[-1].strip() # Total amount mapping logic elif any(keyword in line.lower() for keyword in ["total", "total amount", "grand total", "final amount", "balance due"]): amounts = re.findall(r"\d+\.\d{2}", line) if amounts: structured_data["total_amount"] = amounts[-1] total_amount_found = True elif not total_amount_found and re.match(r"^\s*TOTAL\s*:\s*\d+\.\d{2}\s*$", line, re.IGNORECASE): structured_data["total_amount"] = re.findall(r"\d+\.\d{2}", line)[0] total_amount_found = True # Category of invoice mapping logic elif "category of invoice" in line.lower(): structured_data["category_of_invoice"] = line.split(":")[-1].strip() results.append(structured_data) except Exception as e: results.append({ "invoice_number": "Error", "date": "Error", "place_of_invoice": "Error", "total_amount": "Error", "category_of_invoice": str(e), }) return pd.DataFrame(results) # Function to process and format dates def process_date(date_str): try: if re.match(r"\d{2}/\d{2}/\d{4}", date_str): return date_str elif re.match(r"\d{2} \w+ \d{4}", date_str): date_obj = datetime.strptime(date_str, "%d %b %Y") return date_obj.strftime("%d/%m/%Y") elif re.match(r"\d{2} \w+", date_str): date_obj = datetime.strptime(date_str, "%d %b") return date_obj.strftime("%d/%m") + "/YYYY" else: return date_str except: return date_str # Upload extracted data to Azure Blob Storage as a Parquet file def upload_to_azure_blob(df): try: # Convert DataFrame to Parquet format parquet_buffer = BytesIO() df.to_parquet(parquet_buffer, index=False) # Create the BlobServiceClient object blob_service_client = BlobServiceClient( account_url=f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net", credential=STORAGE_ACCOUNT_KEY, ) # Get the BlobClient object timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=f"invoice_data_{timestamp}.parquet") # Upload the Parquet file blob_client.upload_blob(parquet_buffer.getvalue(), overwrite=True) # Return the file URL return f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/invoice_data_{timestamp}.parquet" except Exception as e: return {"error": str(e)} # Gradio interface function def gradio_interface(username, email, image_files): df = process_image_batch(model, processor, image_files) file_url = upload_to_azure_blob(df) user_info = f"Username: {username}\nEmail: {email}" return user_info, df, f"Parquet File URL: {file_url}" # Define the Gradio interface grpc_interface = gr.Interface( fn=gradio_interface, inputs=[ gr.Textbox(label="Username"), gr.Textbox(label="Email"), gr.Files(label="Upload Invoice Images", type="filepath"), ], outputs=[ gr.Textbox(label="User Info"), gr.Dataframe(label="Extracted Invoice Data"), gr.Textbox(label="Parquet File URL"), ], title="Invoice Extraction System", description="Upload invoices, extract details, and save to Azure Blob Storage.", ) # Launch the Gradio interface if __name__ == "__main__": grpc_interface.launch(share=True)