nehakothari commited on
Commit
6f08d64
·
verified ·
1 Parent(s): 9e0f607

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -0
app.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
+ from qwen_vl_utils import process_vision_info
4
+ import torch
5
+ import pandas as pd
6
+ from datetime import datetime
7
+ from azure.storage.blob import BlobServiceClient
8
+ from io import BytesIO
9
+ import re
10
+
11
+ # Azure Storage Account details
12
+ STORAGE_ACCOUNT_NAME = "piointernaldestrg"
13
+ STORAGE_ACCOUNT_KEY = "Pd91QXwgXkiRyd4njM06B9rRFSvtMBijk99N9s7n1M405Kmn4vWzMUmm0vstoYtLLepFmKb9iBaJ+ASt6q+jwg=="
14
+ CONTAINER_NAME = "invoices"
15
+
16
+ # Initialize model and processor
17
+ model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-AWQ", torch_dtype="auto")
18
+ if torch.cuda.is_available():
19
+ model.to("cuda")
20
+
21
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-AWQ")
22
+
23
+ # Function to process a batch of images
24
+ def process_image_batch(model, processor, image_paths):
25
+ results = []
26
+ for image_path in image_paths:
27
+ try:
28
+ prompt = (
29
+ "Please extract the following details from the invoice:\n"
30
+ "- 'invoice_number'\n"
31
+ "- 'date'\n"
32
+ "- 'place of invoice (city)'\n"
33
+ "- 'total amount'\n"
34
+ "- 'category of invoice (like food, stay, travel, other)'"
35
+ )
36
+
37
+ messages = [
38
+ {
39
+ "role": "user",
40
+ "content": [
41
+ {"type": "image", "image": image_path},
42
+ {"type": "text", "text": prompt},
43
+ ],
44
+ }
45
+ ]
46
+
47
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
48
+ image_inputs, video_inputs = process_vision_info(messages)
49
+ inputs = processor(
50
+ text=[text],
51
+ images=image_inputs,
52
+ videos=video_inputs,
53
+ padding=True,
54
+ return_tensors="pt",
55
+ )
56
+ inputs = inputs.to(model.device)
57
+
58
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
59
+ generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
60
+ output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
61
+
62
+ structured_data = {
63
+ "invoice_number": None,
64
+ "date": None,
65
+ "place_of_invoice": None,
66
+ "total_amount": None,
67
+ "category_of_invoice": None,
68
+ }
69
+
70
+ total_amount_found = False
71
+
72
+ for line in output_text[0].split("\n"):
73
+ # Invoice number mapping logic
74
+ if any(keyword in line.lower() for keyword in ["invoice_number", "number in bold", "number", "bill number", "estimate number"]):
75
+ structured_data["invoice_number"] = line.split(":")[-1].strip()
76
+
77
+ # Date mapping logic
78
+ elif "date" in line.lower():
79
+ date = line.split(":")[-1].strip()
80
+ structured_data["date"] = process_date(date)
81
+
82
+ # Place of invoice mapping logic
83
+ elif "place of invoice" in line.lower():
84
+ structured_data["place_of_invoice"] = line.split(":")[-1].strip()
85
+
86
+ # Total amount mapping logic
87
+ elif any(keyword in line.lower() for keyword in ["total", "total amount", "grand total", "final amount", "balance due"]):
88
+ amounts = re.findall(r"\d+\.\d{2}", line)
89
+ if amounts:
90
+ structured_data["total_amount"] = amounts[-1]
91
+ total_amount_found = True
92
+ elif not total_amount_found and re.match(r"^\s*TOTAL\s*:\s*\d+\.\d{2}\s*$", line, re.IGNORECASE):
93
+ structured_data["total_amount"] = re.findall(r"\d+\.\d{2}", line)[0]
94
+ total_amount_found = True
95
+
96
+ # Category of invoice mapping logic
97
+ elif "category of invoice" in line.lower():
98
+ structured_data["category_of_invoice"] = line.split(":")[-1].strip()
99
+
100
+ results.append(structured_data)
101
+ except Exception as e:
102
+ results.append({
103
+ "invoice_number": "Error",
104
+ "date": "Error",
105
+ "place_of_invoice": "Error",
106
+ "total_amount": "Error",
107
+ "category_of_invoice": str(e),
108
+ })
109
+
110
+ return pd.DataFrame(results)
111
+
112
+ # Function to process and format dates
113
+ def process_date(date_str):
114
+ try:
115
+ if re.match(r"\d{2}/\d{2}/\d{4}", date_str):
116
+ return date_str
117
+ elif re.match(r"\d{2} \w+ \d{4}", date_str):
118
+ date_obj = datetime.strptime(date_str, "%d %b %Y")
119
+ return date_obj.strftime("%d/%m/%Y")
120
+ elif re.match(r"\d{2} \w+", date_str):
121
+ date_obj = datetime.strptime(date_str, "%d %b")
122
+ return date_obj.strftime("%d/%m") + "/YYYY"
123
+ else:
124
+ return date_str
125
+ except:
126
+ return date_str
127
+
128
+ # Upload extracted data to Azure Blob Storage as a Parquet file
129
+ def upload_to_azure_blob(df):
130
+ try:
131
+ # Convert DataFrame to Parquet format
132
+ parquet_buffer = BytesIO()
133
+ df.to_parquet(parquet_buffer, index=False)
134
+
135
+ # Create the BlobServiceClient object
136
+ blob_service_client = BlobServiceClient(
137
+ account_url=f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net",
138
+ credential=STORAGE_ACCOUNT_KEY,
139
+ )
140
+
141
+ # Get the BlobClient object
142
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
143
+ blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=f"invoice_data_{timestamp}.parquet")
144
+
145
+ # Upload the Parquet file
146
+ blob_client.upload_blob(parquet_buffer.getvalue(), overwrite=True)
147
+
148
+ # Return the file URL
149
+ return f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/invoice_data_{timestamp}.parquet"
150
+ except Exception as e:
151
+ return {"error": str(e)}
152
+
153
+ # Gradio interface function
154
+ def gradio_interface(username, email, image_files):
155
+ df = process_image_batch(model, processor, image_files)
156
+ file_url = upload_to_azure_blob(df)
157
+ user_info = f"Username: {username}\nEmail: {email}"
158
+ return user_info, df, f"Parquet File URL: {file_url}"
159
+
160
+ # Define the Gradio interface
161
+ grpc_interface = gr.Interface(
162
+ fn=gradio_interface,
163
+ inputs=[
164
+ gr.Textbox(label="Username"),
165
+ gr.Textbox(label="Email"),
166
+ gr.Files(label="Upload Invoice Images", type="filepath"),
167
+ ],
168
+ outputs=[
169
+ gr.Textbox(label="User Info"),
170
+ gr.Dataframe(label="Extracted Invoice Data"),
171
+ gr.Textbox(label="Parquet File URL"),
172
+ ],
173
+ title="Invoice Extraction System",
174
+ description="Upload invoices, extract details, and save to Azure Blob Storage.",
175
+ )
176
+
177
+ # Launch the Gradio interface
178
+ if __name__ == "__main__":
179
+ grpc_interface.launch(share=True)