anmolsahai commited on
Commit
3056140
·
1 Parent(s): 1ea97da
Files changed (2) hide show
  1. app.py +124 -92
  2. requirements.txt +4 -2
app.py CHANGED
@@ -1,95 +1,127 @@
 
 
 
 
 
 
 
1
  import streamlit as st
2
- from langchain_pipeline.py import pipeline
3
- import fitz # PyMuPDF
4
- from docx import Document
5
- from difflib import unified_diff
6
- import tempfile
7
- from docx.shared import RGBColor
8
- import re
9
- import subprocess
10
-
11
- def pdf_to_text_with_layout(pdf_file):
12
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
13
- text = []
14
- for page_num in range(doc.page_count):
15
- page = doc.load_page(page_num)
16
- text.append(page.get_text("text"))
17
- return "\n".join(text)
18
- def clean_text(text):
19
- # Remove non-ASCII and control characters
20
- return ''.join(c for c in text if c.isprintable() and ord(c) < 65536)
21
- def text_to_word_with_formatting(text, word_path):
22
- doc = Document()
23
- for line in text.split("\n"):
24
- clean_line = clean_text(line)
25
- doc.add_paragraph(clean_line)
26
- doc.save(word_path)
27
- def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
28
- return pipeline(
29
- file,
30
- model_name,
31
- balance_type,
32
- apsn_transactions,
33
- max_fees_per_day,
34
- min_overdrawn_fee,
35
- min_transaction_overdraft
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  )
37
- def redline_changes(original_path, revised_path, output_path):
38
- # Using docxcompose to create a redlined document
39
- subprocess.run(['docxcompose', 'compose', original_path, revised_path, output_path])
40
- # Streamlit App
41
- st.title("Canarie AI Prototype")
42
- st.subheader("Finding the canarie in the coal mine")
43
- model_name = st.selectbox("Model", ["gemini-1.5-pro-001", "other-model-name"])
44
- balance_type = st.selectbox("Do you charge on available balance or ledger balance?", ["available balance", "ledger balance"])
45
- apsn_transactions = st.selectbox("Do you charge for APSN transactions?", ["yes", "no"])
46
- max_fees_per_day = st.number_input("How many overdraft fees per day can be charged?", min_value=0, max_value=10)
47
- min_overdrawn_fee = st.number_input("What is the minimum amount overdrawn to incur a fee?", min_value=0, max_value=500)
48
- min_transaction_overdraft = st.number_input("What is the minimum transaction amount to trigger an overdraft?", min_value=0, max_value=500)
49
- uploaded_file = st.file_uploader("Choose a file", type=["pdf"])
50
  if uploaded_file is not None:
51
- with st.spinner('Please wait ...'):
52
- try:
53
- # Extract text with layout preservation
54
- extracted_text = pdf_to_text_with_layout(uploaded_file)
55
-
56
- original_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
57
- text_to_word_with_formatting(extracted_text, original_word_path)
58
- diff = apply_pipeline(
59
- uploaded_file,
60
- model_name,
61
- balance_type,
62
- apsn_transactions,
63
- max_fees_per_day,
64
- min_overdrawn_fee,
65
- min_transaction_overdraft
66
- )
67
- revised_word_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
68
- text_to_word_with_formatting(diff, revised_word_path)
69
- redlined_output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
70
- redline_changes(original_word_path, revised_word_path, redlined_output_path)
71
- with open(original_word_path, "rb") as f:
72
- st.download_button(
73
- label="Download Original Document",
74
- data=f,
75
- file_name="original_document.docx",
76
- mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
77
- )
78
-
79
- with open(revised_word_path, "rb") as f:
80
- st.download_button(
81
- label="Download Revised Document",
82
- data=f,
83
- file_name="revised_document.docx",
84
- mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
85
- )
86
- with open(redlined_output_path, "rb") as f:
87
- st.download_button(
88
- label="Download Redlined Document",
89
- data=f,
90
- file_name="redlined_document.docx",
91
- mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
92
- )
93
- st.success("Documents created successfully!")
94
- except Exception as e:
95
- st.exception(e)
 
1
+ import os
2
+ import base64
3
+ from pdfminer.high_level import extract_text
4
+ from langchain_core.prompts import PromptTemplate
5
+ from google.cloud import aiplatform
6
+ from google.cloud.aiplatform_v1 import ModelServiceClient
7
+ from google.cloud.aiplatform_v1.types import GenerateContentRequest, Document, GenerationConfig, SafetySettings, HarmCategory, HarmBlockThreshold
8
  import streamlit as st
9
+
10
+ # Initialize the Google AI Platform
11
+ aiplatform.init(project="akroda", location="us-central1")
12
+
13
+ # Define the documents (base64-encoded PDF content)
14
+ documents = [
15
+ Document(content_type="application/pdf", data=base64.b64decode("JVBERi0xLjQKJeODgxNz5dL1Jvb3QgMTU0IDAgUi9TaXplIDE2Nj4+CnN0YXJ0eHJlZgoyMTY0NjkKJSVFT0YK")),
16
+ Document(content_type="application/pdf", data=base64.b64decode("JVBERi0xLjQKJeLjz9MKNijU+PgpzdGFydHhyZWYKMTMxMDY0CiUlRU9GCg==")),
17
+ Document(content_type="application/pdf", data=base64.b64decode("JVBERi0xLjQKJeLjz9MKNiAwZDU0YTVlNzllMWRhYWY1ZDQ2YjI+XS9Sb290IDE3NyAwIFIvU2l6ZSAxODc+PgpzdGFydHhyZWYKMjA3NTk5CiUlRU9GCg==")),
18
+ Document(content_type="application/pdf", data=base64.b64decode("JVBERi0xLjQKJeLjz9ML1Jvb3QgMTg5IDAgUi9TaXplIDE5OT4+CnN0YXJ0eHJlZgoxOTgzNzMKJSVFT0YK")),
19
+ Document(content_type="application/pdf", data=base64.b64decode("JVBERi0xCcnCmVuZHN0cmVhbQplbmRvYmoKc3RhcnR4cmVmCjIwOTgyNQolJUVPRgo=")),
20
+ Document(content_type="application/pdf", data=base64.b64decode("JVBERi0xLj+CnN0YXJ0eHJlZgoyMTk5MDYKJSVFT0YK")),
21
+ Document(content_type="application/pdf", data=base64.b64decode("JVBERi0xLjQKJiUlRU9GCg==")),
22
+ Document(content_type="application/pdf", data=base64.b64decode("JVBERi0xLjQKJe90IDMwOCAwIFIvU2l6ZSAzMTg+PgpzdGFydHhyZWYKMjcwNzU3CiUlRU9GCg==")),
23
+ Document(content_type="application/pdf", data=base64.b64decode("JVBERi0xLjUNJeLjz9MNCjcgMCBvYmoNPDwvTGluZWFyaXplZCAxL0wgNjc1NzgvTyA5L0UgNjAyNDYvTiAxL1QgNjcyODcvSCBbIDQ4MyAxNTRdPj4NZW5kb2JxDSAgICAgICAgICAgICAgICAgICAgDQoyMiAwIG9iag08PC9EZWNvZGVQYXJtczw8L0NvbHVtbnMgNC9QcmVkaWN0b3IgMTI+Pi9GaWx0ZXIvRmxhdGVEZWNvZGUvSURbPDE3NzU4MkJFODc4MzRFQjNBOEM3RkIzQTgyRjFFMEFCPjw5MzI2Qjk4REM4NjQ2RTRCODI3MzZFQUEzOENEQjFBQj5dL0luZGV4WzcgMjhdL0luZm8gNiAwIFIvTGVuZ3RoIDgzL1ByZXYgNjcyODgvUm9vdCA4IDAgUi9TaXplIDM1L1R5cGUvWFPRg0K"))
24
+ ]
25
+
26
+ text1 = """
27
+ attached are several cases and a bank disclosure. Using the cases, please provide changes to the disclosure and keep as much formatting as possible and to ensure there are no legal contradictions between the content of the disclosure and the cases and please provide reasoning for each proposed change. Please also integrate the bank's policies into the disclosure. In the first sentence, please include a reference to the account agreement "for more information on overdrafts" and a placeholder for a URL.
28
+ Here are the answers to the bank's policy questions:
29
+ Do you charge on available balance or ledger balance?: {balance_type} (which should replace money in the first sentence)
30
+ Do you charge for APSN transactions?: {apsn_transactions}
31
+ How many overdraft fees per day can be charged?: {max_fees_per_day}
32
+ What is the minimum amount overdrawn to incur a fee?: ${min_overdrawn_fee}
33
+ What is the minimum transaction amount to trigger an overdraft?: ${min_transaction_overdraft}
34
+
35
+ Please output in the following format:
36
+ {{entire updated disclosure text with changes bolded}}
37
+ ------
38
+ {{reasons for each change listed and cases cited}}
39
+ """
40
+
41
+ prompt = PromptTemplate(
42
+ input_variables=["context", "disclosure", "balance_type", "apsn_transactions", "max_fees_per_day", "min_overdrawn_fee", "min_transaction_overdraft"],
43
+ template=text1,
44
+ )
45
+
46
+ # Placeholder values for the variables used in prompt formatting
47
+ legal_cases_context = "Provide the legal context here..."
48
+ disclosure_text = "Include the initial disclosure text here..."
49
+ balance_type = "available balance"
50
+ apsn_transactions = "yes"
51
+ max_fees_per_day = 3
52
+ min_overdrawn_fee = 5
53
+ min_transaction_overdraft = 1
54
+
55
+ # Base64 encode the disclosure text
56
+ encoded_disclosure_text = base64.b64encode(disclosure_text.encode()).decode()
57
+
58
+ val = prompt.format(
59
+ context=legal_cases_context,
60
+ disclosure=encoded_disclosure_text,
61
+ balance_type=balance_type,
62
+ apsn_transactions=apsn_transactions,
63
+ max_fees_per_day=max_fees_per_day,
64
+ min_overdrawn_fee=min_overdrawn_fee,
65
+ min_transaction_overdraft=min_transaction_overdraft,
66
+ )
67
+
68
+ generation_config = GenerationConfig(
69
+ max_output_tokens=8192,
70
+ temperature=1,
71
+ top_p=0.95,
72
+ )
73
+
74
+ safety_settings = SafetySettings(
75
+ harm_category_settings={
76
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
77
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
78
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
79
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
80
+ }
81
+ )
82
+
83
+ def generate(document_parts, prompt_text):
84
+ model_service_client = ModelServiceClient()
85
+ model_resource_name = model_service_client.model_path("akroda", "us-central1", "gemini-1.5-pro-001")
86
+ response = model_service_client.generate_content(
87
+ request=GenerateContentRequest(
88
+ model=model_resource_name,
89
+ documents=document_parts,
90
+ prompt=prompt_text,
91
+ generation_config=generation_config,
92
+ safety_settings=safety_settings,
93
+ )
94
  )
95
+ return response.generated_text
96
+
97
+ def pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
98
+ document_parts = documents
99
+ response_text = generate(document_parts, val)
100
+ return response_text
101
+
102
+ # Streamlit Interface
103
+ st.title("Bank Disclosure Update Pipeline")
104
+ st.write("Upload your document and provide the necessary details to update the bank disclosure.")
105
+
106
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
107
+
108
  if uploaded_file is not None:
109
+ file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type}
110
+ st.write(file_details)
111
+
112
+ # Display the content of the uploaded file
113
+ content = extract_text(uploaded_file)
114
+ st.text(content)
115
+
116
+ # Placeholder for user inputs
117
+ balance_type = st.text_input("Balance Type", "available balance")
118
+ apsn_transactions = st.text_input("APSN Transactions", "yes")
119
+ max_fees_per_day = st.number_input("Max Fees Per Day", min_value=1, value=3)
120
+ min_overdrawn_fee = st.number_input("Min Overdrawn Fee ($)", min_value=0, value=5)
121
+ min_transaction_overdraft = st.number_input("Min Transaction Overdraft ($)", min_value=0, value=1)
122
+
123
+ if st.button("Generate Updated Disclosure"):
124
+ # Run the pipeline with the provided inputs
125
+ result = pipeline(uploaded_file, "gemini-1.5-pro-001", balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft)
126
+ st.write("Updated Disclosure:")
127
+ st.text(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -11,7 +11,6 @@ google_generativeai
11
  pdf2docx
12
  pymupdf
13
  python-docx
14
- streamlit
15
  pdfplumber
16
  python-docx
17
  redlines
@@ -20,4 +19,7 @@ langchain_core
20
  langchain_openai
21
  langchain_anthropic
22
  langchain_google_genai
23
- docxcompose
 
 
 
 
11
  pdf2docx
12
  pymupdf
13
  python-docx
 
14
  pdfplumber
15
  python-docx
16
  redlines
 
19
  langchain_openai
20
  langchain_anthropic
21
  langchain_google_genai
22
+ docxcompose
23
+ google-cloud-aiplatform==1.12.0
24
+ pdfminer.six==20201018
25
+ langchain-core