EE21 commited on
Commit
af90ec4
1 Parent(s): c8f75e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -1
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import streamlit as st
 
2
  from rouge import Rouge
3
  from datasets import load_dataset
4
  import PyPDF2
@@ -12,9 +13,24 @@ from keyphrase_extraction import extract_sentences_with_obligations
12
  # Load in ToS
13
  dataset = load_dataset("EE21/ToS-Summaries")
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  # Extract titles or identifiers for the ToS
16
- tos_titles = [f"Document {i}" for i in range(len(dataset['train']))]
17
 
 
 
 
18
  # Set page to wide mode
19
  st.set_page_config(layout="wide")
20
 
 
1
  import streamlit as st
2
+ import re
3
  from rouge import Rouge
4
  from datasets import load_dataset
5
  import PyPDF2
 
13
  # Load in ToS
14
  dataset = load_dataset("EE21/ToS-Summaries")
15
 
16
+ def extract_organization_name(text):
17
+ # A simple regex pattern to identify organization names. This pattern looks for capitalized words, possibly followed by "Inc.", "Ltd.", etc.
18
+ # This is a very basic pattern and might need to be adjusted based on the actual content of the documents.
19
+ pattern = r"\b[A-Z][a-zA-Z]*(?:\s[A-Z][a-zA-Z]*)*\s(?:Inc\.|Corporation|Corp\.|LLC|Ltd\.|Limited|Co\.|Company)?\b"
20
+
21
+ # Search for the pattern in the text
22
+ match = re.search(pattern, text)
23
+ if match:
24
+ return match.group()
25
+ else:
26
+ return "Unknown Organization"
27
+
28
  # Extract titles or identifiers for the ToS
29
+ #tos_titles = [f"Document {i}" for i in range(len(dataset['train']))]
30
 
31
+ tos_titles = [extract_organization_name(doc['plan_text']) for doc in dataset['train']]
32
+
33
+
34
  # Set page to wide mode
35
  st.set_page_config(layout="wide")
36