EE21 commited on
Commit
2313689
1 Parent(s): c9be414

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -11
app.py CHANGED
@@ -7,28 +7,35 @@ from extractive_summarization import summarize_with_textrank, summarize_with_lsa
7
  from abstractive_summarization import summarize_with_bart_cnn, summarize_with_bart_ft, summarize_with_led, summarize_with_t5
8
  from keyword_extraction import extract_keywords
9
  from keyphrase_extraction import extract_sentences_with_obligations
 
 
10
  #from blanc import BlancHelp
11
 
12
 
13
  # Load in ToS
14
  dataset = load_dataset("EE21/ToS-Summaries")
15
 
16
- def extract_organization_name(text):
17
- # A simple regex pattern to identify organization names. This pattern looks for capitalized words, possibly followed by "Inc.", "Ltd.", etc.
18
- # This is a very basic pattern and might need to be adjusted based on the actual content of the documents.
19
- pattern = r"\b[A-Z][a-zA-Z]*(?:\s[A-Z][a-zA-Z]*)*\s(?:Inc\.|Corporation|Corp\.|LLC|Ltd\.|Limited|Co\.|Company)?\b"
20
 
21
- # Search for the pattern in the text
22
- match = re.search(pattern, text)
23
- if match:
24
- return match.group()
25
- else:
26
- return "Unknown Organization"
 
 
 
 
 
 
 
 
 
27
 
28
  # Extract titles or identifiers for the ToS
29
  #tos_titles = [f"Document {i}" for i in range(len(dataset['train']))]
30
 
31
- tos_titles = [extract_organization_name(doc['plain_text']) for doc in dataset['train']]
32
 
33
 
34
  # Set page to wide mode
 
7
  from abstractive_summarization import summarize_with_bart_cnn, summarize_with_bart_ft, summarize_with_led, summarize_with_t5
8
  from keyword_extraction import extract_keywords
9
  from keyphrase_extraction import extract_sentences_with_obligations
10
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
11
+ import torch
12
  #from blanc import BlancHelp
13
 
14
 
15
  # Load in ToS
16
  dataset = load_dataset("EE21/ToS-Summaries")
17
 
 
 
 
 
18
 
19
+ model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
20
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
21
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
22
+
23
+ def extract_organization_names(text):
24
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
25
+ outputs = model(**inputs)
26
+
27
+ predictions = torch.argmax(outputs.logits, dim=2)
28
+ entities = [tokenizer.convert_ids_to_tokens(inputs.input_ids[0][idx]) for idx, pred in enumerate(predictions[0]) if model.config.id2label[pred.item()] == 'B-ORG']
29
+ return " ".join(entities)
30
+
31
+ # Apply this function to your dataset
32
+ tos_titles = [extract_organization_names(doc['plain_text']) for doc in dataset['train']]
33
+
34
 
35
  # Extract titles or identifiers for the ToS
36
  #tos_titles = [f"Document {i}" for i in range(len(dataset['train']))]
37
 
38
+
39
 
40
 
41
  # Set page to wide mode