rondaravaol commited on
Commit
76f22a1
·
1 Parent(s): ac8e459

Adding inference example

Browse files
Files changed (3) hide show
  1. README.md +67 -1
  2. code/labels.py +0 -22
  3. code/utils.py +0 -19
README.md CHANGED
@@ -3,4 +3,70 @@ base_model:
3
  - microsoft/markuplm-base
4
  library_name: transformers
5
  pipeline_tag: token-classification
6
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  - microsoft/markuplm-base
4
  library_name: transformers
5
  pipeline_tag: token-classification
6
+ ---
7
+
8
+ This is fine-tuned PoC of [markuplm-base](https://huggingface.co/microsoft/markuplm-base) model for parsing news attributes from web-pages: author, pulication date, content and etc.
9
+
10
+ **Inference example**
11
+
12
+ Code accepts URL as input, loads a web-page and returns the json with extracted data (author, publication date, title and content).
13
+
14
+ ```python
15
+
16
+ id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header", 6: "footer", 7: "rail", 8: "advertisement", 9: "navigation"}
17
+
18
+ def eval(url):
19
+ current_dir = os.path.dirname(os.path.abspath(__file__))
20
+
21
+ model_folder = os.path.join(current_dir, 'models') # models folder is in the repository root
22
+ model_name = 'OxMarkupLM.pt'
23
+
24
+ processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
25
+ processor.parse_html = False
26
+
27
+ model_path = os.path.join(model_folder, model_name)
28
+
29
+ model = MarkupLMForTokenClassification.from_pretrained(
30
+ model_path, id2label=labels.id2label, label2id=labels.label2id
31
+ )
32
+
33
+ html = utils.clean_html(utils.get_html_content(url))
34
+ data = [utils.extract_nodes_and_feautures(html)]
35
+ example = utils.split_sliding_data(data, 10, 0)
36
+
37
+ title, author, date, content = [], [], [], []
38
+ for splited in example:
39
+ nodes, xpaths = splited['nodes'], splited['xpaths']
40
+ encoding = processor(
41
+ nodes=nodes, xpaths=xpaths, return_offsets_mapping=True,
42
+ padding="max_length", truncation=True, max_length=512, return_tensors="pt"
43
+ )
44
+ offset_mapping = encoding.pop("offset_mapping")
45
+ with torch.no_grad():
46
+ logits = model(**encoding).logits
47
+
48
+ predictions = logits.argmax(-1)
49
+ processed_words = []
50
+
51
+ for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()):
52
+ if word_id is not None and offset[0] == 0:
53
+ if pred_id == 1:
54
+ title.append(nodes[word_id])
55
+ elif pred_id == 2 and word_id not in processed_words:
56
+ processed_words.append(word_id)
57
+ content.append(nodes[word_id])
58
+ elif pred_id == 3:
59
+ author.append(nodes[word_id])
60
+ elif pred_id == 4:
61
+ date.append(nodes[word_id])
62
+
63
+ title = rank_titles(title, '\n'.join(content))
64
+ return {
65
+ "model_name": model_name,
66
+ "url": url,
67
+ "title": title,
68
+ "author": author,
69
+ "date": date,
70
+ "content": content,
71
+ }
72
+ ```
code/labels.py CHANGED
@@ -7,28 +7,6 @@ id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header
7
  label2id = {label:id for id, label in id2label.items()}
8
 
9
  label_list = ["B-" + x for x in list(id2label.values())]
10
- print(label_list)
11
-
12
- def get_class_weights_tensor(dataset):
13
- all_labels = [label for data_item in dataset for label in data_item['labels']]
14
-
15
- # Count the frequency of each label
16
- label_counter = Counter(all_labels)
17
-
18
- # Calculate the class weights
19
- total_count = sum(label_counter.values())
20
- class_weights = {label: total_count / count for label, count in label_counter.items()}
21
-
22
- # Normalize the weights
23
- sum_weights = sum(class_weights.values())
24
- normalized_class_weights = {label: weight / sum_weights for label, weight in class_weights.items()}
25
-
26
- # Convert class weights to a tensor
27
- class_weights_list = [normalized_class_weights[label] for label in sorted(normalized_class_weights.keys())]
28
- class_weights_tensor = torch.tensor(class_weights_list, dtype=torch.float).to(device)
29
-
30
- return class_weights_tensor
31
-
32
 
33
  def get_labels(predictions, references):
34
  # Transform predictions and references tensos to numpy arrays
 
7
  label2id = {label:id for id, label in id2label.items()}
8
 
9
  label_list = ["B-" + x for x in list(id2label.values())]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def get_labels(predictions, references):
12
  # Transform predictions and references tensos to numpy arrays
code/utils.py CHANGED
@@ -2,29 +2,13 @@ import pandas as pd
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import re
5
- import json
6
- import os
7
  from common import custom_feature_extraction_markuplm
8
- import glob
9
-
10
- def get_latest_file(directory):
11
- # Get list of all files in the directory
12
- list_of_files = glob.glob(os.path.join(directory, '*'))
13
- print("Files in model folder\n" + str(list_of_files))
14
-
15
- # Sort files based on creation time
16
- latest_file = max(list_of_files, key=os.path.getctime)
17
-
18
- return latest_file
19
 
20
  def split_sliding_data(items, window_size, overlap):
21
  new_data = []
22
  for obj in items:
23
- #print (obj.keys(), '\n')
24
- #print (obj, '\n')
25
  nodes = obj['nodes']
26
  num_elements = len(nodes)
27
- #print(num_elements, '\n')
28
  counter = 0
29
  for i in range(0, num_elements, window_size - overlap):
30
  start = i
@@ -32,12 +16,10 @@ def split_sliding_data(items, window_size, overlap):
32
  #print (start, end)
33
  new_obj = {
34
  'Index': obj['Index'] if 'Index' in obj else 0,
35
- 'Index2': counter,
36
  'Url': obj['Url'] if 'Url' in obj else None,
37
  'Path': obj['Path'] if 'Path' in obj else None,
38
  'nodes': obj['nodes'][start:end],
39
  'xpaths': obj['xpaths'][start:end],
40
- 'xpaths_simple': obj['xpaths_simple'][start:end],
41
  'labels': obj['labels'][start:end] if 'labels' in obj else None,
42
  }
43
  counter= counter+1
@@ -89,7 +71,6 @@ def extract_nodes_and_feautures(html_content):
89
  row = {}
90
  row['nodes'] = encoding['nodes'][0]
91
  row['xpaths'] = encoding['xpaths'][0]
92
- row['xpaths_simple'] = encoding['xpaths_simple'][0]
93
  row['labels'] = encoding['labels'][0]
94
  return row
95
  else:
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import re
 
 
5
  from common import custom_feature_extraction_markuplm
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def split_sliding_data(items, window_size, overlap):
8
  new_data = []
9
  for obj in items:
 
 
10
  nodes = obj['nodes']
11
  num_elements = len(nodes)
 
12
  counter = 0
13
  for i in range(0, num_elements, window_size - overlap):
14
  start = i
 
16
  #print (start, end)
17
  new_obj = {
18
  'Index': obj['Index'] if 'Index' in obj else 0,
 
19
  'Url': obj['Url'] if 'Url' in obj else None,
20
  'Path': obj['Path'] if 'Path' in obj else None,
21
  'nodes': obj['nodes'][start:end],
22
  'xpaths': obj['xpaths'][start:end],
 
23
  'labels': obj['labels'][start:end] if 'labels' in obj else None,
24
  }
25
  counter= counter+1
 
71
  row = {}
72
  row['nodes'] = encoding['nodes'][0]
73
  row['xpaths'] = encoding['xpaths'][0]
 
74
  row['labels'] = encoding['labels'][0]
75
  return row
76
  else: