rondaravaol
commited on
Commit
·
76f22a1
1
Parent(s):
ac8e459
Adding inference example
Browse files- README.md +67 -1
- code/labels.py +0 -22
- code/utils.py +0 -19
README.md
CHANGED
@@ -3,4 +3,70 @@ base_model:
|
|
3 |
- microsoft/markuplm-base
|
4 |
library_name: transformers
|
5 |
pipeline_tag: token-classification
|
6 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
- microsoft/markuplm-base
|
4 |
library_name: transformers
|
5 |
pipeline_tag: token-classification
|
6 |
+
---
|
7 |
+
|
8 |
+
This is fine-tuned PoC of [markuplm-base](https://huggingface.co/microsoft/markuplm-base) model for parsing news attributes from web-pages: author, pulication date, content and etc.
|
9 |
+
|
10 |
+
**Inference example**
|
11 |
+
|
12 |
+
Code accepts URL as input, loads a web-page and returns the json with extracted data (author, publication date, title and content).
|
13 |
+
|
14 |
+
```python
|
15 |
+
|
16 |
+
id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header", 6: "footer", 7: "rail", 8: "advertisement", 9: "navigation"}
|
17 |
+
|
18 |
+
def eval(url):
|
19 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
20 |
+
|
21 |
+
model_folder = os.path.join(current_dir, 'models') # models folder is in the repository root
|
22 |
+
model_name = 'OxMarkupLM.pt'
|
23 |
+
|
24 |
+
processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base")
|
25 |
+
processor.parse_html = False
|
26 |
+
|
27 |
+
model_path = os.path.join(model_folder, model_name)
|
28 |
+
|
29 |
+
model = MarkupLMForTokenClassification.from_pretrained(
|
30 |
+
model_path, id2label=labels.id2label, label2id=labels.label2id
|
31 |
+
)
|
32 |
+
|
33 |
+
html = utils.clean_html(utils.get_html_content(url))
|
34 |
+
data = [utils.extract_nodes_and_feautures(html)]
|
35 |
+
example = utils.split_sliding_data(data, 10, 0)
|
36 |
+
|
37 |
+
title, author, date, content = [], [], [], []
|
38 |
+
for splited in example:
|
39 |
+
nodes, xpaths = splited['nodes'], splited['xpaths']
|
40 |
+
encoding = processor(
|
41 |
+
nodes=nodes, xpaths=xpaths, return_offsets_mapping=True,
|
42 |
+
padding="max_length", truncation=True, max_length=512, return_tensors="pt"
|
43 |
+
)
|
44 |
+
offset_mapping = encoding.pop("offset_mapping")
|
45 |
+
with torch.no_grad():
|
46 |
+
logits = model(**encoding).logits
|
47 |
+
|
48 |
+
predictions = logits.argmax(-1)
|
49 |
+
processed_words = []
|
50 |
+
|
51 |
+
for pred_id, word_id, offset in zip(predictions[0].tolist(), encoding.word_ids(0), offset_mapping[0].tolist()):
|
52 |
+
if word_id is not None and offset[0] == 0:
|
53 |
+
if pred_id == 1:
|
54 |
+
title.append(nodes[word_id])
|
55 |
+
elif pred_id == 2 and word_id not in processed_words:
|
56 |
+
processed_words.append(word_id)
|
57 |
+
content.append(nodes[word_id])
|
58 |
+
elif pred_id == 3:
|
59 |
+
author.append(nodes[word_id])
|
60 |
+
elif pred_id == 4:
|
61 |
+
date.append(nodes[word_id])
|
62 |
+
|
63 |
+
title = rank_titles(title, '\n'.join(content))
|
64 |
+
return {
|
65 |
+
"model_name": model_name,
|
66 |
+
"url": url,
|
67 |
+
"title": title,
|
68 |
+
"author": author,
|
69 |
+
"date": date,
|
70 |
+
"content": content,
|
71 |
+
}
|
72 |
+
```
|
code/labels.py
CHANGED
@@ -7,28 +7,6 @@ id2label = {0: "none", 1:"title", 2:"content", 3:"author", 4: "date", 5: "header
|
|
7 |
label2id = {label:id for id, label in id2label.items()}
|
8 |
|
9 |
label_list = ["B-" + x for x in list(id2label.values())]
|
10 |
-
print(label_list)
|
11 |
-
|
12 |
-
def get_class_weights_tensor(dataset):
|
13 |
-
all_labels = [label for data_item in dataset for label in data_item['labels']]
|
14 |
-
|
15 |
-
# Count the frequency of each label
|
16 |
-
label_counter = Counter(all_labels)
|
17 |
-
|
18 |
-
# Calculate the class weights
|
19 |
-
total_count = sum(label_counter.values())
|
20 |
-
class_weights = {label: total_count / count for label, count in label_counter.items()}
|
21 |
-
|
22 |
-
# Normalize the weights
|
23 |
-
sum_weights = sum(class_weights.values())
|
24 |
-
normalized_class_weights = {label: weight / sum_weights for label, weight in class_weights.items()}
|
25 |
-
|
26 |
-
# Convert class weights to a tensor
|
27 |
-
class_weights_list = [normalized_class_weights[label] for label in sorted(normalized_class_weights.keys())]
|
28 |
-
class_weights_tensor = torch.tensor(class_weights_list, dtype=torch.float).to(device)
|
29 |
-
|
30 |
-
return class_weights_tensor
|
31 |
-
|
32 |
|
33 |
def get_labels(predictions, references):
|
34 |
# Transform predictions and references tensos to numpy arrays
|
|
|
7 |
label2id = {label:id for id, label in id2label.items()}
|
8 |
|
9 |
label_list = ["B-" + x for x in list(id2label.values())]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def get_labels(predictions, references):
|
12 |
# Transform predictions and references tensos to numpy arrays
|
code/utils.py
CHANGED
@@ -2,29 +2,13 @@ import pandas as pd
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
import re
|
5 |
-
import json
|
6 |
-
import os
|
7 |
from common import custom_feature_extraction_markuplm
|
8 |
-
import glob
|
9 |
-
|
10 |
-
def get_latest_file(directory):
|
11 |
-
# Get list of all files in the directory
|
12 |
-
list_of_files = glob.glob(os.path.join(directory, '*'))
|
13 |
-
print("Files in model folder\n" + str(list_of_files))
|
14 |
-
|
15 |
-
# Sort files based on creation time
|
16 |
-
latest_file = max(list_of_files, key=os.path.getctime)
|
17 |
-
|
18 |
-
return latest_file
|
19 |
|
20 |
def split_sliding_data(items, window_size, overlap):
|
21 |
new_data = []
|
22 |
for obj in items:
|
23 |
-
#print (obj.keys(), '\n')
|
24 |
-
#print (obj, '\n')
|
25 |
nodes = obj['nodes']
|
26 |
num_elements = len(nodes)
|
27 |
-
#print(num_elements, '\n')
|
28 |
counter = 0
|
29 |
for i in range(0, num_elements, window_size - overlap):
|
30 |
start = i
|
@@ -32,12 +16,10 @@ def split_sliding_data(items, window_size, overlap):
|
|
32 |
#print (start, end)
|
33 |
new_obj = {
|
34 |
'Index': obj['Index'] if 'Index' in obj else 0,
|
35 |
-
'Index2': counter,
|
36 |
'Url': obj['Url'] if 'Url' in obj else None,
|
37 |
'Path': obj['Path'] if 'Path' in obj else None,
|
38 |
'nodes': obj['nodes'][start:end],
|
39 |
'xpaths': obj['xpaths'][start:end],
|
40 |
-
'xpaths_simple': obj['xpaths_simple'][start:end],
|
41 |
'labels': obj['labels'][start:end] if 'labels' in obj else None,
|
42 |
}
|
43 |
counter= counter+1
|
@@ -89,7 +71,6 @@ def extract_nodes_and_feautures(html_content):
|
|
89 |
row = {}
|
90 |
row['nodes'] = encoding['nodes'][0]
|
91 |
row['xpaths'] = encoding['xpaths'][0]
|
92 |
-
row['xpaths_simple'] = encoding['xpaths_simple'][0]
|
93 |
row['labels'] = encoding['labels'][0]
|
94 |
return row
|
95 |
else:
|
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
import re
|
|
|
|
|
5 |
from common import custom_feature_extraction_markuplm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def split_sliding_data(items, window_size, overlap):
|
8 |
new_data = []
|
9 |
for obj in items:
|
|
|
|
|
10 |
nodes = obj['nodes']
|
11 |
num_elements = len(nodes)
|
|
|
12 |
counter = 0
|
13 |
for i in range(0, num_elements, window_size - overlap):
|
14 |
start = i
|
|
|
16 |
#print (start, end)
|
17 |
new_obj = {
|
18 |
'Index': obj['Index'] if 'Index' in obj else 0,
|
|
|
19 |
'Url': obj['Url'] if 'Url' in obj else None,
|
20 |
'Path': obj['Path'] if 'Path' in obj else None,
|
21 |
'nodes': obj['nodes'][start:end],
|
22 |
'xpaths': obj['xpaths'][start:end],
|
|
|
23 |
'labels': obj['labels'][start:end] if 'labels' in obj else None,
|
24 |
}
|
25 |
counter= counter+1
|
|
|
71 |
row = {}
|
72 |
row['nodes'] = encoding['nodes'][0]
|
73 |
row['xpaths'] = encoding['xpaths'][0]
|
|
|
74 |
row['labels'] = encoding['labels'][0]
|
75 |
return row
|
76 |
else:
|