File size: 4,623 Bytes
d868172 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
from transformers import Pipeline
import numpy as np
import torch
import nltk
nltk.download("averaged_perceptron_tagger")
nltk.download("averaged_perceptron_tagger_eng")
from nltk.chunk import conlltags2tree
from nltk import pos_tag
from nltk.tree import Tree
import requests
import torch.nn.functional as F
import re, string
def get_wikipedia_page_props(input_str: str):
"""
Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia.
If the request fails, it falls back to using the OpenRefine Wikidata API.
Args:
input_str (str): The input string in the format "page_name >> language".
Returns:
str: The QID or "NIL" if the QID is not found.
"""
try:
# Preprocess the input string
page_name, language = input_str.split(" >> ")
page_name = page_name.strip()
language = language.strip()
except ValueError:
return "Invalid input format. Use 'page_name >> language'."
wikipedia_url = f"https://{language}.wikipedia.org/w/api.php"
wikipedia_params = {
"action": "query",
"prop": "pageprops",
"format": "json",
"titles": page_name,
}
qid = "NIL"
try:
# Attempt to fetch from Wikipedia API
response = requests.get(wikipedia_url, params=wikipedia_params)
response.raise_for_status()
data = response.json()
if "pages" in data["query"]:
page_id = list(data["query"]["pages"].keys())[0]
if "pageprops" in data["query"]["pages"][page_id]:
page_props = data["query"]["pages"][page_id]["pageprops"]
if "wikibase_item" in page_props:
return page_props["wikibase_item"]
else:
return qid
else:
return qid
except Exception as e:
return qid
def get_wikipedia_title(qid, language="en"):
url = f"https://www.wikidata.org/w/api.php"
params = {
"action": "wbgetentities",
"format": "json",
"ids": qid,
"props": "sitelinks/urls",
"sitefilter": f"{language}wiki",
}
response = requests.get(url, params=params)
data = response.json()
try:
title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"]
url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"]
return title, url
except KeyError:
return "NIL", "None"
class NelPipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
if "text" in kwargs:
preprocess_kwargs["text"] = kwargs["text"]
return preprocess_kwargs, {}, {}
def preprocess(self, text, **kwargs):
outputs = self.model.generate(
**self.tokenizer([text], return_tensors="pt"),
num_beams=5,
num_return_sequences=5,
max_new_tokens=30,
)
wikipedia_predictons = self.tokenizer.batch_decode(
outputs, skip_special_tokens=True
)
print(f"Decoded: {wikipedia_predictons}")
return wikipedia_predictons
def _forward(self, inputs):
return inputs
def postprocess(self, outputs, **kwargs):
"""
Postprocess the outputs of the model
:param outputs:
:param kwargs:
:return:
"""
# outputs
#
# predictions = {}
# confidence_scores = {}
# for task, logits in tokens_result.logits.items():
# predictions[task] = torch.argmax(logits, dim=-1).tolist()[0]
# confidence_scores[task] = F.softmax(logits, dim=-1).tolist()[0]
#
# entities = {}
# for task in predictions.keys():
# words_list, preds_list, confidence_list = realign(
# text_sentence,
# predictions[task],
# confidence_scores[task],
# self.tokenizer,
# self.id2label[task],
# )
#
# entities[task] = get_entities(words_list, preds_list, confidence_list, text)
#
# postprocessed_entities = self.postprocess_entities(entities, text_sentence)
results = []
for wikipedia_name in outputs:
# Get QID
qid = get_wikipedia_page_props(wikipedia_name)
print(f"{wikipedia_name} -- QID: {qid}")
# Get Wikipedia title and URL
title, url = get_wikipedia_title(qid)
results.append({"title": title, "qid": qid, "url": url})
return results
|