from transformers import Pipeline import numpy as np import torch import nltk nltk.download("averaged_perceptron_tagger") nltk.download("averaged_perceptron_tagger_eng") from nltk.chunk import conlltags2tree from nltk import pos_tag from nltk.tree import Tree import requests import torch.nn.functional as F import re, string def get_wikipedia_page_props(input_str: str): """ Retrieves the QID for a given Wikipedia page name from the specified language Wikipedia. If the request fails, it falls back to using the OpenRefine Wikidata API. Args: input_str (str): The input string in the format "page_name >> language". Returns: str: The QID or "NIL" if the QID is not found. """ try: # Preprocess the input string page_name, language = input_str.split(" >> ") page_name = page_name.strip() language = language.strip() except ValueError: return "Invalid input format. Use 'page_name >> language'." wikipedia_url = f"https://{language}.wikipedia.org/w/api.php" wikipedia_params = { "action": "query", "prop": "pageprops", "format": "json", "titles": page_name, } qid = "NIL" try: # Attempt to fetch from Wikipedia API response = requests.get(wikipedia_url, params=wikipedia_params) response.raise_for_status() data = response.json() if "pages" in data["query"]: page_id = list(data["query"]["pages"].keys())[0] if "pageprops" in data["query"]["pages"][page_id]: page_props = data["query"]["pages"][page_id]["pageprops"] if "wikibase_item" in page_props: return page_props["wikibase_item"] else: return qid else: return qid except Exception as e: return qid def get_wikipedia_title(qid, language="en"): url = f"https://www.wikidata.org/w/api.php" params = { "action": "wbgetentities", "format": "json", "ids": qid, "props": "sitelinks/urls", "sitefilter": f"{language}wiki", } response = requests.get(url, params=params) data = response.json() try: title = data["entities"][qid]["sitelinks"][f"{language}wiki"]["title"] url = data["entities"][qid]["sitelinks"][f"{language}wiki"]["url"] return title, url except KeyError: return "NIL", "None" class NelPipeline(Pipeline): def _sanitize_parameters(self, **kwargs): preprocess_kwargs = {} if "text" in kwargs: preprocess_kwargs["text"] = kwargs["text"] return preprocess_kwargs, {}, {} def preprocess(self, text, **kwargs): outputs = self.model.generate( **self.tokenizer([text], return_tensors="pt"), num_beams=5, num_return_sequences=5, max_new_tokens=30, ) wikipedia_predictons = self.tokenizer.batch_decode( outputs, skip_special_tokens=True ) print(f"Decoded: {wikipedia_predictons}") return wikipedia_predictons def _forward(self, inputs): return inputs def postprocess(self, outputs, **kwargs): """ Postprocess the outputs of the model :param outputs: :param kwargs: :return: """ # outputs # # predictions = {} # confidence_scores = {} # for task, logits in tokens_result.logits.items(): # predictions[task] = torch.argmax(logits, dim=-1).tolist()[0] # confidence_scores[task] = F.softmax(logits, dim=-1).tolist()[0] # # entities = {} # for task in predictions.keys(): # words_list, preds_list, confidence_list = realign( # text_sentence, # predictions[task], # confidence_scores[task], # self.tokenizer, # self.id2label[task], # ) # # entities[task] = get_entities(words_list, preds_list, confidence_list, text) # # postprocessed_entities = self.postprocess_entities(entities, text_sentence) results = [] for wikipedia_name in outputs: # Get QID qid = get_wikipedia_page_props(wikipedia_name) print(f"{wikipedia_name} -- QID: {qid}") # Get Wikipedia title and URL title, url = get_wikipedia_title(qid) results.append({"title": title, "qid": qid, "url": url}) return results