coptic-english-translator / coptic_english_pipeline.py

Upload 13 files

2fddad0 verified 6 months ago

No virus

4.05 kB

	from typing import Dict
	import numpy as np
	import torch
	from transformers import Pipeline
	from transformers.utils import ModelOutput
	from transformers import pipeline
	from transformers.pipelines import PIPELINE_REGISTRY
	from transformers import AutoModelForSeq2SeqLM
	from huggingface_hub import Repository

	SAHIDIC_TAG = "з"
	BOHAIRIC_TAG = "б"

	from transformers import GenerationConfig

	GENERATION_CONFIG = GenerationConfig(
	max_length=20,
	max_new_tokens=128,
	min_new_tokens=1,
	min_length=0,
	early_stopping=True,
	do_sample=True,
	num_beams=5,
	num_beam_groups=1,
	top_k=50,
	top_p=0.95,
	temperature=1.0,
	diversity_penalty=0.0,
	output_scores=True,
	return_dict_in_generate=True,
	)


	class CopticEnglishPipeline(Pipeline):
	def _sanitize_parameters(self, **kwargs):
	preprocess_kwargs = {}
	if "from_bohairic" in kwargs and kwargs["from_bohairic"]:
	preprocess_kwargs["from_bohairic"] = True
	forward_kwargs = {}
	if "output_confidence" in kwargs and kwargs["output_confidence"]:
	forward_kwargs["output_confidence"] = True

	return preprocess_kwargs, forward_kwargs, {}

	def preprocess(self, text, from_bohairic=False):
	text = greekify(text.lower())

	if from_bohairic:
	text = f"{BOHAIRIC_TAG} {text}"
	else:
	text = f"{SAHIDIC_TAG} {text}"

	return self.tokenizer.encode(text, return_tensors="pt")

	def _forward(self, input_tensors, output_confidence=False) -> ModelOutput:
	outputs = self.model.generate(
	input_tensors[:, : self.tokenizer.model_max_length],
	generation_config=GENERATION_CONFIG,
	)

	translated_text = self.tokenizer.decode(
	outputs.sequences[0], skip_special_tokens=True
	)

	if output_confidence:
	scores = outputs.scores
	confidences = [
	torch.softmax(score, dim=-1).max().item() for score in scores
	]
	num_words = len(translated_text.split())
	# scale the predicition probability by the number of words in the sentence
	scaled_probability = np.exp(sum(np.log(confidences)) / num_words)
	return translated_text, scaled_probability

	return translated_text, None

	def postprocess(self, outputs):
	text, confidence = outputs
	if confidence is None:
	return {
	"translation": text,
	}
	return {
	"translation": text,
	"confidence": confidence,
	}


	COPTIC_TO_GREEK = {
	"ⲁ": "α",
	"ⲃ": "β",
	"ⲅ": "γ",
	"ⲇ": "δ",
	"ⲉ": "ε",
	"ⲋ": "ϛ",
	"ⲍ": "ζ",
	"ⲏ": "η",
	"ⲑ": "θ",
	"ⲓ": "ι",
	"ⲕ": "κ",
	"ⲗ": "λ",
	"ⲙ": "μ",
	"ⲛ": "ν",
	"ⲝ": "ξ",
	"ⲟ": "ο",
	"ⲡ": "π",
	"ⲣ": "ρ",
	"ⲥ": "σ",
	"ⲧ": "τ",
	"ⲩ": "υ",
	"ⲫ": "φ",
	"ⲭ": "χ",
	"ⲯ": "ψ",
	"ⲱ": "ω",
	"ϣ": "s",
	"ϥ": "f",
	"ϧ": "k",
	"ϩ": "h",
	"ϫ": "j",
	"ϭ": "c",
	"ϯ": "t",
	}


	def greekify(coptic_text):
	chars = []
	for c in coptic_text:
	l_c = c.lower()
	chars.append(COPTIC_TO_GREEK.get(l_c, l_c))
	return "".join(chars)


	if __name__ == "__main__":
	PIPELINE_REGISTRY.register_pipeline(
	"coptic-english-translation",
	pipeline_class=CopticEnglishPipeline,
	pt_model=AutoModelForSeq2SeqLM,
	default={"pt": "megalaa/mul-cop-en-norm-group-greekified"},
	type="text",
	)

	classifier = pipeline(
	"coptic-english-translation", model="megalaa/mul-cop-en-norm-group-greekified"
	)
	print(classifier("ⲛⲧⲟϥ ⲡⲉ ⲓⲏⲥⲟⲩⲥ ⲡⲉⲭⲣⲓⲥⲧⲟⲥ", from_bohairic=False, output_confidence=True))

	repo = Repository(
	"cop-eng-translation",
	clone_from="megalaa/mul-cop-en-norm-group-greekified",
	)
	classifier.save_pretrained("cop-eng-translation")