Spaces:

Shakhovak
/

RU_ACCENT

Sleeping

RU_ACCENT / app.py

shakhovak

added examples

3e46eed 7 months ago

No virus

5.51 kB

	import json
	import os
	import re
	from os.path import join as join_path
	import gradio as gr

	from text_split import split_by_sentences


	class RUAccent:
	vowels = "аеёиоуыэюя"

	def __init__(self):
	self.omographs = None
	self.accents = None
	self.workdir = os.getcwd()

	def load(self, custom_accent=None, custom_omographs=None):
	if custom_omographs is None:
	custom_omographs = {}

	if custom_accent is None:
	custom_accent = {}

	self.omographs = json.load(
	open(
	join_path(self.workdir, "dictionaries", "file_omo.json"),
	encoding="utf-8",
	)
	)

	self.omographs.update(custom_omographs)

	self.accents = json.load(
	open(
	join_path(self.workdir, "dictionaries", "file_norm.json"),
	encoding="utf-8",
	)
	)

	self.accents.update(custom_accent)

	# self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')

	def split_by_words(self, string):
	result = re.findall(r"\w(?:\+\w+)\|[^\w\s]+", string.lower())
	return [res for res in result if res]

	def process_all(self, text):
	"""Ядро всей программы. Тут текст проходит через ряд функций,
	где по итогу получается строка с проставленными ударениями
	Input:
	text: string

	Output:
	accented_sentence: list[string]
	omographs_list: list[string]
	unknown_list: list[string]
	"""
	accented_sentence = []
	omographs_list = []
	unknown_list = []

	sentences = split_by_sentences(text)
	outputs = []
	for sentence in sentences:
	text = self.split_by_words(sentence)

	founded_omographs = self._process_omographs(text)
	omographs_list.extend(founded_omographs)

	processed_text, unknown_words = self._process_accent(
	text, founded_omographs
	)
	unknown_list.extend(unknown_words)

	processed_text = " ".join(processed_text)
	processed_text = self.delete_spaces_before_punc(processed_text)

	accented_sentence.append(processed_text)

	omographs_list = [
	f"{key}: {value}" for elem in omographs_list for key, value in elem.items()
	]
	return accented_sentence, list(set(omographs_list)), list(set(unknown_list))

	def _process_yo(self, text):
	splitted_text = text

	for i, word in enumerate(splitted_text):
	splitted_text[i] = self.yo_words.get(word, word)
	return splitted_text

	def _process_omographs(self, text):
	splitted_text = text

	founded_omographs = []
	for i, word in enumerate(splitted_text):
	variants = self.omographs.get(word)
	if variants:
	founded_omographs.append({word: self.omographs[word]["acc_variants"]})

	return founded_omographs

	def _process_accent(self, text, founded_omographs):
	splitted_text = text
	unknown_words = []
	for i, word in enumerate(splitted_text):
	stressed_word = self.accents.get(word, word)

	if stressed_word in [list(d.keys())[0] for d in founded_omographs]:
	splitted_text[i] = word

	elif stressed_word != word:
	splitted_text[i] = stressed_word["accent"]

	else:
	if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
	unknown_words.append(word)
	splitted_text[i] = word

	return splitted_text, unknown_words

	def delete_spaces_before_punc(self, text):
	punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{\|}~"
	for char in punc:
	text = text.replace(" " + char, char)
	return text


	ru_accent = RUAccent()
	ru_accent.load()


	title = "Демо для модели расстановки ударения на русском языке"


	description = "Для расстановки ударения необходимо ввести текст в поле ниже. Алгоритм обработает текст и выдаст текст с ударениями, а также 2 списка: омографы, если они есть в тексте и слов, не найденных в словаре."
	examples = [
	"Я иду в замок повесить замок.",
	"Таблетки дорогие, не докуписся, по тыще, да больше...",
	"А главное, шо на вершине горы это было разрушены замка средневекового времён деспота Георгия, царица Ирина, его жена была там последней владетельницей этого замка.",
	]
	outputs = [
	gr.Textbox(label="Обработанный текст"),
	gr.Textbox(label="Омографы"),
	gr.Textbox(label="Нет в словаре"),
	]

	theme = "huggingface"

	interface = gr.Interface(
	fn=ru_accent.process_all,
	inputs=gr.Textbox(label="текст для расстановки ударения"),
	outputs=outputs,
	examples=examples,
	title=title,
	description=description,
	)

	if __name__ == "__main__":
	interface.launch(debug=True, share=True)