Spaces:

OFA-Sys
/

OFA-OCR

Runtime error

App Files Files Community

OFA-OCR / data /ofa_dataset.py

JustinLin610

first commit

ee21b96 almost 2 years ago

raw

history blame

2.45 kB

	# Copyright 2022 The OFA-Sys Team.
	# All rights reserved.
	# This source code is licensed under the Apache 2.0 license
	# found in the LICENSE file in the root directory.

	import logging
	import re
	import torch.utils.data
	from fairseq.data import FairseqDataset

	logger = logging.getLogger(__name__)


	class OFADataset(FairseqDataset):
	def __init__(self, split, dataset, bpe, src_dict, tgt_dict):
	self.split = split
	self.dataset = dataset
	self.bpe = bpe
	self.src_dict = src_dict
	self.tgt_dict = tgt_dict

	self.bos = src_dict.bos()
	self.eos = src_dict.eos()
	self.pad = src_dict.pad()
	self.bos_item = torch.LongTensor([self.bos])
	self.eos_item = torch.LongTensor([self.eos])

	def __len__(self):
	return len(self.dataset)

	def encode_text(self, text, length=None, append_bos=False, append_eos=False, use_bpe=True):
	s = self.tgt_dict.encode_line(
	line=self.bpe.encode(text) if use_bpe else text,
	add_if_not_exist=False,
	append_eos=False
	).long()
	if length is not None:
	s = s[:length]
	if append_bos:
	s = torch.cat([self.bos_item, s])
	if append_eos:
	s = torch.cat([s, self.eos_item])
	return s

	def pre_question(self, question, max_ques_words=None):
	question = question.lower().lstrip(",.!?*#:;~").replace('-', ' ').replace('/', ' ')

	question = re.sub(
	r"\s{2,}",
	' ',
	question,
	)
	question = question.rstrip('\n')
	question = question.strip(' ')

	# truncate question
	question_words = question.split(' ')
	if max_ques_words is not None and len(question_words) > max_ques_words:
	question = ' '.join(question_words[:max_ques_words])

	return question

	def pre_caption(self, caption, max_words=None):
	caption = caption.lower().lstrip(",.!?*#:;~").replace('-', ' ').replace('/', ' ').replace('<person>', 'person')

	caption = re.sub(
	r"\s{2,}",
	' ',
	caption,
	)
	caption = caption.rstrip('\n')
	caption = caption.strip(' ')

	# truncate caption
	caption_words = caption.split(' ')
	if max_words is not None and len(caption_words) > max_words:
	caption = ' '.join(caption_words[:max_words])

	return caption