Spaces:
Sleeping
Sleeping
File size: 3,384 Bytes
5e30561 f669dd9 5e30561 dd9b2b1 5e30561 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import gradio as gr
import os
import shutil
from pypdf import PdfReader
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import fitz
TOKENIZER_REPO = "MediaTek-Research/Breeze-7B-Instruct-v1_0"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO,local_files_only=False,use_fast=True)
tran_hints = "请将以下的文字转为繁体:"
start_flag="<s>"
end_flag="</s>"
model = AutoModelForCausalLM.from_pretrained(
TOKENIZER_REPO,
device_map="auto",
local_files_only=False,
torch_dtype=torch.bfloat16
)
def generate(text):
chat_data = []
text = text.strip()
if text:
chat_data.append({"role": "user", "content": text})
achat=tokenizer.apply_chat_template(chat_data,return_tensors="pt")
#achat=tokenizer.encode(chat_data,return_tensors="pt",max_length=2048)
outputs = model.generate(achat,
max_new_tokens=2048,
top_p=0.01,
top_k=85,
repetition_penalty=1.1,
temperature=0)
return tokenizer.decode(outputs[0])
def tran_txt(input_txt):
data_txt=tran_hints+"\n"+input_txt.strip()
tran_result=generate(data_txt)
print("tran_result="+tran_result)
# tran_result=tran_result.strip()
# index=tran_result.find(start_flag)
# if index>=0:
# tran_result=tran_result[len(start_flag):]
# tran_result=tran_result.strip()
# c_index=tran_result.find(data_txt)
# if c_index>=0:
# tran_result=tran_result[len(data_txt):]
# e_index=tran_result.find(end_flag)
# if e_index>=0:
# tran_result=tran_result[0:e_index]
return tran_result
def exec_tran(file):
temp_file=upload_file(file)
page_texts=read_paragraphs(temp_file)
temp_result_file=file;
file_index=temp_result_file.index('.pdf')
if file_index!=-1:
temp_result_file=temp_result_file[0:file_index]
temp_result_file=temp_result_file+"_result.txt"
else :
temp_result_file=temp_result_file+"_result.txt"
tran_file_name=file.name
with open(temp_result_file,'w') as fw:
tran_result=tran_txt(tran_hints)
# print(tran_result+"\n")
for page_content in page_texts:
#lines=page_content.split('\n')
#for line_content in lines:
#print("input="+line_content)
tran_result=tran_txt(page_content)
# print("result="+tran_result)
fw.write(tran_result+"\n")
return temp_result_file
def upload_file(file):
UPLOAD_FOLDER="./data"
if not os.path.exists(UPLOAD_FOLDER):
os.mkdir(UPLOAD_FOLDER)
return shutil.copy(file,UPLOAD_FOLDER)
def read_paragraphs(pdf_path):
document = fitz.open(pdf_path)
paragraphs = []
for page in document:
text = page.get_text("paragraphs")
para_list = text.split('。')
paragraphs.extend([para for para in para_list if para.strip()])
document.close()
return paragraphs
def load_pdf_pages(filename):
page_texts=[]
reader = PdfReader(filename)
for page in reader.pages:
page_texts.append(page.extract_text())
return page_texts
def exec_translate(file):
upload_file(file)
page_texts=load_pdf_pages(file.name)
with gr.Blocks() as app:
file_output=gr.File()
upload_button=gr.UploadButton("上传pdf文件",file_types=["pdf"],file_count="single")
upload_button.upload(exec_tran,upload_button,file_output)
app.launch() |