Spaces:
Sleeping
Sleeping
JunzhaoSun
commited on
Commit
•
14f33b1
1
Parent(s):
85a6cce
init
Browse files- app.py +93 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/python3
|
2 |
+
#-*- coding:utf-8 -*-
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import torch
|
6 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
7 |
+
import os
|
8 |
+
|
9 |
+
checkpoint = "gpt2-large"
|
10 |
+
# checkpoint = "/innev/open-ai/huggingface/models/gpt2-large"
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
12 |
+
model = AutoModelForCausalLM.from_pretrained(checkpoint)
|
13 |
+
|
14 |
+
def generate(text):
|
15 |
+
# text = 'Who was Jim Henson ? Jim Henson was a'
|
16 |
+
|
17 |
+
# 编码一段文本
|
18 |
+
# 编码后为[8241, 373, 5395, 367, 19069, 5633, 5395, 367, 19069, 373, 257]
|
19 |
+
indexed_tokens = tokenizer.encode(text)
|
20 |
+
# 转换为pytorch tensor
|
21 |
+
# tensor([[ 8241, 373, 5395, 367, 19069, 5633, 5395, 367, 19069, 373, 257]])
|
22 |
+
# shape为 torch.Size([1, 11])
|
23 |
+
tokens_tensor = torch.tensor([indexed_tokens])
|
24 |
+
|
25 |
+
|
26 |
+
# 设置为evaluation模式,去取消激活dropout等模块。
|
27 |
+
# 在huggingface/transformers框架中,默认就是eval模式
|
28 |
+
model.eval()
|
29 |
+
|
30 |
+
# 预测所有token
|
31 |
+
with torch.no_grad():
|
32 |
+
# 将输入tensor输入,就得到了模型的输出,非常简单
|
33 |
+
# outputs是一个元组,所有huggingface/transformers模型的输出都是元组
|
34 |
+
# 本初的元组有两个,第一个是预测得分(没经过softmax之前的,也叫作logits),
|
35 |
+
# 第二个是past,里面的attention计算的key value值
|
36 |
+
# 此时我们需要的是第一个值
|
37 |
+
outputs = model(tokens_tensor)
|
38 |
+
# predictions shape为 torch.Size([1, 11, 50257]),
|
39 |
+
# 也就是11个词每个词的预测得分(没经过softmax之前的)
|
40 |
+
# 也叫做logits
|
41 |
+
predictions = outputs[0]
|
42 |
+
|
43 |
+
# 我们需要预测下一个单词,所以是使用predictions第一个batch,最后一个词的logits去计算
|
44 |
+
# predicted_index = 582,通过计算最大得分的索引得到的
|
45 |
+
predicted_index = torch.argmax(predictions[0, -1, :]).item()
|
46 |
+
# 反向解码为我们需要的文本
|
47 |
+
predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
|
48 |
+
# predicted_text = tokenizer.decode([predicted_index])
|
49 |
+
# 解码后的文本:'Who was Jim Henson? Jim Henson was a man'
|
50 |
+
# 成功预测出单词 'man'
|
51 |
+
|
52 |
+
return predicted_text
|
53 |
+
|
54 |
+
|
55 |
+
def doloop(prompts):
|
56 |
+
|
57 |
+
text = prompts
|
58 |
+
total = 1
|
59 |
+
while text[-1] != "." and total < 20:
|
60 |
+
text = generate(text)
|
61 |
+
print("Index %s: %s" % (total, text))
|
62 |
+
total = total + 1
|
63 |
+
|
64 |
+
return text, total
|
65 |
+
|
66 |
+
|
67 |
+
title = "GPT2 large"
|
68 |
+
|
69 |
+
description = """
|
70 |
+
本例为使用GPT2模型的简单推测语句DEMO,输入前面的句子,推测出后面的句子。
|
71 |
+
|
72 |
+
使用原始模型,未经过微调。只支持英文输入输出。
|
73 |
+
"""
|
74 |
+
|
75 |
+
examples = [
|
76 |
+
["Who was Jim Henson ? Jim Henson was a", None],
|
77 |
+
["My name is Julien and I like to", None],
|
78 |
+
["My name is Thomas and my main", None],
|
79 |
+
["My name is Mariama, my favorite", None],
|
80 |
+
["My name is Clara and I am", None],
|
81 |
+
]
|
82 |
+
|
83 |
+
gr.Interface(
|
84 |
+
fn=doloop,
|
85 |
+
inputs=gr.Text(label="输入前置语句"),
|
86 |
+
outputs=[
|
87 |
+
gr.Text(label="补全后输出"),
|
88 |
+
gr.Text(label="循环次数"),
|
89 |
+
],
|
90 |
+
title=title,
|
91 |
+
description=description,
|
92 |
+
examples=examples,
|
93 |
+
).launch()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/transformers.git
|
2 |
+
torch
|
3 |
+
gradio
|