Update README.md
Browse files
README.md
CHANGED
@@ -52,6 +52,57 @@ print(summary)
|
|
52 |
|
53 |
包头警方发布一起利用AI实施电信诈骗典型案例:法人代表10分钟内被骗430万元
|
54 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
## Credits
|
56 |
This model is trained and maintained by KongYang from Shanghai Jiao Tong University. For any questions, please reach out to me at my WeChat ID: kongyang.
|
57 |
|
|
|
52 |
|
53 |
包头警方发布一起利用AI实施电信诈骗典型案例:法人代表10分钟内被骗430万元
|
54 |
```
|
55 |
+
|
56 |
+
## If you need a longer abbreviation, refer to the following code 如果需要更长的缩略语,参考如下代码:
|
57 |
+
|
58 |
+
```python
|
59 |
+
from transformers import MT5ForConditionalGeneration, T5Tokenizer
|
60 |
+
|
61 |
+
model_heack = MT5ForConditionalGeneration.from_pretrained("heack/HeackMT5-ZhSum100k")
|
62 |
+
tokenizer_heack = T5Tokenizer.from_pretrained("heack/HeackMT5-ZhSum100k")
|
63 |
+
|
64 |
+
|
65 |
+
def _split_text(text, length):
|
66 |
+
chunks = []
|
67 |
+
start = 0
|
68 |
+
while start < len(text):
|
69 |
+
if len(text) - start > length:
|
70 |
+
pos_forward = start + length
|
71 |
+
pos_backward = start + length
|
72 |
+
pos = start + length
|
73 |
+
while (pos_forward < len(text)) and (pos_backward >= 0) and (pos_forward < 20 + pos) and (pos_backward + 20 > pos) and text[pos_forward] not in {'.', '。',',',','} and text[pos_backward] not in {'.', '。',',',','}:
|
74 |
+
pos_forward += 1
|
75 |
+
pos_backward -= 1
|
76 |
+
if pos_forward - pos >= 20 and pos_backward <= pos - 20:
|
77 |
+
pos = start + length
|
78 |
+
elif text[pos_backward] in {'.', '。',',',','}:
|
79 |
+
pos = pos_backward
|
80 |
+
else:
|
81 |
+
pos = pos_forward
|
82 |
+
chunks.append(text[start:pos+1])
|
83 |
+
start = pos + 1
|
84 |
+
else:
|
85 |
+
chunks.append(text[start:])
|
86 |
+
break
|
87 |
+
# Combine last chunk with previous one if it's too short
|
88 |
+
if len(chunks) > 1 and len(chunks[-1]) < 100:
|
89 |
+
chunks[-2] += chunks[-1]
|
90 |
+
chunks.pop()
|
91 |
+
return chunks
|
92 |
+
|
93 |
+
def get_summary_heack(text, each_summary_length=150):
|
94 |
+
chunks = _split_text(text, 300)
|
95 |
+
summaries = []
|
96 |
+
for chunk in chunks:
|
97 |
+
inputs = tokenizer_heack.encode("summarize: " + chunk, return_tensors='pt', max_length=512, truncation=True)
|
98 |
+
summary_ids = model_heack.generate(inputs, max_length=each_summary_length, num_beams=4, length_penalty=1.5, no_repeat_ngram_size=2)
|
99 |
+
summary = tokenizer_heack.decode(summary_ids[0], skip_special_tokens=True)
|
100 |
+
summaries.append(summary)
|
101 |
+
return " ".join(summaries)
|
102 |
+
|
103 |
+
|
104 |
+
```
|
105 |
+
|
106 |
## Credits
|
107 |
This model is trained and maintained by KongYang from Shanghai Jiao Tong University. For any questions, please reach out to me at my WeChat ID: kongyang.
|
108 |
|