Spaces:
Runtime error
Runtime error
Commit
·
f6ab2e2
1
Parent(s):
42780ef
working on text splitting
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
from typing import AnyStr
|
3 |
import nltk
|
4 |
import streamlit as st
|
5 |
-
from transformers import pipeline
|
6 |
|
7 |
|
8 |
def main() -> None:
|
@@ -53,8 +53,10 @@ def main() -> None:
|
|
53 |
accumulated_lists = []
|
54 |
result_list = []
|
55 |
cumulative_token_length = 0
|
|
|
56 |
for sentence in sentences:
|
57 |
-
token_list = [token for token in nltk.word_tokenize(sentence)]
|
|
|
58 |
token_length = len(token_list)
|
59 |
if token_length + cumulative_token_length > split_token_length and result_list:
|
60 |
accumulated_lists.append(join_sentences(result_list))
|
@@ -68,6 +70,7 @@ def main() -> None:
|
|
68 |
return accumulated_lists
|
69 |
|
70 |
pipe = create_pipeline()
|
|
|
71 |
|
72 |
if "target_text" not in st.session_state:
|
73 |
st.session_state.target_text = ""
|
|
|
2 |
from typing import AnyStr
|
3 |
import nltk
|
4 |
import streamlit as st
|
5 |
+
from transformers import pipeline, AutoTokenizer
|
6 |
|
7 |
|
8 |
def main() -> None:
|
|
|
53 |
accumulated_lists = []
|
54 |
result_list = []
|
55 |
cumulative_token_length = 0
|
56 |
+
|
57 |
for sentence in sentences:
|
58 |
+
# token_list = [token for token in nltk.word_tokenize(sentence)]
|
59 |
+
token_list = tokenizer(sentence, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
|
60 |
token_length = len(token_list)
|
61 |
if token_length + cumulative_token_length > split_token_length and result_list:
|
62 |
accumulated_lists.append(join_sentences(result_list))
|
|
|
70 |
return accumulated_lists
|
71 |
|
72 |
pipe = create_pipeline()
|
73 |
+
tokenizer = AutoTokenizer.from_pretrained("ML-unipi/bart-large-tos")
|
74 |
|
75 |
if "target_text" not in st.session_state:
|
76 |
st.session_state.target_text = ""
|