Spaces:

ML-unipi
/

TermsOfServiceSummarization

Runtime error

tommasobaldi commited on Aug 28, 2022

Commit

f6ab2e2

1 Parent(s): 42780ef

working on text splitting

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from typing import AnyStr
 import nltk
 import streamlit as st
-from transformers import pipeline
 def main() -> None:
@@ -53,8 +53,10 @@ def main() -> None:
         accumulated_lists = []
         result_list = []
         cumulative_token_length = 0
         for sentence in sentences:
-            token_list = [token for token in nltk.word_tokenize(sentence)]
             token_length = len(token_list)
             if token_length + cumulative_token_length > split_token_length and result_list:
                 accumulated_lists.append(join_sentences(result_list))
@@ -68,6 +70,7 @@ def main() -> None:
         return accumulated_lists
     pipe = create_pipeline()
     if "target_text" not in st.session_state:
         st.session_state.target_text = ""

 from typing import AnyStr
 import nltk
 import streamlit as st
+from transformers import pipeline, AutoTokenizer
 def main() -> None:
         accumulated_lists = []
         result_list = []
         cumulative_token_length = 0
         for sentence in sentences:
+            # token_list = [token for token in nltk.word_tokenize(sentence)]
+            token_list = tokenizer(sentence, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
             token_length = len(token_list)
             if token_length + cumulative_token_length > split_token_length and result_list:
                 accumulated_lists.append(join_sentences(result_list))
         return accumulated_lists
     pipe = create_pipeline()
+    tokenizer = AutoTokenizer.from_pretrained("ML-unipi/bart-large-tos")
     if "target_text" not in st.session_state:
         st.session_state.target_text = ""