tommasobaldi commited on
Commit
f6ab2e2
1 Parent(s): 42780ef

working on text splitting

Browse files
Files changed (1) hide show
  1. app.py +5 -2
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  from typing import AnyStr
3
  import nltk
4
  import streamlit as st
5
- from transformers import pipeline
6
 
7
 
8
  def main() -> None:
@@ -53,8 +53,10 @@ def main() -> None:
53
  accumulated_lists = []
54
  result_list = []
55
  cumulative_token_length = 0
 
56
  for sentence in sentences:
57
- token_list = [token for token in nltk.word_tokenize(sentence)]
 
58
  token_length = len(token_list)
59
  if token_length + cumulative_token_length > split_token_length and result_list:
60
  accumulated_lists.append(join_sentences(result_list))
@@ -68,6 +70,7 @@ def main() -> None:
68
  return accumulated_lists
69
 
70
  pipe = create_pipeline()
 
71
 
72
  if "target_text" not in st.session_state:
73
  st.session_state.target_text = ""
 
2
  from typing import AnyStr
3
  import nltk
4
  import streamlit as st
5
+ from transformers import pipeline, AutoTokenizer
6
 
7
 
8
  def main() -> None:
 
53
  accumulated_lists = []
54
  result_list = []
55
  cumulative_token_length = 0
56
+
57
  for sentence in sentences:
58
+ # token_list = [token for token in nltk.word_tokenize(sentence)]
59
+ token_list = tokenizer(sentence, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
60
  token_length = len(token_list)
61
  if token_length + cumulative_token_length > split_token_length and result_list:
62
  accumulated_lists.append(join_sentences(result_list))
 
70
  return accumulated_lists
71
 
72
  pipe = create_pipeline()
73
+ tokenizer = AutoTokenizer.from_pretrained("ML-unipi/bart-large-tos")
74
 
75
  if "target_text" not in st.session_state:
76
  st.session_state.target_text = ""