tommasobaldi commited on
Commit
87fbf70
1 Parent(s): 69f90b2

working on text splitting

Browse files
Files changed (1) hide show
  1. app.py +18 -14
app.py CHANGED
@@ -97,29 +97,33 @@ def main() -> None:
97
  # with st.spinner("Summarizing in progress..."):
98
  # return tuple(summarizer.abstractive_summary(list(summary_sentence)))
99
 
100
- def split_text(text: str) -> list:
101
- sentences = sent_tokenize(text, language="english")
102
 
103
- token_count = 0
104
- text_block = ""
105
- result = []
 
106
  for sentence in sentences:
107
- tokens = word_tokenize(sentence, language="english", preserve_line=True)
108
- if token_count + len(tokens) < 500:
109
- token_count += len(tokens)
110
- text_block += " ".join(sentence)
 
 
111
  else:
112
- result.append(text_block)
113
- text_block = "".join(sentence)
114
- token_count = len(tokens)
115
- return result
 
116
 
117
  pipe = create_pipeline()
118
 
119
  if summarize_button:
120
  if target_text_input is not "":
121
  with st.spinner("Summarizing in progress..."):
122
- sentences = split_text(target_text_input)
123
  for sentence in sentences:
124
  st.text(sentence)
125
  #output = pipe(sentence)
 
97
  # with st.spinner("Summarizing in progress..."):
98
  # return tuple(summarizer.abstractive_summary(list(summary_sentence)))
99
 
100
+ def join_sentences(sentences: list) -> str:
101
+ return " ".join([sentence for sentence in sentences])
102
 
103
+ def split_sentences_by_token_length(sentences: list, split_token_length: int) -> list:
104
+ accumulated_lists = []
105
+ result_list = []
106
+ cumulative_token_length = 0
107
  for sentence in sentences:
108
+ token_list = [token for token in nltk.word_tokenize(sentence) if token not in ['.']]
109
+ token_length = len(token_list)
110
+ if token_length + cumulative_token_length > split_token_length and result_list:
111
+ accumulated_lists.append(join_sentences(result_list))
112
+ result_list = [sentence]
113
+ cumulative_token_length = token_length
114
  else:
115
+ result_list.append(sentence)
116
+ cumulative_token_length += token_length
117
+ if result_list:
118
+ accumulated_lists.append(join_sentences(result_list))
119
+ return accumulated_lists
120
 
121
  pipe = create_pipeline()
122
 
123
  if summarize_button:
124
  if target_text_input is not "":
125
  with st.spinner("Summarizing in progress..."):
126
+ sentences = split_sentences_by_token_length(target_text_input)
127
  for sentence in sentences:
128
  st.text(sentence)
129
  #output = pipe(sentence)