Spaces:

ML-unipi
/

TermsOfServiceSummarization

Runtime error

tommasobaldi commited on Aug 28, 2022

Commit

87fbf70

•

1 Parent(s): 69f90b2

working on text splitting

Files changed (1) hide show

app.py CHANGED Viewed

@@ -97,29 +97,33 @@ def main() -> None:
     #     with st.spinner("Summarizing in progress..."):
     #         return tuple(summarizer.abstractive_summary(list(summary_sentence)))
-    def split_text(text: str) -> list:
-        sentences = sent_tokenize(text, language="english")
-        token_count = 0
-        text_block = ""
-        result = []
         for sentence in sentences:
-            tokens = word_tokenize(sentence, language="english", preserve_line=True)
-            if token_count + len(tokens) < 500:
-                token_count += len(tokens)
-                text_block += " ".join(sentence)
             else:
-                result.append(text_block)
-                text_block = "".join(sentence)
-                token_count = len(tokens)
-        return result
     pipe = create_pipeline()
     if summarize_button:
         if target_text_input is not "":
             with st.spinner("Summarizing in progress..."):
-                sentences = split_text(target_text_input)
                 for sentence in sentences:
                     st.text(sentence)
                     #output = pipe(sentence)

     #     with st.spinner("Summarizing in progress..."):
     #         return tuple(summarizer.abstractive_summary(list(summary_sentence)))
+    def join_sentences(sentences: list) -> str:
+        return " ".join([sentence for sentence in sentences])
+    def split_sentences_by_token_length(sentences: list, split_token_length: int) -> list:
+        accumulated_lists = []
+        result_list = []
+        cumulative_token_length = 0
         for sentence in sentences:
+            token_list = [token for token in nltk.word_tokenize(sentence) if token not in ['.']]
+            token_length = len(token_list)
+            if token_length + cumulative_token_length > split_token_length and result_list:
+                accumulated_lists.append(join_sentences(result_list))
+                result_list = [sentence]
+                cumulative_token_length = token_length
             else:
+                result_list.append(sentence)
+                cumulative_token_length += token_length
+        if result_list:
+            accumulated_lists.append(join_sentences(result_list))
+        return accumulated_lists
     pipe = create_pipeline()
     if summarize_button:
         if target_text_input is not "":
             with st.spinner("Summarizing in progress..."):
+                sentences = split_sentences_by_token_length(target_text_input)
                 for sentence in sentences:
                     st.text(sentence)
                     #output = pipe(sentence)