Spaces:

mikeion
/

research_guru

Sleeping

App Files Files Community

mikeion commited on Mar 25, 2023

Commit

fd32063

1 Parent(s): 1c6a5e4

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -36

app.py CHANGED Viewed

@@ -59,47 +59,32 @@ class Chatbot():
             _ = page.extract_text(visitor_text=visitor_body)
             print(f'Page {i} text", {page_text}')
-            # Instantiate variables to be used in the loop. The first text element in the page_text list is used to initialize the variables
-            prev_y = None
-            prev_font_size = None
-            paragraph = ''
-            # Iterate through the page_text list and add the text to the paragraph string.
-            # y_diff and font_size_diff are used to separate paragraphs into different elements in the paper_text list by
-            # checking the y coordinate and the font size of the current text element and comparing it to the previous text element
-            for idx, t in enumerate(page_text):
-                if prev_y is not None and prev_font_size is not None:
-                    y_diff = abs(t['y'] - prev_y)
-                    font_size_diff = abs(t['fontsize'] - prev_font_size)
-                    # y_diff > 10 and font_size_diff > 1 are used to separate paragraphs into different elements in the paper_text list
-                    if y_diff > 10 or font_size_diff > 1:
-                        # Add paragraph to paper_text when the y_diff is too large or the font size is too different
-                        # This is to separate paragraphs into different elements in the paper_text list
-                        # This is done by checking the y coordinate and the font size of the current text element
-                        paper_text.append({
-                            'fontsize': prev_font_size,
-                            'text': paragraph.strip(),
                             'page': i
                         })
-                        paragraph = ''
-                    else:
-                        y_diff = 0
-                        font_size_diff = 0
-                    # Add text to paragraph, and update the variables.
-                    paragraph += f" {t['text']}"
-                    prev_y = t['y']
-                    prev_font_size = t['fontsize']
-                    # Add last paragraph when reaching the end of the page_text
-                    if idx == len(page_text) - 1:
-                        paper_text.append({
-                            'fontsize': prev_font_size,
-                            'text': paragraph.strip(),
                             'page': i
                         })
         print("Done parsing paper")
         print(paper_text)

             _ = page.extract_text(visitor_text=visitor_body)
             print(f'Page {i} text", {page_text}')
+            blob_font_size = None
+            blob_text = ''
+            processed_text = []
+            for t in page_text:
+                if t['fontsize'] == blob_font_size:
+                    blob_text += f" {t['text']}"
+                    if len(blob_text) >= 2000:
+                        processed_text.append({
+                            'fontsize': blob_font_size,
+                            'text': blob_text,
                             'page': i
                         })
+                        blob_font_size = None
+                        blob_text = ''
+                else:
+                    if blob_font_size is not None and len(blob_text) >= 1:
+                        processed_text.append({
+                            'fontsize': blob_font_size,
+                            'text': blob_text,
                             'page': i
                         })
+                    blob_font_size = t['fontsize']
+                    blob_text = t['text']
+                paper_text += processed_text
         print("Done parsing paper")
         print(paper_text)