mikeion commited on
Commit
fd32063
·
1 Parent(s): 1c6a5e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -36
app.py CHANGED
@@ -59,47 +59,32 @@ class Chatbot():
59
  _ = page.extract_text(visitor_text=visitor_body)
60
  print(f'Page {i} text", {page_text}')
61
 
62
- # Instantiate variables to be used in the loop. The first text element in the page_text list is used to initialize the variables
63
- prev_y = None
64
- prev_font_size = None
65
- paragraph = ''
66
 
67
- # Iterate through the page_text list and add the text to the paragraph string.
68
- # y_diff and font_size_diff are used to separate paragraphs into different elements in the paper_text list by
69
- # checking the y coordinate and the font size of the current text element and comparing it to the previous text element
70
- for idx, t in enumerate(page_text):
71
- if prev_y is not None and prev_font_size is not None:
72
- y_diff = abs(t['y'] - prev_y)
73
- font_size_diff = abs(t['fontsize'] - prev_font_size)
74
-
75
- # y_diff > 10 and font_size_diff > 1 are used to separate paragraphs into different elements in the paper_text list
76
- if y_diff > 10 or font_size_diff > 1:
77
- # Add paragraph to paper_text when the y_diff is too large or the font size is too different
78
- # This is to separate paragraphs into different elements in the paper_text list
79
- # This is done by checking the y coordinate and the font size of the current text element
80
- paper_text.append({
81
- 'fontsize': prev_font_size,
82
- 'text': paragraph.strip(),
83
  'page': i
84
  })
85
- paragraph = ''
86
- else:
87
- y_diff = 0
88
- font_size_diff = 0
89
- # Add text to paragraph, and update the variables.
90
- paragraph += f" {t['text']}"
91
- prev_y = t['y']
92
- prev_font_size = t['fontsize']
93
-
94
- # Add last paragraph when reaching the end of the page_text
95
-
96
- if idx == len(page_text) - 1:
97
- paper_text.append({
98
- 'fontsize': prev_font_size,
99
- 'text': paragraph.strip(),
100
  'page': i
101
  })
102
-
 
 
103
  print("Done parsing paper")
104
  print(paper_text)
105
 
 
59
  _ = page.extract_text(visitor_text=visitor_body)
60
  print(f'Page {i} text", {page_text}')
61
 
62
+
63
+ blob_font_size = None
64
+ blob_text = ''
65
+ processed_text = []
66
 
67
+ for t in page_text:
68
+ if t['fontsize'] == blob_font_size:
69
+ blob_text += f" {t['text']}"
70
+ if len(blob_text) >= 2000:
71
+ processed_text.append({
72
+ 'fontsize': blob_font_size,
73
+ 'text': blob_text,
 
 
 
 
 
 
 
 
 
74
  'page': i
75
  })
76
+ blob_font_size = None
77
+ blob_text = ''
78
+ else:
79
+ if blob_font_size is not None and len(blob_text) >= 1:
80
+ processed_text.append({
81
+ 'fontsize': blob_font_size,
82
+ 'text': blob_text,
 
 
 
 
 
 
 
 
83
  'page': i
84
  })
85
+ blob_font_size = t['fontsize']
86
+ blob_text = t['text']
87
+ paper_text += processed_text
88
  print("Done parsing paper")
89
  print(paper_text)
90