belgrano91 commited on
Commit
8228dae
·
1 Parent(s): d32a483

Upload functions.py

Browse files
Files changed (1) hide show
  1. functions.py +226 -0
functions.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #*********************************************************************
2
+
3
+
4
+ # This archive could be a potential first stone of the project.
5
+ # Now contains only functions used throughout the files, but
6
+ # in the future could contain more complex structures.
7
+
8
+
9
+ #*********************************************************************
10
+ import pdfplumber
11
+ import docx2txt
12
+ import os
13
+ import re
14
+ import numpy as np
15
+ import pandas as pd
16
+ import matplotlib.pyplot as plt
17
+ import seaborn as sns
18
+ from sentence_transformers import SentenceTransformer, models,util
19
+ import nltk
20
+ from nltk.tokenize import sent_tokenize, wordpunct_tokenize
21
+ nltk.download("punkt")
22
+
23
+
24
+
25
+ def reading_word(string):
26
+ text = docx2txt.process("var.docx")
27
+ return text
28
+
29
+ def reading_pdf(string):
30
+ all_text=""
31
+ with pdfplumber.open(string) as pdf:
32
+ for pdf_page in pdf.pages:
33
+ bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
34
+ single_page_text = bold.extract_text(x_tolerance=2)
35
+ #print( single_page_text )
36
+ # separate each page's text with newline
37
+ all_text = all_text + '\n' + single_page_text
38
+ return all_text
39
+
40
+
41
+ def reading_file(string):
42
+ """"
43
+ -----------------------------------------------------------------------------
44
+
45
+ This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
46
+ For the moment we detect only: PDF and Words.
47
+
48
+ Returns: Long string with all the sentences in the document
49
+
50
+ -----------------------------------------------------------------------------
51
+
52
+ Input:
53
+
54
+ string: path of the file we want to analyze
55
+
56
+ """
57
+
58
+ ext = os.path.splitext(string)[-1].lower()
59
+ if ext == ".pdf":
60
+ text=reading_pdf(string)
61
+ elif ext == ".docx":
62
+ text=reading_word(string)
63
+ else:
64
+ print ("Unknown file format.")
65
+ return text
66
+
67
+
68
+ def splitting(word: str, text):
69
+ if word=="line":
70
+ tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
71
+ elif word=="sentences":
72
+ #tok_text1=text.split('. ')
73
+ tok_text=sent_tokenize(text)
74
+ elif word=="paragraphs":
75
+ tok_text=re.split(r'\n{2,}', text)
76
+ for i in tok_text:
77
+ if len(i)<50:
78
+ tok_text.remove(i)
79
+
80
+ elif word=="words":
81
+ tok_text=wordpunct_tokenize(text)
82
+ return tok_text
83
+
84
+
85
+ def filtering(text):
86
+ """"
87
+ -----------------------------------------------------------------------------
88
+
89
+ This function takes as arguments the string obtained in the reading step and filters out undesired characters.
90
+
91
+ Potential things to filter: Index of contents, titles, formulas, references, tables (?)
92
+
93
+
94
+ Returns: Long string with all the sentences in the document.
95
+
96
+ -----------------------------------------------------------------------------
97
+
98
+ Input:
99
+
100
+ string: string obtained in the previous reading step.
101
+
102
+ """
103
+ clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
104
+ clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
105
+ clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
106
+ clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
107
+ clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
108
+ clean1=re.sub("\no |\n\uf0b7","",clean1)
109
+ #clean1=re.sub(" \n"," ",clean1)
110
+ return clean1
111
+
112
+
113
+ def ctrlf(words: list, text):
114
+ b=[]
115
+ for word in words:
116
+ #print("Sentences matching the word ", word, ":\n")
117
+ a=re.findall(f"[^.]* {word} [^.]*\.", text)
118
+ #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
119
+ for i in range(len(a)):
120
+ #print(i+1,".-", a[i])
121
+ b = b + [a[i]]
122
+ #print("--------------------------------------------------")
123
+ return b
124
+
125
+
126
+ def everything_vs_word(query, corpus, model_name, number=5, score_function=util.cos_sim, ax=None):
127
+ """"
128
+ -----------------------------------------------------------------------------
129
+
130
+ This function takes as arguments the text that we want to compare, the query with respect to we want to
131
+ compare, and then the number of comparisons we wanna show (by defect 5), the model used, and the metric used
132
+ to compute the similarity (by defect cosine similarity).
133
+
134
+ Returns: Histogram plot
135
+
136
+ -----------------------------------------------------------------------------
137
+
138
+ Input:
139
+
140
+ query: String
141
+ corpus: String or list of strings (usually the latter for a document --> list of sentences)
142
+ number: Int
143
+ model_name: String
144
+ score_function: Function
145
+ ax: Axis object
146
+
147
+ """
148
+
149
+ # model info retrieval
150
+ model = SentenceTransformer(model_name)
151
+ n=len(query)
152
+
153
+ # tokenize according to the model
154
+ corpus_embedding = model.encode(corpus, convert_to_tensor=True)
155
+ query_embedding = model.encode(query, convert_to_tensor=True)
156
+
157
+ # semantic search gives a list of lists composed of dictionaries
158
+ hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
159
+ hits = hits[0]
160
+ #print("Comparing ", query, " VS:")
161
+
162
+ scoring=[]
163
+ corp=[]
164
+ for hit in hits:
165
+ #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
166
+ scoring.append(hit['score'])
167
+ corp.append(corpus[hit['corpus_id']])
168
+
169
+ # defining dataframe for easiness in plotting
170
+ data = pd.DataFrame(np.column_stack([corp, scoring]),
171
+ columns=['Expression', 'Score'])
172
+ data.sort_values(by=['Score'], ascending=False)
173
+ data = data.explode('Score')
174
+ data['Score'] = data['Score'].astype('float')
175
+
176
+ return sns.barplot(data=data.reset_index(), ax=ax, x='Score', y='Expression')
177
+
178
+
179
+ def sim(query, corpus, model_name, number=5, score_function=util.cos_sim):
180
+ # model info retrieval
181
+ model = SentenceTransformer(model_name)
182
+ n=len(query)
183
+
184
+ # tokenize according to the model
185
+ corpus_embedding = model.encode(corpus, convert_to_tensor=True)
186
+ query_embedding = model.encode(query, convert_to_tensor=True)
187
+
188
+ # semantic search gives a list of lists composed of dictionaries
189
+ hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
190
+ hits = hits[0]
191
+ #print("Comparing ", query, " VS:")
192
+
193
+ scoring=[]
194
+ corp=[]
195
+ for hit in hits:
196
+ #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
197
+ scoring.append(hit['score'])
198
+ corp.append(corpus[hit['corpus_id']])
199
+
200
+ # defining dataframe for easiness in plotting
201
+ data = pd.DataFrame(np.column_stack([corp, scoring]),
202
+ columns=['Expression', 'Score'])
203
+ data.sort_values(by=['Score'], ascending=False)
204
+ data = data.explode('Score')
205
+ data['Score'] = data['Score'].astype('float')
206
+ return data
207
+
208
+
209
+ def sim_2(query: list, corpus, model_name, threshold,number=5, score_function=util.cos_sim):
210
+ frames=[]
211
+ for i in query:
212
+ frames = frames + [functions.sim(query[i], corpus, model_name=model_name, number=number, score_function=util.cos_sim)]
213
+
214
+ result = pd.DataFrame(frames)
215
+ result=result.sort_values(by=['Score'], ascending=False)
216
+ result.drop_duplicates(subset=['Expression'], inplace=True)
217
+ return result
218
+
219
+
220
+ ############ EXTRA BALL ################
221
+ # detecting the conclusion and getting all the sentences of that paragraph for future use.
222
+ def conclusion():
223
+ return
224
+
225
+
226
+ ########## Get a function with the distribution of the results per word