Spaces:
Build error
Build error
belgrano91
commited on
Commit
·
8228dae
1
Parent(s):
d32a483
Upload functions.py
Browse files- functions.py +226 -0
functions.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#*********************************************************************
|
2 |
+
|
3 |
+
|
4 |
+
# This archive could be a potential first stone of the project.
|
5 |
+
# Now contains only functions used throughout the files, but
|
6 |
+
# in the future could contain more complex structures.
|
7 |
+
|
8 |
+
|
9 |
+
#*********************************************************************
|
10 |
+
import pdfplumber
|
11 |
+
import docx2txt
|
12 |
+
import os
|
13 |
+
import re
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import matplotlib.pyplot as plt
|
17 |
+
import seaborn as sns
|
18 |
+
from sentence_transformers import SentenceTransformer, models,util
|
19 |
+
import nltk
|
20 |
+
from nltk.tokenize import sent_tokenize, wordpunct_tokenize
|
21 |
+
nltk.download("punkt")
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
def reading_word(string):
|
26 |
+
text = docx2txt.process("var.docx")
|
27 |
+
return text
|
28 |
+
|
29 |
+
def reading_pdf(string):
|
30 |
+
all_text=""
|
31 |
+
with pdfplumber.open(string) as pdf:
|
32 |
+
for pdf_page in pdf.pages:
|
33 |
+
bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
|
34 |
+
single_page_text = bold.extract_text(x_tolerance=2)
|
35 |
+
#print( single_page_text )
|
36 |
+
# separate each page's text with newline
|
37 |
+
all_text = all_text + '\n' + single_page_text
|
38 |
+
return all_text
|
39 |
+
|
40 |
+
|
41 |
+
def reading_file(string):
|
42 |
+
""""
|
43 |
+
-----------------------------------------------------------------------------
|
44 |
+
|
45 |
+
This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
|
46 |
+
For the moment we detect only: PDF and Words.
|
47 |
+
|
48 |
+
Returns: Long string with all the sentences in the document
|
49 |
+
|
50 |
+
-----------------------------------------------------------------------------
|
51 |
+
|
52 |
+
Input:
|
53 |
+
|
54 |
+
string: path of the file we want to analyze
|
55 |
+
|
56 |
+
"""
|
57 |
+
|
58 |
+
ext = os.path.splitext(string)[-1].lower()
|
59 |
+
if ext == ".pdf":
|
60 |
+
text=reading_pdf(string)
|
61 |
+
elif ext == ".docx":
|
62 |
+
text=reading_word(string)
|
63 |
+
else:
|
64 |
+
print ("Unknown file format.")
|
65 |
+
return text
|
66 |
+
|
67 |
+
|
68 |
+
def splitting(word: str, text):
|
69 |
+
if word=="line":
|
70 |
+
tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
|
71 |
+
elif word=="sentences":
|
72 |
+
#tok_text1=text.split('. ')
|
73 |
+
tok_text=sent_tokenize(text)
|
74 |
+
elif word=="paragraphs":
|
75 |
+
tok_text=re.split(r'\n{2,}', text)
|
76 |
+
for i in tok_text:
|
77 |
+
if len(i)<50:
|
78 |
+
tok_text.remove(i)
|
79 |
+
|
80 |
+
elif word=="words":
|
81 |
+
tok_text=wordpunct_tokenize(text)
|
82 |
+
return tok_text
|
83 |
+
|
84 |
+
|
85 |
+
def filtering(text):
|
86 |
+
""""
|
87 |
+
-----------------------------------------------------------------------------
|
88 |
+
|
89 |
+
This function takes as arguments the string obtained in the reading step and filters out undesired characters.
|
90 |
+
|
91 |
+
Potential things to filter: Index of contents, titles, formulas, references, tables (?)
|
92 |
+
|
93 |
+
|
94 |
+
Returns: Long string with all the sentences in the document.
|
95 |
+
|
96 |
+
-----------------------------------------------------------------------------
|
97 |
+
|
98 |
+
Input:
|
99 |
+
|
100 |
+
string: string obtained in the previous reading step.
|
101 |
+
|
102 |
+
"""
|
103 |
+
clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
|
104 |
+
clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
|
105 |
+
clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
|
106 |
+
clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
|
107 |
+
clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
|
108 |
+
clean1=re.sub("\no |\n\uf0b7","",clean1)
|
109 |
+
#clean1=re.sub(" \n"," ",clean1)
|
110 |
+
return clean1
|
111 |
+
|
112 |
+
|
113 |
+
def ctrlf(words: list, text):
|
114 |
+
b=[]
|
115 |
+
for word in words:
|
116 |
+
#print("Sentences matching the word ", word, ":\n")
|
117 |
+
a=re.findall(f"[^.]* {word} [^.]*\.", text)
|
118 |
+
#a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
|
119 |
+
for i in range(len(a)):
|
120 |
+
#print(i+1,".-", a[i])
|
121 |
+
b = b + [a[i]]
|
122 |
+
#print("--------------------------------------------------")
|
123 |
+
return b
|
124 |
+
|
125 |
+
|
126 |
+
def everything_vs_word(query, corpus, model_name, number=5, score_function=util.cos_sim, ax=None):
|
127 |
+
""""
|
128 |
+
-----------------------------------------------------------------------------
|
129 |
+
|
130 |
+
This function takes as arguments the text that we want to compare, the query with respect to we want to
|
131 |
+
compare, and then the number of comparisons we wanna show (by defect 5), the model used, and the metric used
|
132 |
+
to compute the similarity (by defect cosine similarity).
|
133 |
+
|
134 |
+
Returns: Histogram plot
|
135 |
+
|
136 |
+
-----------------------------------------------------------------------------
|
137 |
+
|
138 |
+
Input:
|
139 |
+
|
140 |
+
query: String
|
141 |
+
corpus: String or list of strings (usually the latter for a document --> list of sentences)
|
142 |
+
number: Int
|
143 |
+
model_name: String
|
144 |
+
score_function: Function
|
145 |
+
ax: Axis object
|
146 |
+
|
147 |
+
"""
|
148 |
+
|
149 |
+
# model info retrieval
|
150 |
+
model = SentenceTransformer(model_name)
|
151 |
+
n=len(query)
|
152 |
+
|
153 |
+
# tokenize according to the model
|
154 |
+
corpus_embedding = model.encode(corpus, convert_to_tensor=True)
|
155 |
+
query_embedding = model.encode(query, convert_to_tensor=True)
|
156 |
+
|
157 |
+
# semantic search gives a list of lists composed of dictionaries
|
158 |
+
hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
|
159 |
+
hits = hits[0]
|
160 |
+
#print("Comparing ", query, " VS:")
|
161 |
+
|
162 |
+
scoring=[]
|
163 |
+
corp=[]
|
164 |
+
for hit in hits:
|
165 |
+
#print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
|
166 |
+
scoring.append(hit['score'])
|
167 |
+
corp.append(corpus[hit['corpus_id']])
|
168 |
+
|
169 |
+
# defining dataframe for easiness in plotting
|
170 |
+
data = pd.DataFrame(np.column_stack([corp, scoring]),
|
171 |
+
columns=['Expression', 'Score'])
|
172 |
+
data.sort_values(by=['Score'], ascending=False)
|
173 |
+
data = data.explode('Score')
|
174 |
+
data['Score'] = data['Score'].astype('float')
|
175 |
+
|
176 |
+
return sns.barplot(data=data.reset_index(), ax=ax, x='Score', y='Expression')
|
177 |
+
|
178 |
+
|
179 |
+
def sim(query, corpus, model_name, number=5, score_function=util.cos_sim):
|
180 |
+
# model info retrieval
|
181 |
+
model = SentenceTransformer(model_name)
|
182 |
+
n=len(query)
|
183 |
+
|
184 |
+
# tokenize according to the model
|
185 |
+
corpus_embedding = model.encode(corpus, convert_to_tensor=True)
|
186 |
+
query_embedding = model.encode(query, convert_to_tensor=True)
|
187 |
+
|
188 |
+
# semantic search gives a list of lists composed of dictionaries
|
189 |
+
hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
|
190 |
+
hits = hits[0]
|
191 |
+
#print("Comparing ", query, " VS:")
|
192 |
+
|
193 |
+
scoring=[]
|
194 |
+
corp=[]
|
195 |
+
for hit in hits:
|
196 |
+
#print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
|
197 |
+
scoring.append(hit['score'])
|
198 |
+
corp.append(corpus[hit['corpus_id']])
|
199 |
+
|
200 |
+
# defining dataframe for easiness in plotting
|
201 |
+
data = pd.DataFrame(np.column_stack([corp, scoring]),
|
202 |
+
columns=['Expression', 'Score'])
|
203 |
+
data.sort_values(by=['Score'], ascending=False)
|
204 |
+
data = data.explode('Score')
|
205 |
+
data['Score'] = data['Score'].astype('float')
|
206 |
+
return data
|
207 |
+
|
208 |
+
|
209 |
+
def sim_2(query: list, corpus, model_name, threshold,number=5, score_function=util.cos_sim):
|
210 |
+
frames=[]
|
211 |
+
for i in query:
|
212 |
+
frames = frames + [functions.sim(query[i], corpus, model_name=model_name, number=number, score_function=util.cos_sim)]
|
213 |
+
|
214 |
+
result = pd.DataFrame(frames)
|
215 |
+
result=result.sort_values(by=['Score'], ascending=False)
|
216 |
+
result.drop_duplicates(subset=['Expression'], inplace=True)
|
217 |
+
return result
|
218 |
+
|
219 |
+
|
220 |
+
############ EXTRA BALL ################
|
221 |
+
# detecting the conclusion and getting all the sentences of that paragraph for future use.
|
222 |
+
def conclusion():
|
223 |
+
return
|
224 |
+
|
225 |
+
|
226 |
+
########## Get a function with the distribution of the results per word
|