Spaces:
Runtime error
Runtime error
cryptocalypse
commited on
Commit
·
103c053
1
Parent(s):
e865108
libs entropy and read files
Browse files- lib/entropy.py +131 -0
- lib/files.py +31 -0
lib/entropy.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import math
|
3 |
+
|
4 |
+
class TextProcessor:
|
5 |
+
def __init__(self, texto):
|
6 |
+
self.texto = texto
|
7 |
+
|
8 |
+
def entropy(self):
|
9 |
+
simbolos = {}
|
10 |
+
total_caracteres = len(self.texto)
|
11 |
+
|
12 |
+
for caracter in self.texto:
|
13 |
+
simbolos[caracter] = simbolos.get(caracter, 0) + 1
|
14 |
+
|
15 |
+
entropia = 0
|
16 |
+
for count in simbolos.values():
|
17 |
+
probabilidad = count / total_caracteres
|
18 |
+
entropia -= probabilidad * math.log2(probabilidad)
|
19 |
+
|
20 |
+
return simbolos, entropia
|
21 |
+
|
22 |
+
def common_string(self, cadena1, cadena2):
|
23 |
+
longitud1 = len(cadena1)
|
24 |
+
longitud2 = len(cadena2)
|
25 |
+
comun = ''
|
26 |
+
subcadenas_comunes = []
|
27 |
+
|
28 |
+
for i in range(longitud1):
|
29 |
+
for j in range(longitud2):
|
30 |
+
k = 0
|
31 |
+
while (i+k < longitud1 and j+k < longitud2 and cadena1[i+k] == cadena2[j+k]):
|
32 |
+
k += 1
|
33 |
+
if k > 0:
|
34 |
+
subcadenas_comunes.append(cadena1[i:i+k])
|
35 |
+
|
36 |
+
if subcadenas_comunes:
|
37 |
+
comun = max(subcadenas_comunes, key=len)
|
38 |
+
|
39 |
+
return comun
|
40 |
+
|
41 |
+
def magic_split(self):
|
42 |
+
unique_symbols = set(self.texto)
|
43 |
+
symbol_distances = {}
|
44 |
+
for symbol in unique_symbols:
|
45 |
+
indices = [i for i, char in enumerate(self.texto) if char == symbol]
|
46 |
+
if len(indices) > 1:
|
47 |
+
distances = [indices[i + 1] - indices[i] for i in range(len(indices) - 1)]
|
48 |
+
symbol_distances[symbol] = distances
|
49 |
+
|
50 |
+
variation = {symbol: max(distances) - min(distances) for symbol, distances in symbol_distances.items() if distances}
|
51 |
+
|
52 |
+
mins = {}
|
53 |
+
for v in variation:
|
54 |
+
if variation[v]!=0 and variation[v]!=1:
|
55 |
+
mins[v] = variation[v]
|
56 |
+
|
57 |
+
best_symbol = min(mins, key=mins.get)
|
58 |
+
|
59 |
+
return best_symbol
|
60 |
+
|
61 |
+
def rotate_string(self, string, n):
|
62 |
+
indice = n % len(string)
|
63 |
+
string_rotado = string[indice:] + string[:indice]
|
64 |
+
return string_rotado
|
65 |
+
|
66 |
+
def rotate_compare(self, tokiA, tokiB):
|
67 |
+
if tokiA >= tokiB:
|
68 |
+
tokA = tokiA
|
69 |
+
tokB = tokiB
|
70 |
+
ltokA = len(tokA)
|
71 |
+
else:
|
72 |
+
tokA = tokiB
|
73 |
+
tokB = tokiA
|
74 |
+
ltokA = len(tokB)
|
75 |
+
|
76 |
+
i = 0
|
77 |
+
rotations = {}
|
78 |
+
while i < ltokA:
|
79 |
+
tokrotated = self.rotate_string(tokA, i)
|
80 |
+
rotations[str(i)] = self.common_string(tokrotated, tokB)
|
81 |
+
i += 1
|
82 |
+
|
83 |
+
best_r = ""
|
84 |
+
for x in rotations:
|
85 |
+
lb = len(best_r)
|
86 |
+
rot = rotations[x]
|
87 |
+
lrot = len(rot)
|
88 |
+
if lrot > 1 and lrot < ltokA and lrot > lb:
|
89 |
+
best_r = rot
|
90 |
+
|
91 |
+
return best_r
|
92 |
+
|
93 |
+
def get_subTokens(self, spl):
|
94 |
+
sub_tokens = self.texto.split(spl)
|
95 |
+
toks = []
|
96 |
+
for tok in sub_tokens:
|
97 |
+
for tok2 in sub_tokens:
|
98 |
+
if tok != tok2:
|
99 |
+
toks.append(self.rotate_compare(tok, tok2))
|
100 |
+
|
101 |
+
return list(set(toks))
|
102 |
+
|
103 |
+
def tokenize(self, spliter_optimo):
|
104 |
+
tokens = self.get_subTokens(spliter_optimo)
|
105 |
+
tokenized_sentence = {}
|
106 |
+
chunk = self.texto.split(spliter_optimo)
|
107 |
+
for txt in chunk:
|
108 |
+
best_split = ""
|
109 |
+
for tok in tokens:
|
110 |
+
if tok != "":
|
111 |
+
lt = len(tok)
|
112 |
+
lb = len(best_split)
|
113 |
+
spltxt = txt.split(tok)
|
114 |
+
if len(spltxt) > 1:
|
115 |
+
l0 = len(spltxt[0])
|
116 |
+
l1 = len(spltxt[1])
|
117 |
+
if lt < len(txt) and lt > lb:
|
118 |
+
best_split = tok
|
119 |
+
tokenized_sentence[txt] = " " + spltxt[0] + "-" + tok + "-" + spltxt[1]
|
120 |
+
return tokenized_sentence
|
121 |
+
|
122 |
+
|
123 |
+
# Example usage:
|
124 |
+
texto_ejemplo = sys.argv[1]
|
125 |
+
|
126 |
+
text_processor = TextProcessor(texto_ejemplo)
|
127 |
+
spliter_optimo = text_processor.magic_split()
|
128 |
+
print("Spliter óptimo:", spliter_optimo)
|
129 |
+
print(text_processor.entropy())
|
130 |
+
print(text_processor.tokenize(spliter_optimo))
|
131 |
+
|
lib/files.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
class TextFinder:
|
4 |
+
def __init__(self, folder):
|
5 |
+
self.folder = folder
|
6 |
+
|
7 |
+
def find_matches(self, text):
|
8 |
+
matches = []
|
9 |
+
files = os.listdir(self.folder)
|
10 |
+
|
11 |
+
for file in files:
|
12 |
+
file_path = os.path.join(self.folder, file)
|
13 |
+
if os.path.isfile(file_path):
|
14 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
15 |
+
content = f.read()
|
16 |
+
index = content.find(text)
|
17 |
+
while index != -1:
|
18 |
+
start = max(content.rfind('\n', 0, index), content.rfind('.', 0, index))
|
19 |
+
end = min(content.find('\n', index), content.find('.', index))
|
20 |
+
if start != -1 and end != -1:
|
21 |
+
matches.append(content[start+1:end].strip())
|
22 |
+
index = content.find(text, index + 1)
|
23 |
+
|
24 |
+
return matches
|
25 |
+
|
26 |
+
# Example usage:
|
27 |
+
if __name__ == "__main__":
|
28 |
+
finder = TextFinder('example_folder')
|
29 |
+
matches = finder.find_matches('text_to_find')
|
30 |
+
print(matches)
|
31 |
+
|