seyia92coding
commited on
Commit
•
cf8a101
1
Parent(s):
b1fea4e
Upload fuzz.py
Browse files
fuzz.py
ADDED
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# encoding: utf-8
|
3 |
+
from __future__ import unicode_literals
|
4 |
+
import platform
|
5 |
+
import warnings
|
6 |
+
|
7 |
+
try:
|
8 |
+
from .StringMatcher import StringMatcher as SequenceMatcher
|
9 |
+
except ImportError:
|
10 |
+
if platform.python_implementation() != "PyPy":
|
11 |
+
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
|
12 |
+
from difflib import SequenceMatcher
|
13 |
+
|
14 |
+
from . import utils
|
15 |
+
|
16 |
+
|
17 |
+
###########################
|
18 |
+
# Basic Scoring Functions #
|
19 |
+
###########################
|
20 |
+
|
21 |
+
@utils.check_for_none
|
22 |
+
@utils.check_for_equivalence
|
23 |
+
@utils.check_empty_string
|
24 |
+
def ratio(s1, s2):
|
25 |
+
s1, s2 = utils.make_type_consistent(s1, s2)
|
26 |
+
|
27 |
+
m = SequenceMatcher(None, s1, s2)
|
28 |
+
return utils.intr(100 * m.ratio())
|
29 |
+
|
30 |
+
|
31 |
+
@utils.check_for_none
|
32 |
+
@utils.check_for_equivalence
|
33 |
+
@utils.check_empty_string
|
34 |
+
def partial_ratio(s1, s2):
|
35 |
+
""""Return the ratio of the most similar substring
|
36 |
+
as a number between 0 and 100."""
|
37 |
+
s1, s2 = utils.make_type_consistent(s1, s2)
|
38 |
+
|
39 |
+
if len(s1) <= len(s2):
|
40 |
+
shorter = s1
|
41 |
+
longer = s2
|
42 |
+
else:
|
43 |
+
shorter = s2
|
44 |
+
longer = s1
|
45 |
+
|
46 |
+
m = SequenceMatcher(None, shorter, longer)
|
47 |
+
blocks = m.get_matching_blocks()
|
48 |
+
|
49 |
+
# each block represents a sequence of matching characters in a string
|
50 |
+
# of the form (idx_1, idx_2, len)
|
51 |
+
# the best partial match will block align with at least one of those blocks
|
52 |
+
# e.g. shorter = "abcd", longer = XXXbcdeEEE
|
53 |
+
# block = (1,3,3)
|
54 |
+
# best score === ratio("abcd", "Xbcd")
|
55 |
+
scores = []
|
56 |
+
for block in blocks:
|
57 |
+
long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
|
58 |
+
long_end = long_start + len(shorter)
|
59 |
+
long_substr = longer[long_start:long_end]
|
60 |
+
|
61 |
+
m2 = SequenceMatcher(None, shorter, long_substr)
|
62 |
+
r = m2.ratio()
|
63 |
+
if r > .995:
|
64 |
+
return 100
|
65 |
+
else:
|
66 |
+
scores.append(r)
|
67 |
+
|
68 |
+
return utils.intr(100 * max(scores))
|
69 |
+
|
70 |
+
|
71 |
+
##############################
|
72 |
+
# Advanced Scoring Functions #
|
73 |
+
##############################
|
74 |
+
|
75 |
+
def _process_and_sort(s, force_ascii, full_process=True):
|
76 |
+
"""Return a cleaned string with token sorted."""
|
77 |
+
# pull tokens
|
78 |
+
ts = utils.full_process(s, force_ascii=force_ascii) if full_process else s
|
79 |
+
tokens = ts.split()
|
80 |
+
|
81 |
+
# sort tokens and join
|
82 |
+
sorted_string = u" ".join(sorted(tokens))
|
83 |
+
return sorted_string.strip()
|
84 |
+
|
85 |
+
|
86 |
+
# Sorted Token
|
87 |
+
# find all alphanumeric tokens in the string
|
88 |
+
# sort those tokens and take ratio of resulting joined strings
|
89 |
+
# controls for unordered string elements
|
90 |
+
@utils.check_for_none
|
91 |
+
def _token_sort(s1, s2, partial=True, force_ascii=True, full_process=True):
|
92 |
+
sorted1 = _process_and_sort(s1, force_ascii, full_process=full_process)
|
93 |
+
sorted2 = _process_and_sort(s2, force_ascii, full_process=full_process)
|
94 |
+
|
95 |
+
if partial:
|
96 |
+
return partial_ratio(sorted1, sorted2)
|
97 |
+
else:
|
98 |
+
return ratio(sorted1, sorted2)
|
99 |
+
|
100 |
+
|
101 |
+
def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
|
102 |
+
"""Return a measure of the sequences' similarity between 0 and 100
|
103 |
+
but sorting the token before comparing.
|
104 |
+
"""
|
105 |
+
return _token_sort(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
|
106 |
+
|
107 |
+
|
108 |
+
def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
|
109 |
+
"""Return the ratio of the most similar substring as a number between
|
110 |
+
0 and 100 but sorting the token before comparing.
|
111 |
+
"""
|
112 |
+
return _token_sort(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
|
113 |
+
|
114 |
+
|
115 |
+
@utils.check_for_none
|
116 |
+
def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
|
117 |
+
"""Find all alphanumeric tokens in each string...
|
118 |
+
- treat them as a set
|
119 |
+
- construct two strings of the form:
|
120 |
+
<sorted_intersection><sorted_remainder>
|
121 |
+
- take ratios of those two strings
|
122 |
+
- controls for unordered partial matches"""
|
123 |
+
|
124 |
+
if not full_process and s1 == s2:
|
125 |
+
return 100
|
126 |
+
|
127 |
+
p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
|
128 |
+
p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2
|
129 |
+
|
130 |
+
if not utils.validate_string(p1):
|
131 |
+
return 0
|
132 |
+
if not utils.validate_string(p2):
|
133 |
+
return 0
|
134 |
+
|
135 |
+
# pull tokens
|
136 |
+
tokens1 = set(p1.split())
|
137 |
+
tokens2 = set(p2.split())
|
138 |
+
|
139 |
+
intersection = tokens1.intersection(tokens2)
|
140 |
+
diff1to2 = tokens1.difference(tokens2)
|
141 |
+
diff2to1 = tokens2.difference(tokens1)
|
142 |
+
|
143 |
+
sorted_sect = " ".join(sorted(intersection))
|
144 |
+
sorted_1to2 = " ".join(sorted(diff1to2))
|
145 |
+
sorted_2to1 = " ".join(sorted(diff2to1))
|
146 |
+
|
147 |
+
combined_1to2 = sorted_sect + " " + sorted_1to2
|
148 |
+
combined_2to1 = sorted_sect + " " + sorted_2to1
|
149 |
+
|
150 |
+
# strip
|
151 |
+
sorted_sect = sorted_sect.strip()
|
152 |
+
combined_1to2 = combined_1to2.strip()
|
153 |
+
combined_2to1 = combined_2to1.strip()
|
154 |
+
|
155 |
+
if partial:
|
156 |
+
ratio_func = partial_ratio
|
157 |
+
else:
|
158 |
+
ratio_func = ratio
|
159 |
+
|
160 |
+
pairwise = [
|
161 |
+
ratio_func(sorted_sect, combined_1to2),
|
162 |
+
ratio_func(sorted_sect, combined_2to1),
|
163 |
+
ratio_func(combined_1to2, combined_2to1)
|
164 |
+
]
|
165 |
+
return max(pairwise)
|
166 |
+
|
167 |
+
|
168 |
+
def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
|
169 |
+
return _token_set(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process)
|
170 |
+
|
171 |
+
|
172 |
+
def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
|
173 |
+
return _token_set(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process)
|
174 |
+
|
175 |
+
|
176 |
+
###################
|
177 |
+
# Combination API #
|
178 |
+
###################
|
179 |
+
|
180 |
+
# q is for quick
|
181 |
+
def QRatio(s1, s2, force_ascii=True, full_process=True):
|
182 |
+
"""
|
183 |
+
Quick ratio comparison between two strings.
|
184 |
+
|
185 |
+
Runs full_process from utils on both strings
|
186 |
+
Short circuits if either of the strings is empty after processing.
|
187 |
+
|
188 |
+
:param s1:
|
189 |
+
:param s2:
|
190 |
+
:param force_ascii: Allow only ASCII characters (Default: True)
|
191 |
+
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
|
192 |
+
:return: similarity ratio
|
193 |
+
"""
|
194 |
+
|
195 |
+
if full_process:
|
196 |
+
p1 = utils.full_process(s1, force_ascii=force_ascii)
|
197 |
+
p2 = utils.full_process(s2, force_ascii=force_ascii)
|
198 |
+
else:
|
199 |
+
p1 = s1
|
200 |
+
p2 = s2
|
201 |
+
|
202 |
+
if not utils.validate_string(p1):
|
203 |
+
return 0
|
204 |
+
if not utils.validate_string(p2):
|
205 |
+
return 0
|
206 |
+
|
207 |
+
return ratio(p1, p2)
|
208 |
+
|
209 |
+
|
210 |
+
def UQRatio(s1, s2, full_process=True):
|
211 |
+
"""
|
212 |
+
Unicode quick ratio
|
213 |
+
|
214 |
+
Calls QRatio with force_ascii set to False
|
215 |
+
|
216 |
+
:param s1:
|
217 |
+
:param s2:
|
218 |
+
:return: similarity ratio
|
219 |
+
"""
|
220 |
+
return QRatio(s1, s2, force_ascii=False, full_process=full_process)
|
221 |
+
|
222 |
+
|
223 |
+
# w is for weighted
|
224 |
+
def WRatio(s1, s2, force_ascii=True, full_process=True):
|
225 |
+
"""
|
226 |
+
Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
|
227 |
+
|
228 |
+
**Steps in the order they occur**
|
229 |
+
|
230 |
+
#. Run full_process from utils on both strings
|
231 |
+
#. Short circuit if this makes either string empty
|
232 |
+
#. Take the ratio of the two processed strings (fuzz.ratio)
|
233 |
+
#. Run checks to compare the length of the strings
|
234 |
+
* If one of the strings is more than 1.5 times as long as the other
|
235 |
+
use partial_ratio comparisons - scale partial results by 0.9
|
236 |
+
(this makes sure only full results can return 100)
|
237 |
+
* If one of the strings is over 8 times as long as the other
|
238 |
+
instead scale by 0.6
|
239 |
+
|
240 |
+
#. Run the other ratio functions
|
241 |
+
* if using partial ratio functions call partial_ratio,
|
242 |
+
partial_token_sort_ratio and partial_token_set_ratio
|
243 |
+
scale all of these by the ratio based on length
|
244 |
+
* otherwise call token_sort_ratio and token_set_ratio
|
245 |
+
* all token based comparisons are scaled by 0.95
|
246 |
+
(on top of any partial scalars)
|
247 |
+
|
248 |
+
#. Take the highest value from these results
|
249 |
+
round it and return it as an integer.
|
250 |
+
|
251 |
+
:param s1:
|
252 |
+
:param s2:
|
253 |
+
:param force_ascii: Allow only ascii characters
|
254 |
+
:type force_ascii: bool
|
255 |
+
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
|
256 |
+
:return:
|
257 |
+
"""
|
258 |
+
|
259 |
+
if full_process:
|
260 |
+
p1 = utils.full_process(s1, force_ascii=force_ascii)
|
261 |
+
p2 = utils.full_process(s2, force_ascii=force_ascii)
|
262 |
+
else:
|
263 |
+
p1 = s1
|
264 |
+
p2 = s2
|
265 |
+
|
266 |
+
if not utils.validate_string(p1):
|
267 |
+
return 0
|
268 |
+
if not utils.validate_string(p2):
|
269 |
+
return 0
|
270 |
+
|
271 |
+
# should we look at partials?
|
272 |
+
try_partial = True
|
273 |
+
unbase_scale = .95
|
274 |
+
partial_scale = .90
|
275 |
+
|
276 |
+
base = ratio(p1, p2)
|
277 |
+
len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))
|
278 |
+
|
279 |
+
# if strings are similar length, don't use partials
|
280 |
+
if len_ratio < 1.5:
|
281 |
+
try_partial = False
|
282 |
+
|
283 |
+
# if one string is much much shorter than the other
|
284 |
+
if len_ratio > 8:
|
285 |
+
partial_scale = .6
|
286 |
+
|
287 |
+
if try_partial:
|
288 |
+
partial = partial_ratio(p1, p2) * partial_scale
|
289 |
+
ptsor = partial_token_sort_ratio(p1, p2, full_process=False) \
|
290 |
+
* unbase_scale * partial_scale
|
291 |
+
ptser = partial_token_set_ratio(p1, p2, full_process=False) \
|
292 |
+
* unbase_scale * partial_scale
|
293 |
+
|
294 |
+
return utils.intr(max(base, partial, ptsor, ptser))
|
295 |
+
else:
|
296 |
+
tsor = token_sort_ratio(p1, p2, full_process=False) * unbase_scale
|
297 |
+
tser = token_set_ratio(p1, p2, full_process=False) * unbase_scale
|
298 |
+
|
299 |
+
return utils.intr(max(base, tsor, tser))
|
300 |
+
|
301 |
+
|
302 |
+
def UWRatio(s1, s2, full_process=True):
|
303 |
+
"""Return a measure of the sequences' similarity between 0 and 100,
|
304 |
+
using different algorithms. Same as WRatio but preserving unicode.
|
305 |
+
"""
|
306 |
+
return WRatio(s1, s2, force_ascii=False, full_process=full_process)
|