import math import string def maybe_is_text(s, thresh=2.5): if len(s) == 0: return False # Calculate the entropy of the string entropy = 0 for c in string.printable: p = s.count(c) / len(s) if p > 0: entropy += -p * math.log2(p) # Check if the entropy is within a reasonable range for text if entropy > thresh: return True return False def maybe_is_code(s): if len(s) == 0: return False # Check if the string contains a lot of non-ascii characters if len([c for c in s if ord(c) > 128]) / len(s) > 0.1: return True return False def strings_similarity(s1, s2): if len(s1) == 0 or len(s2) == 0: return 0 # break the strings into words s1 = set(s1.split()) s2 = set(s2.split()) # return the similarity ratio return len(s1.intersection(s2)) / len(s1.union(s2)) def maybe_is_truncated(s): punct = [".", "!", "?", '"'] if s[-1] in punct: return False return True def maybe_is_html(s): if len(s) == 0: return False # check for html tags if "