|
import cleantext |
|
import ftfy |
|
import json |
|
import hashlib |
|
from io import StringIO |
|
from datetime import datetime |
|
|
|
def clean_text(input): |
|
|
|
text = ftfy.fix_text(input) |
|
text = cleantext.clean( |
|
text, |
|
extra_spaces=True, |
|
stemming=False, |
|
stopwords=False, |
|
lowercase=True, |
|
numbers=False, |
|
punct=False |
|
) |
|
return(text) |
|
|
|
def df_to_csv(df): |
|
csv = StringIO() |
|
df.to_csv(csv, index=False) |
|
csv.seek(0) |
|
csv_data = csv.getvalue() |
|
return(csv_data) |
|
|
|
def serialize_data(data): |
|
|
|
def converter(o): |
|
if isinstance(o, datetime): |
|
return o.__str__() |
|
|
|
return json.dumps(data, default=converter) |
|
|
|
def hash(input): |
|
sha1 = hashlib.sha1() |
|
encoded = json.dumps([input], sort_keys=True).encode() |
|
sha1.update(encoded) |
|
return sha1.hexdigest() |