Spaces:
Runtime error
Runtime error
File size: 4,047 Bytes
00c6db8 a34ad6e 00c6db8 485f576 73b620f c19c9f8 00c6db8 8ad74f7 00c6db8 8ad74f7 00c6db8 8ad74f7 00c6db8 a27a834 00c6db8 a27a834 00c6db8 898101c 00c6db8 898101c 00c6db8 898101c 00c6db8 898101c 00c6db8 898101c 00c6db8 a27a834 00c6db8 a27a834 00c6db8 7fe3481 00c6db8 7fe3481 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import re
from gensim.models.keyedvectors import KeyedVectors
from transformers import pipeline
import pickle
import numpy as np
import pandas as pd
w2v = KeyedVectors.load('models/word2vec')
w2v_vocab = set(sorted(w2v.index_to_key))
model = pickle.load(open('models/w2v_ovr_svc.sav', 'rb'))
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli", framework='pt'
)
labels = [
'communication', 'waiting time',
'information', 'user interface',
'facilities', 'location', 'price'
]
sample_file = pd.read_csv('sample.csv').to_csv(index=False).encode('utf-8')
print('utils imported!')
def get_sentiment_label_facebook(list_of_sent_dicts):
if list_of_sent_dicts['labels'][0] == 'negative':
return 'negative'
else:
return 'positive'
def get_single_prediction(text):
# manipulate data into a format that we pass to our model
text = text.lower() #lower case
text = re.sub('[^0-9a-zA-Z\s]', '', text) #remove special char, punctuation
# Remove OOV words
text = ' '.join([i for i in text.split() if i in w2v_vocab])
# Vectorise text and store in new dataframe. Sentence vector = average of word vectors
text_vectors = np.mean([w2v[i] for i in text.split()], axis=0)
# Make predictions
results = model.predict_proba(text_vectors.reshape(1,300)).squeeze().round(2)
pred_prob = pd.DataFrame({'topic': labels, 'probability': results}).sort_values('probability', ascending=True)
# Get sentiment
sentiment_results = classifier(text,
candidate_labels=['positive', 'negative'],
hypothesis_template='The sentiment of this is {}')
sentiment_prob = pd.DataFrame({'sentiment': sentiment_results['labels'], 'probability': sentiment_results['scores']})
return (pred_prob, sentiment_prob)
def get_multiple_predictions(csv):
df = pd.read_csv(csv)
df.columns = ['sequence']
df['sequence_clean'] = df['sequence'].str.lower() #lower case
df['sequence_clean'] = df['sequence_clean'].str.strip()
df['sequence_clean'] = df['sequence_clean'].str.replace('[^0-9a-zA-Z\s]','') #remove special char, punctuation
# Remove OOV words
df['sequence_clean'] = df['sequence_clean'].apply(lambda x: ' '.join([i for i in x.split() if i in w2v_vocab]))
# Remove rows with blank string
invalid = df[(pd.isna(df['sequence_clean'])) | (df['sequence_clean'] == '')]
invalid.drop(columns=['sequence_clean'], inplace=True)
# Drop rows with blank string
df.dropna(inplace=True)
df = df[df['sequence_clean'] != ''].reset_index(drop=True)
# Vectorise text and store in new dataframe. Sentence vector = average of word vectors
series_text_vectors = pd.DataFrame(df['sequence_clean'].apply(lambda x: np.mean([w2v[i] for i in x.split()], axis=0)).values.tolist())
# Get predictions
pred_results = pd.DataFrame(model.predict(series_text_vectors), columns = labels)
# Join back to original sequence
final_results = df.join(pred_results)
final_results['others'] = final_results[labels].max(axis=1)
final_results['others'] = final_results['others'].apply(lambda x: 1 if x == 0 else 0)
# Get sentiment labels
final_results['sentiment'] = final_results['sequence_clean'].apply(lambda x: get_sentiment_label_facebook(classifier(x,
candidate_labels=['positive', 'negative'],
hypothesis_template='The sentiment of this is {}'))
)
final_results.drop(columns=['sequence_clean'], inplace=True)
# Append invalid rows
if len(invalid) == 0:
return final_results.to_csv(index=False).encode('utf-8')
else:
return pd.concat([final_results, invalid]).reset_index(drop=True).to_csv(index=False).encode('utf-8') |