hongaik commited on
Commit
898101c
1 Parent(s): a34ad6e

update code

Browse files
.ipynb_checkpoints/tester-checkpoint.ipynb ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "48c76726-b0a4-43e6-9f07-0199e0248d5e",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": []
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 39,
14
+ "id": "bd2034e6-1187-4887-9ca7-8b9c0b5c9331",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": []
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "1ac414f1-37dd-4642-867c-5520a16c1c86",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": []
26
+ }
27
+ ],
28
+ "metadata": {
29
+ "kernelspec": {
30
+ "display_name": "Python 3",
31
+ "language": "python",
32
+ "name": "python3"
33
+ },
34
+ "language_info": {
35
+ "codemirror_mode": {
36
+ "name": "ipython",
37
+ "version": 3
38
+ },
39
+ "file_extension": ".py",
40
+ "mimetype": "text/x-python",
41
+ "name": "python",
42
+ "nbconvert_exporter": "python",
43
+ "pygments_lexer": "ipython3",
44
+ "version": "3.8.8"
45
+ }
46
+ },
47
+ "nbformat": 4,
48
+ "nbformat_minor": 5
49
+ }
.ipynb_checkpoints/utils-checkpoint.py CHANGED
@@ -37,7 +37,8 @@ def get_single_prediction(text):
37
  text_vectors = np.mean([w2v[i] for i in text.split()], axis=0)
38
 
39
  # Make predictions
40
- results = model.predict(text_vectors)
 
41
 
42
  # Get sentiment
43
  sentiment = get_sentiment_label_facebook(classifier(text,
@@ -46,6 +47,10 @@ def get_single_prediction(text):
46
 
47
  # Consolidate results
48
  pred_labels = [labels[idx] for idx, tag in enumerate(results) if tag == 1]
 
 
 
 
49
  pred_labels.append(sentiment)
50
 
51
  return pred_labels
@@ -59,22 +64,27 @@ def get_multiple_predictions(csv):
59
  df['sequence'] = df['sequence'].str.replace('[^0-9a-zA-Z\s]','') #remove special char, punctuation
60
 
61
  # Remove OOV words
62
- df['sequence'] = df['sequence'].apply(lambda x: ' '.join([i for i in x.split() if i in w2v_vocab]))
63
 
64
  # Remove rows with blank string
65
- invalid = df[(pd.isna(df['sequence'])) | (df['sequence'] == '')]
 
66
 
 
67
  df.dropna(inplace=True)
68
- df = df[df['sequence'] != ''].reset_index(drop=True)
69
 
70
  # Vectorise text and store in new dataframe. Sentence vector = average of word vectors
71
- series_text_vectors = pd.DataFrame(df['sequence'].apply(lambda x: np.mean([w2v[i] for i in x.split()], axis=0)).values.tolist())
72
 
73
  # Get predictions
74
  pred_results = pd.DataFrame(model.predict(series_text_vectors), columns = labels)
75
 
76
  # Join back to original sequence
77
- final_results = df.join(series_text_vectors)
 
 
 
78
 
79
  # Get sentiment labels
80
  final_results['sentiment'] = final_results['sequence'].apply(lambda x: get_sentiment_label_facebook(classifier(x,
 
37
  text_vectors = np.mean([w2v[i] for i in text.split()], axis=0)
38
 
39
  # Make predictions
40
+ results = model.predict(text_vectors.reshape(1,300)).squeeze()
41
+ print(results)
42
 
43
  # Get sentiment
44
  sentiment = get_sentiment_label_facebook(classifier(text,
 
47
 
48
  # Consolidate results
49
  pred_labels = [labels[idx] for idx, tag in enumerate(results) if tag == 1]
50
+
51
+ if len(pred_labels) == 0:
52
+ pred_labels.append('others')
53
+
54
  pred_labels.append(sentiment)
55
 
56
  return pred_labels
 
64
  df['sequence'] = df['sequence'].str.replace('[^0-9a-zA-Z\s]','') #remove special char, punctuation
65
 
66
  # Remove OOV words
67
+ df['sequence_clean'] = df['sequence'].apply(lambda x: ' '.join([i for i in x.split() if i in w2v_vocab]))
68
 
69
  # Remove rows with blank string
70
+ invalid = df[(pd.isna(df['sequence_clean'])) | (df['sequence_clean'] == '')]
71
+ invalid.drop(columns=['sequence_clean'], inplace=True)
72
 
73
+ # Drop rows with blank string
74
  df.dropna(inplace=True)
75
+ df = df[df['sequence_clean'] != ''].reset_index(drop=True)
76
 
77
  # Vectorise text and store in new dataframe. Sentence vector = average of word vectors
78
+ series_text_vectors = pd.DataFrame(df['sequence_clean'].apply(lambda x: np.mean([w2v[i] for i in x.split()], axis=0)).values.tolist())
79
 
80
  # Get predictions
81
  pred_results = pd.DataFrame(model.predict(series_text_vectors), columns = labels)
82
 
83
  # Join back to original sequence
84
+ final_results = df.join(pred_results)
85
+ final_results.drop(columns=['sequence_clean'], inplace=True)
86
+ final_results['others'] = final_results[labels].max(axis=1)
87
+ final_results['others'] = final_results['others'].apply(lambda x: 1 if x == 0 else 0)
88
 
89
  # Get sentiment labels
90
  final_results['sentiment'] = final_results['sequence'].apply(lambda x: get_sentiment_label_facebook(classifier(x,
__pycache__/utils.cpython-38.pyc ADDED
Binary file (3.21 kB). View file
 
tester.ipynb ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "48c76726-b0a4-43e6-9f07-0199e0248d5e",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": []
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 39,
14
+ "id": "bd2034e6-1187-4887-9ca7-8b9c0b5c9331",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": []
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "1ac414f1-37dd-4642-867c-5520a16c1c86",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": []
26
+ }
27
+ ],
28
+ "metadata": {
29
+ "kernelspec": {
30
+ "display_name": "Python 3",
31
+ "language": "python",
32
+ "name": "python3"
33
+ },
34
+ "language_info": {
35
+ "codemirror_mode": {
36
+ "name": "ipython",
37
+ "version": 3
38
+ },
39
+ "file_extension": ".py",
40
+ "mimetype": "text/x-python",
41
+ "name": "python",
42
+ "nbconvert_exporter": "python",
43
+ "pygments_lexer": "ipython3",
44
+ "version": "3.8.8"
45
+ }
46
+ },
47
+ "nbformat": 4,
48
+ "nbformat_minor": 5
49
+ }
utils.py CHANGED
@@ -37,7 +37,8 @@ def get_single_prediction(text):
37
  text_vectors = np.mean([w2v[i] for i in text.split()], axis=0)
38
 
39
  # Make predictions
40
- results = model.predict(text_vectors)
 
41
 
42
  # Get sentiment
43
  sentiment = get_sentiment_label_facebook(classifier(text,
@@ -46,6 +47,10 @@ def get_single_prediction(text):
46
 
47
  # Consolidate results
48
  pred_labels = [labels[idx] for idx, tag in enumerate(results) if tag == 1]
 
 
 
 
49
  pred_labels.append(sentiment)
50
 
51
  return pred_labels
@@ -59,22 +64,27 @@ def get_multiple_predictions(csv):
59
  df['sequence'] = df['sequence'].str.replace('[^0-9a-zA-Z\s]','') #remove special char, punctuation
60
 
61
  # Remove OOV words
62
- df['sequence'] = df['sequence'].apply(lambda x: ' '.join([i for i in x.split() if i in w2v_vocab]))
63
 
64
  # Remove rows with blank string
65
- invalid = df[(pd.isna(df['sequence'])) | (df['sequence'] == '')]
 
66
 
 
67
  df.dropna(inplace=True)
68
- df = df[df['sequence'] != ''].reset_index(drop=True)
69
 
70
  # Vectorise text and store in new dataframe. Sentence vector = average of word vectors
71
- series_text_vectors = pd.DataFrame(df['sequence'].apply(lambda x: np.mean([w2v[i] for i in x.split()], axis=0)).values.tolist())
72
 
73
  # Get predictions
74
  pred_results = pd.DataFrame(model.predict(series_text_vectors), columns = labels)
75
 
76
  # Join back to original sequence
77
- final_results = df.join(series_text_vectors)
 
 
 
78
 
79
  # Get sentiment labels
80
  final_results['sentiment'] = final_results['sequence'].apply(lambda x: get_sentiment_label_facebook(classifier(x,
 
37
  text_vectors = np.mean([w2v[i] for i in text.split()], axis=0)
38
 
39
  # Make predictions
40
+ results = model.predict(text_vectors.reshape(1,300)).squeeze()
41
+ print(results)
42
 
43
  # Get sentiment
44
  sentiment = get_sentiment_label_facebook(classifier(text,
 
47
 
48
  # Consolidate results
49
  pred_labels = [labels[idx] for idx, tag in enumerate(results) if tag == 1]
50
+
51
+ if len(pred_labels) == 0:
52
+ pred_labels.append('others')
53
+
54
  pred_labels.append(sentiment)
55
 
56
  return pred_labels
 
64
  df['sequence'] = df['sequence'].str.replace('[^0-9a-zA-Z\s]','') #remove special char, punctuation
65
 
66
  # Remove OOV words
67
+ df['sequence_clean'] = df['sequence'].apply(lambda x: ' '.join([i for i in x.split() if i in w2v_vocab]))
68
 
69
  # Remove rows with blank string
70
+ invalid = df[(pd.isna(df['sequence_clean'])) | (df['sequence_clean'] == '')]
71
+ invalid.drop(columns=['sequence_clean'], inplace=True)
72
 
73
+ # Drop rows with blank string
74
  df.dropna(inplace=True)
75
+ df = df[df['sequence_clean'] != ''].reset_index(drop=True)
76
 
77
  # Vectorise text and store in new dataframe. Sentence vector = average of word vectors
78
+ series_text_vectors = pd.DataFrame(df['sequence_clean'].apply(lambda x: np.mean([w2v[i] for i in x.split()], axis=0)).values.tolist())
79
 
80
  # Get predictions
81
  pred_results = pd.DataFrame(model.predict(series_text_vectors), columns = labels)
82
 
83
  # Join back to original sequence
84
+ final_results = df.join(pred_results)
85
+ final_results.drop(columns=['sequence_clean'], inplace=True)
86
+ final_results['others'] = final_results[labels].max(axis=1)
87
+ final_results['others'] = final_results['others'].apply(lambda x: 1 if x == 0 else 0)
88
 
89
  # Get sentiment labels
90
  final_results['sentiment'] = final_results['sequence'].apply(lambda x: get_sentiment_label_facebook(classifier(x,