Spaces:
Sleeping
Sleeping
File size: 3,459 Bytes
58d4ef5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
from sys import argv
#import re
import nltk
from nltk.corpus import stopwords
import joblib
"""
# Remove punctuation and special characters
def remove_punctuation(text):
return re.sub(r'[^\w\s]', '', text)
# Function to remove custom stop words from text
def remove_custom_stopwords(text):
hebrew_stopwords = set(stopwords.words('hebrew'))
additional_stopwords = {'ืื ื', 'ืืชื', 'ืืช', 'ืื ืื ื', 'ืืชื', 'ืืชื', 'ืื', 'ืื'}
hebrew_stopwords.update(additional_stopwords)
return ' '.join(word for word in text.split() if word not in hebrew_stopwords)
# Preprocess the new text (remove punctuation and custom stop words)
# ืื ืจืืฆืื ืืืืืืจ ืืช ืืคืื ืงืฆืืื ืืื ืคืขืืื ืืฉ ืืืขืืืจ ืืช ืืืฉืชื ื ืืืจื ืืืฉืชื ื new_text
new_text_cleaned = remove_custom_stopwords(remove_punctuation(new_text))
"""
# Load the trained model from the file
loaded_classifier = joblib.load("is_this_bible_model.pkl")
# Load the TF-IDF vectorizer used for training
vectorizer = joblib.load("is_this_bible_vectorizer.pkl")
def parse_text(new_text):
# Transform the new text using the TF-IDF vectorizer
new_text_tfidf = vectorizer.transform([new_text])
# Make predictions on the new text
prediction = loaded_classifier.predict(new_text_tfidf)
# Get the confidence score for the predicted class
probabilities = loaded_classifier.predict_proba(new_text_tfidf)
confidence_score = probabilities[0, 1] # The confidence score for class "Bible" (index 1)
# Print the prediction and the confidence score
print(f"Text: {new_text} | Prediction: {'Bible' if prediction[0] == 1 else 'Other'} | Confidence Score: {confidence_score:.4f}")
text_list = [
'ืื ื ืืืฉื ืคื ืืฉืงื ืืืงืื ืืช ืืขืืืื ืฉืืืง ืืืชืืื ืืช ืฉืื ื ืืชืืืง ืงืฉืืจื ืืคืืืชืื 2.4, ืฉืืื ืื ืืช ืื',
'ืืื ืืคื ืื ืื ืืฉืฉืืืขืื ืืฉืืจื ืฉืืื',
'ืืืื ืืขืช ืืืื ืืืคืฉ ืืช ืืจืืฉืืื ืื ืจืืช ืืืืืขืชืื ืืช ืื ืชืืขืืืชืื',
'ืืืื ืฉืขืืื ืืืืืชืื ื ืืื ื ืฉืื ืืื ืืืื ืขืื ืขืืื ื ืืืืืชืื ื',
'ืื ื ืืกืชืืืชื ืืฉืืื ืืชื ืฆืืืช ืืืื',
'ืืฆื ืืื ืืขื ืืืื ืฉืื ืืื ืืืืืฉื',
'ืืืื ืื ืฉืืจ ืืฆืืื ืืื ืืชืจ ืืืจืืฉืืื ืงืืืฉ ืืืืจ ืื',
'ืฉืืจ ืืฉืืจืื ืืฉืจ ืืฉืืื',
'ืืฉืงื ื ืื ืฉืืงืืช ืคืืื ืื ืืืืื ืืืืื ืืืื',
'ืืืื ืจืง ืืื ืฉืืื ืืืืื ืชืืื ืืฉืืื ืืืืจ ืืื ืขื ืืื ืืกืชื ืืื ืื ืฉืืฉื ืงืฆืืืช',
'ืื ืืขืฉื ืฉืื ืืื ืืขืฉื ืฉืื ืืขืื ืื ืื ื ืืืืจ ืืืืจืื',
'ืืืื ืืจื ืืื ื ืืขืจืืืืช ืืืืฉื ืืจืขืืช ืืื ืื ืืืืงืื ืฉืืฉื ืื',
'ืืืืจื ืื ืืืช ืืฉืจืื ืืืืืจ ืืืงืืชื ืื ืืืื ืืืช ืืฉืคืื ืืืกื ืืฉืจ ืืขืฉื ืืชื ืืืื ืืื ืืื',
'ืื ืื ืืฉื ื ืืืคื ืืื ื ืขืืืื ืืขืืงืจ ืื ืืืืื',
'ืืืืจ ืืช ืืื ืืฉืืช ืืงืืฉื',
'ืืืฉืื ืืขืงื ืืืืืื ืืคื ืื ืื ืขืฉืื ืืืื',
'ืื ืื ืืืจืฆื ืืืืืืืชื ืืืืืช ืืืื',
'ืขืืืื :ืืืจ ืืืืจ ืชื "ื ,ืืืืจืขืืช ืืืื ืืชื "ื ืงืจืืื']
if argv[1:]:
new_text = argv[1]
parse_text(new_text)
else:
for new_text in text_list:
parse_text(new_text)
|