is-this-bible / try_model.py
NHLOCAL's picture
first create
58d4ef5
raw
history blame
3.46 kB
from sys import argv
#import re
import nltk
from nltk.corpus import stopwords
import joblib
"""
# Remove punctuation and special characters
def remove_punctuation(text):
return re.sub(r'[^\w\s]', '', text)
# Function to remove custom stop words from text
def remove_custom_stopwords(text):
hebrew_stopwords = set(stopwords.words('hebrew'))
additional_stopwords = {'ืื ื™', 'ืืชื”', 'ืืช', 'ืื ื—ื ื•', 'ืืชื', 'ืืชืŸ', 'ื”ื', 'ื”ืŸ'}
hebrew_stopwords.update(additional_stopwords)
return ' '.join(word for word in text.split() if word not in hebrew_stopwords)
# Preprocess the new text (remove punctuation and custom stop words)
# ืื ืจื•ืฆื™ื ืœื”ื—ื–ื™ืจ ืืช ื”ืคื•ื ืงืฆื™ื™ื” ื”ืœื ืคืขื™ืœื” ื™ืฉ ืœื”ืขื‘ื™ืจ ืืช ื”ืžืฉืชื ื” ืื—ืจื™ ื”ืžืฉืชื ื” new_text
new_text_cleaned = remove_custom_stopwords(remove_punctuation(new_text))
"""
# Load the trained model from the file
loaded_classifier = joblib.load("is_this_bible_model.pkl")
# Load the TF-IDF vectorizer used for training
vectorizer = joblib.load("is_this_bible_vectorizer.pkl")
def parse_text(new_text):
# Transform the new text using the TF-IDF vectorizer
new_text_tfidf = vectorizer.transform([new_text])
# Make predictions on the new text
prediction = loaded_classifier.predict(new_text_tfidf)
# Get the confidence score for the predicted class
probabilities = loaded_classifier.predict_proba(new_text_tfidf)
confidence_score = probabilities[0, 1] # The confidence score for class "Bible" (index 1)
# Print the prediction and the confidence score
print(f"Text: {new_text} | Prediction: {'Bible' if prediction[0] == 1 else 'Other'} | Confidence Score: {confidence_score:.4f}")
text_list = [
'ืื ื™ ื™ื•ืฉื‘ ืคื” ื‘ืฉืงื˜ ื•ืžืงืœืœ ืืช ื”ืขื•ื‘ื“ื” ืฉื—ืœืง ืžื”ืชื•ื›ื ื•ืช ืฉืื ื™ ืžืชื—ื–ืง ืงืฉื•ืจื” ืœืคื™ื™ืชื•ืŸ 2.4, ืฉืื™ืŸ ืœื” ืืช ื–ื”',
'ื›ืžื” ื™ืคื” ื•ื ืื” ื›ืฉืฉื•ืžืขื™ื ื”ืฉื™ืจื” ืฉืœื”ื',
'ื•ื”ื™ื” ื‘ืขืช ื”ื”ื™ื ืื—ืคืฉ ืืช ื™ืจื•ืฉืœื™ื ื‘ื ืจื•ืช ื•ื”ื•ื“ืขืชื™ื” ืืช ื›ืœ ืชื•ืขื‘ื•ืชื™ื”',
'ื•ื”ื™ื ืฉืขืžื“ื” ืœืื‘ื•ืชื™ื ื• ื•ืœื ื• ืฉืœื ืื—ื“ ื‘ืœื‘ื“ ืขืžื“ ืขืœื™ื ื• ืœื›ืœื•ืชื™ื ื•',
'ืื ื™ ื”ืกืชื›ืœืชื™ ืœืฉืžื™ื ืืชื” ืฆืœืœืช ื‘ืžื™ื',
'ื”ืฆื‘ ื”ื•ื ื‘ืขืœ ื—ื™ื™ื ืฉื—ื™ ื‘ื™ื ื•ื‘ื™ื‘ืฉื”',
'ื•ื”ื™ื” ื”ื ืฉืืจ ื‘ืฆื™ื•ืŸ ื•ื”ื ื•ืชืจ ื‘ื™ืจื•ืฉืœื™ื ืงื“ื•ืฉ ื™ืืžืจ ืœื•',
'ืฉื™ืจ ื”ืฉื™ืจื™ื ืืฉืจ ืœืฉืœืžื”',
'ื™ืฉืงื ื™ ืžื ืฉื™ืงื•ืช ืคื™ื”ื• ื›ื™ ื˜ื•ื‘ื™ื ื“ื•ื“ื™ืš ืžื™ื™ืŸ',
'ื•ื”ื™ื” ืจืง ืžืœื ืฉืžื—ื” ื•ื—ื“ื•ื” ืชืžื™ื“ ื›ืฉื”ื™ื” ื’ื•ืžืจ ื”ืžื ืขืœ ื•ืžืŸ ื”ืกืชื ื”ื™ื” ืœื• ืฉืœืฉื” ืงืฆื•ื•ืช',
'ื–ื” ืžืขืฉื” ืฉืœื• ื•ื–ื” ืžืขืฉื” ืฉืœื™ ื•ืขื•ื“ ืžื” ืœื ื• ืœื“ื‘ืจ ืžืื—ืจื™ื',
'ื“ื•ื“ื™ ื™ืจื“ ืœื’ื ื• ืœืขืจื•ื’ื•ืช ื”ื‘ื•ืฉื ืœืจืขื•ืช ื‘ื’ื ื™ื ื•ืœืœืงื•ื˜ ืฉื•ืฉื ื™ื',
'ื•ื™ืžืจื• ื‘ื™ ื‘ื™ืช ื™ืฉืจืืœ ื‘ืžื“ื‘ืจ ื‘ื—ืงื•ืชื™ ืœื ื”ืœื›ื• ื•ืืช ืžืฉืคื˜ื™ ืžืืกื• ืืฉืจ ื™ืขืฉื” ืืชื ื”ืื“ื ื•ื—ื™ ื‘ื”ื',
'ื–ื” ืœื ืžืฉื ื” ืื•ืคื ื™ื™ื ื ืขืœื™ื™ื ื”ืขื™ืงืจ ื–ื” ื‘ื—ื™ื™ื',
'ื–ื›ื•ืจ ืืช ื™ื•ื ื”ืฉื‘ืช ืœืงื“ืฉื•',
'ื•ื™ืฉืœื— ื™ืขืงื‘ ืžืœืื›ื™ื ืœืคื ื™ื• ืืœ ืขืฉื™ื• ืื—ื™ื•',
'ืœืš ืœืš ืžืืจืฆืš ื•ืžืžื•ืœื“ืชืš ื•ืžื‘ื™ืช ืื‘ื™ืš',
'ืขื“ื›ื•ืŸ :ื“ื•ืจ ืœื“ื•ืจ ืชื "ืš ,ืžืื•ืจืขื•ืช ื‘ื–ืžืŸ ื”ืชื "ืš ืงืจื“ื™ื˜']
if argv[1:]:
new_text = argv[1]
parse_text(new_text)
else:
for new_text in text_list:
parse_text(new_text)