Spaces:
Sleeping
Sleeping
from sys import argv | |
#import re | |
import nltk | |
from nltk.corpus import stopwords | |
import joblib | |
""" | |
# Remove punctuation and special characters | |
def remove_punctuation(text): | |
return re.sub(r'[^\w\s]', '', text) | |
# Function to remove custom stop words from text | |
def remove_custom_stopwords(text): | |
hebrew_stopwords = set(stopwords.words('hebrew')) | |
additional_stopwords = {'ืื ื', 'ืืชื', 'ืืช', 'ืื ืื ื', 'ืืชื', 'ืืชื', 'ืื', 'ืื'} | |
hebrew_stopwords.update(additional_stopwords) | |
return ' '.join(word for word in text.split() if word not in hebrew_stopwords) | |
# Preprocess the new text (remove punctuation and custom stop words) | |
# ืื ืจืืฆืื ืืืืืืจ ืืช ืืคืื ืงืฆืืื ืืื ืคืขืืื ืืฉ ืืืขืืืจ ืืช ืืืฉืชื ื ืืืจื ืืืฉืชื ื new_text | |
new_text_cleaned = remove_custom_stopwords(remove_punctuation(new_text)) | |
""" | |
# Load the trained model from the file | |
loaded_classifier = joblib.load("is_this_bible_model.pkl") | |
# Load the TF-IDF vectorizer used for training | |
vectorizer = joblib.load("is_this_bible_vectorizer.pkl") | |
def parse_text(new_text): | |
# Transform the new text using the TF-IDF vectorizer | |
new_text_tfidf = vectorizer.transform([new_text]) | |
# Make predictions on the new text | |
prediction = loaded_classifier.predict(new_text_tfidf) | |
# Get the confidence score for the predicted class | |
probabilities = loaded_classifier.predict_proba(new_text_tfidf) | |
confidence_score = probabilities[0, 1] # The confidence score for class "Bible" (index 1) | |
# Print the prediction and the confidence score | |
print(f"Text: {new_text} | Prediction: {'Bible' if prediction[0] == 1 else 'Other'} | Confidence Score: {confidence_score:.4f}") | |
text_list = [ | |
'ืื ื ืืืฉื ืคื ืืฉืงื ืืืงืื ืืช ืืขืืืื ืฉืืืง ืืืชืืื ืืช ืฉืื ื ืืชืืืง ืงืฉืืจื ืืคืืืชืื 2.4, ืฉืืื ืื ืืช ืื', | |
'ืืื ืืคื ืื ืื ืืฉืฉืืืขืื ืืฉืืจื ืฉืืื', | |
'ืืืื ืืขืช ืืืื ืืืคืฉ ืืช ืืจืืฉืืื ืื ืจืืช ืืืืืขืชืื ืืช ืื ืชืืขืืืชืื', | |
'ืืืื ืฉืขืืื ืืืืืชืื ื ืืื ื ืฉืื ืืื ืืืื ืขืื ืขืืื ื ืืืืืชืื ื', | |
'ืื ื ืืกืชืืืชื ืืฉืืื ืืชื ืฆืืืช ืืืื', | |
'ืืฆื ืืื ืืขื ืืืื ืฉืื ืืื ืืืืืฉื', | |
'ืืืื ืื ืฉืืจ ืืฆืืื ืืื ืืชืจ ืืืจืืฉืืื ืงืืืฉ ืืืืจ ืื', | |
'ืฉืืจ ืืฉืืจืื ืืฉืจ ืืฉืืื', | |
'ืืฉืงื ื ืื ืฉืืงืืช ืคืืื ืื ืืืืื ืืืืื ืืืื', | |
'ืืืื ืจืง ืืื ืฉืืื ืืืืื ืชืืื ืืฉืืื ืืืืจ ืืื ืขื ืืื ืืกืชื ืืื ืื ืฉืืฉื ืงืฆืืืช', | |
'ืื ืืขืฉื ืฉืื ืืื ืืขืฉื ืฉืื ืืขืื ืื ืื ื ืืืืจ ืืืืจืื', | |
'ืืืื ืืจื ืืื ื ืืขืจืืืืช ืืืืฉื ืืจืขืืช ืืื ืื ืืืืงืื ืฉืืฉื ืื', | |
'ืืืืจื ืื ืืืช ืืฉืจืื ืืืืืจ ืืืงืืชื ืื ืืืื ืืืช ืืฉืคืื ืืืกื ืืฉืจ ืืขืฉื ืืชื ืืืื ืืื ืืื', | |
'ืื ืื ืืฉื ื ืืืคื ืืื ื ืขืืืื ืืขืืงืจ ืื ืืืืื', | |
'ืืืืจ ืืช ืืื ืืฉืืช ืืงืืฉื', | |
'ืืืฉืื ืืขืงื ืืืืืื ืืคื ืื ืื ืขืฉืื ืืืื', | |
'ืื ืื ืืืจืฆื ืืืืืืืชื ืืืืืช ืืืื', | |
'ืขืืืื :ืืืจ ืืืืจ ืชื "ื ,ืืืืจืขืืช ืืืื ืืชื "ื ืงืจืืื'] | |
if argv[1:]: | |
new_text = argv[1] | |
parse_text(new_text) | |
else: | |
for new_text in text_list: | |
parse_text(new_text) | |