NHLOCAL's picture
first create
58d4ef5
raw
history blame
1.77 kB
from sys import argv
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
# Load the trained model from the file
loaded_classifier = joblib.load("text_identification_model.pkl")
# Load the TF-IDF vectorizer used for training
vectorizer = joblib.load("text_identification_vectorizer.pkl")
# Define labels for your categories
categories = {0: 'Other', 1: 'Bible', 2: 'Talmud'}
def parse_text(new_text):
# Transform the new text using the TF-IDF vectorizer
new_text_tfidf = vectorizer.transform([new_text])
# Make predictions on the new text
prediction = loaded_classifier.predict(new_text_tfidf)
# Get the confidence score for the predicted class
probabilities = loaded_classifier.predict_proba(new_text_tfidf)
confidence_score = probabilities[0, 1] # Confidence score for class "Bible" (index 1)
# Determine the predicted category label
predicted_category = categories[prediction[0]]
# Print the prediction and the confidence score
print(f"Text: {new_text} | Prediction: {predicted_category} | Confidence Score: {confidence_score:.4f}")
text_list = [
'ื›ืžื” ื™ืคื” ื•ื ืื” ื›ืฉืฉื•ืžืขื™ื ื”ืฉื™ืจื” ืฉืœื”ื',
'ื—ื“ืฉื•ืช ื”ืขืจื‘: ืฉืœื•ืฉื” ืื ืฉื™ื ื ืฆืื• ื˜ื•ื‘ืขื™ื ื‘ื›ื™ื ืจืช',
'ื•ื”ื™ื” ื‘ืขืช ื”ื”ื™ื ืื—ืคืฉ ืืช ื™ืจื•ืฉืœื™ื ื‘ื ืจื•ืช ื•ื”ื•ื“ืขืชื™ื” ืืช ื›ืœ ืชื•ืขื‘ื•ืชื™ื”',
'ื•ื™ืืžืจ ืžืฉื” ืืœ ื‘ื ื™ ื™ืฉืจืืœ',
'ื“ืืžืจ ื ืฉื™ื ืžื‘ื™ื ืฉืขื™ืจ ืชื• ื”ื ื“ืชื ืŸ',
'ืืžืจ ืœื™ื” ืื‘ื™ื™ ืœืจื‘ ื–ืขื™ืจื',
'ื•ืื™ื”ื• ืœื ืงื ื™ื”ื™ื‘ ืฉืขื•ืจื ื‘ืžืฉื›ื',]
if argv[1:]:
new_text = argv[1]
parse_text(new_text)
else:
for new_text in text_list:
parse_text(new_text)