File size: 2,863 Bytes
f7eb132 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# -*- coding: utf-8 -*-
"""medicalsymptoms1.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1uRT7zfEMnu-tq74GyZoUUtAb-In4XtX8
"""
import pandas as pd
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
# Load the data
data = pd.read_csv('symptomssingle.csv')
# Check for any missing values and remove them
data = data.dropna()
# Define a function to separate symptoms and diseases from the text
def separate_symptoms_and_diseases(text):
symptoms = re.findall(r'{"symptoms":"(.*?)"}', text)
disease = re.sub(r'(?:{"symptoms":".*?"},?)+', '', text).strip()
disease = disease.replace('],', '').strip() # Remove '],' from the disease name
return symptoms, disease
# Apply the function to the data
data['symptoms_and_diseases'] = data['data'].apply(separate_symptoms_and_diseases)
data[['symptoms', 'disease']] = pd.DataFrame(data['symptoms_and_diseases'].tolist(), index=data.index)
data = data.drop(columns=['data', 'symptoms_and_diseases'])
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')
# Preprocessing function
def preprocess(symptoms):
processed_symptoms = []
for symptom in symptoms:
doc = nlp(symptom)
processed_symptom = ' '.join(token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha)
processed_symptoms.append(processed_symptom)
return ' '.join(processed_symptoms)
# Preprocess the symptoms column
data['symptoms_preprocessed'] = data['symptoms'].apply(preprocess)
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['symptoms_preprocessed'], data['disease'], test_size=0.2, random_state=42)
# Create a pipeline for text classification
pipeline = Pipeline([
('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
('classifier', LogisticRegression(solver='liblinear', C=10))
])
# Train the model
pipeline.fit(X_train, y_train)
# Make predictions
y_pred = pipeline.predict(X_test)
# Evaluate the model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
!pip install joblib
import joblib
# Save the trained model
joblib.dump(pipeline, 'DiseasePredictionBasedonSymptoms.joblib')
import joblib
# Load the saved model
loaded_pipeline = joblib.load('DiseasePredictionBasedonSymptoms.joblib')
# Make predictions using the loaded model (example)
sample_symptom = "Skin Rash"
processed_symptom = preprocess([sample_symptom])
prediction = loaded_pipeline.predict([processed_symptom])
print("Predicted disease:", prediction[0]) |