File size: 2,863 Bytes
f7eb132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""medicalsymptoms1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1uRT7zfEMnu-tq74GyZoUUtAb-In4XtX8
"""

import pandas as pd
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

# Load the data
data = pd.read_csv('symptomssingle.csv')

# Check for any missing values and remove them
data = data.dropna()

# Define a function to separate symptoms and diseases from the text
def separate_symptoms_and_diseases(text):
    symptoms = re.findall(r'{"symptoms":"(.*?)"}', text)
    disease = re.sub(r'(?:{"symptoms":".*?"},?)+', '', text).strip()
    disease = disease.replace('],', '').strip()  # Remove '],' from the disease name
    return symptoms, disease

# Apply the function to the data
data['symptoms_and_diseases'] = data['data'].apply(separate_symptoms_and_diseases)
data[['symptoms', 'disease']] = pd.DataFrame(data['symptoms_and_diseases'].tolist(), index=data.index)
data = data.drop(columns=['data', 'symptoms_and_diseases'])

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Preprocessing function
def preprocess(symptoms):
    processed_symptoms = []
    for symptom in symptoms:
        doc = nlp(symptom)
        processed_symptom = ' '.join(token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha)
        processed_symptoms.append(processed_symptom)
    return ' '.join(processed_symptoms)

# Preprocess the symptoms column
data['symptoms_preprocessed'] = data['symptoms'].apply(preprocess)


# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['symptoms_preprocessed'], data['disease'], test_size=0.2, random_state=42)

# Create a pipeline for text classification
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('classifier', LogisticRegression(solver='liblinear', C=10))
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

!pip install joblib
import joblib

# Save the trained model
joblib.dump(pipeline, 'DiseasePredictionBasedonSymptoms.joblib')

import joblib

# Load the saved model
loaded_pipeline = joblib.load('DiseasePredictionBasedonSymptoms.joblib')

# Make predictions using the loaded model (example)
sample_symptom = "Skin Rash"
processed_symptom = preprocess([sample_symptom])
prediction = loaded_pipeline.predict([processed_symptom])

print("Predicted disease:", prediction[0])