Commit ff61d903 by Sanghoon

Upload New File

parent 0d436897
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Develop algorithm to predict whether a person has diabetes given health factors\n",
"\n",
"## Using Naive Bayes Classifier"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#Prepare a clean R environment in work space.\n",
"rm(list=ls()) "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#Import our csv file data\n",
"data=read.csv(\"pima.csv\",header=TRUE) "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#Construct a training data set\n",
"TrainingPct=0.8 #Percent of data to train model on\n",
"TrainingSample=floor(TrainingPct*dim(data)[1]) #Number of observations to train the model on , #dim()[n] = Retrieve or set the n dimension of an object\n",
"TestSample=dim(data)[1]-TrainingSample #Number of observations to test the model on"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"TrainingData=data[1:TrainingSample,] #Get the training data\n",
"Diabetes_categ=unique(TrainingData$diabetes) #Categorize diabetes by taking unique elements of the column diabetes(which is 0 and 1 in this case)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"MeanMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1) #Initialize matrix for mean values in training sample\n",
"SDMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1) #Initialize matrix for standard deviations(st dev) in training sample\n",
"MargProb=rep(0,length(Diabetes_categ)) #Initialize vector for marginal probabilities\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"for (i in 1:length(Diabetes_categ)){ #Using for loop, loop through whether or not the person has diabetes\n",
"Data_categ=subset(TrainingData,TrainingData$diabetes==Diabetes_categ[i]) #Subset training sample based on whether or not the person has diabetes\n",
" for (j in 1:(dim(Data_categ)[2]-1)){ #Using for loop, loop through obtain mean, st dev, and marginal probability\n",
" mean_val=mean(Data_categ[,j]) #Calculates mean\n",
" sd_val=sd(Data_categ[,j]) #Calculates st dev\n",
" MeanMat[i,j]=mean_val\n",
" SDMat[i,j]=sd_val\n",
" MargProb[i]=dim(Data_categ)[1]/dim(TrainingData)[1] #Calculates marginal probability\n",
"}\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"ProbList=list(MeanMat=MeanMat,SDMat=SDMat,MargProb=MargProb) #Stores the training data (mean, sd, marg prob in a list)\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#Construct a test sample\n",
"TestData=data[(TrainingSample+1):dim(data)[1],] #Select all except the training sample from the data\n",
"TestVec=TestData[1,]\n",
"AssignedMat=matrix(0,dim(TestData)[1],3)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"#Construct a function NB classifier\n",
"pima_fn<-function(TestVec,ProbList){\n",
" \n",
" #Bring in training data as separate matrices and vectors - mean, st dev, and marg prob\n",
" MeanMat=ProbList$MeanMat\n",
" SDMat=ProbList$SDMat\n",
" MargProb=ProbList$MargProb\n",
" ProbTestMat=matrix(0,length(MargProb),length(TestVec))\n",
" \n",
" for (j in 1:length(TestVec)){ #Loop through the different elements of the patient (various variables)\n",
" \n",
" for (k in 1:length(ProbList$MargProb)){ #Loop through the options as to whether or not the patient has diabetes\n",
" \n",
" if (j<length(TestVec))\n",
" {ProbTestMat[k,j]=dnorm(as.numeric(TestVec[j]),MeanMat[k,j],SDMat[k,j])} #Calculate the normal density value\n",
" else\n",
" {ProbTestMat[k,j]=MargProb[k]} #Calculate marg prob\n",
" }\n",
" Probs=apply(ProbTestMat,1,prod) #Calculate the product across probabilities\n",
" ind=which.max(Probs) #Find which probability is higher\n",
" AssignedVec=c(Probs,Diabetes_categ[ind]) \n",
" }\n",
" return(list(AssignedVec=AssignedVec[1:2],AssignedCondition=AssignedVec[3])) #Elements returned as a list.\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#Load NB classifier\n",
"\n",
"for (i in 1:dim(TestData)[1]){\n",
" \n",
" TestVec=TestData[i,1:(dim(TestData)[2]-1)]\n",
" result<-pima_fn(TestVec,ProbList)\n",
" AssignedMat[i,]=c(as.numeric(result$AssignedVec),result$AssignedCondition)\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"CheckMat=data.frame(cbind(TestData$diabetes,AssignedMat[,3]))\n",
"colnames(CheckMat)=c(\"Actual\",\"Assigned\")\n",
"Pct_Accuracy=sum(CheckMat$Actual==CheckMat$Assigned)/dim(TestData)[1] #Computes the percent accuracy\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[1] \"Classifier Percent Accuracy\"\n",
"[1] 0.721519\n"
]
}
],
"source": [
"print(\"Classifier Percent Accuracy\") #Print our accuracy as percent value.\n",
"print(Pct_Accuracy)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"#Executing function in a sample data set to predict likelihood of diabetes\n",
"Example=read.csv(file=\"Example_Diabetes.csv\",header=TRUE)\n",
"Ex1<-pima_fn(Example[1,],ProbList)\n",
"Ex2<-pima_fn(Example[2,],ProbList)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.4.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment