Commit 137277c4 by Sanghoon

Upload New File

parent bffca245
#Develop algorithm to predict whether a person has diabetes given health factors
#Prepare a clean R environment in work space.
rm(list=ls())
#Use setwd() to navigate the data directory and specify desired folder. Here we are using Rstudio Editor directory.
setwd(dirname(rstudioapi::getSourceEditorContext()$path))
#Import our csv file data
data=read.csv("pima.csv",header=TRUE)
#Construct a training data set
TrainingPct=0.8 #Percent of data to train model on
TrainingSample=floor(TrainingPct*dim(data)[1]) #Number of observations to train the model on , #dim()[n] = Retrieve or set the n dimension of an object
TestSample=dim(data)[1]-TrainingSample #Number of observations to test the model on
TrainingData=data[1:TrainingSample,] #Get the training data
Diabetes_categ=unique(TrainingData$diabetes) #Categorize diabetes by taking unique elements of the column diabetes(which is 0 and 1 in this case)
MeanMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1) #Initialize matrix for mean values in training sample
SDMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1) #Initialize matrix for standard deviations(st dev) in training sample
MargProb=rep(0,length(Diabetes_categ)) #Initialize vector for marginal probabilities
for (i in 1:length(Diabetes_categ)){ #Using for loop, loop through whether or not the person has diabetes
Data_categ=subset(TrainingData,TrainingData$diabetes==Diabetes_categ[i]) #Subset training sample based on whether or not the person has diabetes
for (j in 1:(dim(Data_categ)[2]-1)){ #Using for loop, loop through obtain mean, st dev, and marginal probability
mean_val=mean(Data_categ[,j]) #Calculates mean
sd_val=sd(Data_categ[,j]) #Calculates st dev
MeanMat[i,j]=mean_val
SDMat[i,j]=sd_val
MargProb[i]=dim(Data_categ)[1]/dim(TrainingData)[1] #Calculates marginal probability
}
}
ProbList=list(MeanMat=MeanMat,SDMat=SDMat,MargProb=MargProb) #Stores the training data (mean, sd, marg prob in a list)
#Construct a test sample
TestData=data[(TrainingSample+1):dim(data)[1],] #Select all except the training sample from the data
TestVec=TestData[1,]
AssignedMat=matrix(0,dim(TestData)[1],3)
#Construct a function NB classifier
pima_fn<-function(TestVec,ProbList){
#Bring in training data as separate matrices and vectors - mean, st dev, and marg prob
MeanMat=ProbList$MeanMat
SDMat=ProbList$SDMat
MargProb=ProbList$MargProb
ProbTestMat=matrix(0,length(MargProb),length(TestVec))
for (j in 1:length(TestVec)){ #Loop through the different elements of the patient (various variables)
for (k in 1:length(ProbList$MargProb)){ #Loop through the options as to whether or not the patient has diabetes
if (j<length(TestVec))
{ProbTestMat[k,j]=dnorm(as.numeric(TestVec[j]),MeanMat[k,j],SDMat[k,j])} #Calculate the normal density value
else
{ProbTestMat[k,j]=MargProb[k]} #Calculate marg prob
}
Probs=apply(ProbTestMat,1,prod) #Calculate the product across probabilities
ind=which.max(Probs) #Find which probability is higher
AssignedVec=c(Probs,Diabetes_categ[ind])
}
return(list(AssignedVec=AssignedVec[1:2],AssignedCondition=AssignedVec[3])) #Elements returned as a list.
}
#Load NB classifier
for (i in 1:dim(TestData)[1]){
TestVec=TestData[i,1:(dim(TestData)[2]-1)]
result<-pima_fn(TestVec,ProbList)
AssignedMat[i,]=c(as.numeric(result$AssignedVec),result$AssignedCondition)
}
CheckMat=data.frame(cbind(TestData$diabetes,AssignedMat[,3]))
colnames(CheckMat)=c("Actual","Assigned")
Pct_Accuracy=sum(CheckMat$Actual==CheckMat$Assigned)/dim(TestData)[1] #Computes the percent accuracy
print("Classifier Percent Accuracy") #Print our accuracy as percent value.
print(Pct_Accuracy)
#Executing function in a sample data set to predict likelihood of diabetes
Example=read.csv(file="Example_Diabetes.csv",header=TRUE)
Ex1<-pima_fn(Example[1,],ProbList)
Ex2<-pima_fn(Example[2,],ProbList)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment