Upload New File

137277c4 · Sanghoon · bffca245 · 137277c4
Commit 137277c4 authored May 03, 2022 by Sanghoon
Show whitespace changes
Inline Side-by-side

Showing with 88 additions and 0 deletions

pima_code.R ...n Naive Bayes Classification/Diabetes Example/pima_code.R +88 -0

No files found.
--- a/Crash Course on Naive Bayes Classification/Diabetes Example/pima_code.R
+++ b/Crash Course on Naive Bayes Classification/Diabetes Example/pima_code.R
+#Develop algorithm to predict whether a person has diabetes given health factors
+
+#Prepare a clean R environment in work space.
+rm(list=ls()) 
+
+#Use setwd() to navigate the data directory and specify desired folder. Here we are using Rstudio Editor directory.
+setwd(dirname(rstudioapi::getSourceEditorContext()$path))
+
+#Import our csv file data
+data=read.csv("pima.csv",header=TRUE) 
+
+#Construct a training data set
+TrainingPct=0.8 #Percent of data to train model on
+TrainingSample=floor(TrainingPct*dim(data)[1])  #Number of observations to train the model on , #dim()[n] = Retrieve or set the n dimension of an object
+TestSample=dim(data)[1]-TrainingSample #Number of observations to test the model on
+
+TrainingData=data[1:TrainingSample,]  #Get the training data
+Diabetes_categ=unique(TrainingData$diabetes) #Categorize diabetes by taking unique elements of the column diabetes(which is 0 and 1 in this case)
+
+MeanMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1)  #Initialize matrix for mean values in training sample
+SDMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1)  #Initialize matrix for standard deviations(st dev) in training sample
+MargProb=rep(0,length(Diabetes_categ)) #Initialize vector for marginal probabilities
+
+for (i in 1:length(Diabetes_categ)){  #Using for loop, loop through whether or not the person has diabetes
+  Data_categ=subset(TrainingData,TrainingData$diabetes==Diabetes_categ[i])  #Subset training sample based on whether or not the person has diabetes
+  
+  for (j in 1:(dim(Data_categ)[2]-1)){ #Using for loop, loop through obtain mean, st dev, and marginal probability
+    mean_val=mean(Data_categ[,j]) #Calculates mean
+    sd_val=sd(Data_categ[,j]) #Calculates st dev
+    MeanMat[i,j]=mean_val
+    SDMat[i,j]=sd_val
+    MargProb[i]=dim(Data_categ)[1]/dim(TrainingData)[1]  #Calculates marginal probability
+  }
+}
+
+ProbList=list(MeanMat=MeanMat,SDMat=SDMat,MargProb=MargProb)  #Stores the training data (mean, sd, marg prob in a list)
+
+#Construct a test sample
+TestData=data[(TrainingSample+1):dim(data)[1],] #Select all except the training sample from the data
+TestVec=TestData[1,]
+AssignedMat=matrix(0,dim(TestData)[1],3)
+
+#Construct a function NB classifier
+pima_fn<-function(TestVec,ProbList){
+  
+  #Bring in training data as separate matrices and vectors - mean, st dev, and marg prob
+  MeanMat=ProbList$MeanMat
+  SDMat=ProbList$SDMat
+  MargProb=ProbList$MargProb
+  ProbTestMat=matrix(0,length(MargProb),length(TestVec))
+  
+  for (j in 1:length(TestVec)){  #Loop through the different elements of the patient (various variables)
+    
+    for (k in 1:length(ProbList$MargProb)){ #Loop through the options as to whether or not the patient has diabetes
+      
+      if (j<length(TestVec))
+      {ProbTestMat[k,j]=dnorm(as.numeric(TestVec[j]),MeanMat[k,j],SDMat[k,j])}  #Calculate the normal density value
+      else
+      {ProbTestMat[k,j]=MargProb[k]}  #Calculate marg prob
+    }
+    Probs=apply(ProbTestMat,1,prod)  #Calculate the product across probabilities
+    ind=which.max(Probs)  #Find which probability is higher
+    AssignedVec=c(Probs,Diabetes_categ[ind])  
+  }
+  return(list(AssignedVec=AssignedVec[1:2],AssignedCondition=AssignedVec[3]))  #Elements returned as a list.
+}
+
+#Load NB classifier
+
+for (i in 1:dim(TestData)[1]){
+  
+  TestVec=TestData[i,1:(dim(TestData)[2]-1)]
+  result<-pima_fn(TestVec,ProbList)
+  AssignedMat[i,]=c(as.numeric(result$AssignedVec),result$AssignedCondition)
+}
+
+CheckMat=data.frame(cbind(TestData$diabetes,AssignedMat[,3]))
+colnames(CheckMat)=c("Actual","Assigned")
+Pct_Accuracy=sum(CheckMat$Actual==CheckMat$Assigned)/dim(TestData)[1]  #Computes the percent accuracy
+
+print("Classifier Percent Accuracy") #Print our accuracy as percent value.
+print(Pct_Accuracy)
+
+#Executing function in a sample data set to predict likelihood of diabetes
+Example=read.csv(file="Example_Diabetes.csv",header=TRUE)
+Ex1<-pima_fn(Example[1,],ProbList)
+Ex2<-pima_fn(Example[2,],ProbList)
+