"# Develop algorithm to predict whether a person has diabetes given health factors\n",
"\n",
"## Using Naive Bayes Classifier"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#Prepare a clean R environment in work space.\n",
"rm(list=ls()) "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#Import our csv file data\n",
"data=read.csv(\"pima.csv\",header=TRUE) "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#Construct a training data set\n",
"TrainingPct=0.8 #Percent of data to train model on\n",
"TrainingSample=floor(TrainingPct*dim(data)[1]) #Number of observations to train the model on , #dim()[n] = Retrieve or set the n dimension of an object\n",
"TestSample=dim(data)[1]-TrainingSample #Number of observations to test the model on"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"TrainingData=data[1:TrainingSample,] #Get the training data\n",
"Diabetes_categ=unique(TrainingData$diabetes) #Categorize diabetes by taking unique elements of the column diabetes(which is 0 and 1 in this case)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"MeanMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1) #Initialize matrix for mean values in training sample\n",
"SDMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1) #Initialize matrix for standard deviations(st dev) in training sample\n",
"MargProb=rep(0,length(Diabetes_categ)) #Initialize vector for marginal probabilities\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"for (i in 1:length(Diabetes_categ)){ #Using for loop, loop through whether or not the person has diabetes\n",
"Data_categ=subset(TrainingData,TrainingData$diabetes==Diabetes_categ[i]) #Subset training sample based on whether or not the person has diabetes\n",
" for (j in 1:(dim(Data_categ)[2]-1)){ #Using for loop, loop through obtain mean, st dev, and marginal probability\n",