Upload New File

ff61d903 · Sanghoon · 0d436897 · ff61d903
Commit ff61d903 authored May 04, 2022 by Sanghoon
Show whitespace changes
Inline Side-by-side

Showing with 212 additions and 0 deletions

Pima_R_code_notebook.ipynb ...lassification/Diabetes Example/Pima_R_code_notebook.ipynb +212 -0

No files found.
--- a/Crash Course on Naive Bayes Classification/Diabetes Example/Pima_R_code_notebook.ipynb
+++ b/Crash Course on Naive Bayes Classification/Diabetes Example/Pima_R_code_notebook.ipynb
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Develop algorithm to predict whether a person has diabetes given health factors\n",
+ "\n",
+ "## Using Naive Bayes Classifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Prepare a clean R environment in work space.\n",
+ "rm(list=ls()) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Import our csv file data\n",
+ "data=read.csv(\"pima.csv\",header=TRUE) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Construct a training data set\n",
+ "TrainingPct=0.8 #Percent of data to train model on\n",
+ "TrainingSample=floor(TrainingPct*dim(data)[1]) #Number of observations to train the model on , #dim()[n] = Retrieve or set the n dimension of an object\n",
+ "TestSample=dim(data)[1]-TrainingSample #Number of observations to test the model on"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "TrainingData=data[1:TrainingSample,] #Get the training data\n",
+ "Diabetes_categ=unique(TrainingData$diabetes) #Categorize diabetes by taking unique elements of the column diabetes(which is 0 and 1 in this case)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "MeanMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1) #Initialize matrix for mean values in training sample\n",
+ "SDMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1) #Initialize matrix for standard deviations(st dev) in training sample\n",
+ "MargProb=rep(0,length(Diabetes_categ)) #Initialize vector for marginal probabilities\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for (i in 1:length(Diabetes_categ)){ #Using for loop, loop through whether or not the person has diabetes\n",
+ "Data_categ=subset(TrainingData,TrainingData$diabetes==Diabetes_categ[i]) #Subset training sample based on whether or not the person has diabetes\n",
+ " for (j in 1:(dim(Data_categ)[2]-1)){ #Using for loop, loop through obtain mean, st dev, and marginal probability\n",
+ " mean_val=mean(Data_categ[,j]) #Calculates mean\n",
+ " sd_val=sd(Data_categ[,j]) #Calculates st dev\n",
+ " MeanMat[i,j]=mean_val\n",
+ " SDMat[i,j]=sd_val\n",
+ " MargProb[i]=dim(Data_categ)[1]/dim(TrainingData)[1] #Calculates marginal probability\n",
+ "}\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ProbList=list(MeanMat=MeanMat,SDMat=SDMat,MargProb=MargProb) #Stores the training data (mean, sd, marg prob in a list)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Construct a test sample\n",
+ "TestData=data[(TrainingSample+1):dim(data)[1],] #Select all except the training sample from the data\n",
+ "TestVec=TestData[1,]\n",
+ "AssignedMat=matrix(0,dim(TestData)[1],3)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Construct a function NB classifier\n",
+ "pima_fn<-function(TestVec,ProbList){\n",
+ " \n",
+ " #Bring in training data as separate matrices and vectors - mean, st dev, and marg prob\n",
+ " MeanMat=ProbList$MeanMat\n",
+ " SDMat=ProbList$SDMat\n",
+ " MargProb=ProbList$MargProb\n",
+ " ProbTestMat=matrix(0,length(MargProb),length(TestVec))\n",
+ " \n",
+ " for (j in 1:length(TestVec)){ #Loop through the different elements of the patient (various variables)\n",
+ " \n",
+ " for (k in 1:length(ProbList$MargProb)){ #Loop through the options as to whether or not the patient has diabetes\n",
+ " \n",
+ " if (j<length(TestVec))\n",
+ " {ProbTestMat[k,j]=dnorm(as.numeric(TestVec[j]),MeanMat[k,j],SDMat[k,j])} #Calculate the normal density value\n",
+ " else\n",
+ " {ProbTestMat[k,j]=MargProb[k]} #Calculate marg prob\n",
+ " }\n",
+ " Probs=apply(ProbTestMat,1,prod) #Calculate the product across probabilities\n",
+ " ind=which.max(Probs) #Find which probability is higher\n",
+ " AssignedVec=c(Probs,Diabetes_categ[ind]) \n",
+ " }\n",
+ " return(list(AssignedVec=AssignedVec[1:2],AssignedCondition=AssignedVec[3])) #Elements returned as a list.\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Load NB classifier\n",
+ "\n",
+ "for (i in 1:dim(TestData)[1]){\n",
+ " \n",
+ " TestVec=TestData[i,1:(dim(TestData)[2]-1)]\n",
+ " result<-pima_fn(TestVec,ProbList)\n",
+ " AssignedMat[i,]=c(as.numeric(result$AssignedVec),result$AssignedCondition)\n",
+ "}\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CheckMat=data.frame(cbind(TestData$diabetes,AssignedMat[,3]))\n",
+ "colnames(CheckMat)=c(\"Actual\",\"Assigned\")\n",
+ "Pct_Accuracy=sum(CheckMat$Actual==CheckMat$Assigned)/dim(TestData)[1] #Computes the percent accuracy\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1] \"Classifier Percent Accuracy\"\n",
+ "[1] 0.721519\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Classifier Percent Accuracy\") #Print our accuracy as percent value.\n",
+ "print(Pct_Accuracy)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Executing function in a sample data set to predict likelihood of diabetes\n",
+ "Example=read.csv(file=\"Example_Diabetes.csv\",header=TRUE)\n",
+ "Ex1<-pima_fn(Example[1,],ProbList)\n",
+ "Ex2<-pima_fn(Example[2,],ProbList)\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "R",
+ "language": "R",
+ "name": "ir"
+ },
+ "language_info": {
+ "codemirror_mode": "r",
+ "file_extension": ".r",
+ "mimetype": "text/x-r-source",
+ "name": "R",
+ "pygments_lexer": "r",
+ "version": "3.4.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}