Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
tutorials
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
6
Issues
6
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Data Science Dojo
tutorials
Commits
137277c4
Commit
137277c4
authored
2 years ago
by
Sanghoon
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
bffca245
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
88 additions
and
0 deletions
+88
-0
pima_code.R
...n Naive Bayes Classification/Diabetes Example/pima_code.R
+88
-0
No files found.
Crash Course on Naive Bayes Classification/Diabetes Example/pima_code.R
0 → 100644
View file @
137277c4
#Develop algorithm to predict whether a person has diabetes given health factors
#Prepare a clean R environment in work space.
rm
(
list
=
ls
())
#Use setwd() to navigate the data directory and specify desired folder. Here we are using Rstudio Editor directory.
setwd
(
dirname
(
rstudioapi
::
getSourceEditorContext
()
$
path
))
#Import our csv file data
data
=
read.csv
(
"pima.csv"
,
header
=
TRUE
)
#Construct a training data set
TrainingPct
=
0.8
#Percent of data to train model on
TrainingSample
=
floor
(
TrainingPct
*
dim
(
data
)[
1
])
#Number of observations to train the model on , #dim()[n] = Retrieve or set the n dimension of an object
TestSample
=
dim
(
data
)[
1
]
-
TrainingSample
#Number of observations to test the model on
TrainingData
=
data
[
1
:
TrainingSample
,]
#Get the training data
Diabetes_categ
=
unique
(
TrainingData
$
diabetes
)
#Categorize diabetes by taking unique elements of the column diabetes(which is 0 and 1 in this case)
MeanMat
=
matrix
(
0
,
length
(
Diabetes_categ
),
dim
(
TrainingData
)[
2
]
-1
)
#Initialize matrix for mean values in training sample
SDMat
=
matrix
(
0
,
length
(
Diabetes_categ
),
dim
(
TrainingData
)[
2
]
-1
)
#Initialize matrix for standard deviations(st dev) in training sample
MargProb
=
rep
(
0
,
length
(
Diabetes_categ
))
#Initialize vector for marginal probabilities
for
(
i
in
1
:
length
(
Diabetes_categ
)){
#Using for loop, loop through whether or not the person has diabetes
Data_categ
=
subset
(
TrainingData
,
TrainingData
$
diabetes
==
Diabetes_categ
[
i
])
#Subset training sample based on whether or not the person has diabetes
for
(
j
in
1
:
(
dim
(
Data_categ
)[
2
]
-1
)){
#Using for loop, loop through obtain mean, st dev, and marginal probability
mean_val
=
mean
(
Data_categ
[,
j
])
#Calculates mean
sd_val
=
sd
(
Data_categ
[,
j
])
#Calculates st dev
MeanMat
[
i
,
j
]
=
mean_val
SDMat
[
i
,
j
]
=
sd_val
MargProb
[
i
]
=
dim
(
Data_categ
)[
1
]
/
dim
(
TrainingData
)[
1
]
#Calculates marginal probability
}
}
ProbList
=
list
(
MeanMat
=
MeanMat
,
SDMat
=
SDMat
,
MargProb
=
MargProb
)
#Stores the training data (mean, sd, marg prob in a list)
#Construct a test sample
TestData
=
data
[(
TrainingSample
+1
)
:
dim
(
data
)[
1
],]
#Select all except the training sample from the data
TestVec
=
TestData
[
1
,]
AssignedMat
=
matrix
(
0
,
dim
(
TestData
)[
1
],
3
)
#Construct a function NB classifier
pima_fn
<-
function
(
TestVec
,
ProbList
){
#Bring in training data as separate matrices and vectors - mean, st dev, and marg prob
MeanMat
=
ProbList
$
MeanMat
SDMat
=
ProbList
$
SDMat
MargProb
=
ProbList
$
MargProb
ProbTestMat
=
matrix
(
0
,
length
(
MargProb
),
length
(
TestVec
))
for
(
j
in
1
:
length
(
TestVec
)){
#Loop through the different elements of the patient (various variables)
for
(
k
in
1
:
length
(
ProbList
$
MargProb
)){
#Loop through the options as to whether or not the patient has diabetes
if
(
j
<
length
(
TestVec
))
{
ProbTestMat
[
k
,
j
]
=
dnorm
(
as.numeric
(
TestVec
[
j
]),
MeanMat
[
k
,
j
],
SDMat
[
k
,
j
])}
#Calculate the normal density value
else
{
ProbTestMat
[
k
,
j
]
=
MargProb
[
k
]}
#Calculate marg prob
}
Probs
=
apply
(
ProbTestMat
,
1
,
prod
)
#Calculate the product across probabilities
ind
=
which.max
(
Probs
)
#Find which probability is higher
AssignedVec
=
c
(
Probs
,
Diabetes_categ
[
ind
])
}
return
(
list
(
AssignedVec
=
AssignedVec
[
1
:
2
],
AssignedCondition
=
AssignedVec
[
3
]))
#Elements returned as a list.
}
#Load NB classifier
for
(
i
in
1
:
dim
(
TestData
)[
1
]){
TestVec
=
TestData
[
i
,
1
:
(
dim
(
TestData
)[
2
]
-1
)]
result
<-
pima_fn
(
TestVec
,
ProbList
)
AssignedMat
[
i
,]
=
c
(
as.numeric
(
result
$
AssignedVec
),
result
$
AssignedCondition
)
}
CheckMat
=
data.frame
(
cbind
(
TestData
$
diabetes
,
AssignedMat
[,
3
]))
colnames
(
CheckMat
)
=
c
(
"Actual"
,
"Assigned"
)
Pct_Accuracy
=
sum
(
CheckMat
$
Actual
==
CheckMat
$
Assigned
)
/
dim
(
TestData
)[
1
]
#Computes the percent accuracy
print
(
"Classifier Percent Accuracy"
)
#Print our accuracy as percent value.
print
(
Pct_Accuracy
)
#Executing function in a sample data set to predict likelihood of diabetes
Example
=
read.csv
(
file
=
"Example_Diabetes.csv"
,
header
=
TRUE
)
Ex1
<-
pima_fn
(
Example
[
1
,],
ProbList
)
Ex2
<-
pima_fn
(
Example
[
2
,],
ProbList
)
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment