Upload New File

b2293e72 · Sanghoon · c97d2992 · b2293e72
Commit b2293e72 authored May 04, 2022 by Sanghoon
Hide whitespace changes
Inline Side-by-side

Showing with 241 additions and 0 deletions

VotingModel_R_code_notebook.ipynb ...fication/Voting Example/VotingModel_R_code_notebook.ipynb +241 -0

No files found.
--- a/Crash Course on Naive Bayes Classification/Voting Example/VotingModel_R_code_notebook.ipynb
+++ b/Crash Course on Naive Bayes Classification/Voting Example/VotingModel_R_code_notebook.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Can these models be used to predict how lawmakers may vote?\n",
+    "\n",
+    "## Using Naive Bayes Classifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Prepare a clean R environment in work space.\n",
+    "rm(list=ls())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Import our csv file data\n",
+    "data=read.csv(\"VotingData.csv\",header=TRUE) #Load data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Construct training data set\n",
+    "TrainingPct=0.6 #Percent of data to train model on\n",
+    "TrainingSample=floor(TrainingPct*dim(data)[1])  #Number of observations to train the model on\n",
+    "TestSample=dim(data)[1]-TrainingSample #Number of observations to test the model on\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TrainingData=data[1:TrainingSample,]  #Get the training data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Find probabilities associtaed with democrat voting\n",
+    "DemData=subset(TrainingData,TrainingData$Party==\"democrat\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Store All Probabilities in a Matrix (2 rows, across all votes)\n",
+    "ProbMat=matrix(0,2,(dim(DemData)[2]-3+1+1))\n",
+    "\n",
+    "m=2 #Equivalent sample size for Laplacian correction\n",
+    "p=1/2  #Prior probability for Laplacian correction\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for (j in 3:dim(DemData)[2])\n",
+    "{\n",
+    "  ProbMat[1,j-2]=(sum(DemData[,j]==\"y\")+m*p)/(dim(DemData)[1]+m)\n",
+    "  \n",
+    "}\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Find Probabilities Associated with Republican Voting\n",
+    "GOPData=subset(TrainingData,TrainingData$Party==\"republican\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for (j in 3:dim(GOPData)[2])\n",
+    "{\n",
+    "  ProbMat[2,j-2]=(sum(GOPData[,j]==\"y\")+m*p)/(dim(GOPData)[1]+m)\n",
+    "  \n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Tag on marignal probabilities\n",
+    "FinalInd=dim(ProbMat)[2]\n",
+    "ProbMat[1:2,FinalInd]=c(sum(TrainingData$Party==\"democrat\")/dim(TrainingData)[1],sum(TrainingData$Party==\"republican\")/dim(TrainingData)[1])\n",
+    "colnames(ProbMat)=c(names(data)[3:dim(data)[2]],\"MargProb\")\n",
+    "rownames(ProbMat)=unique(TrainingData$Party)\n",
+    "\n",
+    "TestData=data[(TrainingSample+1):dim(data)[1],]\n",
+    "AssignedMat=matrix(0,dim(TestData)[1],3)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#Use the NB classifier on test Data\n",
+    "VotingModel_fn<-function(TestVec,ProbMat){\n",
+    "  \n",
+    "  \n",
+    "  ProbTestMat=matrix(0,2,dim(ProbMat)[2])\n",
+    "  #TestVec is the member of interests' vote record\n",
+    "  \n",
+    "  for (j in 1:length(TestVec)){\n",
+    "    for (k in 1:2){\n",
+    "      #Compute probabilities if vote yes or no via if loop\n",
+    "      \n",
+    "      if (TestVec[j]==\"y\"){\n",
+    "        ProbTestMat[k,j]=ProbMat[k,j]\n",
+    "      }  else  {\n",
+    "        ProbTestMat[k,j]=1-ProbMat[k,j]\n",
+    "      }\n",
+    "    }\n",
+    "  }\n",
+    "  \n",
+    "  ProbTestMat[1:2,(length(TestVec)+1)]=ProbMat[1:2,(length(TestVec)+1)]\n",
+    "  Probs=apply(ProbTestMat,1,prod)  #Compute product of probabilities for the candidate being of either party\n",
+    "  ind=which.max(Probs)  #Find which probability is higher\n",
+    "  AssignedVec=c(Probs,unique(TrainingData$Party)[ind])  #Probability of being a democrat, being a republican, and which one is assigned\n",
+    "  \n",
+    "  \n",
+    "  return(list(AssignedVec=as.numeric(AssignedVec[1:2]),AssignedParty=AssignedVec[3]))  #Elements returned as a list.\n",
+    "}\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for (i in 1:dim(TestData)[1]){\n",
+    "  for (j in 3:dim(TestData)[2]){\n",
+    "  \n",
+    "  TestVec=TestData[i,3:dim(TestData)[2]]\n",
+    "  result<-VotingModel_fn(TestVec,ProbMat)\n",
+    "  AssignedMat[i,]=c(as.numeric(result$AssignedVec),result$AssignedParty)\n",
+    "  }\n",
+    "}\n",
+    " "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CheckMat=data.frame(cbind(TestData$Party,AssignedMat[,3]))\n",
+    "colnames(CheckMat)=c(\"Actual\",\"Assigned\")\n",
+    "Pct_Accuracy=sum(CheckMat$Actual==CheckMat$Assigned)/dim(TestData)[1]  #computes the percent accuracy\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Classifier Percent Accuracy\") #Print our accuracy as percent value.\n",
+    "print(Pct_Accuracy)\n",
+    "\n",
+    "Example=read.csv(\"ArbitraryMember.csv\")\n",
+    "result<-VotingModel_fn(Example,ProbMat)\n",
+    "print(result)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Executing function in a sample data set to predict likelihood of voting\n",
+    "Example=read.csv(\"ArbitraryMember.csv\")  #load data\n",
+    "\n",
+    "result<-VotingModel_fn(Example,ProbMat)\n",
+    "print(result)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "codemirror_mode": "r",
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R",
+   "pygments_lexer": "r",
+   "version": "3.4.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}