names(new.theta) <- names(features) alpha <- 0.01 # Utility functon that calculates the prediction for an observation # given the current state of the hypothesis function. #h.theta <- function(theta, observation) { # return(sum(theta * observation)) # prediction <- 0.0 # for(i in 1:length(theta)) { # prediction <- prediction + (theta[i] * observation[i]) # } # return(prediction) #} # for(k in 1:2){ # m <- ncol(features) # # for(j in 1:m) { # n <- nrow(features) # summation <- 0.0 # # for(i in 1:n) { # # prediction <- h.theta(theta, features[i,]) # prediction <- sum(theta * features[i,]) # residual <- prediction - y[i] # update.value <- residual * features[i, j] # summation <- summation + update.value # } # # new.theta[j] <- theta[j] + (alpha * summation) # } # # theta <- new.theta # # # print(theta) # } # # print(theta) iterations <- 250 X <- features for(k in 1:iterations) { for(j in 1:ncol(X)) { summation <- 0 for(i in 1:nrow(X)) { residual <- sum(X[i,] * theta[j]) - y[i] summation <- summation + (residual * X[i, j]) } new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) } theta <- new.theta } lm.model set.seed(1234) x <- runif(1000, -5, 5) y <- x + rnorm(1000) + 3 intercept <- rep(1, length(x)) lm.model <- lm(y ~ x) summary(lm.model) features <- data.frame(intercept = intercept, x = x) theta <- rep(0, ncol(features)) names(theta) <- names(features) theta new.theta <- rep(0, ncol(features)) names(new.theta) <- names(features) alpha <- 0.025 # Utility functon that calculates the prediction for an observation # given the current state of the hypothesis function. #h.theta <- function(theta, observation) { # return(sum(theta * observation)) # prediction <- 0.0 # for(i in 1:length(theta)) { # prediction <- prediction + (theta[i] * observation[i]) # } # return(prediction) #} # for(k in 1:2){ # m <- ncol(features) # # for(j in 1:m) { # n <- nrow(features) # summation <- 0.0 # # for(i in 1:n) { # # prediction <- h.theta(theta, features[i,]) # prediction <- sum(theta * features[i,]) # residual <- prediction - y[i] # update.value <- residual * features[i, j] # summation <- summation + update.value # } # # new.theta[j] <- theta[j] + (alpha * summation) # } # # theta <- new.theta # # # print(theta) # } # # print(theta) iterations <- 250 X <- features for(k in 1:iterations) { for(j in 1:ncol(X)) { summation <- 0 for(i in 1:nrow(X)) { residual <- sum(X[i,] * theta[j]) - y[i] summation <- summation + (residual * X[i, j]) } new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) } theta <- new.theta } set.seed(1234) x <- runif(1000, -5, 5) y <- x + rnorm(1000) + 3 intercept <- rep(1, length(x)) lm.model <- lm(y ~ x) summary(lm.model) features <- data.frame(intercept = intercept, x = x) theta <- rep(0, ncol(features)) names(theta) <- names(features) theta new.theta <- rep(0, ncol(features)) names(new.theta) <- names(features) alpha <- 0.05 # Utility functon that calculates the prediction for an observation # given the current state of the hypothesis function. #h.theta <- function(theta, observation) { # return(sum(theta * observation)) # prediction <- 0.0 # for(i in 1:length(theta)) { # prediction <- prediction + (theta[i] * observation[i]) # } # return(prediction) #} # for(k in 1:2){ # m <- ncol(features) # # for(j in 1:m) { # n <- nrow(features) # summation <- 0.0 # # for(i in 1:n) { # # prediction <- h.theta(theta, features[i,]) # prediction <- sum(theta * features[i,]) # residual <- prediction - y[i] # update.value <- residual * features[i, j] # summation <- summation + update.value # } # # new.theta[j] <- theta[j] + (alpha * summation) # } # # theta <- new.theta # # # print(theta) # } # # print(theta) iterations <- 250 X <- features for(k in 1:iterations) { for(j in 1:ncol(X)) { summation <- 0 for(i in 1:nrow(X)) { residual <- sum(X[i,] * theta[j]) - y[i] summation <- summation + (residual * X[i, j]) } new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) } theta <- new.theta } set.seed(1234) x <- runif(1000, -5, 5) y <- x + rnorm(1000) + 3 intercept <- rep(1, length(x)) lm.model <- lm(y ~ x) summary(lm.model) features <- data.frame(intercept = intercept, x = x) theta <- rep(0, ncol(features)) names(theta) <- names(features) theta new.theta <- rep(0, ncol(features)) names(new.theta) <- names(features) alpha <- 0.05 # Utility functon that calculates the prediction for an observation # given the current state of the hypothesis function. #h.theta <- function(theta, observation) { # return(sum(theta * observation)) # prediction <- 0.0 # for(i in 1:length(theta)) { # prediction <- prediction + (theta[i] * observation[i]) # } # return(prediction) #} # for(k in 1:2){ # m <- ncol(features) # # for(j in 1:m) { # n <- nrow(features) # summation <- 0.0 # # for(i in 1:n) { # # prediction <- h.theta(theta, features[i,]) # prediction <- sum(theta * features[i,]) # residual <- prediction - y[i] # update.value <- residual * features[i, j] # summation <- summation + update.value # } # # new.theta[j] <- theta[j] + (alpha * summation) # } # # theta <- new.theta # # # print(theta) # } # # print(theta) iterations <- 300 X <- features for(k in 1:iterations) { for(j in 1:ncol(X)) { summation <- 0 for(i in 1:nrow(X)) { residual <- sum(X[i,] * theta[j]) - y[i] summation <- summation + (residual * X[i, j]) } new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) } theta <- new.theta } set.seed(1234) x <- runif(1000, -5, 5) y <- x + rnorm(1000) + 3 intercept <- rep(1, length(x)) lm.model <- lm(y ~ x) summary(lm.model) features <- data.frame(intercept = intercept, x = x) theta <- rep(0, ncol(features)) names(theta) <- names(features) theta new.theta <- rep(0, ncol(features)) names(new.theta) <- names(features) alpha <- 0.05 # Utility functon that calculates the prediction for an observation # given the current state of the hypothesis function. #h.theta <- function(theta, observation) { # return(sum(theta * observation)) # prediction <- 0.0 # for(i in 1:length(theta)) { # prediction <- prediction + (theta[i] * observation[i]) # } # return(prediction) #} # for(k in 1:2){ # m <- ncol(features) # # for(j in 1:m) { # n <- nrow(features) # summation <- 0.0 # # for(i in 1:n) { # # prediction <- h.theta(theta, features[i,]) # prediction <- sum(theta * features[i,]) # residual <- prediction - y[i] # update.value <- residual * features[i, j] # summation <- summation + update.value # } # # new.theta[j] <- theta[j] + (alpha * summation) # } # # theta <- new.theta # # # print(theta) # } # # print(theta) iterations <- 500 X <- features for(k in 1:iterations) { for(j in 1:ncol(X)) { summation <- 0 for(i in 1:nrow(X)) { residual <- sum(X[i,] * theta[j]) - y[i] summation <- summation + (residual * X[i, j]) } new.theta[j] <- theta[j] - (alpha / nrow(X) * summation) } theta <- new.theta } data(iria) data(iris) install.packages(c("lmtest", "mgcv", "nlme")) data("iris") library(GGally) ggpairs(iris) remove.packages("tibble") library(GGally) remove.packages("GGally") remove.packages("plotly") install.pacakges("GGally") install.packages("GGally") library(GGally) install.packages(tibble) install.packages("tibble") library(GGally) data("iris") ggpairs(iris) debugSource('~/Dropbox/AmsterdamBootcamp/GradientDescentExample.R', echo=TRUE) debugSource('~/Dropbox/AmsterdamBootcamp/GradientDescentExample.R', echo=TRUE) # # Copyright 2017 Dave Langer # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # # This R source code file corresponds to video 10 of the YouTube series # "R Programming for Excel Users" located at the following URL: # # #=========================================================================== # Numeric Vectors # # Create a vector of integer values my_vector <- 1:10 my_vector # Inspect the vector more closely class(my_vector) str(my_vector) summary(my_vector) # Add 1 to each value of the vector my_vector_plus1 <- my_vector + 1 my_vector_plus1 # Divide each value of the vector by 2 half_my_vector <- my_vector / 2 half_my_vector # Make the vector whole again whole_my_vector <- half_my_vector + half_my_vector whole_my_vector # Square the value of each vector my_vector_squared1 <- my_vector * my_vector my_vector_squared1 # Square the value of each vector my_vector_squared2 <- my_vector ^ 2 my_vector_squared2 # Take the square root of each value sqrt_my_vector <- sqrt(my_vector) sqrt_my_vector # More vectorized functions sum(my_vector) mean(my_vector) sd(my_vector) #=========================================================================== # Logical Vectors # # Which values are greater than 3.5? larger_than_3.5 <- my_vector > 3.5 larger_than_3.5 # Inspect vector more closely class(larger_than_3.5) str(larger_than_3.5) summary(larger_than_3.5) # Grab only the values larger than 3.5 my_vector2 <- my_vector[larger_than_3.5] my_vector2 # Grab only the values larger than 3.5 my_vector3 <- my_vector[my_vector > 3.5] my_vector3 # Grow the vector my_bigger_vector <- c(my_vector, 11:15, 16, 17, 18, 19, 20) my_bigger_vector # How big is it now? length(my_bigger_vector) dim(my_bigger_vector) #=========================================================================== # String Vectors # # Create a vector of strings force_users <- c("Yoda", "Darth Vader", "Obi Wan", "Mace Windu", "Darth Maul", "Luke Skywalker", "Darth Sidious") # Inspect vector more closely class(force_users) str(force_users) summary(force_users) # Add 1 to string vector force_users + 1 # Add another force user force_users <- force_users + "Kylo Ren" # Add more force users more_force_users <- c(force_users, "Qui-Gon Jinn", "Darth Tyranus") more_force_users # How big is the vector? length(more_force_users) # How long is each string in the vector? name_lengths <- nchar(more_force_users) name_lengths #=========================================================================== # Missing Values # # Build a vector with missing values birthplaces <- c(NA, "Tatooine", "Stewjon", "Haruun Kal", "Dathomir", "Polis Massa", "Naboo", "Coruscant", "Serenno") birthplaces # Inspect closer class(birthplaces) str(birthplaces) summary(birthplaces) # Vectorized operation nchar(birthplaces) nchar("") # Logical operations birthplaces[!] #=========================================================================== # Factor Vectors # # Create factor (categorical) vector affiliation <- as.factor(c("Jedi", "Sith", "Rogue")) affiliation # Inspect class(affiliation) str(affiliation) summary(affiliation) levels(affiliation) # Explore representations as.numeric(affiliation) as.character(affiliation) #=========================================================================== # Data Frames # star_wars <- data.frame(id = 1:length(more_force_users), more_force_users, birthplaces = as.factor(birthplaces), affiliation = c("Jedi", "Sith", "Jedi", "Jedi", "Sith", "Jedi", "Sith", "Jedi", "Sith"), stringsAsFactors = FALSE) # Inspect View(star_wars) head(star_wars) summary(star_wars) str(star_wars) # Set up factors star_wars$affiliation <- as.factor(star_wars$affiliation) # Reinspect str(star_wars) # Additional slicing syntax star_wars$more_force_users[3] star_wars$more_force_users[star_wars$affiliation == "Sith"] # Load-up some built in data data(iris) data(mtcars) # Get help on built-in data ?mtcars # Understand the shape of a data frame nrow(mtcars) ncol(mtcars) dim(mtcars) # Understand the metadata of a data frame names(mtcars) names(mtcars)[3] colnames(mtcars) colnames(mtcars)[3:5] rownames(mtcars) rownames(mtcars)[c(3, 4, 5)] # Cool RStudio feature - spreadsheet view of a data frame View(mtcars) # See a few rows at the top and bottom of a data frame head(mtcars) tail(mtcars) # All-up view of a data frame summary(mtcars) # Understand the data type of a data frame class(mtcars) str(mtcars) setwd("~/Dropbox/DataScienceDojo/IntroToTextAnalyticsWithR") spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") View(spam.raw) # Clean up the data frame and view our handiwork. spam.raw <- spam.raw[, 1:2] names(spam.raw) <- c("Label", "Text") View(spam.raw) # Check data to see if there are missing values. length(which(!complete.cases(spam.raw))) # Convert our class label into a factor. spam.raw$Label <- as.factor(spam.raw$Label) # The first step, as always, is to explore the data. # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). prop.table(table(spam.raw$Label)) # Next up, let's get a feel for the distribution of text lengths of the SMS # messages by adding a new feature for the length of each message. spam.raw$TextLength <- nchar(spam.raw$Text) summary(spam.raw$TextLength) # Visualize distribution with ggplot2, adding segmentation for ham/spam. library(ggplot2) ggplot(spam.raw, aes(x = TextLength, fill = Label)) + theme_bw() + geom_histogram(binwidth = 5) + labs(y = "Text Count", x = "Length of Text", title = "Distribution of Text Lengths with Class Labels")