Commit 15a2fbed by Arham Akheel

Migrating meetup, datasets, web_scraping_r,…

Migrating meetup, datasets, web_scraping_r, IntroDataVisualizationWithRAndGgplot2 to tutorials repository
parent 22f44079
Col1
the
and
you
for.
that
have
but
just
with
get
not
day
was
now
this
can
work
all
out
are
http
today
your
too
time
what
got
thank
back
want
from
one
know
will
see
feel
com
think
about
don
realli
had
how
some
there
night
amp
make
watch
need
new
still
they
come
home
when
look
here
off
more
much
quot
twitter
morn
last
tomorrow
then
has
been
wait
sleep
again
her
onli
week
tri
whi
tonight
would
she
thing
way
did
say
follow
veri
bit
though
take
gonna
them
over
should
yeah
bed
even
start
tweet
could
school
hour
peopl
show
twitpic
didn
guy
hey
after
him
next.
weekend
play
down
final
let
cant
use
yes
were
who
soon
never
dont
life
girl
littl
everyon
year
rain
wanna
movi
first
find
where
call
done
sure
head
our
keep
ani
than
alway
his
leav
lot
talk
alreadi
won
man
readi
someth
made
anoth
live
read
eat
becaus
yet
yay
phone
ever
hous
went
song
befor
sound
thought
mayb
summer
someon
tell
give
guess
babi
check
mean
other
end
game
into
hear
listen
later
doesn
noth
while.
actual
happen
same
pic
stuff
birthday
mom
saw
weather
car
two
doe
put
stay
yesterday
world
those
run
also
might
until
gotta
meet
said
around
post
exam
monday
friday
seem
sinc
sunday
job
must
mani
updat
myself
found
haven
video
gone
such
famili
book
most
www
aww
month
their
boy
shop
move
least
dinner
total
woke
may
anyth
lunch
studi
pictur
hair
isn
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Please determine the required text preprocessing steps using the following flag
replace_special_chars <- TRUE
remove_duplicate_chars <- TRUE
replace_numbers <- TRUE
convert_to_lower_case <- TRUE
remove_default_stopWords <- TRUE
remove_given_stopWords <- TRUE
stem_words <- TRUE
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Map 1-based optional input ports to variables
dataset1 <- maml.mapInputPort(1) # class: data.frame
# get the label and text columns from the input data set
text_column <- dataset1[["tweet_text"]]
#label_column <- dataset1[["label_column"]]
stopword_list <- NULL
result <- tryCatch({
dataset2 <- maml.mapInputPort(2) # class: data.frame
# get the stopword list from the second input data set
stopword_list <- dataset2[[1]]
}, warning = function(war) {
# warning handler
print(paste("WARNING: ", war))
}, error = function(err) {
# error handler
print(paste("ERROR: ", err))
stopword_list <- NULL
}, finally = {})
# Load the R script from the Zip port in ./src/
source("src/text.preprocessing.R");
text_column <- preprocessText(text_column,
replace_special_chars,
remove_duplicate_chars,
replace_numbers,
convert_to_lower_case,
remove_default_stopWords,
remove_given_stopWords,
stem_words,
stopword_list)
Sentinment <- dataset1[["sentiment_label"]]
data.set <- data.frame(
Sentinment,
text_column,
stringsAsFactors = FALSE
)
# Select data.frame to be sent to the output Dataset port
maml.mapOutputPort("data.set")
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5"/>
</startup>
<system.serviceModel>
<extensions>
<!-- In this extension section we are introducing all known service bus extensions. User can remove the ones they don't need. -->
<behaviorExtensions>
<add name="connectionStatusBehavior"
type="Microsoft.ServiceBus.Configuration.ConnectionStatusElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="transportClientEndpointBehavior"
type="Microsoft.ServiceBus.Configuration.TransportClientEndpointBehaviorElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="serviceRegistrySettings"
type="Microsoft.ServiceBus.Configuration.ServiceRegistrySettingsElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
</behaviorExtensions>
<bindingElementExtensions>
<add name="netMessagingTransport"
type="Microsoft.ServiceBus.Messaging.Configuration.NetMessagingTransportExtensionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="tcpRelayTransport"
type="Microsoft.ServiceBus.Configuration.TcpRelayTransportElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="httpRelayTransport"
type="Microsoft.ServiceBus.Configuration.HttpRelayTransportElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="httpsRelayTransport"
type="Microsoft.ServiceBus.Configuration.HttpsRelayTransportElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="onewayRelayTransport"
type="Microsoft.ServiceBus.Configuration.RelayedOnewayTransportElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
</bindingElementExtensions>
<bindingExtensions>
<add name="basicHttpRelayBinding"
type="Microsoft.ServiceBus.Configuration.BasicHttpRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="webHttpRelayBinding"
type="Microsoft.ServiceBus.Configuration.WebHttpRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="ws2007HttpRelayBinding"
type="Microsoft.ServiceBus.Configuration.WS2007HttpRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="netTcpRelayBinding"
type="Microsoft.ServiceBus.Configuration.NetTcpRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="netOnewayRelayBinding"
type="Microsoft.ServiceBus.Configuration.NetOnewayRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="netEventRelayBinding"
type="Microsoft.ServiceBus.Configuration.NetEventRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="netMessagingBinding"
type="Microsoft.ServiceBus.Messaging.Configuration.NetMessagingBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
</bindingExtensions>
</extensions>
</system.serviceModel>
<appSettings>
<!-- Service Bus specific app setings for messaging connections -->
<add key="Microsoft.ServiceBus.ConnectionString"
value="Endpoint=sb://tolltest.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey=V93mgRhRp0d1FkslcsyjOZNLjo5iSZ730wJuWbZIbS8="/>
<add key="storageAccountName"
value="dojodemo"/>
<add key="storageAccountKey"
value="QPALUJTeuleyZLwLQ45uT5gLIe6KcrKtpO4VpDsRs/8blwphpkySk7FQwHO4lbgp633uNEG5UFePj/p+6bDmnw=="/>
</appSettings>
</configuration>
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5"/>
</startup>
<system.serviceModel>
<extensions>
<!-- In this extension section we are introducing all known service bus extensions. User can remove the ones they don't need. -->
<behaviorExtensions>
<add name="connectionStatusBehavior"
type="Microsoft.ServiceBus.Configuration.ConnectionStatusElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="transportClientEndpointBehavior"
type="Microsoft.ServiceBus.Configuration.TransportClientEndpointBehaviorElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="serviceRegistrySettings"
type="Microsoft.ServiceBus.Configuration.ServiceRegistrySettingsElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
</behaviorExtensions>
<bindingElementExtensions>
<add name="netMessagingTransport"
type="Microsoft.ServiceBus.Messaging.Configuration.NetMessagingTransportExtensionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="tcpRelayTransport"
type="Microsoft.ServiceBus.Configuration.TcpRelayTransportElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="httpRelayTransport"
type="Microsoft.ServiceBus.Configuration.HttpRelayTransportElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="httpsRelayTransport"
type="Microsoft.ServiceBus.Configuration.HttpsRelayTransportElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="onewayRelayTransport"
type="Microsoft.ServiceBus.Configuration.RelayedOnewayTransportElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
</bindingElementExtensions>
<bindingExtensions>
<add name="basicHttpRelayBinding"
type="Microsoft.ServiceBus.Configuration.BasicHttpRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="webHttpRelayBinding"
type="Microsoft.ServiceBus.Configuration.WebHttpRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="ws2007HttpRelayBinding"
type="Microsoft.ServiceBus.Configuration.WS2007HttpRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="netTcpRelayBinding"
type="Microsoft.ServiceBus.Configuration.NetTcpRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="netOnewayRelayBinding"
type="Microsoft.ServiceBus.Configuration.NetOnewayRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="netEventRelayBinding"
type="Microsoft.ServiceBus.Configuration.NetEventRelayBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
<add name="netMessagingBinding"
type="Microsoft.ServiceBus.Messaging.Configuration.NetMessagingBindingCollectionElement, Microsoft.ServiceBus, Culture=neutral, PublicKeyToken=31bf3856ad364e35"/>
</bindingExtensions>
</extensions>
</system.serviceModel>
<appSettings>
<!-- Service Bus specific app setings for messaging connections -->
<add key="Microsoft.ServiceBus.ConnectionString"
value="Endpoint=sb://tolltest.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey=V93mgRhRp0d1FkslcsyjOZNLjo5iSZ730wJuWbZIbS8="/>
<add key="storageAccountName"
value="dojoeventhubs"/>
<add key="storageAccountKey"
value="lrrS7WkjginKovVFS9E3J8JmYJRnEj6bsz7hGymEqwfqmbt31h5GmQwE9+SiVSC3NPQZ+FhYLtkbTkJxOBbTrg=="/>
</appSettings>
</configuration>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
<assemblyIdentity version="1.0.0.0" name="MyApplication.app"/>
<trustInfo xmlns="urn:schemas-microsoft-com:asm.v2">
<security>
<requestedPrivileges xmlns="urn:schemas-microsoft-com:asm.v3">
<requestedExecutionLevel level="asInvoker" uiAccess="false"/>
</requestedPrivileges>
</security>
</trustInfo>
</assembly>
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
<?xml version="1.0" encoding="utf-8"?>
<doc>
<assembly>
<name>Microsoft.ServiceBus.Messaging.EventProcessorHost</name>
</assembly>
<members>
<member name="T:Microsoft.ServiceBus.Messaging.EventProcessorHost">
<summary>Represents a host for processing Event Hubs event data.</summary>
</member>
<member name="M:Microsoft.ServiceBus.Messaging.EventProcessorHost.#ctor(System.String,System.String,System.String,System.String,System.String)">
<summary>Initializes a new instance of the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> class.</summary>
<param name="hostName">The name of the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance. This name must be unique for each instance of the host.</param>
<param name="eventHubPath">The path to the Event Hub from which to start receiving event data.</param>
<param name="consumerGroupName">The name of the Event Hubs consumer group from which to start receiving event data.</param>
<param name="eventHubConnectionString">The connection string for the Event Hub.</param>
<param name="storageConnectionString">The connection string for the Azure Blob storage account to use for partition distribution.</param>
</member>
<member name="M:Microsoft.ServiceBus.Messaging.EventProcessorHost.#ctor(System.String,System.String,System.String,System.String,System.String,System.String)">
<summary>Initializes a new instance of the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> class.</summary>
<param name="hostName">The name of the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance. This name must be unique for each instance of the host.</param>
<param name="eventHubPath">The path to the Event Hub from which to start receiving event data.</param>
<param name="consumerGroupName">The name of the Event Hubs consumer group from which to start receiving event data.</param>
<param name="eventHubConnectionString">The connection string for the Event Hub.</param>
<param name="storageConnectionString">The connection string for the Azure Blob storage account to use for partition distribution.</param>
<param name="leaseContainerName">The name of the Azure Blob container in which all lease blobs are created. If this parameter is not supplied, then the Event Hubs path is used as the name of the Azure Blob container.</param>
</member>
<member name="P:Microsoft.ServiceBus.Messaging.EventProcessorHost.HostName">
<summary>Gets the host name, which is a unique name for the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance.</summary>
<returns>The host name.</returns>
</member>
<member name="P:Microsoft.ServiceBus.Messaging.EventProcessorHost.PartitionManagerOptions">
<summary>Gets or sets the <see cref="T:Microsoft.ServiceBus.Messaging.PartitionManagerOptions" /> instance used by the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> object.</summary>
<returns>The <see cref="T:Microsoft.ServiceBus.Messaging.PartitionManagerOptions" /> instance.</returns>
</member>
<member name="M:Microsoft.ServiceBus.Messaging.EventProcessorHost.RegisterEventProcessorAsync``1">
<summary>Asynchronously registers the <see cref="T:Microsoft.ServiceBus.Messaging.IEventProcessor" /> interface implementation with the host using the <see cref="T:Microsoft.ServiceBus.Messaging.DefaultEventProcessorFactory`1" /> factory. This method also starts the host and enables it to start participating in the partition distribution process.</summary>
<returns>A task indicating that the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance has started.</returns>
<typeparam name="T">Implementation of your application-specific <see cref="T:Microsoft.ServiceBus.Messaging.IEventProcessor" />.</typeparam>
</member>
<member name="M:Microsoft.ServiceBus.Messaging.EventProcessorHost.RegisterEventProcessorAsync``1(Microsoft.ServiceBus.Messaging.EventProcessorOptions)">
<summary>Asynchronously registers the <see cref="T:Microsoft.ServiceBus.Messaging.IEventProcessor" /> interface implementation with the host using the <see cref="T:Microsoft.ServiceBus.Messaging.DefaultEventProcessorFactory`1" /> factory. This method also starts the host and enables it to start participating in the partition distribution process.</summary>
<returns>A task indicating that the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance has started.</returns>
<param name="processorOptions">An <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorOptions" /> object that controls various aspects of the event pump created when ownership is acquired for a given Event Hubs partition.</param>
<typeparam name="T">Implementation of your application-specific <see cref="T:Microsoft.ServiceBus.Messaging.IEventProcessor" />.</typeparam>
</member>
<member name="M:Microsoft.ServiceBus.Messaging.EventProcessorHost.RegisterEventProcessorFactoryAsync(Microsoft.ServiceBus.Messaging.IEventProcessorFactory)">
<summary>Asynchronously registers the event processor factory.</summary>
<returns>The task representing the asynchronous operation.</returns>
<param name="factory">The factory to register.</param>
</member>
<member name="M:Microsoft.ServiceBus.Messaging.EventProcessorHost.RegisterEventProcessorFactoryAsync(Microsoft.ServiceBus.Messaging.IEventProcessorFactory,Microsoft.ServiceBus.Messaging.EventProcessorOptions)">
<summary>Asynchronously registers the event processor factory.</summary>
<returns>Returns <see cref="T:System.Threading.Tasks.Task" />.</returns>
<param name="factory">The factory to register.</param>
<param name="processorOptions">An <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorOptions" /> object that controls various aspects of the event pump created when ownership is acquired for a given Event Hubs partition.</param>
</member>
<member name="M:Microsoft.ServiceBus.Messaging.EventProcessorHost.UnregisterEventProcessorAsync">
<summary>Asynchronously shuts down the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance. This method maintains the leases on all partitions currently held, and enables each <see cref="T:Microsoft.ServiceBus.Messaging.IEventProcessor" /> instance to shut down cleanly by invoking the <see cref="M:Microsoft.ServiceBus.Messaging.IEventProcessor.CloseAsync(Microsoft.ServiceBus.Messaging.PartitionContext,Microsoft.ServiceBus.Messaging.CloseReason)" /> method with a <see cref="F:Microsoft.ServiceBus.Messaging.CloseReason.Shutdown" /> object.</summary>
<returns>A task that indicates the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance has stopped.</returns>
</member>
<member name="T:Microsoft.ServiceBus.Messaging.PartitionManagerOptions">
<summary>Represents the options that control various aspects of partition distribution that occur within the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance.</summary>
</member>
<member name="M:Microsoft.ServiceBus.Messaging.PartitionManagerOptions.#ctor">
<summary>Initializes a new instance of the <see cref="T:Microsoft.ServiceBus.Messaging.PartitionManagerOptions" /> class.</summary>
</member>
<member name="P:Microsoft.ServiceBus.Messaging.PartitionManagerOptions.AcquireInterval">
<summary>Gets or sets the interval at which the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance begins a task to determine whether partitions are distributed evenly among known host instances.</summary>
<returns>The acquire interval of the partition.</returns>
</member>
<member name="P:Microsoft.ServiceBus.Messaging.PartitionManagerOptions.DefaultOptions">
<summary>Creates an instance of <see cref="P:Microsoft.ServiceBus.Messaging.EventProcessorHost.PartitionManagerOptions" /> with the following default values:<see cref="P:Microsoft.ServiceBus.Messaging.PartitionManagerOptions.RenewInterval" />: 10 seconds.<see cref="P:Microsoft.ServiceBus.Messaging.PartitionManagerOptions.AcquireInterval" />: 10 seconds.<see cref="P:Microsoft.ServiceBus.Messaging.PartitionManagerOptions.LeaseInterval" />: 30 seconds. </summary>
<returns>The default partition manager options.</returns>
</member>
<member name="P:Microsoft.ServiceBus.Messaging.PartitionManagerOptions.LeaseInterval">
<summary>Gets or sets the interval at which the lease is created on an Azure Blob representing an Event Hubs partition. If the lease is not renewed within this interval, it expires, and ownership of the partition passes to another <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance.</summary>
<returns>Returns <see cref="T:System.TimeSpan" />.</returns>
</member>
<member name="P:Microsoft.ServiceBus.Messaging.PartitionManagerOptions.MaxReceiveClients"></member>
<member name="P:Microsoft.ServiceBus.Messaging.PartitionManagerOptions.RenewInterval">
<summary>Gets or sets the renewal interval for all leases for partitions currently held by the <see cref="T:Microsoft.ServiceBus.Messaging.EventProcessorHost" /> instance.</summary>
<returns>The interval to renew the partition.</returns>
</member>
</members>
</doc>
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# Building a Real-time Sentiment Pipeline for Live Tweets using Python, R, & Azure
## Requirements
* Twitter Account + Twitter App setup (https://apps.twitter.com/)
* Anaconda 3.5 or Python 3.5 Installed
* Azure subscription or free trial account
* [30 day free trial](https://azure.microsoft.com/en-us/pricing/free-trial/)
* Azure Machine Learning Studio workspace
* Text Editor, I'll be using Sublime Text 3
* Github.com account (to receive code)
* PowerBI.com account (for Dashboard portion)
* .NET up to date + windows (for testing portion)
## Cloning the Repo for Code & Materials
```
git clone https://www.github.com/datasciencedojo/meetup.git
```
Folder: Building a Real-time Sentiment Pipeline for Live Tweets using Python, R, & Azure
## The Predictive Model
### Supervised Twitter Dataset
* Azure ML Reader Module:
* Data source: Azure Blob Storage
* Authentication type: PublicOrSAS
* URI: http://azuremlsampleexperiments.blob.core.windows.net/datasets/Sentiment140.tenPercent.sample.tweets.tsv
* File format: TSV
* URI has header row: Checked
* Import and save dataset
### Preprocessing & Cleaning
* Azure ML Metadata Editor: Cast categorical sentiment_label
* Azure ML Group Categorical Values: Casting '0' as Negative, '4' as positive
### Text Processing
* Filtering using R
* Removing stop words (Stop words list)
* Removing special characters
* Replace numbers
* Globally conform to lower case
* Stemming and lemmatization
[Example of Cleansing Stop Words](http://demos.datasciencedojo.com/demo/stopwords/)
* Create a term frequency matrix for English words
* Azure ML's [Feature Hashing Module](https://msdn.microsoft.com/library/azure/c9a82660-2d9c-411d-8122-4d9e0b3ce92a)
* Drop the tweet_text column, since it is no longer needed
* Azure ML's Project Columns module
* Feature Selection & Filtering
* Pick only the most X relevant columns/words to train on.
* Using Azure ML's [Filter based Selection](https://msdn.microsoft.com/library/azure/818b356b-045c-412b-aa12-94a1d2dad90f) module, set to Pearson's correlation to select the top 5000 most correlated columns
* Normalize the Term Frequency Matrix
* Text processing best practice, but does not matter too much for Tweets
* Normalize Data Module: Min/Max for all numeric columns
### Algorithm Selection
* [Algorithm Cheat Sheet](https://azure.microsoft.com/en-us/documentation/articles/machine-learning-algorithm-cheat-sheet/)
* [Beginer's Guide to Choosing Algorithms](https://azure.microsoft.com/en-us/documentation/articles/machine-learning-algorithm-choice/)
* [Azure ML's Support Vector Machines](https://msdn.microsoft.com/en-us/library/azure/dn905835.aspx)
* [Support Vector Machines in General](https://en.wikipedia.org/wiki/Support_vector_machine)
### Model Building
* Train the model
* Score the trained model against a validation set
* Evaluate the performance, maximaxing accuracy in this case
### Twitter App
* [Creating a Twitter Account] (https://www.hashtags.org/platforms/twitter/how-to-create-a-twitter-account/)
* [Creating a Twitter App](http://www.ning.com/help/?p=4955)
* Get your [Twitter app's](https://apps.twitter.com/) OAuth keys and tokens.
### Twitter API with Python
* [Twitter API for all languages](https://dev.twitter.com/overview/api/twitter-libraries)
* [Tweepy Python Package](https://github.com/tweepy/tweepy)
* [Streaming with Tweepy](http://tweepy.readthedocs.org/en/v3.2.0/streaming_how_to.html?highlight=stream)
### Azure Event Hub
* Create an Service Bus Namespace
* Create an Azure Event Hub
* Create a send key (to push data to)
* Create a manage key (stream processor)
* Create a listen key (to subscribe to)
* [Pushing to Azure Event Hub](http://azure-sdk-for-python.readthedocs.org/en/latest/servicebus.html)
* [Viewing inside of an Azure Event Hub](https://azure.microsoft.com/en-us/documentation/articles/event-hubs-csharp-ephcs-getstarted/)
Deploy the Model
Hook up Stream Processors
\ No newline at end of file
import tweepy
# import json
# my keys
consumer_token = ''
consumer_secret = ''
key = ''
secret = ''
auth = tweepy.OAuthHandler(consumer_token, consumer_secret)
auth.set_access_token(key, secret)
api = tweepy.API(auth)
api.verify_credentials()
class MyStreamListener(tweepy.StreamListener):
def on_status(self, status):
print(status.text)
def on_data(self, twitter_data):
print(twitter_data)
# tweetJSON = json.loads(twitter_data)
# print(tweetJSON['text'].encode("utf-8"))
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth=api.auth, listener=MyStreamListener())
myStream.sample(async=False, languages=['en'])
# Intro to Business Data Analysis with Excel
GitHub Repository for the 03/08/2017 Meetup titled "[Business Data Analysis with Excel](https://www.meetup.com/data-science-dojo/events/236198327/)".
These materials make extensive use of the examples documented in the book "[Making Sense of Data](https://www.amazon.com/Making-Sense-Data-Donald-Wheeler/dp/0945320728/)" by Donald J. Wheeler. This book is highly recommended to all Data/Business Analysts interested in expanding the rigor of their analyses.
This source diff could not be displayed because it is too large. You can view the blob instead.
# datasets
A public repo of datasets
This source diff could not be displayed because it is too large. You can view the blob instead.
---
title: "Work and fun in Data Science Dojo"
author: your name
date:
output:
pdf_document:
toc: true
---
[linked phrase](http://datasciencedojo.com/)
# My story of Titanic tragedy
## Obtain the data
<!-- You may want to load data here -->
## Overview of the data
<!-- You may want to do the preliminary exploration of the data, using str(), summary(), head(), class(), etc. -->
<!-- Also write down your feelings of the data -->
## Modification of the original data
<!-- You can revise the data you got. -->
<!-- For example: if you feel the feature Survived should better to be a factor, you can do something like: titanic$Survived = factor(titanic$Survived, labels=c("died", "survived")) -->
## First plot of Titanic data
<!-- Make your first plot of Titanic data, and write down what you see from the plot. -->
<!-- Feel free to revise the headers to make this storybook nicer. -->
## Second plot of Titanic data
<!-- Make the 2nd, 3rd, 4th plots from here. Doesn't need to be a lot, but try to make every single one telling. -->
## Your summary of the Titanic data (story of Titanic tragedy)
* First...
* Second...
* Third...
* Fourth...
# Another course in Data Science Dojo
<!-- Keep adding your note, code and thoughts during the bootcamp! -->
# Another course in Data Science Dojo
# Important contacts in DSD bootcamp
* Raja Iqbal (Instructor)
[email protected]
* Jasmine Wilkerson (Instructor)
[email protected]
* Phuc Duong (Instructor)
[email protected]
* Yuhui Zhang (Instructor)
[email protected]
* Lisa Nicholson
[email protected]
This source diff could not be displayed because it is too large. You can view the blob instead.
#=======================================================================================
#
# File: CustomerQuery.R
# Author: Dave Langer
# Description: This code illustrates querying a SQL Server database via the RODBC
# package for the "Introduction to R Visualization with Power BI " Meetup
# dated 03/15/2017. More details on the Meetup are available at:
#
# https://www.meetup.com/Data-Science-Dojo-Toronto/events/237952698/
#
# The code in this file leverages data from Microsoft's Wide World
# Importers sample database available at:
#
# https://github.com/Microsoft/sql-server-samples/releases/tag/wide-world-importers-v1.0
#
# NOTE - This file is provided "As-Is" and no warranty regardings its contents are
# offered nor implied. USE AT YOUR OWN RISK!
#
#=======================================================================================
# Uncomment and run these lines of code to install required packages
#install.packages("RODBC")
library(RODBC)
# Open connection using Windows ODBC DSN
dbhandle <- odbcConnect("RConnection")
# Query database for a denormalized view of [Fact][Sale] data
dataset <- sqlQuery(dbhandle,
"SELECT [C].[CustomerID]
,[C].[CustomerName]
,[C].[BuyingGroupID]
,[C].[DeliveryMethodID]
,[C].[DeliveryCityID]
,[C].[DeliveryAddressLine1]
,[C].[DeliveryAddressLine2]
,[CITY].[CityName]
,[P].[StateProvinceCode]
,[C].[DeliveryPostalCode]
,[CC].[CustomerCategoryName]
,[BG].[BuyingGroupName]
,[O].[OrderID]
,[O].[OrderDate]
,[OL].[OrderLineID]
,[OL].[Quantity]
,[OL].[UnitPrice]
,[OL].[Quantity] * [OL].[UnitPrice] AS [LineTotal]
,[SC].[SupplierCategoryName]
FROM [WideWorldImporters].[Sales].[Customers] C
INNER JOIN [WideWorldImporters].[Sales].[CustomerCategories] CC ON ([C].[CustomerCategoryID] = [CC].[CustomerCategoryID])
LEFT OUTER JOIN [WideWorldImporters].[Sales].[BuyingGroups] BG ON ([C].[BuyingGroupID] = [BG].[BuyingGroupID])
INNER JOIN [WideWorldImporters].[Sales].[Orders] O ON ([C].[CustomerID] = [O].[CustomerID])
INNER JOIN [WideWorldImporters].[Sales].[OrderLines] OL ON ([O].[OrderID] = [OL].[OrderID])
INNER JOIN [WideWorldImporters].[Warehouse].[StockItems] SI ON ([OL].[StockItemID] = [SI].[StockItemID])
INNER JOIN [WideWorldImporters].[Purchasing].[Suppliers] S ON ([SI].[SupplierID] = [S].[SupplierID])
INNER JOIN [WideWorldImporters].[Purchasing].[SupplierCategories] SC ON ([S].[SupplierCategoryID] = [SC].[SupplierCategoryID])
INNER JOIN [WideWorldImporters].[Application].[Cities] CITY ON ([C].[DeliveryCityID] = [CITY].[CityID])
INNER JOIN [WideWorldImporters].[Application].[StateProvinces] P ON ([CITY].[StateProvinceID] = [P].[StateProvinceID])",
stringsAsFactors = FALSE)
#Close DB connection
odbcClose(dbhandle)
# Save off data frame in .RData binary format
save(dataset, file = "CustomerData.RData")
#=======================================================================================
#
# File: CustomerVisualizations.R
# Author: Dave Langer
# Description: This code illustrates R visualizaions used in the "Introduction to R
# Visualization with Power BI " Meetup dated 03/15/2017. More details on
# the Meetup are available at:
#
# https://www.meetup.com/Data-Science-Dojo-Toronto/events/237952698/
#
# The code in this file leverages data from Microsoft's Wide World
# Importers sample Data Warehouse available at:
#
# https://github.com/Microsoft/sql-server-samples/releases/tag/wide-world-importers-v1.0
#
# NOTE - This file is provided "As-Is" and no warranty regardings its contents are
# offered nor implied. USE AT YOUR OWN RISK!
#
#=======================================================================================
# Uncomment and run these lines of code to install required packages
#install.packages("dplyr")
#install.packages("lubridate")
#install.packages("ggplot2")
#install.packages("scales")
#install.packages("qcc")
# NOTE - Change your working directory as needed
load("CustomerData.RData")
# Preprocessing to make dataset look like Power BI
library(dplyr)
library(lubridate)
dataset <- dataset %>%
mutate(Year = year(dataset$OrderDate),
Month = month(dataset$OrderDate, label = TRUE))
#=============================================================================
#
# Visualization #1 - Aggregaed dynamic bar charts by Customer Category
#
#=============================================================================
library(dplyr)
library(ggplot2)
library(scales)
# Get total revenue by Buying Group, Supplier Category and Customer Catetory
customer.categories <- dataset %>%
group_by(BuyingGroupName, SupplierCategoryName, CustomerCategoryName) %>%
summarize(TotalRevenue = sum(LineTotal))
# Aggregate data across all supplier categories
all.suppliers <- dataset %>%
group_by(BuyingGroupName, CustomerCategoryName) %>%
summarize(TotalRevenue = sum(LineTotal))
all.suppliers$SupplierCategoryName <- "All Suppliers"
# Add aggregated data
customer.categories <- rbind(customer.categories,
all.suppliers)
# Format visualization title string dynamically
title.str.1 <- paste("Total Revenue for",
dataset$Year[1],
"by Buying Group and Supplier/Customer Categories for",
nrow(dataset),
"Rows of Data",
sep = " ")
# Plot
ggplot(customer.categories, aes(x = CustomerCategoryName, y = TotalRevenue, fill = BuyingGroupName)) +
theme_bw() +
coord_flip() +
facet_grid(BuyingGroupName ~ SupplierCategoryName) +
geom_bar(stat = "identity") +
scale_y_continuous(labels = comma) +
theme(text = element_text(size = 18),
axis.text.x = element_text(size = 12, angle=90, hjust=1)) +
labs(x = "Customer Category",
y = "Total Revenue",
fill = "Buying Group",
title = title.str.1)
#=============================================================================
#
# Visualization #2 - Aggregated Process Behavior Charts
#
#=============================================================================
# Add artificial filtering for example
dataset <- dataset %>%
filter(is.na(BuyingGroupName) &
(Year == 2013 | Year == 2014))
# Power BI code starts here
library(dplyr)
library(qcc)
# Grab year variables
Year1 <- min(dataset$Year)
Year2 <- max(dataset$Year)
# Accumulate totals
totals <- dataset %>%
filter(Year == Year1| Year == Year2 ) %>%
mutate(Month = substr(Month, 1, 3),
MonthNum = match(Month, month.abb)) %>%
group_by(Year, MonthNum, Month) %>%
summarize(TotalRevenue = sum(LineTotal)) %>%
mutate(Label = paste(Month, Year, sep = "-")) %>%
arrange(Year, MonthNum)
# Make labels pretty with dummy vars
Revenue.Group.1 <- totals$TotalRevenue[1:12]
Revenue.Group.2 <- totals$TotalRevenue[13:24]
title.str <- paste("Process Behavior Chart - ", Year1, " and ", Year2, " ",
dataset$CustomerCategoryName[1], " Total Revenue for Buying Group '",
dataset$BuyingGroupName[1], "'", sep = "")