Commit 78d82c70 by Arham Noman

Adding files for webinar

parents
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Web scraping with BeautifulSoup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import required libraries"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup as soup\n",
"from urllib.request import urlopen as uReq\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an object and turn an HTML page into a soup object"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Fetching the raw HTML page"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"playstation+5\" # Put your search term here\n",
"my_url = 'YOUR_EBAY_URL'.format(query) # Paste the url for an Ebay search from your browser and replace the query part with a {}\n",
"my_url"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"uClient = uReq(my_url)\n",
"page_html = uClient.read()\n",
"uClient.close()\n",
"\n",
"page_soup = soup(page_html, 'html.parser')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find all the relevant classes and extract data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"divs = page_soup.findAll('div',{'class':'s-item__details clearfix'}) # Find the HTML tag with the data we are looking for and fetch all instances of it\n",
"prices = []\n",
"\n",
"for PS5 in divs: \n",
" price = PS5.find('span',{'class':\"s-item__price\"})\n",
" price = price.text[1:]\n",
" price = price.replace(',','')\n",
" prices.append(price)\n",
"\n",
"\n",
"prices = pd.to_numeric(prices,errors='coerce')\n",
"prices"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Use the scraped data to get insights into the market for PS5 consoles"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"sns.set()\n",
"sns.boxplot(data=prices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "8fe041c8bf3f6c634cfbe99daa1d0835869ac67f33d4cde2619e21380e56c003"
},
"kernelspec": {
"display_name": "Python 3.9.7 64-bit",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
import datetime
import logging
import azure.functions as func
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd
my_url = "YOUR_EBAY_LINK" # Change this to your ebay search url
def main(mytimer: func.TimerRequest) -> None:
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
# Make the client object and read the html into a soup object
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
divs = page_soup.findAll('div',{'class':'s-item__details clearfix'})
# Extract relevant data from the soup object
prices = []
for PS5 in divs:
price = PS5.find('span',{'class':"s-item__price"})
price = price.text[1:]
price = price.replace(',','')
prices.append(price)
# Clean data and save to a file for later use
prices = pd.DataFrame(pd.to_numeric(prices))
name = "PS5" + str(utc_timestamp) +".csv"
name = "".join( x for x in name if (x.isalnum() or x in "._- "))
prices.to_csv(name)
logging.info('Python timer trigger function ran at %s', utc_timestamp)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment