Commit 1e2b6b6d by Arham Akheel

New Folder or Web Scraping Python code to tutoprials

parent 15a2fbed
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
# URl to web scrap from.
# in this example we web scrap graphics cards from Newegg.com
page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"
# opens the connection and downloads html page from url
uClient = uReq(page_url)
# parses html into a soup data structure to traverse html
# as if it were a json data type.
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "item-container"})
# name the output file to write to local disk
out_filename = "graphics_cards.csv"
# header of csv file to be written
headers = "brand,product_name,shipping \n"
# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)
# loops over each product and grabs attributes about
# each product
for container in containers:
# Finds all link tags "a" from within the first div.
make_rating_sp = container.div.select("a")
# Grabs the title from the image title attribute
# Then does proper casing using .title()
brand = make_rating_sp[0].img["title"].title()
# Grabs the text within the second "(a)" tag from within
# the list of queries.
product_name = container.div.select("a")[2].text
# Grabs the product shipping information by searching
# all lists with the class "price-ship".
# Then cleans the text of white space with strip()
# Cleans the strip of "Shipping $" if it exists to just get number
shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")
# prints the dataset to console
print("brand: " + brand + "\n")
print("product_name: " + product_name + "\n")
print("shipping: " + shipping + "\n")
# writes the dataset to file
f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "\n")
f.close() # Close the file
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment