From 1e2b6b6d031740fdb4938c3cba8ebe4c4b17c1d1 Mon Sep 17 00:00:00 2001 From: arhamakheel Date: Fri, 16 Mar 2018 12:49:09 -0700 Subject: [PATCH] New Folder or Web Scraping Python code to tutoprials --- Web Scraping with Python and BeautifulSoup/Web Scraping with Python and Beautiful Soup.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 Web Scraping with Python and BeautifulSoup/Web Scraping with Python and Beautiful Soup.py diff --git a/Web Scraping with Python and BeautifulSoup/Web Scraping with Python and Beautiful Soup.py b/Web Scraping with Python and BeautifulSoup/Web Scraping with Python and Beautiful Soup.py new file mode 100644 index 0000000..4c551b8 --- /dev/null +++ b/Web Scraping with Python and BeautifulSoup/Web Scraping with Python and Beautiful Soup.py @@ -0,0 +1,56 @@ +from bs4 import BeautifulSoup as soup # HTML data structure +from urllib.request import urlopen as uReq # Web client + +# URl to web scrap from. +# in this example we web scrap graphics cards from Newegg.com +page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH" + +# opens the connection and downloads html page from url +uClient = uReq(page_url) + +# parses html into a soup data structure to traverse html +# as if it were a json data type. +page_soup = soup(uClient.read(), "html.parser") +uClient.close() + +# finds each product from the store page +containers = page_soup.findAll("div", {"class": "item-container"}) + +# name the output file to write to local disk +out_filename = "graphics_cards.csv" +# header of csv file to be written +headers = "brand,product_name,shipping \n" + +# opens file, and writes headers +f = open(out_filename, "w") +f.write(headers) + +# loops over each product and grabs attributes about +# each product +for container in containers: + # Finds all link tags "a" from within the first div. + make_rating_sp = container.div.select("a") + + # Grabs the title from the image title attribute + # Then does proper casing using .title() + brand = make_rating_sp[0].img["title"].title() + + # Grabs the text within the second "(a)" tag from within + # the list of queries. + product_name = container.div.select("a")[2].text + + # Grabs the product shipping information by searching + # all lists with the class "price-ship". + # Then cleans the text of white space with strip() + # Cleans the strip of "Shipping $" if it exists to just get number + shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "") + + # prints the dataset to console + print("brand: " + brand + "\n") + print("product_name: " + product_name + "\n") + print("shipping: " + shipping + "\n") + + # writes the dataset to file + f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "\n") + +f.close() # Close the file \ No newline at end of file -- libgit2 0.26.0