New Folder or Web Scraping Python code to tutoprials

1e2b6b6d · Arham Akheel · 15a2fbed · 1e2b6b6d
Commit 1e2b6b6d authored Mar 16, 2018 by Arham Akheel
Hide whitespace changes
Inline Side-by-side

Showing with 57 additions and 0 deletions

Web Scraping with Python and Beautiful Soup.py ...utifulSoup/Web Scraping with Python and Beautiful Soup.py +57 -0

No files found.
--- a/Web Scraping with Python and BeautifulSoup/Web Scraping with Python and Beautiful Soup.py
+++ b/Web Scraping with Python and BeautifulSoup/Web Scraping with Python and Beautiful Soup.py
+from bs4 import BeautifulSoup as soup  # HTML data structure
+from urllib.request import urlopen as uReq  # Web client
+
+# URl to web scrap from.
+# in this example we web scrap graphics cards from Newegg.com
+page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH"
+
+# opens the connection and downloads html page from url
+uClient = uReq(page_url)
+
+# parses html into a soup data structure to traverse html
+# as if it were a json data type.
+page_soup = soup(uClient.read(), "html.parser")
+uClient.close()
+
+# finds each product from the store page
+containers = page_soup.findAll("div", {"class": "item-container"})
+
+# name the output file to write to local disk
+out_filename = "graphics_cards.csv"
+# header of csv file to be written
+headers = "brand,product_name,shipping \n"
+
+# opens file, and writes headers
+f = open(out_filename, "w")
+f.write(headers)
+
+# loops over each product and grabs attributes about
+# each product
+for container in containers:
+    # Finds all link tags "a" from within the first div.
+    make_rating_sp = container.div.select("a")
+
+    # Grabs the title from the image title attribute
+    # Then does proper casing using .title()
+    brand = make_rating_sp[0].img["title"].title()
+
+    # Grabs the text within the second "(a)" tag from within
+    # the list of queries.
+    product_name = container.div.select("a")[2].text
+
+    # Grabs the product shipping information by searching
+    # all lists with the class "price-ship".
+    # Then cleans the text of white space with strip()
+    # Cleans the strip of "Shipping $" if it exists to just get number
+    shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "")
+
+    # prints the dataset to console
+    print("brand: " + brand + "\n")
+    print("product_name: " + product_name + "\n")
+    print("shipping: " + shipping + "\n")
+
+    # writes the dataset to file
+    f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "\n")
+
+f.close()  # Close the file
\ No newline at end of file