from bs4 import BeautifulSoup as soup # HTML data structure from urllib.request import urlopen as uReq # Web client # URl to web scrap from. # in this example we web scrap graphics cards from Newegg.com page_url = "http://www.newegg.com/Product/ProductList.aspx?Submit=ENE&N=-1&IsNodeId=1&Description=GTX&bop=And&Page=1&PageSize=36&order=BESTMATCH" # opens the connection and downloads html page from url uClient = uReq(page_url) # parses html into a soup data structure to traverse html # as if it were a json data type. page_soup = soup(uClient.read(), "html.parser") uClient.close() # finds each product from the store page containers = page_soup.findAll("div", {"class": "item-container"}) # name the output file to write to local disk out_filename = "graphics_cards.csv" # header of csv file to be written headers = "brand,product_name,shipping \n" # opens file, and writes headers f = open(out_filename, "w") f.write(headers) # loops over each product and grabs attributes about # each product for container in containers: # Finds all link tags "a" from within the first div. make_rating_sp = container.div.select("a") # Grabs the title from the image title attribute # Then does proper casing using .title() brand = make_rating_sp[0].img["title"].title() # Grabs the text within the second "(a)" tag from within # the list of queries. product_name = container.div.select("a")[2].text # Grabs the product shipping information by searching # all lists with the class "price-ship". # Then cleans the text of white space with strip() # Cleans the strip of "Shipping $" if it exists to just get number shipping = container.findAll("li", {"class": "price-ship"})[0].text.strip().replace("$", "").replace(" Shipping", "") # prints the dataset to console print("brand: " + brand + "\n") print("product_name: " + product_name + "\n") print("shipping: " + shipping + "\n") # writes the dataset to file f.write(brand + ", " + product_name.replace(",", "|") + ", " + shipping + "\n") f.close() # Close the file ---End of code---