Commit bbcc789a by Jonathan Kelly

Create switchup.ipynb

parents
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"source": [
"# $ python -m pip install beautifulsoup4"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"#### Creating dataframe for review data"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 2,
"source": [
"import requests\r\n",
"from bs4 import BeautifulSoup\r\n",
"\r\n",
"\r\n",
"URL = \"https://www.switchup.org/bootcamps/data-science-dojo?page=1\"\r\n",
"page = requests.get(URL)\r\n",
"soup = BeautifulSoup(page.content, \"html.parser\")"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"#### Get number of pages for reviews by iterating through all review pages and building of list of review page urls."
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"n=1\r\n",
"while True:\r\n",
" response = requests.get(URL + str(n), allow_redirects=False)\r\n",
" if response.status_code == 200:\r\n",
" n+=1\r\n",
" else:\r\n",
" break\r\n",
"\r\n",
"main_page_url_list = []\r\n",
"pages_to_scrape = n\r\n",
"i = 1\r\n",
"main_page_url_list = [URL + str(i) for i in range(i,n)]\r\n",
"print(main_page_url_list) # check"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 15,
"source": [
"# for x, _ in enumerate(range(1,n)):\r\n",
"# url = main_page_url_list[]\r\n",
"page = requests.get('https://www.switchup.org/bootcamps/data-science-dojo?page=2')\r\n",
"soup = BeautifulSoup(page.content, \"html.parser\")\r\n",
"pretty_soup = soup.prettify()\r\n",
"with open(\"Review page \" + str(2) + \".txt\", \"w\", encoding=\"utf-8\") as f:\r\n",
" f.write(str(pretty_soup))"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 16,
"source": [
"results = soup.find(id=\"reviews\")\r\n",
"job_elements = results.find_all(\"div\", class_=\"section--white border-grey\")\r\n",
"\r\n",
"with open(\"switchup_reviews2.txt\", \"w+\") as j:\r\n",
" j.write(str(job_elements))\r\n",
" j.close"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"#### Variables to be gathered"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 8,
"source": [
"name_and_job = soup.find_all(\"li\", class_=\"name\")\r\n",
"review_created_date = soup.find_all(\"div\", class_=\"created-at\")\r\n",
"review_text_trunc = soup.find_all(\"span\", class_=\"truncatable\")\r\n",
"ovr_cirr_job_rating = soup.find_all(\"div\", class_=\"rating-icons__filled\")"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"#### Pulling Names and Job titles"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 9,
"source": [
"# Pull name and job from the list\r\n",
"nameJob = name_and_job[2]\r\n",
"double_dict1 = {}\r\n",
"name = []\r\n",
"job = []\r\n",
"for nameJob in name_and_job:\r\n",
" try:\r\n",
" names = nameJob.find('span').text[0:-1].strip()\r\n",
" name.append(names)\r\n",
" except:\r\n",
" name = \"N/A\"\r\n",
" try:\r\n",
" jobs = nameJob.contents[3].text.strip()\r\n",
" jobs = jobs[2:].strip()\r\n",
" job.append(jobs)\r\n",
" except:\r\n",
" jobs = \"N/A\"\r\n",
" job.append(jobs)\r\n",
"\r\n",
" # double_dict1[name] = job \r\n",
"print (name)\r\n",
"print (len(name))\r\n",
"print (job)\r\n",
"print (len(job))"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"['Jacqueline', 'Jonathan', 'Kevin Lee', 'Jeffrey Bierman', 'Sarah', 'Anonymous', 'Anonymous', 'Sumit Hore', 'Ryan Eaton', 'Anonymous']\n",
"10\n",
"['Marketing Consultant', 'Data Science Intern', 'N/A', 'Graduate Student in Chemical Biology', 'Product Manager', 'Evaluator', 'N/A', 'Phd student in Knowledge Management', 'Researcher', 'N/A']\n",
"10\n"
]
}
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"#### Create a list of dates"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"source": [
"datelist = [x.text for x in review_created_date]\r\n",
"print(datelist)\r\n",
"print(len(datelist))\r\n"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"['7/14/2021', '7/13/2021', '5/12/2021', '4/22/2021', '2/19/2021', '2/16/2021', '2/16/2021', '12/9/2020', '12/5/2020', '12/3/2020']\n",
"10\n"
]
}
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"#### Gather Reviews"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 11,
"source": [
"review_list = [x.text.replace('Read More','') for x in review_text_trunc[1:]]\r\n",
"review_list = [x.replace('Read Less','') for x in review_list]\r\n",
"review_list = [x.replace('...','') for x in review_list]\r\n",
"print(review_list)\r\n",
"print(len(review_list))"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[\"I really enjoyed the experience overall. The instructors were thorough and attentive in providing support for all learners whether new or experienced. I feel confident that I can build on the knowledge and skills I've gained to become a data science practitioner. \", 'I was making a career change and unsure about what course/bootcamp to take. After doing research, I decided Data Science Dojo was the best fit for me. It was such an awesome experience and I have learned a TON! The instructors were knowledgeable and were able to keep things easy to understand while still providing a challenging environment to grow. They never balked at anyone or made them feel small and I was never afraid to ask a question. Office hours were a great time to get 1-on-1 tutoring too. I feel prepared and ready for the next step in my career. I would definitely recommend.', 'This bootcamp is for people who want to get their feet wet in the data science field. The instructors were amazing, the platform was amazing, and the resources they had were amazing. The way the instructors broke down the core data science concept was very easy to understand.', \"Want a solid overview of common Python fundamentals for data analysis? Want down-to-earth instructors that are actually interested in their work, and you? Want to know how to evaluate data, clean it, visualize it, polish it, and feed it into a pipeline for machine learning? Want all of this in a five-day course? Look no further. I didn't know lists from dictionaries before this, and now I have a lot of essential tools at my disposal to do bioinformatic analysis for my graduate research. Not only that, but all of the course material is available to me for an additional six months after the course ended, so I consider more than five days of instruction. Plus, the instructors make sure they are available to help you and set you up for success. To add, I really liked the style of this course, and I think it is so pragmatic for learning programming. I love that our live sessions were recorded so that we can go back and see exactly what we saw before. I think my favorite aspect were the instructors, because they were very kind, knowledgeable, and made everything less intimidating. I love that the instructors were all around the world too! Such a cool way to showcase the wonders of working online and utilizing computers to learn about computers. I will hopefully be in contact with Data Science Dojo again in the future! Thank You!\", \"The online data science bootcamp offered by Data Science Dojo was a really valuable investment. Over a course of approximately 4 months, we had weekly 3-hour-long live classes with optional homework to deepen what we had learnt. From building and evaluating Machine Learning Models, to parameter tuning to text analytics and linear regression (and much more) - many relevant topics were covered and explained with such patience and good examples that even someone like me from a more non-technical/non-mathematical background was able to understand and follow along. The instructors were empathetic and extremely knowledgable and welcomed students' questions at any time throughout the course. The homework was appropriate and voluntary - the suggested reading material was interesting and contributed to a better understanding of certain topics.\", 'The boot camp provided me an overview of all key machine learning tools in a practical way that I could apply them in my work, and grow my expertise. I would highly recommend it. It allows you to think about the use case for the approaches, and when and where to most effectively use them.', 'This course has something for everyone! I especially was happy to learn more about R and Azure. If you are looking for help with text analytics, predictive modeling, logistic regression, they teach on these topics and more. The instructors are there to answer questions and to go in-depth during class and office hours.', 'I did a lot of research on data science boot camps before I finally choose the Data Science Dojo. I joined the boot camp with a business and economics background to enhance my understanding of data science and more specifically on predictive modelling and machine learning.The classes were intensive, interactive, full of practical examples, exercises, and labs. The tutors and mentors were extremely competent, friendly, and available to provide one-to-one help to solve the class assignments and projects. This boot camp also allows the participants to earn 7 credit points from the University of New Mexico, USA.After attending the boot camp, I strongly believe that it can be a great start for anyone who loves data and to build their career in data science.', 'I came to the bootcamp with a background in scientific data analysis hoping to broaden my understanding of predictive models and machine learning. Data Science Dojo surpassed my expectations through combination of in-person, discussion-oriented classes and practical, hands-on exercises. Modules on feature engineering and cross-validation techniques I found especially useful. From day one it was obvious that emphasis would be on algorithms and rationale underlying the predictive models covered over intricacies of the particular functions/libraries used to generate them. Though I had no trouble figuring out how to code to train and test models after working through the well-documented detailed exercises. All-in-all, Data Science Dojo is a strong choice for the student wanting to understand machine learning fundamentals.', \"I'm really converted. I never thought a data science bootcamp course could bring such great value to me but here I am converted let ring you know the DoJo bootcamp is the best data science course I recommend strongly. The teaching is excellent and class discussion is really interesting. The school also arranged the course time fitting my timezone which was amazing. If anyone wishes a strong foundation and basis of data science, I strongly recommend this course. It gives you confidence and makes you feel you grasped the important concepts. For me, it gets five stars.\"]\n",
"10\n"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 12,
"source": [
"#NOTE First 4 positions in list are for DSD - 0 = overall, 1 = overall (average), 2 = Curriculum, 3 = job support ||\r\n",
"#NOTE Currently the last 7 positions are for ads - but that may change - basically anything not divisible by 20 is either DSD overall or an ad. \r\n",
"#NOTE Switchup only let's people give whole stars. Divide by 20 to get number of stars.\r\n",
"overall_rating = []\r\n",
"curriculum_rating = []\r\n",
"job_support_rating = []\r\n",
"star_rating_list = [x['style'] for x in ovr_cirr_job_rating]\r\n",
"star_rating_list = [x.replace('width:','') for x in star_rating_list]\r\n",
"star_rating_list = [x[0:3] for x in star_rating_list]\r\n",
"star_rating_list = [x.replace('.','') for x in star_rating_list]\r\n",
"star_rating_list = [x for x in star_rating_list if x in {'100', '80', '60', '40', '20', '00'}]\r\n",
"star_rating_list = [star_rating_list[i:i+3] for i in range(0, len(star_rating_list), 3)]\r\n",
"print(star_rating_list)\r\n",
"print(len(star_rating_list))\r\n",
"\r\n",
"for starrating in star_rating_list:\r\n",
" overall_rating.append(starrating[0])\r\n",
" curriculum_rating.append(starrating[1])\r\n",
" job_support_rating.append(starrating[2])\r\n",
"for i in job_support_rating:\r\n",
" if i == '00':\r\n",
" job_support_rating[job_support_rating.index(i)] = 'NA'\r\n",
"print(overall_rating)\r\n",
"print(curriculum_rating)\r\n",
"print(job_support_rating)\r\n",
" \r\n"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[['100', '100', '00'], ['100', '100', '100'], ['100', '100', '00'], ['100', '100', '100'], ['100', '100', '00'], ['100', '100', '100'], ['100', '100', '100'], ['100', '100', '100'], ['80', '100', '80'], ['100', '100', '00']]\n",
"10\n",
"['100', '100', '100', '100', '100', '100', '100', '100', '80', '100']\n",
"['100', '100', '100', '100', '100', '100', '100', '100', '100', '100']\n",
"['NA', '100', 'NA', '100', 'NA', '100', '100', '100', '80', 'NA']\n"
]
}
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"Add each list"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 13,
"source": [
"import pandas as pd\r\n",
"\r\n",
"switchup_reviews = pd.DataFrame([name, job, datelist, review_list,overall_rating,curriculum_rating,job_support_rating])\r\n",
"switchup_reviews = switchup_reviews.transpose()\r\n",
"switchup_reviews.columns=['Name','Job','Date of Review', 'Review','Overall Rating','Curriculum Rating','Job Support Rating']\r\n",
"switchup_reviews.head()\r\n"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Job</th>\n",
" <th>Date of Review</th>\n",
" <th>Review</th>\n",
" <th>Overall Rating</th>\n",
" <th>Curriculum Rating</th>\n",
" <th>Job Support Rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Jacqueline</td>\n",
" <td>Marketing Consultant</td>\n",
" <td>7/14/2021</td>\n",
" <td>I really enjoyed the experience overall. The i...</td>\n",
" <td>100</td>\n",
" <td>100</td>\n",
" <td>NA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Jonathan</td>\n",
" <td>Data Science Intern</td>\n",
" <td>7/13/2021</td>\n",
" <td>I was making a career change and unsure about ...</td>\n",
" <td>100</td>\n",
" <td>100</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Kevin Lee</td>\n",
" <td>N/A</td>\n",
" <td>5/12/2021</td>\n",
" <td>This bootcamp is for people who want to get th...</td>\n",
" <td>100</td>\n",
" <td>100</td>\n",
" <td>NA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Jeffrey Bierman</td>\n",
" <td>Graduate Student in Chemical Biology</td>\n",
" <td>4/22/2021</td>\n",
" <td>Want a solid overview of common Python fundame...</td>\n",
" <td>100</td>\n",
" <td>100</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Sarah</td>\n",
" <td>Product Manager</td>\n",
" <td>2/19/2021</td>\n",
" <td>The online data science bootcamp offered by Da...</td>\n",
" <td>100</td>\n",
" <td>100</td>\n",
" <td>NA</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Job Date of Review \\\n",
"0 Jacqueline Marketing Consultant 7/14/2021 \n",
"1 Jonathan Data Science Intern 7/13/2021 \n",
"2 Kevin Lee N/A 5/12/2021 \n",
"3 Jeffrey Bierman Graduate Student in Chemical Biology 4/22/2021 \n",
"4 Sarah Product Manager 2/19/2021 \n",
"\n",
" Review Overall Rating \\\n",
"0 I really enjoyed the experience overall. The i... 100 \n",
"1 I was making a career change and unsure about ... 100 \n",
"2 This bootcamp is for people who want to get th... 100 \n",
"3 Want a solid overview of common Python fundame... 100 \n",
"4 The online data science bootcamp offered by Da... 100 \n",
"\n",
" Curriculum Rating Job Support Rating \n",
"0 100 NA \n",
"1 100 100 \n",
"2 100 NA \n",
"3 100 100 \n",
"4 100 NA "
]
},
"metadata": {},
"execution_count": 13
}
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"#### Saving"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 14,
"source": [
"switchup_reviews.to_csv('switchup_reviews.csv', index=False)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"# Extra code from other methods"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"with open(\"Review page 1.txt\", \"w+\") as f:\r\n",
" for job_element in job_elements:\r\n",
" name_and_job = job_element.find(\"li\", class_=\"name\")\r\n",
" review_created_date = job_element.find(\"div\", class_=\"created-at\")\r\n",
" review_text_trunc = job_element.find(\"span\", class_=\"truncatable\")\r\n",
" review_text_more = job_element.find(\"span\", class_=\"read-more\")\r\n",
" ovr_cirr_job_rating = job_element.find(\"div\", class_=\"rating-icons__filled\")\r\n",
" line_item = list(\r\n",
" [\r\n",
" name_and_job,\r\n",
" review_created_date,\r\n",
" review_text_trunc,\r\n",
" review_text_more,\r\n",
" ovr_cirr_job_rating,\r\n",
" ]\r\n",
" )\r\n",
" f.write(str(line_item))\r\n",
" f.write(\"\\n\"*2)\r\n",
"\r\n",
"j.close"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"#### Extra code"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"import pandas as pd\r\n",
"from html.parser import HTMLParser\r\n",
"\r\n",
"# Create table parser and extract table data\r\n",
"class Parser(HTMLParser):\r\n",
" def __init__(self):\r\n",
" HTMLParser.__init__(self)\r\n",
" self.in_td = False\r\n",
" \r\n",
" def handle_starttag(self, tag, attrs):\r\n",
" if tag in ['span', 'p', 'div']:\r\n",
" self.in_td = True\r\n",
" \r\n",
" def handle_data(self, data):\r\n",
" if self.in_td:\r\n",
" if data == '\\xa0':\r\n",
" list.append('') # This is a placeholder for the draft round will fill in later\r\n",
" else:\r\n",
" list.append(data)\r\n",
" \r\n",
" def handle_endtag(self, tag):\r\n",
" self.in_td = False\r\n",
"\r\n",
"list = []\r\n",
"data = open('Review page 2.txt', 'r')\r\n",
"data = data.read()\r\n",
"p = Parser()\r\n",
"p.feed(str(data))\r\n",
"\r\n",
"print(list)\r\n",
"\r\n",
"reviewDetails = [list[x:x+6] for x in range(0, len(list), 6)]\r\n",
"print(reviewDetails[0])\r\n",
"print(reviewDetails[1])\r\n",
"print(reviewDetails[2])\r\n",
"print(reviewDetails[3])\r\n"
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"orig_nbformat": 4,
"language_info": {
"name": "python",
"version": "3.9.2",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.9.2 64-bit"
},
"interpreter": {
"hash": "63fd5069d213b44bf678585dea6b12cceca9941eaf7f819626cde1f2670de90d"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment