Add new file

0a9fa80e · Muhammad Sabih Ur · 0a9fa80e
Commit 0a9fa80e authored Apr 03, 2023 by Muhammad Sabih Ur
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 0 deletions

.gitlab-ci.yml .gitlab-ci.yml +55 -0

No files found.
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+header = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
+ 'Accept-Language': 'en-US,en;q=0.5',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Connection': 'keep-alive',
+}
+
+data = []
+
+
+def getQuestions(tag, pgno):
+ url = f'https://stackoverflow.com/questions/tagged/{tag}?tab=newest&page={pgno}&pagesize=50'
+ try:
+ r = requests.get(url, headers=header)
+ soup = BeautifulSoup(r.text, 'html.parser')
+ except Exception as e:
+ print(f"An error occurred: {e}")
+
+ questions = soup.find_all('div', {'class': 's-post-summary'})
+
+ # print(questions)
+ for item in questions:
+ question = {
+ 'title': item.find('a', {'class': 's-link'}).text.strip(),
+
+ 'description': item.find('div', {'class': 's-post-summary--content-excerpt'}).text.strip(),
+ 'date': item.find('span', {'class': 'relativetime'})['title'],
+ 'link': 'https://stackoverflow.com/' + item.find('a', {'class': 's-link'})['href'],
+
+
+ # print(vote)
+ # print(link)
+ # print(description)
+ # print(votes)
+ }
+ # print(question)
+ data.append(question)
+ return
+
+
+# Total pages we have for python tag "42473"
+
+
+for x in range(102, 201):
+ getQuestions('python', x)
+
+
+df = pd.DataFrame(data)
+# print(len(data))
+print(df.head())
+df.to_csv('F:\StacksOverflow\stacks3.csv', index=False)
+print("Done")