Commit 5c0c3f65 by Muhammad Sabih Ur

Update stackoverflow.gitlab-ci.yml

parent fc82150e
...@@ -8,48 +8,46 @@ header = { ...@@ -8,48 +8,46 @@ header = {
'Connection': 'keep-alive', 'Connection': 'keep-alive',
} }
data = []
data =[]
def getQuestions(tag, pgno): def getQuestions(tag,pgno):
url = f'https://stackoverflow.com/questions/tagged/{tag}?tab=newest&page={pgno}&pagesize=50' url = f'https://stackoverflow.com/questions/tagged/{tag}?tab=frequent&page={pgno}&pagesize=50'
print(url)
try: try:
r = requests.get(url, headers=header) r = requests.get(url,headers=header)
soup = BeautifulSoup(r.text, 'html.parser') soup = BeautifulSoup(r.text,'html.parser')
except Exception as e: except Exception as e:
print(f"An error occurred: {e}") print(f"An error occurred: {e}")
questions = soup.find_all('div', {'class': 's-post-summary'}) questions= soup.find_all('div',{'class':'s-post-summary'})
# print(questions) # print(questions)
for item in questions: for item in questions:
question = { question = {
'title': item.find('a', {'class': 's-link'}).text.strip(), 'title' : item.find('a',{'class':'s-link'}).text.strip(),
'description': item.find('div', {'class': 's-post-summary--content-excerpt'}).text.strip(), 'description' : item.find('div',{'class':'s-post-summary--content-excerpt'}).text.strip(),
'date': item.find('span', {'class': 'relativetime'})['title'], 'date' : item.find('span',{'class':'relativetime'})['title'] if item.find('span',{'class':'relativetime'}) else '',
'link': 'https://stackoverflow.com/' + item.find('a', {'class': 's-link'})['href'], 'link' :'https://stackoverflow.com/' + item.find('a', {'class': 's-link'})['href'],
'votes': item.find_all('span', {'class': 's-post-summary--stats-item-number'})[0].text,
'views' : item.find_all('span', {'class': 's-post-summary--stats-item-number'})[2].text
# print(vote)
# print(link)
# print(description) # print(vote)
# print(votes) # print(link)
# print(description)
# print(votes)
} }
# print(question) # print(question)
data.append(question) data.append(question)
return return
# 42473
for x in range(1,3860):
# Total pages we have for python tag "42473" getQuestions('python',x)
for x in range(102, 201):
getQuestions('python', x)
df = pd.DataFrame(data) df = pd.DataFrame(data)
# print(len(data)) # print(len(data))
print(df.head()) print(df.head())
df.to_csv('F:\StacksOverflow\stacks3.csv', index=False) df.to_csv('F:\StacksOverflow\ss_2.csv')
print("Done") print("Done")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment