Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
SoloWebsite
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Muhammad Sabih Ur
SoloWebsite
Commits
472181d5
Commit
472181d5
authored
Apr 29, 2023
by
Muhammad Sabih Ur
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Delete .gitlab-ci.yml
parent
dcfbe9ec
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
104 deletions
+0
-104
.gitlab-ci.yml
.gitlab-ci.yml
+0
-104
No files found.
.gitlab-ci.yml
deleted
100644 → 0
View file @
dcfbe9ec
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel("E:\MySoloWebsite\moon.xlsx", header=0)
links = []
links.extend(df['source_url'].tolist())
links.extend(df['target_url'].tolist())
def scrape_page(url)
:
try
:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
title = soup.title.string if soup.title else None
url = response.url
content = str(soup)
images = []
for img in soup.find_all("img")
:
images.append({"src"
:
img["src"], "alt"
:
img.get("alt", "")})
links = []
for a in soup.find_all("a")
:
links.append({"href"
:
a["href"], "text"
:
a.text})
metadata = {}
for meta in soup.find_all("meta")
:
if meta.has_attr("name")
:
metadata[meta["name"]] = meta.get("content", "")
elif meta.has_attr("property")
:
metadata[meta["property"]] = meta.get("content", "")
date = soup.find("meta", {"property"
:
"
article:modified_time"
}
).get(
"content", "") if soup.find("meta", {"property"
:
"
article:modified_time"
}
) else None
comments = []
for comment in soup.find_all("comment")
:
comments.append(comment.text)
social_media_metrics = {}
for span in soup.find_all("span")
:
if "like" in span.text.lower()
:
social_media_metrics["likes"] = span.text
elif "share" in span.text.lower()
:
social_media_metrics["shares"] = span.text
elif "comment" in span.text.lower()
:
social_media_metrics["comments"] = span.text
structured_data = []
for script in soup.find_all("script", {"type"
:
"
application/ld+json"
}
)
:
structured_data.append(script.string.strip())
data = {
"Title"
:
title,
"URL"
:
url,
"Content"
:
content,
"Images"
:
images,
"Links"
:
links,
"Metadata"
:
metadata,
"Date and time"
:
date,
"Comments"
:
comments,
"Social media metrics"
:
social_media_metrics,
"Structured data"
:
structured_data
}
except requests.exceptions.RequestException as e
:
data = {
"Title"
:
None,
"URL"
:
url,
"Text"
:
None,
"Images"
:
None,
"Links"
:
None,
"Metadata"
:
None,
"Date and time"
:
None,
"Comments"
:
None,
"Social media metrics"
:
None,
"Structured data"
:
None
}
return data
scraped_data = []
for i, link in enumerate(links)
:
try
:
data = scrape_page(link)
print(i, link)
scraped_data.append(data)
except Exception as e
:
print(f"Error scraping {link}
:
{
e
}
"
)
df
=
pd.DataFrame(scraped_data)
try:
df.to_excel("E:\MySoloWebsite\main.xlsx", index=False)
print("Scraped data saved to Excel file")
except Exception as e
:
print(f"Error saving scraped data to Excel file
:
{
e
}
"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment