Commit dcfbe9ec by Muhammad Sabih Ur

Excel File

parent a952a30d
Pipeline #9 failed
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel("E:\MySoloWebsite\moon.xlsx", header=0)
links = []
links.extend(df['source_url'].tolist())
links.extend(df['target_url'].tolist())
def scrape_page(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
title = soup.title.string if soup.title else None
url = response.url
content = str(soup)
images = []
for img in soup.find_all("img"):
images.append({"src": img["src"], "alt": img.get("alt", "")})
links = []
for a in soup.find_all("a"):
links.append({"href": a["href"], "text": a.text})
metadata = {}
for meta in soup.find_all("meta"):
if meta.has_attr("name"):
metadata[meta["name"]] = meta.get("content", "")
elif meta.has_attr("property"):
metadata[meta["property"]] = meta.get("content", "")
date = soup.find("meta", {"property": "article:modified_time"}).get(
"content", "") if soup.find("meta", {"property": "article:modified_time"}) else None
comments = []
for comment in soup.find_all("comment"):
comments.append(comment.text)
social_media_metrics = {}
for span in soup.find_all("span"):
if "like" in span.text.lower():
social_media_metrics["likes"] = span.text
elif "share" in span.text.lower():
social_media_metrics["shares"] = span.text
elif "comment" in span.text.lower():
social_media_metrics["comments"] = span.text
structured_data = []
for script in soup.find_all("script", {"type": "application/ld+json"}):
structured_data.append(script.string.strip())
data = {
"Title": title,
"URL": url,
"Content": content,
"Images": images,
"Links": links,
"Metadata": metadata,
"Date and time": date,
"Comments": comments,
"Social media metrics": social_media_metrics,
"Structured data": structured_data
}
except requests.exceptions.RequestException as e:
data = {
"Title": None,
"URL": url,
"Text": None,
"Images": None,
"Links": None,
"Metadata": None,
"Date and time": None,
"Comments": None,
"Social media metrics": None,
"Structured data": None
}
return data
scraped_data = []
for i, link in enumerate(links):
try:
data = scrape_page(link)
print(i, link)
scraped_data.append(data)
except Exception as e:
print(f"Error scraping {link}: {e}")
df = pd.DataFrame(scraped_data)
try:
df.to_excel("E:\MySoloWebsite\main.xlsx", index=False)
print("Scraped data saved to Excel file")
except Exception as e:
print(f"Error saving scraped data to Excel file: {e}")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment