Excel File

dcfbe9ec · Muhammad Sabih Ur · a952a30d · dcfbe9ec
Commit dcfbe9ec authored Apr 29, 2023 by Muhammad Sabih Ur
Hide whitespace changes
Inline Side-by-side

Showing with 104 additions and 0 deletions

main_excel.py main_excel.py +104 -0

No files found.
--- a/main_excel.py
+++ b/main_excel.py
+import requests
+from bs4 import BeautifulSoup
+
+import pandas as pd
+df = pd.read_excel("E:\MySoloWebsite\moon.xlsx", header=0)
+
+
+links = []
+links.extend(df['source_url'].tolist())
+links.extend(df['target_url'].tolist())
+
+
+def scrape_page(url):
+ try:
+ response = requests.get(url)
+ soup = BeautifulSoup(response.content, "html.parser")
+
+ title = soup.title.string if soup.title else None
+ url = response.url
+ content = str(soup)
+ images = []
+ for img in soup.find_all("img"):
+ images.append({"src": img["src"], "alt": img.get("alt", "")})
+
+ links = []
+ for a in soup.find_all("a"):
+ links.append({"href": a["href"], "text": a.text})
+
+ metadata = {}
+ for meta in soup.find_all("meta"):
+ if meta.has_attr("name"):
+ metadata[meta["name"]] = meta.get("content", "")
+ elif meta.has_attr("property"):
+ metadata[meta["property"]] = meta.get("content", "")
+
+ date = soup.find("meta", {"property": "article:modified_time"}).get(
+ "content", "") if soup.find("meta", {"property": "article:modified_time"}) else None
+
+ comments = []
+ for comment in soup.find_all("comment"):
+ comments.append(comment.text)
+
+ social_media_metrics = {}
+ for span in soup.find_all("span"):
+ if "like" in span.text.lower():
+ social_media_metrics["likes"] = span.text
+ elif "share" in span.text.lower():
+ social_media_metrics["shares"] = span.text
+ elif "comment" in span.text.lower():
+ social_media_metrics["comments"] = span.text
+
+ structured_data = []
+ for script in soup.find_all("script", {"type": "application/ld+json"}):
+ structured_data.append(script.string.strip())
+
+ data = {
+ "Title": title,
+ "URL": url,
+ "Content": content,
+ "Images": images,
+ "Links": links,
+ "Metadata": metadata,
+ "Date and time": date,
+ "Comments": comments,
+ "Social media metrics": social_media_metrics,
+ "Structured data": structured_data
+ }
+
+ except requests.exceptions.RequestException as e:
+ data = {
+ "Title": None,
+ "URL": url,
+ "Text": None,
+ "Images": None,
+ "Links": None,
+ "Metadata": None,
+ "Date and time": None,
+ "Comments": None,
+ "Social media metrics": None,
+ "Structured data": None
+ }
+
+ return data
+
+
+scraped_data = []
+
+
+for i, link in enumerate(links):
+ try:
+ data = scrape_page(link)
+ print(i, link)
+ scraped_data.append(data)
+ except Exception as e:
+ print(f"Error scraping {link}: {e}")
+
+
+df = pd.DataFrame(scraped_data)
+
+try:
+ df.to_excel("E:\MySoloWebsite\main.xlsx", index=False)
+ print("Scraped data saved to Excel file")
+except Exception as e:
+ print(f"Error saving scraped data to Excel file: {e}")