Delete .gitlab-ci.yml

472181d5 · Muhammad Sabih Ur · dcfbe9ec · dcfbe9ec
Commit 472181d5 authored Apr 29, 2023 by Muhammad Sabih Ur
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 104 deletions

.gitlab-ci.yml .gitlab-ci.yml +0 -104

No files found.
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-import requests
-from bs4 import BeautifulSoup
-
-import pandas as pd
-df = pd.read_excel("E:\MySoloWebsite\moon.xlsx", header=0)
-
-
-links = []
-links.extend(df['source_url'].tolist())
-links.extend(df['target_url'].tolist())
-
-
-def scrape_page(url):
- try:
- response = requests.get(url)
- soup = BeautifulSoup(response.content, "html.parser")
-
- title = soup.title.string if soup.title else None
- url = response.url
- content = str(soup)
- images = []
- for img in soup.find_all("img"):
- images.append({"src": img["src"], "alt": img.get("alt", "")})
-
- links = []
- for a in soup.find_all("a"):
- links.append({"href": a["href"], "text": a.text})
-
- metadata = {}
- for meta in soup.find_all("meta"):
- if meta.has_attr("name"):
- metadata[meta["name"]] = meta.get("content", "")
- elif meta.has_attr("property"):
- metadata[meta["property"]] = meta.get("content", "")
-
- date = soup.find("meta", {"property": "article:modified_time"}).get(
- "content", "") if soup.find("meta", {"property": "article:modified_time"}) else None
-
- comments = []
- for comment in soup.find_all("comment"):
- comments.append(comment.text)
-
- social_media_metrics = {}
- for span in soup.find_all("span"):
- if "like" in span.text.lower():
- social_media_metrics["likes"] = span.text
- elif "share" in span.text.lower():
- social_media_metrics["shares"] = span.text
- elif "comment" in span.text.lower():
- social_media_metrics["comments"] = span.text
-
- structured_data = []
- for script in soup.find_all("script", {"type": "application/ld+json"}):
- structured_data.append(script.string.strip())
-
- data = {
- "Title": title,
- "URL": url,
- "Content": content,
- "Images": images,
- "Links": links,
- "Metadata": metadata,
- "Date and time": date,
- "Comments": comments,
- "Social media metrics": social_media_metrics,
- "Structured data": structured_data
- }
-
- except requests.exceptions.RequestException as e:
- data = {
- "Title": None,
- "URL": url,
- "Text": None,
- "Images": None,
- "Links": None,
- "Metadata": None,
- "Date and time": None,
- "Comments": None,
- "Social media metrics": None,
- "Structured data": None
- }
-
- return data
-
-
-scraped_data = []
-
-
-for i, link in enumerate(links):
- try:
- data = scrape_page(link)
- print(i, link)
- scraped_data.append(data)
- except Exception as e:
- print(f"Error scraping {link}: {e}")
-
-
-df = pd.DataFrame(scraped_data)
-
-try:
- df.to_excel("E:\MySoloWebsite\main.xlsx", index=False)
- print("Scraped data saved to Excel file")
-except Exception as e:
- print(f"Error saving scraped data to Excel file: {e}")