1 files changed, 89 insertions, 0 deletions
diff --git a/chadscrapper-seo.py b/chadscrapper-seo.py
new file mode 100644
index 0000000..7786eab
--- /dev/null
+++ b/chadscrapper-seo.py
@@ -0,0 +1,89 @@
+import aiohttp
+import asyncio
+import pandas as pd
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from tqdm import tqdm
+
+HEADERS = {"User-Agent": "Mozilla/5.0"}
+
+async def fetch_url(session, url):
+    """Fetches HTML content asynchronously."""
+    try:
+        async with session.get(url, headers=HEADERS, timeout=10) as response:
+            response.raise_for_status()
+            return await response.text()
+    except Exception as e:
+        print(f"Failed to fetch {url}: {e}")
+        return None
+
+def parse_html(url, html):
+    """Parses HTML and extracts SEO data."""
+    soup = BeautifulSoup(html, "html.parser")
+
+    title = soup.title.string.strip() if soup.title else "N/A"
+
+    def get_meta_content(name):
+        tag = soup.find("meta", attrs={"name": name})
+        return tag["content"].strip() if tag else "N/A"
+
+    meta_desc = get_meta_content("description")
+    meta_keywords = get_meta_content("keywords")
+
+    # Extracting Open Graph & Twitter Card Data
+    og_title = get_meta_content("og:title")
+    og_desc = get_meta_content("og:description")
+    twitter_title = get_meta_content("twitter:title")
+    twitter_desc = get_meta_content("twitter:description")
+
+    # Extract Canonical URL
+    canonical_tag = soup.find("link", rel="canonical")
+    canonical_url = canonical_tag["href"].strip() if canonical_tag else url
+
+    # Extract Headings
+    headings = {f"H{i}": [h.get_text(strip=True) for h in soup.find_all(f"h{i}")] for i in range(1, 7)}
+
+    # Extract Links
+    internal_links, external_links = set(), set()
+    for link in soup.find_all("a", href=True):
+        href = link["href"].strip()
+        full_url = urljoin(url, href)
+        (internal_links if urlparse(full_url).netloc == urlparse(url).netloc else external_links).add(full_url)
+
+    return {
+        "URL": url,
+        "Canonical URL": canonical_url,
+        "Title": title,
+        "Meta Description": meta_desc,
+        "Meta Keywords": meta_keywords,
+        "OG Title": og_title,
+        "OG Description": og_desc,
+        "Twitter Title": twitter_title,
+        "Twitter Description": twitter_desc,
+        **headings,
+        "Internal Links": list(internal_links),
+        "External Links": list(external_links),
+    }
+
+async def fetch_seo_data(urls):
+    """Fetches SEO data for multiple URLs asynchronously."""
+    async with aiohttp.ClientSession() as session:
+        tasks = [fetch_url(session, url) for url in urls]
+        responses = await asyncio.gather(*tasks)
+
+    return [parse_html(url, html) for url, html in zip(urls, responses) if html]
+
+def save_to_csv(data, filename="seo_data.csv"):
+    """Saves the extracted data to a CSV file."""
+    if not data:
+        print("No data to save.")
+        return
+
+    df = pd.DataFrame(data)
+    df.to_csv(filename, index=False, encoding="utf-8")
+    print(f"Data saved to {filename}")
+
+if __name__ == "__main__":
+    urls = ["https://example.com", "https://anotherwebsite.com"]  # Add URLs here
+    seo_results = asyncio.run(fetch_seo_data(urls))
+    save_to_csv(seo_results)