summaryrefslogtreecommitdiff
path: root/chadscrapper-seo.py
blob: 7786eabc4fbd2084e8f963405d9f2db2f45da64b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import aiohttp
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from tqdm import tqdm

HEADERS = {"User-Agent": "Mozilla/5.0"}

async def fetch_url(session, url):
    """Fetches HTML content asynchronously."""
    try:
        async with session.get(url, headers=HEADERS, timeout=10) as response:
            response.raise_for_status()
            return await response.text()
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return None

def parse_html(url, html):
    """Parses HTML and extracts SEO data."""
    soup = BeautifulSoup(html, "html.parser")

    title = soup.title.string.strip() if soup.title else "N/A"

    def get_meta_content(name):
        tag = soup.find("meta", attrs={"name": name})
        return tag["content"].strip() if tag else "N/A"

    meta_desc = get_meta_content("description")
    meta_keywords = get_meta_content("keywords")

    # Extracting Open Graph & Twitter Card Data
    og_title = get_meta_content("og:title")
    og_desc = get_meta_content("og:description")
    twitter_title = get_meta_content("twitter:title")
    twitter_desc = get_meta_content("twitter:description")

    # Extract Canonical URL
    canonical_tag = soup.find("link", rel="canonical")
    canonical_url = canonical_tag["href"].strip() if canonical_tag else url

    # Extract Headings
    headings = {f"H{i}": [h.get_text(strip=True) for h in soup.find_all(f"h{i}")] for i in range(1, 7)}

    # Extract Links
    internal_links, external_links = set(), set()
    for link in soup.find_all("a", href=True):
        href = link["href"].strip()
        full_url = urljoin(url, href)
        (internal_links if urlparse(full_url).netloc == urlparse(url).netloc else external_links).add(full_url)

    return {
        "URL": url,
        "Canonical URL": canonical_url,
        "Title": title,
        "Meta Description": meta_desc,
        "Meta Keywords": meta_keywords,
        "OG Title": og_title,
        "OG Description": og_desc,
        "Twitter Title": twitter_title,
        "Twitter Description": twitter_desc,
        **headings,
        "Internal Links": list(internal_links),
        "External Links": list(external_links),
    }

async def fetch_seo_data(urls):
    """Fetches SEO data for multiple URLs asynchronously."""
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_url(session, url) for url in urls]
        responses = await asyncio.gather(*tasks)

    return [parse_html(url, html) for url, html in zip(urls, responses) if html]

def save_to_csv(data, filename="seo_data.csv"):
    """Saves the extracted data to a CSV file."""
    if not data:
        print("No data to save.")
        return

    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding="utf-8")
    print(f"Data saved to {filename}")

if __name__ == "__main__":
    urls = ["https://example.com", "https://anotherwebsite.com"]  # Add URLs here
    seo_results = asyncio.run(fetch_seo_data(urls))
    save_to_csv(seo_results)