summaryrefslogtreecommitdiff
path: root/chadcrawler.py
blob: cf39e929bf23b74d2184e667307ca990f3fc4565 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import requests
from bs4 import BeautifulSoup
import csv
import time

visited_urls = set()
csv_filename = "crawled_data.csv"

def crawl(url, depth=2):
    if depth == 0 or url in visited_urls:
        return
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
    except requests.RequestException:
        return

    visited_urls.add(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract title and all links
    title = soup.title.string if soup.title else "No Title"
    links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith("http")]

    # Save to CSV
    save_to_csv([url, title])

    # Crawl next links
    for link in links:
        crawl(link, depth - 1)
        time.sleep(1)  # Delay to avoid overloading the server

def save_to_csv(data):
    with open(csv_filename, "a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(data)

# Start crawling
start_url = "https://example.com"
crawl(start_url)
print("Crawling finished. Data saved in", csv_filename)