blob: cf39e929bf23b74d2184e667307ca990f3fc4565 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
import requests
from bs4 import BeautifulSoup
import csv
import time
visited_urls = set()
csv_filename = "crawled_data.csv"
def crawl(url, depth=2):
if depth == 0 or url in visited_urls:
return
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
except requests.RequestException:
return
visited_urls.add(url)
soup = BeautifulSoup(response.text, "html.parser")
# Extract title and all links
title = soup.title.string if soup.title else "No Title"
links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith("http")]
# Save to CSV
save_to_csv([url, title])
# Crawl next links
for link in links:
crawl(link, depth - 1)
time.sleep(1) # Delay to avoid overloading the server
def save_to_csv(data):
with open(csv_filename, "a", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(data)
# Start crawling
start_url = "https://example.com"
crawl(start_url)
print("Crawling finished. Data saved in", csv_filename)
|