Ever stumbled upon this situation where you are trying to crawl a small site for Audit, but you can’t crawl it due to rate limiting, IP blacklisting, among various other reasons?
It is annoying, right?
I recently faced this challenge, which is when I thought maybe I could build a Python Script which would run on Visual Studio Code & save the crawl in a CSV.
What information are we able to extract?
- URL
- Canonical URL
- Title
- Description
- Crawl Depth
- Word Count
- Robots Meta Tag
This is useful when you’re blocked on crawling & the tech team responds stating that due to current priorities, this will be addressed in a few days, but you know you can’t wait for that long.
Folder Structure you need
|-crawl.py
|-requirements.txt
Requirements.txt will contain the following
requests
beautifulsoup4
Here is the VS Code Python Script
import argparse
import csv
import time
import random
from collections import deque
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
# A few user-agents to rotate
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
"Mozilla/5.0 (X11; Linux x86_64)",
]
def fetch(url):
for ua in USER_AGENTS:
try:
r = requests.get(url, headers={"User-Agent": ua}, timeout=10)
return r
except:
pass
return None
def parse_page(resp, depth):
soup = BeautifulSoup(resp.text, "html.parser")
title = soup.title.string.strip() if soup.title else ""
desc = soup.find("meta", {"name":"description"})
description = desc["content"].strip() if desc and desc.get("content") else ""
can = soup.find("link", rel="canonical")
canonical = can["href"].strip() if can and can.get("href") else ""
robots = soup.find("meta", {"name":"robots"})
robots_meta = robots["content"].strip() if robots and robots.get("content") else ""
text = soup.get_text(" ").strip()
word_count = len(text.split())
return [
resp.url, resp.status_code,
title, description,
canonical, robots_meta,
depth, word_count
]
def crawl(start_url, output_csv):
domain = urlparse(start_url).netloc
seen = set([ ])
queue = deque([(start_url, 0)])
with open(output_csv, "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow([
"url","status_code","meta_title","meta_description",
"canonical_url","robots_meta","crawl_depth","word_count"
])
while queue:
url, depth = queue.popleft()
if url in seen: continue
seen.add(url)
# skip any URL with this to use if you want to ignore any subfolder "/subfolder"
if "/subfolder" in urlparse(url).path.lower():
continue
resp = fetch(url)
if not resp:
print("Failed:", url)
continue
w.writerow(parse_page(resp, depth))
print(f"Crawled ({depth}): {url}")
soup = BeautifulSoup(resp.text, "html.parser")
for a in soup.find_all("a", href=True):
nxt = urljoin(start_url, a["href"])
if urlparse(nxt).netloc == domain and nxt not in seen:
queue.append((nxt, depth+1))
time.sleep(random.uniform(0.5,1.5))
print("Done! Results in", output_csv)
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("--domain", required=True,
help="e.g. https://www.decodedigitalmarket.com/")
p.add_argument("--output", default="crawl_results.csv")
args = p.parse_args()
crawl(args.domain, args.output)
Terminal Installations You Need
python -m venv .venv
Set-ExecutionPolicy -Scope Process -ExecutionPolicy RemoteSigned
pip install -r requirements.txt
python crawl.py --domain https://www.yourdomain.com --output results1s.csv
This will get the crawl running which you can monitor


Kunjal Chawhan founder of Decode Digital Market, a Digital Marketer by profession, and a Digital Marketing Niche Blogger by passion, here to share my knowledge