Building a Basic Website Crawler using Python & BeautifulSoup

Ever stumbled upon this situation where you are trying to crawl a small site for Audit, but you can’t crawl it due to rate limiting, IP blacklisting, among various other reasons?

It is annoying, right?

I recently faced this challenge, which is when I thought maybe I could build a Python Script which would run on Visual Studio Code & save the crawl in a CSV.

What information are we able to extract?

  1. URL
  2. Canonical URL
  3. Title
  4. Description
  5. Crawl Depth
  6. Word Count
  7. Robots Meta Tag

This is useful when you’re blocked on crawling & the tech team responds stating that due to current priorities, this will be addressed in a few days, but you know you can’t wait for that long.

Folder Structure you need

|-crawl.py
|-requirements.txt

Requirements.txt will contain the following

requests
beautifulsoup4

Here is the VS Code Python Script

				
					import argparse
import csv
import time
import random
from collections import deque
from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup

# A few user-agents to rotate
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)",
]

def fetch(url):
    for ua in USER_AGENTS:
        try:
            r = requests.get(url, headers={"User-Agent": ua}, timeout=10)
            return r
        except:
            pass
    return None

def parse_page(resp, depth):
    soup = BeautifulSoup(resp.text, "html.parser")
    title = soup.title.string.strip() if soup.title else ""
    desc = soup.find("meta", {"name":"description"})
    description = desc["content"].strip() if desc and desc.get("content") else ""
    can = soup.find("link", rel="canonical")
    canonical = can["href"].strip() if can and can.get("href") else ""
    robots = soup.find("meta", {"name":"robots"})
    robots_meta = robots["content"].strip() if robots and robots.get("content") else ""
    text = soup.get_text(" ").strip()
    word_count = len(text.split())
    return [
        resp.url, resp.status_code,
        title, description,
        canonical, robots_meta,
        depth, word_count
    ]

def crawl(start_url, output_csv):
    domain = urlparse(start_url).netloc
    seen = set([ ])
    queue = deque([(start_url, 0)])

    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow([
            "url","status_code","meta_title","meta_description",
            "canonical_url","robots_meta","crawl_depth","word_count"
        ])

        while queue:
            url, depth = queue.popleft()
            if url in seen: continue
            seen.add(url)

            # skip any URL with this to use if you want to ignore any subfolder "/subfolder"
            if "/subfolder" in urlparse(url).path.lower():
                continue

            resp = fetch(url)
            if not resp:
                print("Failed:", url)
                continue

            w.writerow(parse_page(resp, depth))
            print(f"Crawled ({depth}): {url}")

            soup = BeautifulSoup(resp.text, "html.parser")
            for a in soup.find_all("a", href=True):
                nxt = urljoin(start_url, a["href"])
                if urlparse(nxt).netloc == domain and nxt not in seen:
                    queue.append((nxt, depth+1))

            time.sleep(random.uniform(0.5,1.5))

    print("Done! Results in", output_csv)

if __name__ == "__main__":
    p = argparse.ArgumentParser()
    p.add_argument("--domain", required=True,
                   help="e.g. https://www.decodedigitalmarket.com/")
    p.add_argument("--output", default="crawl_results.csv")
    args = p.parse_args()
    crawl(args.domain, args.output)
				
			

Terminal Installations You Need

				
					python -m venv .venv
				
			
				
					Set-ExecutionPolicy -Scope Process -ExecutionPolicy RemoteSigned
				
			
				
					pip install -r requirements.txt
				
			
				
					python crawl.py --domain https://www.yourdomain.com --output results1s.csv
				
			

This will get the crawl running which you can monitor

vs code website crawler

Leave a Comment