I built a Python Streamlit app that visualizes Internal Link clusters via page type regex so that you can visualize link flow amongst content groups.

Let's see how the outcome looks like

		
																										
		## **How it works?**

I will share the Python Script also, but UI wise, how it works, let's break that down.

You have to provide page regex & assign cluster name & then generate the sankey visualization

Let's see that view of Streamlit

		
																										
		I did this exercise with Screaming Frog crawl of Gymshark website

Now here is the full Python Script that you can use

		
			
				
					#!/usr/bin/env python3
"""
Regex-Based Internal Linking Sankey Visualizer (Streamlit App)
Upload an internal linking CSV, define URL-contains rules to cluster URLs,
and visualize link flow between clusters as an interactive Sankey diagram.
Download the Sankey as a self-contained HTML file.
Usage:
  streamlit run "sitemap_based-inlinks-sankey - Copy.py"
"""
import re
import csv
import json
from collections import defaultdict
from io import StringIO
import streamlit as st
# ─────────────────────────────────────────────────────────────────────
# SESSION STATE DEFAULTS
# ─────────────────────────────────────────────────────────────────────
if "rules" not in st.session_state:
    st.session_state.rules = [
        {"pattern": "", "cluster": ""},
    ]
if "sankey_html" not in st.session_state:
    st.session_state.sankey_html = None
UNCLUSTERED_LABEL = "Unclustered"
# ─────────────────────────────────────────────────────────────────────
# URL CLASSIFICATION
# ─────────────────────────────────────────────────────────────────────
def compile_rules(rules: list[dict]) -> list[tuple[re.Pattern, str]]:
    """Compile user-defined rules into (regex, cluster_name) pairs."""
    compiled = []
    for r in rules:
        pattern = r.get("pattern", "").strip()
        cluster = r.get("cluster", "").strip()
        if not pattern or not cluster:
            continue
        try:
            compiled.append((re.compile(pattern, re.IGNORECASE), cluster))
        except re.error:
            st.warning(f"Invalid regex pattern: `{pattern}` — skipped.")
    return compiled
def classify_url(url: str, compiled_rules: list[tuple[re.Pattern, str]]) -> str:
    """Return the cluster name for a URL based on the first matching rule."""
    for regex, cluster in compiled_rules:
        if regex.search(url):
            return cluster
    return UNCLUSTERED_LABEL
def normalize_url(raw: str) -> str:
    if not raw:
        return ""
    url = raw.strip().rstrip("/")
    if "#" in url:
        url = url.split("#")[0]
    return url
# ─────────────────────────────────────────────────────────────────────
# CSV PROCESSING
# ─────────────────────────────────────────────────────────────────────
def process_csv(
    file_content: str,
    source_col: str,
    target_col: str,
    compiled_rules: list[tuple[re.Pattern, str]],
    exclude_unclustered: bool,
    delimiter: str = ",",
) -> tuple[dict[tuple[str, str], int], dict]:
    """Stream-read the CSV and aggregate link counts between clusters."""
    reader = csv.DictReader(StringIO(file_content), delimiter=delimiter)
    flow: dict[tuple[str, str], int] = defaultdict(int)
    cluster_url_counts: dict[str, int] = defaultdict(int)
    total_rows = 0
    skipped_empty = 0
    skipped_unclustered = 0
    counted = 0
    for row in reader:
        total_rows += 1
        raw_src = (row.get(source_col, "") or "").strip()
        raw_tgt = (row.get(target_col, "") or "").strip()
        if not raw_src or not raw_tgt:
            skipped_empty += 1
            continue
        src_cluster = classify_url(raw_src, compiled_rules)
        tgt_cluster = classify_url(raw_tgt, compiled_rules)
        cluster_url_counts[src_cluster] += 1
        cluster_url_counts[tgt_cluster] += 1
        if exclude_unclustered and (
            src_cluster == UNCLUSTERED_LABEL or tgt_cluster == UNCLUSTERED_LABEL
        ):
            skipped_unclustered += 1
            continue
        flow[(src_cluster, tgt_cluster)] += 1
        counted += 1
    stats = {
        "total_rows": total_rows,
        "skipped_empty": skipped_empty,
        "skipped_unclustered": skipped_unclustered,
        "counted": counted,
        "cluster_url_counts": dict(cluster_url_counts),
    }
    return dict(flow), stats
# ─────────────────────────────────────────────────────────────────────
# SANKEY DATA PREPARATION
# ─────────────────────────────────────────────────────────────────────
def prepare_sankey_data(
    flow: dict[tuple[str, str], int], min_links: int = 0
) -> dict:
    node_set: set[str] = set()
    for (src, tgt), count in flow.items():
        if count