I built a Python Streamlit app that visualizes Internal Link clusters via page type regex so that you can visualize link flow amongst content groups. Let's see how the outcome looks like ## **How it works?** I will share the Python Script also, but UI wise, how it works, let's break that down. You have to provide page regex & assign cluster name & then generate the sankey visualization Let's see that view of Streamlit I did this exercise with Screaming Frog crawl of Gymshark website Now here is the full Python Script that you can use #!/usr/bin/env python3 """ Regex-Based Internal Linking Sankey Visualizer (Streamlit App) Upload an internal linking CSV, define URL-contains rules to cluster URLs, and visualize link flow between clusters as an interactive Sankey diagram. Download the Sankey as a self-contained HTML file. Usage: streamlit run "sitemap_based-inlinks-sankey - Copy.py" """ import re import csv import json from collections import defaultdict from io import StringIO import streamlit as st # ───────────────────────────────────────────────────────────────────── # SESSION STATE DEFAULTS # ───────────────────────────────────────────────────────────────────── if "rules" not in st.session_state: st.session_state.rules = [ {"pattern": "", "cluster": ""}, ] if "sankey_html" not in st.session_state: st.session_state.sankey_html = None UNCLUSTERED_LABEL = "Unclustered" # ───────────────────────────────────────────────────────────────────── # URL CLASSIFICATION # ───────────────────────────────────────────────────────────────────── def compile_rules(rules: list[dict]) -> list[tuple[re.Pattern, str]]: """Compile user-defined rules into (regex, cluster_name) pairs.""" compiled = [] for r in rules: pattern = r.get("pattern", "").strip() cluster = r.get("cluster", "").strip() if not pattern or not cluster: continue try: compiled.append((re.compile(pattern, re.IGNORECASE), cluster)) except re.error: st.warning(f"Invalid regex pattern: `{pattern}` — skipped.") return compiled def classify_url(url: str, compiled_rules: list[tuple[re.Pattern, str]]) -> str: """Return the cluster name for a URL based on the first matching rule.""" for regex, cluster in compiled_rules: if regex.search(url): return cluster return UNCLUSTERED_LABEL def normalize_url(raw: str) -> str: if not raw: return "" url = raw.strip().rstrip("/") if "#" in url: url = url.split("#")[0] return url # ───────────────────────────────────────────────────────────────────── # CSV PROCESSING # ───────────────────────────────────────────────────────────────────── def process_csv( file_content: str, source_col: str, target_col: str, compiled_rules: list[tuple[re.Pattern, str]], exclude_unclustered: bool, delimiter: str = ",", ) -> tuple[dict[tuple[str, str], int], dict]: """Stream-read the CSV and aggregate link counts between clusters.""" reader = csv.DictReader(StringIO(file_content), delimiter=delimiter) flow: dict[tuple[str, str], int] = defaultdict(int) cluster_url_counts: dict[str, int] = defaultdict(int) total_rows = 0 skipped_empty = 0 skipped_unclustered = 0 counted = 0 for row in reader: total_rows += 1 raw_src = (row.get(source_col, "") or "").strip() raw_tgt = (row.get(target_col, "") or "").strip() if not raw_src or not raw_tgt: skipped_empty += 1 continue src_cluster = classify_url(raw_src, compiled_rules) tgt_cluster = classify_url(raw_tgt, compiled_rules) cluster_url_counts[src_cluster] += 1 cluster_url_counts[tgt_cluster] += 1 if exclude_unclustered and ( src_cluster == UNCLUSTERED_LABEL or tgt_cluster == UNCLUSTERED_LABEL ): skipped_unclustered += 1 continue flow[(src_cluster, tgt_cluster)] += 1 counted += 1 stats = { "total_rows": total_rows, "skipped_empty": skipped_empty, "skipped_unclustered": skipped_unclustered, "counted": counted, "cluster_url_counts": dict(cluster_url_counts), } return dict(flow), stats # ───────────────────────────────────────────────────────────────────── # SANKEY DATA PREPARATION # ───────────────────────────────────────────────────────────────────── def prepare_sankey_data( flow: dict[tuple[str, str], int], min_links: int = 0 ) -> dict: node_set: set[str] = set() for (src, tgt), count in flow.items(): if count