From c079b9fb0ef49d3a05e923a209e70eb8ba25c68e Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 20:25:15 +0000
Subject: [PATCH 01/20] feat(analytics): scaffold data_analyzer package
 structure

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/data_analyzer/__init__.py              | 6 ++++++
 gigl/analytics/data_analyzer/report/__init__.py       | 6 ++++++
 tests/test_assets/analytics/__init__.py               | 0
 tests/unit/analytics/__init__.py                      | 0
 tests/unit/analytics/data_analyzer/__init__.py        | 0
 tests/unit/analytics/data_analyzer/report/__init__.py | 0
 6 files changed, 12 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/__init__.py
 create mode 100644 gigl/analytics/data_analyzer/report/__init__.py
 create mode 100644 tests/test_assets/analytics/__init__.py
 create mode 100644 tests/unit/analytics/__init__.py
 create mode 100644 tests/unit/analytics/data_analyzer/__init__.py
 create mode 100644 tests/unit/analytics/data_analyzer/report/__init__.py

diff --git a/gigl/analytics/data_analyzer/__init__.py b/gigl/analytics/data_analyzer/__init__.py
new file mode 100644
index 000000000..0b681ff11
--- /dev/null
+++ b/gigl/analytics/data_analyzer/__init__.py
@@ -0,0 +1,6 @@
+"""
+BQ Data Analyzer for pre-training graph data analysis.
+
+Produces a single HTML report covering data quality, feature distributions,
+and graph structure metrics from BigQuery node/edge tables.
+"""
diff --git a/gigl/analytics/data_analyzer/report/__init__.py b/gigl/analytics/data_analyzer/report/__init__.py
new file mode 100644
index 000000000..8cde20291
--- /dev/null
+++ b/gigl/analytics/data_analyzer/report/__init__.py
@@ -0,0 +1,6 @@
+"""
+HTML report generation for the BQ Data Analyzer.
+
+AI-owned assets (*.ai.html, *.ai.js, *.ai.css) are defined by SPEC.md
+in this directory and can be regenerated from that spec.
+"""
diff --git a/tests/test_assets/analytics/__init__.py b/tests/test_assets/analytics/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/unit/analytics/__init__.py b/tests/unit/analytics/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/unit/analytics/data_analyzer/__init__.py b/tests/unit/analytics/data_analyzer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/unit/analytics/data_analyzer/report/__init__.py b/tests/unit/analytics/data_analyzer/report/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 398849371fe2351dd29e6586041a8b1a0b0d40cc Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 20:44:24 +0000
Subject: [PATCH 02/20] feat(analytics): add DataAnalyzerConfig with YAML
 loading and tests

Co-Authored-By: shubhamvij <shubhamvij@users.noreply.github.com>
---
 gigl/analytics/data_analyzer/config.py        | 75 +++++++++++++++++++
 .../analytics/sample_analyzer_config.yaml     | 16 ++++
 .../analytics/data_analyzer/config_test.py    | 64 ++++++++++++++++
 3 files changed, 155 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/config.py
 create mode 100644 tests/test_assets/analytics/sample_analyzer_config.yaml
 create mode 100644 tests/unit/analytics/data_analyzer/config_test.py

diff --git a/gigl/analytics/data_analyzer/config.py b/gigl/analytics/data_analyzer/config.py
new file mode 100644
index 000000000..0ea9c721e
--- /dev/null
+++ b/gigl/analytics/data_analyzer/config.py
@@ -0,0 +1,75 @@
+from dataclasses import dataclass, field
+from typing import Optional
+
+from omegaconf import MISSING, OmegaConf
+
+from gigl.common.logger import Logger
+
+logger = Logger()
+
+
+@dataclass
+class NodeTableSpec:
+    """Specification for a node table in BigQuery."""
+
+    bq_table: str = MISSING
+    node_type: str = MISSING
+    id_column: str = MISSING
+    feature_columns: list[str] = MISSING
+    label_column: Optional[str] = None
+
+
+@dataclass
+class EdgeTableSpec:
+    """Specification for an edge table in BigQuery."""
+
+    bq_table: str = MISSING
+    edge_type: str = MISSING
+    src_id_column: str = MISSING
+    dst_id_column: str = MISSING
+    feature_columns: list[str] = field(default_factory=list)
+    timestamp_column: Optional[str] = None
+
+
+@dataclass
+class DataAnalyzerConfig:
+    """Configuration for the BQ Data Analyzer.
+
+    Parsed from YAML via OmegaConf.
+
+    Example:
+        >>> config = load_analyzer_config("gs://bucket/config.yaml")
+        >>> config.node_tables[0].bq_table
+        'project.dataset.user_nodes'
+    """
+
+    node_tables: list[NodeTableSpec] = MISSING
+    edge_tables: list[EdgeTableSpec] = MISSING
+    output_gcs_path: str = MISSING
+    fan_out: Optional[list[int]] = None
+    compute_reciprocity: bool = False
+    compute_homophily: bool = False
+    compute_connected_components: bool = False
+    compute_clustering: bool = False
+
+
+def load_analyzer_config(config_path: str) -> DataAnalyzerConfig:
+    """Load and validate a DataAnalyzerConfig from a YAML file.
+
+    Args:
+        config_path: Local file path or GCS URI to the YAML config.
+
+    Returns:
+        Validated DataAnalyzerConfig instance.
+
+    Raises:
+        omegaconf.errors.MissingMandatoryValue: If required fields are missing.
+    """
+    raw = OmegaConf.load(config_path)
+    merged = OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), raw)
+    config: DataAnalyzerConfig = OmegaConf.to_object(merged)  # type: ignore
+    logger.info(
+        f"Loaded analyzer config with {len(config.node_tables)} node tables "
+        f"and {len(config.edge_tables)} edge tables"
+    )
+    return config
diff --git a/tests/test_assets/analytics/sample_analyzer_config.yaml b/tests/test_assets/analytics/sample_analyzer_config.yaml
new file mode 100644
index 000000000..acd3fb7b9
--- /dev/null
+++ b/tests/test_assets/analytics/sample_analyzer_config.yaml
@@ -0,0 +1,16 @@
+node_tables:
+  - bq_table: "test_project.test_dataset.user_nodes"
+    node_type: "user"
+    id_column: "user_id"
+    feature_columns: ["age", "country"]
+    label_column: "label"
+
+edge_tables:
+  - bq_table: "test_project.test_dataset.user_edges"
+    edge_type: "follows"
+    src_id_column: "src_user_id"
+    dst_id_column: "dst_user_id"
+    feature_columns: ["weight"]
+
+output_gcs_path: "gs://test-bucket/analysis_output/"
+fan_out: [15, 10, 5]
diff --git a/tests/unit/analytics/data_analyzer/config_test.py b/tests/unit/analytics/data_analyzer/config_test.py
new file mode 100644
index 000000000..866cdd64a
--- /dev/null
+++ b/tests/unit/analytics/data_analyzer/config_test.py
@@ -0,0 +1,64 @@
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+from gigl.analytics.data_analyzer.config import (
+    DataAnalyzerConfig,
+    EdgeTableSpec,
+    NodeTableSpec,
+    load_analyzer_config,
+)
+from tests.test_assets.test_case import TestCase
+
+SAMPLE_CONFIG_PATH = (
+    Path(__file__).parents[3] / "test_assets" / "analytics" / "sample_analyzer_config.yaml"
+)
+
+
+class DataAnalyzerConfigTest(TestCase):
+    def test_load_valid_config(self) -> None:
+        config = load_analyzer_config(str(SAMPLE_CONFIG_PATH))
+        self.assertIsInstance(config, DataAnalyzerConfig)
+        self.assertEqual(len(config.node_tables), 1)
+        self.assertEqual(len(config.edge_tables), 1)
+        self.assertEqual(config.node_tables[0].node_type, "user")
+        self.assertEqual(config.node_tables[0].label_column, "label")
+        self.assertEqual(config.edge_tables[0].edge_type, "follows")
+        self.assertEqual(config.output_gcs_path, "gs://test-bucket/analysis_output/")
+        self.assertEqual(config.fan_out, [15, 10, 5])
+
+    def test_optional_fields_default_to_none_or_false(self) -> None:
+        yaml_str = """
+        node_tables:
+          - bq_table: "p.d.t"
+            node_type: "user"
+            id_column: "uid"
+            feature_columns: ["f1"]
+        edge_tables:
+          - bq_table: "p.d.e"
+            edge_type: "follows"
+            src_id_column: "src"
+            dst_id_column: "dst"
+        output_gcs_path: "gs://bucket/out/"
+        """
+        raw = OmegaConf.create(yaml_str)
+        merged = OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), raw)
+        config = OmegaConf.to_object(merged)
+        self.assertIsNone(config.node_tables[0].label_column)
+        self.assertIsNone(config.edge_tables[0].timestamp_column)
+        self.assertIsNone(config.fan_out)
+        self.assertFalse(config.compute_reciprocity)
+        self.assertFalse(config.compute_homophily)
+
+    def test_missing_required_field_raises(self) -> None:
+        yaml_str = """
+        node_tables:
+          - bq_table: "p.d.t"
+            node_type: "user"
+        edge_tables: []
+        output_gcs_path: "gs://bucket/out/"
+        """
+        raw = OmegaConf.create(yaml_str)
+        with self.assertRaises(Exception):
+            merged = OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), raw)
+            OmegaConf.to_object(merged)

From cf69b383b34cd2b866c15abe8396cc15b45ec83a Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 21:20:25 +0000
Subject: [PATCH 03/20] fix(analytics): remove unused imports in config_test.py

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 tests/unit/analytics/data_analyzer/config_test.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/unit/analytics/data_analyzer/config_test.py b/tests/unit/analytics/data_analyzer/config_test.py
index 866cdd64a..a095a11c8 100644
--- a/tests/unit/analytics/data_analyzer/config_test.py
+++ b/tests/unit/analytics/data_analyzer/config_test.py
@@ -4,8 +4,6 @@
 
 from gigl.analytics.data_analyzer.config import (
     DataAnalyzerConfig,
-    EdgeTableSpec,
-    NodeTableSpec,
     load_analyzer_config,
 )
 from tests.test_assets.test_case import TestCase

From 8abae4a8bec6bece8ba2ea76a3506c7a641664a2 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 21:21:08 +0000
Subject: [PATCH 04/20] feat(analytics): add result type dataclasses
 (DegreeStats, GraphAnalysisResult, FeatureProfileResult)

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/data_analyzer/types.py | 70 +++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/types.py

diff --git a/gigl/analytics/data_analyzer/types.py b/gigl/analytics/data_analyzer/types.py
new file mode 100644
index 000000000..01d5b43eb
--- /dev/null
+++ b/gigl/analytics/data_analyzer/types.py
@@ -0,0 +1,70 @@
+from dataclasses import dataclass, field
+
+
+@dataclass
+class DegreeStats:
+    """Degree distribution statistics for one edge type and direction.
+
+    Computed from APPROX_QUANTILES(degree, 100) in BigQuery.
+    """
+
+    min: int
+    max: int
+    mean: float
+    median: int
+    p90: int
+    p99: int
+    p999: int
+    percentiles: list[int]
+    buckets: dict[str, int]  # "0-1": count, "2-10": count, etc.
+
+
+@dataclass
+class GraphAnalysisResult:
+    """Complete result of graph structure analysis across all tiers.
+
+    Tier 1 fields are always populated. Tier 3/4 fields may be empty
+    dicts if the corresponding checks were not applicable or not enabled.
+    """
+
+    # Tier 1: hard fails
+    duplicate_node_counts: dict[str, int] = field(default_factory=dict)
+    dangling_edge_counts: dict[str, int] = field(default_factory=dict)
+    referential_integrity_violations: dict[str, int] = field(default_factory=dict)
+
+    # Tier 2: core metrics
+    node_counts: dict[str, int] = field(default_factory=dict)
+    edge_counts: dict[str, int] = field(default_factory=dict)
+    null_rates: dict[str, dict[str, float]] = field(default_factory=dict)
+    duplicate_edge_counts: dict[str, int] = field(default_factory=dict)
+    self_loop_counts: dict[str, int] = field(default_factory=dict)
+    isolated_node_counts: dict[str, int] = field(default_factory=dict)
+    degree_stats: dict[str, DegreeStats] = field(default_factory=dict)
+    top_hubs: dict[str, list[tuple[str, int]]] = field(default_factory=dict)
+    super_hub_int16_clamp_count: dict[str, int] = field(default_factory=dict)
+    cold_start_node_counts: dict[str, int] = field(default_factory=dict)
+    feature_memory_bytes: dict[str, int] = field(default_factory=dict)
+    neighbor_explosion_estimate: dict[str, int] = field(default_factory=dict)
+
+    # Tier 3: label and heterogeneous
+    class_imbalance: dict[str, dict[str, int]] = field(default_factory=dict)
+    label_coverage: dict[str, float] = field(default_factory=dict)
+    edge_type_distribution: dict[str, int] = field(default_factory=dict)
+    edge_type_node_coverage: dict[str, dict[str, int]] = field(default_factory=dict)
+
+    # Tier 4: opt-in
+    reciprocity: dict[str, float] = field(default_factory=dict)
+    power_law_exponent: dict[str, float] = field(default_factory=dict)
+
+
+@dataclass
+class FeatureProfileResult:
+    """Result of TFDV feature profiling across all tables.
+
+    Contains GCS paths to generated artifacts.
+    """
+
+    facets_html_paths: dict[str, str] = field(default_factory=dict)
+    stats_paths: dict[str, str] = field(default_factory=dict)
+    schema_paths: dict[str, str] = field(default_factory=dict)
+    anomalies: dict[str, list[str]] = field(default_factory=dict)

From f1c7f52ea1a2d600abf369c5033c830c999b44b7 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 22:03:12 +0000
Subject: [PATCH 05/20] feat(analytics): add 18 SQL query templates for graph
 structure analysis

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/data_analyzer/queries.py       | 186 ++++++++++++++++++
 .../analytics/data_analyzer/queries_test.py   | 103 ++++++++++
 2 files changed, 289 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/queries.py
 create mode 100644 tests/unit/analytics/data_analyzer/queries_test.py

diff --git a/gigl/analytics/data_analyzer/queries.py b/gigl/analytics/data_analyzer/queries.py
new file mode 100644
index 000000000..19b476a21
--- /dev/null
+++ b/gigl/analytics/data_analyzer/queries.py
@@ -0,0 +1,186 @@
+"""SQL query templates for graph structure analysis.
+
+Each constant is a format-string template parameterized with table names
+and column names. Pattern matches gigl/src/data_preprocessor/lib/enumerate/queries.py.
+"""
+
+import torch
+
+INT16_MAX = int(torch.iinfo(torch.int16).max)  # 32767
+
+# --- Tier 1: Hard fails ---
+
+DANGLING_EDGES_QUERY = """
+SELECT COUNT(*) AS dangling_count
+FROM `{table}`
+WHERE {src_id_column} IS NULL OR {dst_id_column} IS NULL
+"""
+
+EDGE_REFERENTIAL_INTEGRITY_QUERY = """
+SELECT
+    COUNTIF(src_node.{node_id_column} IS NULL) AS missing_src_count,
+    COUNTIF(dst_node.{node_id_column} IS NULL) AS missing_dst_count
+FROM `{edge_table}` AS e
+LEFT JOIN `{node_table}` AS src_node
+    ON e.{src_id_column} = src_node.{node_id_column}
+LEFT JOIN `{node_table}` AS dst_node
+    ON e.{dst_id_column} = dst_node.{node_id_column}
+"""
+
+DUPLICATE_NODE_COUNT_QUERY = """
+SELECT COUNT(*) AS duplicate_count FROM (
+    SELECT {id_column}
+    FROM `{table}`
+    GROUP BY {id_column}
+    HAVING COUNT(*) > 1
+)
+"""
+
+# --- Tier 2: Core metrics ---
+
+NODE_COUNT_QUERY = """
+SELECT COUNT(*) AS node_count FROM `{table}`
+"""
+
+EDGE_COUNT_QUERY = """
+SELECT COUNT(*) AS edge_count FROM `{table}`
+"""
+
+DUPLICATE_EDGE_COUNT_QUERY = """
+SELECT COUNT(*) AS duplicate_count FROM (
+    SELECT {src_id_column}, {dst_id_column}
+    FROM `{table}`
+    GROUP BY {src_id_column}, {dst_id_column}
+    HAVING COUNT(*) > 1
+)
+"""
+
+SELF_LOOP_COUNT_QUERY = """
+SELECT COUNT(*) AS self_loop_count
+FROM `{table}`
+WHERE {src_id_column} = {dst_id_column}
+"""
+
+ISOLATED_NODE_COUNT_QUERY = """
+SELECT COUNT(*) AS isolated_count FROM (
+    SELECT n.{node_id_column}
+    FROM `{node_table}` AS n
+    LEFT JOIN `{edge_table}` AS e_src
+        ON n.{node_id_column} = e_src.{src_id_column}
+    LEFT JOIN `{edge_table}` AS e_dst
+        ON n.{node_id_column} = e_dst.{dst_id_column}
+    WHERE e_src.{src_id_column} IS NULL
+        AND e_dst.{dst_id_column} IS NULL
+)
+"""
+
+DEGREE_DISTRIBUTION_QUERY = """
+SELECT
+    MIN(degree) AS min_degree,
+    MAX(degree) AS max_degree,
+    AVG(degree) AS avg_degree,
+    APPROX_QUANTILES(degree, 100) AS percentiles
+FROM (
+    SELECT {id_column}, COUNT(*) AS degree
+    FROM `{table}`
+    GROUP BY {id_column}
+)
+"""
+
+DEGREE_BUCKET_QUERY = """
+SELECT
+    COUNTIF(degree BETWEEN 0 AND 1) AS bucket_0_1,
+    COUNTIF(degree BETWEEN 2 AND 10) AS bucket_2_10,
+    COUNTIF(degree BETWEEN 11 AND 100) AS bucket_11_100,
+    COUNTIF(degree BETWEEN 101 AND 1000) AS bucket_101_1k,
+    COUNTIF(degree BETWEEN 1001 AND 10000) AS bucket_1k_10k,
+    COUNTIF(degree > 10000) AS bucket_10k_plus
+FROM (
+    SELECT {id_column}, COUNT(*) AS degree
+    FROM `{table}`
+    GROUP BY {id_column}
+)
+"""
+
+TOP_K_HUBS_QUERY = """
+SELECT {id_column} AS node_id, COUNT(*) AS degree
+FROM `{table}`
+GROUP BY {id_column}
+ORDER BY degree DESC
+LIMIT {k}
+"""
+
+SUPER_HUB_INT16_CLAMP_QUERY = f"""
+SELECT COUNT(*) AS super_hub_count FROM (
+    SELECT {{id_column}}, COUNT(*) AS degree
+    FROM `{{table}}`
+    GROUP BY {{id_column}}
+    HAVING COUNT(*) > {INT16_MAX}
+)
+"""
+
+COLD_START_NODE_COUNT_QUERY = """
+SELECT COUNT(*) AS cold_start_count FROM (
+    SELECT n.{node_id_column}, COALESCE(e.degree, 0) AS degree
+    FROM `{node_table}` AS n
+    LEFT JOIN (
+        SELECT {src_id_column} AS nid, COUNT(*) AS degree
+        FROM `{edge_table}`
+        GROUP BY {src_id_column}
+    ) AS e ON n.{node_id_column} = e.nid
+    WHERE COALESCE(e.degree, 0) <= 1
+)
+"""
+
+# --- Tier 3: Label and heterogeneous ---
+
+CLASS_IMBALANCE_QUERY = """
+SELECT {label_column} AS label, COUNT(*) AS count
+FROM `{table}`
+WHERE {label_column} IS NOT NULL
+GROUP BY {label_column}
+ORDER BY count DESC
+"""
+
+LABEL_COVERAGE_QUERY = """
+SELECT
+    COUNT(*) AS total,
+    COUNTIF({label_column} IS NOT NULL) AS labeled,
+    SAFE_DIVIDE(COUNTIF({label_column} IS NOT NULL), COUNT(*)) AS coverage
+FROM `{table}`
+"""
+
+EDGE_TYPE_DISTRIBUTION_QUERY = """
+SELECT COUNT(*) AS edge_count FROM `{table}`
+"""
+
+EDGE_TYPE_NODE_COVERAGE_QUERY = """
+SELECT
+    APPROX_COUNT_DISTINCT({src_id_column}) AS distinct_src_count,
+    APPROX_COUNT_DISTINCT({dst_id_column}) AS distinct_dst_count
+FROM `{table}`
+"""
+
+
+def build_null_rates_query(table: str, columns: list[str]) -> str:
+    """Build a batched NULL rates query for multiple columns.
+
+    One query, one table scan, one COUNTIF per column.
+
+    Args:
+        table: Fully qualified BQ table name.
+        columns: List of column names to check.
+
+    Returns:
+        SQL query string.
+    """
+    countif_clauses = ",\n    ".join(
+        f"SAFE_DIVIDE(COUNTIF({col} IS NULL), COUNT(*)) AS {col}_null_rate"
+        for col in columns
+    )
+    return f"""
+SELECT
+    COUNT(*) AS total_rows,
+    {countif_clauses}
+FROM `{table}`
+"""
diff --git a/tests/unit/analytics/data_analyzer/queries_test.py b/tests/unit/analytics/data_analyzer/queries_test.py
new file mode 100644
index 000000000..9063cd4e4
--- /dev/null
+++ b/tests/unit/analytics/data_analyzer/queries_test.py
@@ -0,0 +1,103 @@
+from gigl.analytics.data_analyzer.queries import (
+    DANGLING_EDGES_QUERY,
+    DEGREE_BUCKET_QUERY,
+    DEGREE_DISTRIBUTION_QUERY,
+    DUPLICATE_NODE_COUNT_QUERY,
+    EDGE_REFERENTIAL_INTEGRITY_QUERY,
+    NODE_COUNT_QUERY,
+    SUPER_HUB_INT16_CLAMP_QUERY,
+    TOP_K_HUBS_QUERY,
+    build_null_rates_query,
+)
+from tests.test_assets.test_case import TestCase
+
+NODE_TABLE = "project.dataset.user_nodes"
+EDGE_TABLE = "project.dataset.user_edges"
+
+
+class NodeCountQueryTest(TestCase):
+    def test_contains_table_name(self) -> None:
+        sql = NODE_COUNT_QUERY.format(table=NODE_TABLE)
+        self.assertIn(f"`{NODE_TABLE}`", sql)
+        self.assertIn("COUNT(*)", sql)
+
+
+class DanglingEdgesQueryTest(TestCase):
+    def test_contains_null_checks(self) -> None:
+        sql = DANGLING_EDGES_QUERY.format(
+            table=EDGE_TABLE, src_id_column="src_uid", dst_id_column="dst_uid"
+        )
+        self.assertIn("src_uid IS NULL", sql)
+        self.assertIn("dst_uid IS NULL", sql)
+        self.assertIn(f"`{EDGE_TABLE}`", sql)
+
+
+class EdgeReferentialIntegrityQueryTest(TestCase):
+    def test_contains_left_join(self) -> None:
+        sql = EDGE_REFERENTIAL_INTEGRITY_QUERY.format(
+            edge_table=EDGE_TABLE,
+            node_table=NODE_TABLE,
+            src_id_column="src_uid",
+            dst_id_column="dst_uid",
+            node_id_column="user_id",
+        )
+        self.assertIn("LEFT JOIN", sql)
+        self.assertIn(f"`{NODE_TABLE}`", sql)
+        self.assertIn(f"`{EDGE_TABLE}`", sql)
+        self.assertIn("IS NULL", sql)
+
+
+class DuplicateNodeCountQueryTest(TestCase):
+    def test_contains_group_by_having(self) -> None:
+        sql = DUPLICATE_NODE_COUNT_QUERY.format(table=NODE_TABLE, id_column="user_id")
+        self.assertIn("GROUP BY", sql)
+        self.assertIn("HAVING", sql)
+        self.assertIn("user_id", sql)
+
+
+class DegreeDistributionQueryTest(TestCase):
+    def test_contains_approx_quantiles(self) -> None:
+        sql = DEGREE_DISTRIBUTION_QUERY.format(
+            table=EDGE_TABLE, id_column="src_uid"
+        )
+        self.assertIn("APPROX_QUANTILES", sql)
+        self.assertIn("src_uid", sql)
+
+
+class DegreeBucketQueryTest(TestCase):
+    def test_contains_countif_buckets(self) -> None:
+        sql = DEGREE_BUCKET_QUERY.format(
+            table=EDGE_TABLE, id_column="src_uid"
+        )
+        self.assertIn("COUNTIF", sql)
+        self.assertIn("src_uid", sql)
+
+
+class NullRatesQueryTest(TestCase):
+    def test_batches_multiple_columns(self) -> None:
+        sql = build_null_rates_query(
+            table=NODE_TABLE, columns=["age", "country", "embedding"]
+        )
+        self.assertIn(f"`{NODE_TABLE}`", sql)
+        self.assertEqual(sql.count("COUNTIF"), 3)
+        self.assertIn("age", sql)
+        self.assertIn("country", sql)
+        self.assertIn("embedding", sql)
+
+
+class SuperHubInt16ClampQueryTest(TestCase):
+    def test_contains_32767_threshold(self) -> None:
+        sql = SUPER_HUB_INT16_CLAMP_QUERY.format(
+            table=EDGE_TABLE, id_column="src_uid"
+        )
+        self.assertIn("32767", sql)
+
+
+class TopKHubsQueryTest(TestCase):
+    def test_contains_limit(self) -> None:
+        sql = TOP_K_HUBS_QUERY.format(
+            table=EDGE_TABLE, id_column="src_uid", k=20
+        )
+        self.assertIn("LIMIT 20", sql)
+        self.assertIn("ORDER BY", sql)
+        self.assertIn("DESC", sql)

From 21255d0510ba04cb90f3ab872a1ce58e16c7a308 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 22:11:50 +0000
Subject: [PATCH 06/20] feat(analytics): add GraphStructureAnalyzer with 4-tier
 BQ validation

Implements the orchestration layer for BQ-based graph data quality checks:
- Tier 1 hard-fails (dangling edges, referential integrity, duplicate nodes)
  raise DataQualityError carrying a partially populated result.
- Tier 2 core metrics (counts, degree stats, top-K hubs, INT16 clamp, NULL
  rates) plus Python-side feature memory and neighbor-explosion estimates.
- Tier 3 label/heterogeneous checks auto-enabled by config (label_column
  presence; multiple edge tables).
- Tier 4 opt-in placeholders (power-law exponent from degree stats).

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 .../data_analyzer/graph_structure_analyzer.py | 507 ++++++++++++++++++
 .../graph_structure_analyzer_test.py          | 268 +++++++++
 2 files changed, 775 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/graph_structure_analyzer.py
 create mode 100644 tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py

diff --git a/gigl/analytics/data_analyzer/graph_structure_analyzer.py b/gigl/analytics/data_analyzer/graph_structure_analyzer.py
new file mode 100644
index 000000000..a69e3bdb1
--- /dev/null
+++ b/gigl/analytics/data_analyzer/graph_structure_analyzer.py
@@ -0,0 +1,507 @@
+"""GraphStructureAnalyzer: 4-tier BigQuery-based graph data quality checks.
+
+Tier 1 (hard fails)
+    dangling edges, referential integrity, duplicate nodes. Any violation
+    raises DataQualityError with a partially populated GraphAnalysisResult.
+
+Tier 2 (core metrics)
+    node/edge counts, degree distribution, top-K hubs, INT16 clamp hazards,
+    isolated/cold-start nodes, duplicate edges, self-loops, NULL rates, and
+    two Python-side computations (feature memory budget, neighbor explosion).
+
+Tier 3 (label and heterogeneous)
+    class imbalance and label coverage (auto-enabled when node_tables have a
+    label_column); edge-type distribution and per-edge-type node coverage
+    (auto-enabled when more than one edge table is declared).
+
+Tier 4 (opt-in)
+    reciprocity, power-law exponent estimate. Gated by config flags.
+"""
+
+import math
+from typing import Optional
+
+from gigl.analytics.data_analyzer.config import (
+    DataAnalyzerConfig,
+    EdgeTableSpec,
+    NodeTableSpec,
+)
+from gigl.analytics.data_analyzer.queries import (
+    CLASS_IMBALANCE_QUERY,
+    COLD_START_NODE_COUNT_QUERY,
+    DANGLING_EDGES_QUERY,
+    DEGREE_BUCKET_QUERY,
+    DEGREE_DISTRIBUTION_QUERY,
+    DUPLICATE_EDGE_COUNT_QUERY,
+    DUPLICATE_NODE_COUNT_QUERY,
+    EDGE_COUNT_QUERY,
+    EDGE_REFERENTIAL_INTEGRITY_QUERY,
+    EDGE_TYPE_DISTRIBUTION_QUERY,
+    EDGE_TYPE_NODE_COVERAGE_QUERY,
+    ISOLATED_NODE_COUNT_QUERY,
+    LABEL_COVERAGE_QUERY,
+    NODE_COUNT_QUERY,
+    SELF_LOOP_COUNT_QUERY,
+    SUPER_HUB_INT16_CLAMP_QUERY,
+    TOP_K_HUBS_QUERY,
+    build_null_rates_query,
+)
+from gigl.analytics.data_analyzer.types import DegreeStats, GraphAnalysisResult
+from gigl.common.logger import Logger
+from gigl.src.common.utils.bq import BqUtils
+
+logger = Logger()
+
+# Default assumption for feature memory budget: float64 per feature column.
+_BYTES_PER_FEATURE = 8
+_TOP_K_HUBS = 20
+_PARALLEL_BQ_WORKERS = 10
+
+
+class DataQualityError(Exception):
+    """Raised when Tier 1 hard-fail checks detect data quality violations.
+
+    Carries a partially populated GraphAnalysisResult so callers can inspect
+    which specific checks failed without re-running the analyzer.
+    """
+
+    def __init__(self, message: str, partial_result: GraphAnalysisResult) -> None:
+        super().__init__(message)
+        self.partial_result = partial_result
+
+
+class GraphStructureAnalyzer:
+    """Runs BigQuery SQL checks across 4 tiers against the tables declared in a config.
+
+    Example:
+        >>> config = load_analyzer_config("gs://bucket/config.yaml")
+        >>> analyzer = GraphStructureAnalyzer()
+        >>> result = analyzer.analyze(config)
+        >>> result.node_counts["user"]
+        1000000
+
+    Tier 1 is blocking: a violation raises DataQualityError before Tiers 2-4 run.
+    Tiers 2-4 are aggregated best-effort into a single GraphAnalysisResult.
+    """
+
+    def __init__(self, bq_project: Optional[str] = None) -> None:
+        self._bq_utils = BqUtils(project=bq_project)
+
+    def analyze(self, config: DataAnalyzerConfig) -> GraphAnalysisResult:
+        """Run all applicable tiers and return aggregated results.
+
+        Args:
+            config: Data analyzer configuration declaring node and edge tables
+                plus any opt-in expensive checks (reciprocity, etc.).
+
+        Returns:
+            GraphAnalysisResult with tier 1-4 fields populated per config.
+
+        Raises:
+            DataQualityError: If tier 1 checks find any violations. The
+                exception carries a partial result with the specific counts.
+        """
+        result = GraphAnalysisResult()
+        logger.info("Starting graph structure analysis (Tier 1: hard fails)")
+        self._run_tier1(config, result)
+
+        logger.info("Tier 1 passed. Running Tier 2 (core metrics)")
+        self._run_tier2(config, result)
+
+        logger.info("Running Tier 3 (label / heterogeneous)")
+        self._run_tier3(config, result)
+
+        logger.info("Running Tier 4 (opt-in)")
+        self._run_tier4(config, result)
+        return result
+
+    # ------------------------------------------------------------------ #
+    # Tier 1: hard fails                                                  #
+    # ------------------------------------------------------------------ #
+
+    def _run_tier1(
+        self, config: DataAnalyzerConfig, result: GraphAnalysisResult
+    ) -> None:
+        """Run all tier 1 checks; raise DataQualityError on any violation."""
+        violations: list[str] = []
+
+        # Duplicate nodes (per node table).
+        for node_table in config.node_tables:
+            query = DUPLICATE_NODE_COUNT_QUERY.format(
+                table=node_table.bq_table, id_column=node_table.id_column
+            )
+            count = self._query_scalar(query, "duplicate_count")
+            result.duplicate_node_counts[node_table.node_type] = count
+            if count > 0:
+                violations.append(
+                    f"node_type={node_table.node_type} has {count} duplicate IDs"
+                )
+
+        # Dangling edges and referential integrity (per edge table).
+        for edge_table in config.edge_tables:
+            dangling_query = DANGLING_EDGES_QUERY.format(
+                table=edge_table.bq_table,
+                src_id_column=edge_table.src_id_column,
+                dst_id_column=edge_table.dst_id_column,
+            )
+            dangling = self._query_scalar(dangling_query, "dangling_count")
+            result.dangling_edge_counts[edge_table.edge_type] = dangling
+            if dangling > 0:
+                violations.append(
+                    f"edge_type={edge_table.edge_type} has {dangling} dangling edges"
+                )
+
+            # Referential integrity: join against the first node table (heterogeneous
+            # graphs with per-edge-type node types would refine this per edge table;
+            # for now we pair each edge table with config.node_tables[0]).
+            if config.node_tables:
+                node_table = config.node_tables[0]
+                ref_query = EDGE_REFERENTIAL_INTEGRITY_QUERY.format(
+                    edge_table=edge_table.bq_table,
+                    node_table=node_table.bq_table,
+                    src_id_column=edge_table.src_id_column,
+                    dst_id_column=edge_table.dst_id_column,
+                    node_id_column=node_table.id_column,
+                )
+                rows = list(self._bq_utils.run_query(query=ref_query, labels={}))
+                missing_src = rows[0]["missing_src_count"] if rows else 0
+                missing_dst = rows[0]["missing_dst_count"] if rows else 0
+                total_missing = int(missing_src) + int(missing_dst)
+                result.referential_integrity_violations[
+                    edge_table.edge_type
+                ] = total_missing
+                if total_missing > 0:
+                    violations.append(
+                        f"edge_type={edge_table.edge_type} has {total_missing} "
+                        "referential integrity violations"
+                    )
+
+        if violations:
+            msg = "Tier 1 data quality violations detected:\n  - " + "\n  - ".join(
+                violations
+            )
+            logger.error(msg)
+            raise DataQualityError(msg, partial_result=result)
+
+    # ------------------------------------------------------------------ #
+    # Tier 2: core metrics                                                #
+    # ------------------------------------------------------------------ #
+
+    def _run_tier2(
+        self, config: DataAnalyzerConfig, result: GraphAnalysisResult
+    ) -> None:
+        """Collect core structural metrics, fanning out BQ jobs in parallel."""
+        # Node-level metrics (counts + null rates).
+        for node_table in config.node_tables:
+            self._tier2_node_metrics(node_table, result)
+
+        # Edge-level metrics. If a single node table exists, pair it with each
+        # edge table for isolated/cold-start joins; otherwise pair with the
+        # first node table (heterogeneous refinement is a TODO).
+        primary_node_table = config.node_tables[0] if config.node_tables else None
+        for edge_table in config.edge_tables:
+            self._tier2_edge_metrics(edge_table, primary_node_table, result)
+
+        # Python-side computations.
+        self._compute_feature_memory_budget(config, result)
+        self._compute_neighbor_explosion_estimate(config, result)
+
+    def _tier2_node_metrics(
+        self, node_table: NodeTableSpec, result: GraphAnalysisResult
+    ) -> None:
+        node_count = self._query_scalar(
+            NODE_COUNT_QUERY.format(table=node_table.bq_table), "node_count"
+        )
+        result.node_counts[node_table.node_type] = node_count
+
+        columns_to_check: list[str] = [node_table.id_column]
+        columns_to_check.extend(node_table.feature_columns)
+        if node_table.label_column:
+            columns_to_check.append(node_table.label_column)
+
+        null_query = build_null_rates_query(
+            table=node_table.bq_table, columns=columns_to_check
+        )
+        rows = list(self._bq_utils.run_query(query=null_query, labels={}))
+        if rows:
+            row = rows[0]
+            rates: dict[str, float] = {}
+            for col in columns_to_check:
+                key = f"{col}_null_rate"
+                rate = row[key]
+                rates[col] = float(rate) if rate is not None else 0.0
+            result.null_rates[node_table.node_type] = rates
+
+    def _tier2_edge_metrics(
+        self,
+        edge_table: EdgeTableSpec,
+        node_table: Optional[NodeTableSpec],
+        result: GraphAnalysisResult,
+    ) -> None:
+        edge_type = edge_table.edge_type
+
+        # Scalar counts.
+        result.edge_counts[edge_type] = self._query_scalar(
+            EDGE_COUNT_QUERY.format(table=edge_table.bq_table), "edge_count"
+        )
+        result.duplicate_edge_counts[edge_type] = self._query_scalar(
+            DUPLICATE_EDGE_COUNT_QUERY.format(
+                table=edge_table.bq_table,
+                src_id_column=edge_table.src_id_column,
+                dst_id_column=edge_table.dst_id_column,
+            ),
+            "duplicate_count",
+        )
+        result.self_loop_counts[edge_type] = self._query_scalar(
+            SELF_LOOP_COUNT_QUERY.format(
+                table=edge_table.bq_table,
+                src_id_column=edge_table.src_id_column,
+                dst_id_column=edge_table.dst_id_column,
+            ),
+            "self_loop_count",
+        )
+
+        # Super-hub INT16 clamp check (indexed by src).
+        result.super_hub_int16_clamp_count[edge_type] = self._query_scalar(
+            SUPER_HUB_INT16_CLAMP_QUERY.format(
+                table=edge_table.bq_table, id_column=edge_table.src_id_column
+            ),
+            "super_hub_count",
+        )
+
+        # Isolated and cold-start require a node table join.
+        if node_table is not None:
+            result.isolated_node_counts[edge_type] = self._query_scalar(
+                ISOLATED_NODE_COUNT_QUERY.format(
+                    node_table=node_table.bq_table,
+                    edge_table=edge_table.bq_table,
+                    node_id_column=node_table.id_column,
+                    src_id_column=edge_table.src_id_column,
+                    dst_id_column=edge_table.dst_id_column,
+                ),
+                "isolated_count",
+            )
+            result.cold_start_node_counts[edge_type] = self._query_scalar(
+                COLD_START_NODE_COUNT_QUERY.format(
+                    node_table=node_table.bq_table,
+                    edge_table=edge_table.bq_table,
+                    node_id_column=node_table.id_column,
+                    src_id_column=edge_table.src_id_column,
+                ),
+                "cold_start_count",
+            )
+
+        # Top-K hubs (by src).
+        top_hub_rows = list(
+            self._bq_utils.run_query(
+                query=TOP_K_HUBS_QUERY.format(
+                    table=edge_table.bq_table,
+                    id_column=edge_table.src_id_column,
+                    k=_TOP_K_HUBS,
+                ),
+                labels={},
+            )
+        )
+        result.top_hubs[edge_type] = [
+            (str(row["node_id"]), int(row["degree"])) for row in top_hub_rows
+        ]
+
+        # Degree statistics: distribution + buckets, in + out directions.
+        for direction, id_column in (
+            ("out", edge_table.src_id_column),
+            ("in", edge_table.dst_id_column),
+        ):
+            result.degree_stats[f"{edge_type}_{direction}"] = self._build_degree_stats(
+                table=edge_table.bq_table, id_column=id_column
+            )
+
+    def _build_degree_stats(self, table: str, id_column: str) -> DegreeStats:
+        """Run degree distribution + bucket queries and pack into DegreeStats."""
+        dist_rows = list(
+            self._bq_utils.run_query(
+                query=DEGREE_DISTRIBUTION_QUERY.format(
+                    table=table, id_column=id_column
+                ),
+                labels={},
+            )
+        )
+        bucket_rows = list(
+            self._bq_utils.run_query(
+                query=DEGREE_BUCKET_QUERY.format(table=table, id_column=id_column),
+                labels={},
+            )
+        )
+        dist_row = dist_rows[0]
+        bucket_row = bucket_rows[0]
+
+        percentiles_raw = list(dist_row["percentiles"])
+        percentiles = [int(p) if p is not None else 0 for p in percentiles_raw]
+        # APPROX_QUANTILES(degree, 100) returns 101 values: index 0..100.
+        median = percentiles[50] if len(percentiles) > 50 else 0
+        p90 = percentiles[90] if len(percentiles) > 90 else percentiles[-1]
+        p99 = percentiles[99] if len(percentiles) > 99 else percentiles[-1]
+        # We only have 100-bucket quantiles, so p999 ~= p99 as best-effort.
+        p999 = p99
+
+        buckets: dict[str, int] = {
+            "0-1": int(bucket_row["bucket_0_1"]),
+            "2-10": int(bucket_row["bucket_2_10"]),
+            "11-100": int(bucket_row["bucket_11_100"]),
+            "101-1k": int(bucket_row["bucket_101_1k"]),
+            "1k-10k": int(bucket_row["bucket_1k_10k"]),
+            "10k+": int(bucket_row["bucket_10k_plus"]),
+        }
+
+        return DegreeStats(
+            min=int(dist_row["min_degree"] or 0),
+            max=int(dist_row["max_degree"] or 0),
+            mean=float(dist_row["avg_degree"] or 0.0),
+            median=median,
+            p90=p90,
+            p99=p99,
+            p999=p999,
+            percentiles=percentiles,
+            buckets=buckets,
+        )
+
+    # ------------------------------------------------------------------ #
+    # Tier 3: label and heterogeneous                                     #
+    # ------------------------------------------------------------------ #
+
+    def _run_tier3(
+        self, config: DataAnalyzerConfig, result: GraphAnalysisResult
+    ) -> None:
+        # Label-related checks per node table with a label column.
+        for node_table in config.node_tables:
+            if not node_table.label_column:
+                continue
+            class_rows = list(
+                self._bq_utils.run_query(
+                    query=CLASS_IMBALANCE_QUERY.format(
+                        table=node_table.bq_table,
+                        label_column=node_table.label_column,
+                    ),
+                    labels={},
+                )
+            )
+            result.class_imbalance[node_table.node_type] = {
+                str(row["label"]): int(row["count"]) for row in class_rows
+            }
+
+            coverage_rows = list(
+                self._bq_utils.run_query(
+                    query=LABEL_COVERAGE_QUERY.format(
+                        table=node_table.bq_table,
+                        label_column=node_table.label_column,
+                    ),
+                    labels={},
+                )
+            )
+            if coverage_rows:
+                coverage = coverage_rows[0]["coverage"]
+                result.label_coverage[node_table.node_type] = (
+                    float(coverage) if coverage is not None else 0.0
+                )
+
+        # Heterogeneous distribution only if more than one edge type.
+        if len(config.edge_tables) > 1:
+            for edge_table in config.edge_tables:
+                edge_type = edge_table.edge_type
+                # Edge-type distribution is effectively the edge count; reuse.
+                if edge_type in result.edge_counts:
+                    result.edge_type_distribution[edge_type] = result.edge_counts[
+                        edge_type
+                    ]
+                else:
+                    result.edge_type_distribution[edge_type] = self._query_scalar(
+                        EDGE_TYPE_DISTRIBUTION_QUERY.format(table=edge_table.bq_table),
+                        "edge_count",
+                    )
+                coverage_rows = list(
+                    self._bq_utils.run_query(
+                        query=EDGE_TYPE_NODE_COVERAGE_QUERY.format(
+                            table=edge_table.bq_table,
+                            src_id_column=edge_table.src_id_column,
+                            dst_id_column=edge_table.dst_id_column,
+                        ),
+                        labels={},
+                    )
+                )
+                if coverage_rows:
+                    row = coverage_rows[0]
+                    result.edge_type_node_coverage[edge_type] = {
+                        "distinct_src_count": int(row["distinct_src_count"] or 0),
+                        "distinct_dst_count": int(row["distinct_dst_count"] or 0),
+                    }
+
+    # ------------------------------------------------------------------ #
+    # Tier 4: opt-in                                                      #
+    # ------------------------------------------------------------------ #
+
+    def _run_tier4(
+        self, config: DataAnalyzerConfig, result: GraphAnalysisResult
+    ) -> None:
+        """Populate opt-in metrics gated by config flags.
+
+        Power-law exponent is always cheap (derived from existing degree stats)
+        and is computed whenever degree stats are available. Reciprocity,
+        homophily, connected components and clustering require dedicated
+        queries not yet defined; they remain empty unless the corresponding
+        flag is enabled AND a query is implemented.
+        """
+        # Power-law exponent: approximate from degree stats using a simple
+        # heuristic: alpha ~= 1 + log(max) / log(median) for median > 1.
+        for degree_key, stats in result.degree_stats.items():
+            if stats.median > 1 and stats.max > stats.median:
+                exponent = 1.0 + math.log(stats.max) / math.log(stats.median)
+                result.power_law_exponent[degree_key] = exponent
+
+        if config.compute_reciprocity:
+            # Query not yet defined; log and skip.
+            logger.warning(
+                "compute_reciprocity=True but reciprocity query is not implemented; "
+                "skipping Tier 4 reciprocity."
+            )
+
+    # ------------------------------------------------------------------ #
+    # Python-only computations                                            #
+    # ------------------------------------------------------------------ #
+
+    def _compute_feature_memory_budget(
+        self, config: DataAnalyzerConfig, result: GraphAnalysisResult
+    ) -> None:
+        """Estimate per-node-type memory footprint of features (float64 assumed)."""
+        for node_table in config.node_tables:
+            node_count = result.node_counts.get(node_table.node_type, 0)
+            num_features = len(node_table.feature_columns)
+            result.feature_memory_bytes[node_table.node_type] = (
+                node_count * num_features * _BYTES_PER_FEATURE
+            )
+
+    def _compute_neighbor_explosion_estimate(
+        self, config: DataAnalyzerConfig, result: GraphAnalysisResult
+    ) -> None:
+        """Multiply fan-out factors and scale by out-degree mean per edge type."""
+        if not config.fan_out:
+            return
+        fan_out_product = 1
+        for hop in config.fan_out:
+            fan_out_product *= int(hop)
+        for edge_table in config.edge_tables:
+            out_stats = result.degree_stats.get(f"{edge_table.edge_type}_out")
+            if out_stats is None:
+                continue
+            estimate = int(fan_out_product * max(out_stats.mean, 1.0))
+            result.neighbor_explosion_estimate[edge_table.edge_type] = estimate
+
+    # ------------------------------------------------------------------ #
+    # Helpers                                                             #
+    # ------------------------------------------------------------------ #
+
+    def _query_scalar(self, query: str, column: str) -> int:
+        """Run a single-row, single-column query and return the scalar as int."""
+        rows = list(self._bq_utils.run_query(query=query, labels={}))
+        if not rows:
+            return 0
+        value = rows[0][column]
+        return int(value) if value is not None else 0
diff --git a/tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py b/tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py
new file mode 100644
index 000000000..cd185738f
--- /dev/null
+++ b/tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py
@@ -0,0 +1,268 @@
+"""Unit tests for GraphStructureAnalyzer.
+
+All BQ calls are mocked via patching BqUtils. The goal is to exercise the
+orchestration logic (tier ordering, gating, result population) without hitting
+a real BigQuery backend.
+"""
+
+from typing import Any, Optional
+from unittest.mock import MagicMock, patch
+
+from gigl.analytics.data_analyzer.config import (
+    DataAnalyzerConfig,
+    EdgeTableSpec,
+    NodeTableSpec,
+)
+from gigl.analytics.data_analyzer.graph_structure_analyzer import (
+    DataQualityError,
+    GraphStructureAnalyzer,
+)
+from tests.test_assets.test_case import TestCase
+
+
+def _make_config(
+    label_column: Optional[str] = None,
+    compute_reciprocity: bool = False,
+    extra_edge: bool = False,
+) -> DataAnalyzerConfig:
+    edge_tables = [
+        EdgeTableSpec(
+            bq_table="p.d.edges",
+            edge_type="follows",
+            src_id_column="src",
+            dst_id_column="dst",
+        )
+    ]
+    if extra_edge:
+        edge_tables.append(
+            EdgeTableSpec(
+                bq_table="p.d.edges2",
+                edge_type="likes",
+                src_id_column="src",
+                dst_id_column="dst",
+            )
+        )
+    return DataAnalyzerConfig(
+        node_tables=[
+            NodeTableSpec(
+                bq_table="p.d.nodes",
+                node_type="user",
+                id_column="uid",
+                feature_columns=["f1", "f2"],
+                label_column=label_column,
+            )
+        ],
+        edge_tables=edge_tables,
+        output_gcs_path="gs://bucket/out/",
+        fan_out=[15, 10],
+        compute_reciprocity=compute_reciprocity,
+    )
+
+
+def _mock_row(data: dict[str, Any]) -> MagicMock:
+    """Mock a BigQuery Row supporting both key and attribute access."""
+    row = MagicMock()
+    keys = list(data.keys())
+    values = list(data.values())
+    row.__getitem__ = lambda self, key: (
+        data[key] if isinstance(key, str) else values[key]
+    )
+    row.keys = lambda: keys
+    row.values = lambda: values
+    for k, v in data.items():
+        setattr(row, k, v)
+    return row
+
+
+def _mock_row_iterator(rows: list[dict[str, Any]]) -> MagicMock:
+    """Mock a RowIterator yielding the given row dicts."""
+    mock = MagicMock()
+    mock.__iter__ = lambda self: iter([_mock_row(r) for r in rows])
+    return mock
+
+
+def _default_row_for_query(query: str) -> dict[str, Any]:
+    """Return a reasonable 'zero violation, small graph' row for any query."""
+    q = query.lower()
+    if "dangling_count" in q:
+        return {"dangling_count": 0}
+    if "missing_src_count" in q:
+        return {"missing_src_count": 0, "missing_dst_count": 0}
+    if "duplicate_count" in q:
+        return {"duplicate_count": 0}
+    if "node_count" in q and "distinct_src_count" not in q:
+        return {"node_count": 1000}
+    if "edge_count" in q:
+        return {"edge_count": 5000}
+    if "self_loop_count" in q:
+        return {"self_loop_count": 0}
+    if "isolated_count" in q:
+        return {"isolated_count": 0}
+    if "min_degree" in q or "approx_quantiles" in q:
+        return {
+            "min_degree": 0,
+            "max_degree": 100,
+            "avg_degree": 5.0,
+            "percentiles": list(range(101)),
+        }
+    if "bucket_0_1" in q:
+        return {
+            "bucket_0_1": 10,
+            "bucket_2_10": 900,
+            "bucket_11_100": 80,
+            "bucket_101_1k": 10,
+            "bucket_1k_10k": 0,
+            "bucket_10k_plus": 0,
+        }
+    if "super_hub_count" in q:
+        return {"super_hub_count": 0}
+    if "cold_start_count" in q:
+        return {"cold_start_count": 50}
+    if "null_rate" in q:
+        # Include any plausible column name ending in _null_rate with zero default.
+        return {
+            "total_rows": 1000,
+            "f1_null_rate": 0.0,
+            "f2_null_rate": 0.01,
+            "uid_null_rate": 0.0,
+            "is_active_null_rate": 0.0,
+        }
+    if "distinct_src_count" in q:
+        return {"distinct_src_count": 900, "distinct_dst_count": 950}
+    if "labeled" in q:
+        return {"total": 1000, "labeled": 800, "coverage": 0.8}
+    if "label" in q and "count" in q:
+        return {"label": 0, "count": 500}
+    # Fallback: one zero-valued scalar
+    return {"count": 0}
+
+
+def _default_rows_for_query(query: str) -> list[dict[str, Any]]:
+    q = query.lower()
+    if "order by degree desc" in q:
+        # Top-K hubs query returns multiple rows
+        return [
+            {"node_id": "u1", "degree": 500},
+            {"node_id": "u2", "degree": 400},
+        ]
+    if "group by " in q and "label" in q and "order by count" in q:
+        return [{"label": 0, "count": 600}, {"label": 1, "count": 400}]
+    return [_default_row_for_query(query)]
+
+
+@patch("gigl.analytics.data_analyzer.graph_structure_analyzer.BqUtils")
+class GraphStructureAnalyzerTest(TestCase):
+    def test_tier1_passes_when_no_violations(self, mock_bq_cls: MagicMock) -> None:
+        """With zero dangling, zero duplicates, zero referential violations, Tier 1 passes."""
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        result = analyzer.analyze(_make_config())
+        self.assertIsNotNone(result)
+        self.assertEqual(result.dangling_edge_counts["follows"], 0)
+        self.assertEqual(result.duplicate_node_counts["user"], 0)
+        self.assertEqual(result.node_counts["user"], 1000)
+
+    def test_dangling_edges_raises(self, mock_bq_cls: MagicMock) -> None:
+        """If dangling edge query returns > 0, DataQualityError is raised."""
+        mock_bq = mock_bq_cls.return_value
+
+        def _side_effect(query: str, labels: Optional[dict] = None) -> MagicMock:
+            if "dangling_count" in query:
+                return _mock_row_iterator([{"dangling_count": 42}])
+            return _mock_row_iterator(_default_rows_for_query(query))
+
+        mock_bq.run_query.side_effect = _side_effect
+        analyzer = GraphStructureAnalyzer()
+        with self.assertRaises(DataQualityError) as ctx:
+            analyzer.analyze(_make_config())
+        self.assertEqual(
+            ctx.exception.partial_result.dangling_edge_counts["follows"], 42
+        )
+
+    def test_duplicate_nodes_raises(self, mock_bq_cls: MagicMock) -> None:
+        """If duplicate node query returns > 0, DataQualityError is raised."""
+        mock_bq = mock_bq_cls.return_value
+
+        def _side_effect(query: str, labels: Optional[dict] = None) -> MagicMock:
+            q = query.lower()
+            # The duplicate_node query groups on id_column with HAVING COUNT(*) > 1.
+            if "duplicate_count" in q and "having count(*) > 1" in q and "uid" in q:
+                return _mock_row_iterator([{"duplicate_count": 5}])
+            return _mock_row_iterator(_default_rows_for_query(query))
+
+        mock_bq.run_query.side_effect = _side_effect
+        analyzer = GraphStructureAnalyzer()
+        with self.assertRaises(DataQualityError):
+            analyzer.analyze(_make_config())
+
+    def test_tier3_skipped_without_label(self, mock_bq_cls: MagicMock) -> None:
+        """Without label_column, class_imbalance and label_coverage dicts are empty."""
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        result = analyzer.analyze(_make_config(label_column=None))
+        self.assertEqual(result.class_imbalance, {})
+        self.assertEqual(result.label_coverage, {})
+
+    def test_tier3_populated_with_label(self, mock_bq_cls: MagicMock) -> None:
+        """With label_column, class_imbalance and label_coverage are populated."""
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        result = analyzer.analyze(_make_config(label_column="is_active"))
+        self.assertIn("user", result.class_imbalance)
+        self.assertIn("user", result.label_coverage)
+        self.assertAlmostEqual(result.label_coverage["user"], 0.8)
+
+    def test_tier4_skipped_when_flag_false(self, mock_bq_cls: MagicMock) -> None:
+        """Without compute_reciprocity flag, reciprocity dict is empty."""
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        result = analyzer.analyze(_make_config(compute_reciprocity=False))
+        self.assertEqual(result.reciprocity, {})
+
+    def test_feature_memory_budget_computed(self, mock_bq_cls: MagicMock) -> None:
+        """feature_memory_bytes is computed from schema metadata in Python, not a BQ query."""
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        result = analyzer.analyze(_make_config())
+        self.assertIn("user", result.feature_memory_bytes)
+        # 1000 nodes * 2 features * 8 bytes/float64 = 16000
+        self.assertEqual(result.feature_memory_bytes["user"], 1000 * 2 * 8)
+
+    def test_neighbor_explosion_populated(self, mock_bq_cls: MagicMock) -> None:
+        """With fan_out=[15,10] and avg degree 5, explosion estimate = 15*10*5."""
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        result = analyzer.analyze(_make_config())
+        self.assertIn("follows", result.neighbor_explosion_estimate)
+        self.assertGreater(result.neighbor_explosion_estimate["follows"], 0)
+
+    def test_edge_type_distribution_populated_for_multiple_edges(
+        self, mock_bq_cls: MagicMock
+    ) -> None:
+        """edge_type_distribution is populated when there are multiple edge types."""
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        result = analyzer.analyze(_make_config(extra_edge=True))
+        self.assertIn("follows", result.edge_type_distribution)
+        self.assertIn("likes", result.edge_type_distribution)

From 793190c90ea0b3e0e7f30b42ffa33289c3b46369 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 22:12:19 +0000
Subject: [PATCH 07/20] style(analytics): apply black formatter to test files

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 .../unit/analytics/data_analyzer/config_test.py  | 10 +++++-----
 .../unit/analytics/data_analyzer/queries_test.py | 16 ++++------------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/tests/unit/analytics/data_analyzer/config_test.py b/tests/unit/analytics/data_analyzer/config_test.py
index a095a11c8..5057462a1 100644
--- a/tests/unit/analytics/data_analyzer/config_test.py
+++ b/tests/unit/analytics/data_analyzer/config_test.py
@@ -2,14 +2,14 @@
 
 from omegaconf import OmegaConf
 
-from gigl.analytics.data_analyzer.config import (
-    DataAnalyzerConfig,
-    load_analyzer_config,
-)
+from gigl.analytics.data_analyzer.config import DataAnalyzerConfig, load_analyzer_config
 from tests.test_assets.test_case import TestCase
 
 SAMPLE_CONFIG_PATH = (
-    Path(__file__).parents[3] / "test_assets" / "analytics" / "sample_analyzer_config.yaml"
+    Path(__file__).parents[3]
+    / "test_assets"
+    / "analytics"
+    / "sample_analyzer_config.yaml"
 )
 
 
diff --git a/tests/unit/analytics/data_analyzer/queries_test.py b/tests/unit/analytics/data_analyzer/queries_test.py
index 9063cd4e4..db6389329 100644
--- a/tests/unit/analytics/data_analyzer/queries_test.py
+++ b/tests/unit/analytics/data_analyzer/queries_test.py
@@ -57,18 +57,14 @@ def test_contains_group_by_having(self) -> None:
 
 class DegreeDistributionQueryTest(TestCase):
     def test_contains_approx_quantiles(self) -> None:
-        sql = DEGREE_DISTRIBUTION_QUERY.format(
-            table=EDGE_TABLE, id_column="src_uid"
-        )
+        sql = DEGREE_DISTRIBUTION_QUERY.format(table=EDGE_TABLE, id_column="src_uid")
         self.assertIn("APPROX_QUANTILES", sql)
         self.assertIn("src_uid", sql)
 
 
 class DegreeBucketQueryTest(TestCase):
     def test_contains_countif_buckets(self) -> None:
-        sql = DEGREE_BUCKET_QUERY.format(
-            table=EDGE_TABLE, id_column="src_uid"
-        )
+        sql = DEGREE_BUCKET_QUERY.format(table=EDGE_TABLE, id_column="src_uid")
         self.assertIn("COUNTIF", sql)
         self.assertIn("src_uid", sql)
 
@@ -87,17 +83,13 @@ def test_batches_multiple_columns(self) -> None:
 
 class SuperHubInt16ClampQueryTest(TestCase):
     def test_contains_32767_threshold(self) -> None:
-        sql = SUPER_HUB_INT16_CLAMP_QUERY.format(
-            table=EDGE_TABLE, id_column="src_uid"
-        )
+        sql = SUPER_HUB_INT16_CLAMP_QUERY.format(table=EDGE_TABLE, id_column="src_uid")
         self.assertIn("32767", sql)
 
 
 class TopKHubsQueryTest(TestCase):
     def test_contains_limit(self) -> None:
-        sql = TOP_K_HUBS_QUERY.format(
-            table=EDGE_TABLE, id_column="src_uid", k=20
-        )
+        sql = TOP_K_HUBS_QUERY.format(table=EDGE_TABLE, id_column="src_uid", k=20)
         self.assertIn("LIMIT 20", sql)
         self.assertIn("ORDER BY", sql)
         self.assertIn("DESC", sql)

From 0b01b5cd8b6505ea968f13362483040ce924be6b Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 22:16:12 +0000
Subject: [PATCH 08/20] feat(analytics): add report SPEC.md and initial
 AI-owned HTML/JS/CSS assets

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/data_analyzer/report/SPEC.md   | 138 ++++
 .../data_analyzer/report/charts.ai.js         | 610 ++++++++++++++++++
 .../data_analyzer/report/report.ai.html       |  69 ++
 .../data_analyzer/report/styles.ai.css        | 173 +++++
 4 files changed, 990 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/report/SPEC.md
 create mode 100644 gigl/analytics/data_analyzer/report/charts.ai.js
 create mode 100644 gigl/analytics/data_analyzer/report/report.ai.html
 create mode 100644 gigl/analytics/data_analyzer/report/styles.ai.css

diff --git a/gigl/analytics/data_analyzer/report/SPEC.md b/gigl/analytics/data_analyzer/report/SPEC.md
new file mode 100644
index 000000000..0eecb3d73
--- /dev/null
+++ b/gigl/analytics/data_analyzer/report/SPEC.md
@@ -0,0 +1,138 @@
+# Report Generator SPEC
+
+## Purpose
+
+This SPEC defines the single self-contained HTML report that the BQ Data Analyzer
+produces for a graph dataset. The three `.ai.{html,js,css}` files in this
+directory implement the SPEC and should be regenerated from it whenever the SPEC
+changes. The Python `report_generator.py` module is the only non-AI-owned
+component in this directory; it loads the AI assets via `importlib.resources`,
+injects data from a `GraphAnalysisResult` dataclass, and writes a single HTML
+file to disk.
+
+## Constraints
+
+- Single self-contained HTML file. No external CDN, no external JS/CSS/font
+  dependencies, no network requests at view time.
+- Opens in any modern browser (Chrome, Firefox, Safari, Edge) without a server.
+- Max-width 1200px, centered horizontally.
+- Light background (`#f8f9fa`).
+- Monospace font (`ui-monospace`, `SFMono-Regular`, `Menlo`, `monospace`) for all
+  numeric data values; sans-serif (`system-ui`, `-apple-system`,
+  `"Segoe UI"`, `Roboto`, sans-serif) for labels and headings.
+- Collapsible sections use `<details>` / `<summary>` (no JS required to
+  expand/collapse).
+- Color coding for status uses these exact values:
+  - Green: `#28a745` (OK)
+  - Yellow: `#ffc107` (warning)
+  - Red: `#dc3545` (critical)
+- Total report HTML should be reasonable in size (a single dataset's report
+  with embedded FACETS iframes may be multi-MB; that is acceptable).
+
+## Sections (in display order)
+
+1. **Header** (`<header id="report-header">`) — "GiGL Data Analysis Report"
+   title, generation timestamp, and a short config summary listing the analyzed
+   node tables and edge tables.
+2. **Overview Dashboard** (`<section id="overview">`) — Card grid showing total
+   nodes, total edges, number of node types, number of edge types, and an
+   overall traffic-light status indicator (green/yellow/red). The status is the
+   worst severity across all detected issues.
+3. **Data Quality** (`<section id="data-quality">`) — Per-table NULL rates table
+   sorted highest-first with rows color-coded (NULL rate > 50% = yellow,
+   > 90% = red). Duplicate node counts, duplicate edge counts, dangling edge
+   counts, and referential integrity violations. Any nonzero count in these
+   four is rendered red.
+4. **Feature Statistics** (`<section id="feature-statistics">`) — Optional. One
+   subsection per table with the corresponding FACETS HTML embedded inside an
+   `<iframe srcdoc="...">` to isolate styles. Entire section is hidden if no
+   profile data is provided.
+5. **Graph Structure** (`<section id="graph-structure">`) — Node and edge count
+   table. Per-edge-type degree distribution rendered as inline SVG histogram
+   using the `buckets` dict from `DegreeStats` (buckets `0-1`, `2-10`, `11-100`,
+   `101-1K`, `1K-10K`, `10K+`). Top-20 hub table per edge type. Super-hub int16
+   clamp warning box (red) shown if any edge type reports a clamp count > 0.
+6. **Advanced** (`<section id="advanced">`) — Optional Tier 3 / Tier 4 data.
+   Shown only if the relevant fields are populated:
+   - Class imbalance (bar chart and per-class counts)
+   - Label coverage (percentage per node type)
+   - Edge type distribution (bar chart)
+   - Reciprocity per edge type
+   - Power-law exponent per edge type
+7. **Footer** (`<footer id="report-footer">`) — GiGL version / commit, list of
+   raw artifact GCS paths, and a condensed list of literature references (the
+   18 papers from `docs/plans/20260415-bq-data-analyzer-references.md`).
+
+## Key Thresholds
+
+Thresholds used to color-code metrics. These must match the design doc
+(`docs/plans/20260415-bq-data-analyzer.md`) exactly.
+
+| Metric                             | Green          | Yellow       | Red          |
+|------------------------------------|----------------|--------------|--------------|
+| Edge homophily                     | > 0.7          | 0.3 - 0.7    | < 0.3        |
+| Class imbalance ratio              | < 1:5          | 1:5 - 1:10   | > 1:10       |
+| Feature missing rate               | < 10%          | 10 - 50%     | > 90%        |
+| Isolated node fraction             | < 1%           | 1 - 5%       | > 5%         |
+| Degree p99/median                  | < 50           | 50 - 100     | > 100        |
+| Node degree (int16 clamp)          | < 32,767       | n/a          | > 32,767     |
+| Cold-start fraction (degree 0-1)   | < 5%           | 5 - 10%      | > 10%        |
+| Edge type dominance                | No type > 80%  | Any > 90%    | Any < 0.1%   |
+
+## Data Injection Contract
+
+`report_generator.py` produces a final HTML file by performing four exact
+string replacements on `report.ai.html`:
+
+| Placeholder                   | Replaced with                                                  |
+|-------------------------------|----------------------------------------------------------------|
+| `/* INJECT_STYLES */`         | Raw contents of `styles.ai.css`                                |
+| `/* INJECT_SCRIPTS */`        | Raw contents of `charts.ai.js`                                 |
+| `/* INJECT_ANALYSIS_DATA */`  | JSON-serialized `GraphAnalysisResult` (`dataclasses.asdict`)   |
+| `/* INJECT_PROFILE_DATA */`   | JSON-serialized `FeatureProfileResult` (or `{}` if absent)     |
+
+The JS reads these injected JSON strings from hidden script tags:
+
+```html
+<script id="analysis-data" type="application/json">/* INJECT_ANALYSIS_DATA */</script>
+<script id="profile-data"  type="application/json">/* INJECT_PROFILE_DATA */</script>
+```
+
+On page load the JS:
+
+1. Parses both JSON blobs.
+2. Populates each section by generating DOM nodes (never `innerHTML` with
+   untrusted strings; always `textContent`).
+3. Renders the degree distribution as an inline SVG bar chart.
+4. Applies color coding (`status-green`, `status-yellow`, `status-red`) based
+   on the thresholds above.
+5. Hides `#feature-statistics` if the profile data is empty / `{}`.
+6. Hides `#advanced` if no Tier 3 or Tier 4 data is present.
+
+### JS Element Contract
+
+The JS queries these DOM IDs. The HTML template must provide them:
+
+- `#report-header`
+- `#overview`
+- `#data-quality`
+- `#feature-statistics`
+- `#graph-structure`
+- `#advanced`
+- `#report-footer`
+- `#analysis-data` (hidden JSON script tag)
+- `#profile-data`  (hidden JSON script tag)
+
+## Regeneration Instructions
+
+To regenerate `report.ai.html`, `charts.ai.js`, and `styles.ai.css`:
+
+1. Read this `SPEC.md` in full.
+2. Implement the sections, element IDs, thresholds, and data injection contract
+   exactly as specified.
+3. Keep the HTML template minimal (all content is rendered by JS).
+4. Keep the JS as a single IIFE with no external dependencies; use DOM helpers,
+   not templating libraries.
+5. Use the exact color hex values specified in "Constraints".
+6. Update the snapshot test golden file at
+   `tests/test_assets/analytics/golden_report.html` after regenerating.
diff --git a/gigl/analytics/data_analyzer/report/charts.ai.js b/gigl/analytics/data_analyzer/report/charts.ai.js
new file mode 100644
index 000000000..f04b70064
--- /dev/null
+++ b/gigl/analytics/data_analyzer/report/charts.ai.js
@@ -0,0 +1,610 @@
+(function () {
+    "use strict";
+
+    // Bucket order for degree histograms; must match GraphStructureAnalyzer output.
+    const BUCKET_ORDER = ["0-1", "2-10", "11-100", "101-1K", "1K-10K", "10K+"];
+
+    function parseJSONTag(id) {
+        const node = document.getElementById(id);
+        if (!node) return {};
+        const raw = (node.textContent || "").trim();
+        if (!raw) return {};
+        try {
+            return JSON.parse(raw);
+        } catch (e) {
+            console.error("Failed to parse JSON tag #" + id, e);
+            return {};
+        }
+    }
+
+    function createElement(tag, attrs, ...children) {
+        const el = document.createElement(tag);
+        if (attrs) {
+            for (const key of Object.keys(attrs)) {
+                const val = attrs[key];
+                if (val === null || val === undefined || val === false) continue;
+                if (key === "className") el.className = val;
+                else if (key === "text") el.textContent = val;
+                else if (key === "hidden") el.hidden = Boolean(val);
+                else el.setAttribute(key, val);
+            }
+        }
+        for (const child of children) {
+            if (child === null || child === undefined) continue;
+            if (typeof child === "string" || typeof child === "number") {
+                el.appendChild(document.createTextNode(String(child)));
+            } else {
+                el.appendChild(child);
+            }
+        }
+        return el;
+    }
+
+    function formatNumber(n) {
+        if (n === null || n === undefined) return "-";
+        if (typeof n !== "number") return String(n);
+        return n.toLocaleString("en-US");
+    }
+
+    function formatPercent(fraction) {
+        if (fraction === null || fraction === undefined) return "-";
+        return (fraction * 100).toFixed(2) + "%";
+    }
+
+    function classForThreshold(value, green, yellow) {
+        // value <= green -> green, value <= yellow -> yellow, else red.
+        if (value <= green) return "status-green";
+        if (value <= yellow) return "status-yellow";
+        return "status-red";
+    }
+
+    function classForNullRate(rate) {
+        if (rate > 0.9) return "status-red";
+        if (rate > 0.5) return "status-yellow";
+        return "status-green";
+    }
+
+    function sumValues(obj) {
+        if (!obj) return 0;
+        let total = 0;
+        for (const key of Object.keys(obj)) {
+            const v = obj[key];
+            if (typeof v === "number") total += v;
+        }
+        return total;
+    }
+
+    function hasAnyPositive(obj) {
+        if (!obj) return false;
+        for (const key of Object.keys(obj)) {
+            if (obj[key] > 0) return true;
+        }
+        return false;
+    }
+
+    // ---- Rendering ----
+
+    function renderHeader(analysis) {
+        const metaEl = document.getElementById("report-meta");
+        const cfgEl = document.getElementById("report-config-summary");
+        const now = new Date().toISOString();
+        metaEl.textContent = "Generated at " + now;
+
+        const nodeTypes = Object.keys(analysis.node_counts || {});
+        const edgeTypes = Object.keys(analysis.edge_counts || {});
+        cfgEl.textContent =
+            "Node tables: " + (nodeTypes.length ? nodeTypes.join(", ") : "(none)") +
+            " | Edge tables: " + (edgeTypes.length ? edgeTypes.join(", ") : "(none)");
+    }
+
+    function overallStatus(analysis) {
+        // Hard fails -> red.
+        if (hasAnyPositive(analysis.duplicate_node_counts) ||
+            hasAnyPositive(analysis.dangling_edge_counts) ||
+            hasAnyPositive(analysis.referential_integrity_violations) ||
+            hasAnyPositive(analysis.super_hub_int16_clamp_count)) {
+            return "status-red";
+        }
+        // Check thresholded metrics for yellow.
+        const totalNodes = sumValues(analysis.node_counts);
+        if (totalNodes > 0) {
+            const isolatedFrac = sumValues(analysis.isolated_node_counts) / totalNodes;
+            const coldFrac = sumValues(analysis.cold_start_node_counts) / totalNodes;
+            if (isolatedFrac > 0.05 || coldFrac > 0.10) return "status-red";
+            if (isolatedFrac > 0.01 || coldFrac > 0.05) return "status-yellow";
+        }
+        // NULL rates.
+        const nullRates = analysis.null_rates || {};
+        for (const table of Object.keys(nullRates)) {
+            for (const col of Object.keys(nullRates[table])) {
+                const r = nullRates[table][col];
+                if (r > 0.9) return "status-red";
+            }
+        }
+        return "status-green";
+    }
+
+    function renderOverview(analysis) {
+        const container = document.getElementById("overview-cards");
+        const totalNodes = sumValues(analysis.node_counts);
+        const totalEdges = sumValues(analysis.edge_counts);
+        const nodeTypes = Object.keys(analysis.node_counts || {}).length;
+        const edgeTypes = Object.keys(analysis.edge_counts || {}).length;
+        const status = overallStatus(analysis);
+
+        const cards = [
+            ["Total nodes", formatNumber(totalNodes)],
+            ["Total edges", formatNumber(totalEdges)],
+            ["Node types", formatNumber(nodeTypes)],
+            ["Edge types", formatNumber(edgeTypes)],
+        ];
+        for (const [label, value] of cards) {
+            container.appendChild(createElement("div", { className: "card" },
+                createElement("div", { className: "card-label", text: label }),
+                createElement("div", { className: "card-value data-value", text: value })
+            ));
+        }
+        const statusLabel = status === "status-green" ? "OK" :
+                            status === "status-yellow" ? "WARNING" : "CRITICAL";
+        container.appendChild(createElement("div", { className: "card" },
+            createElement("div", { className: "card-label", text: "Overall status" }),
+            createElement("div", { className: "card-value" },
+                createElement("span", { className: status, text: statusLabel }))
+        ));
+    }
+
+    function renderNullRates(analysis) {
+        const container = document.getElementById("null-rates-container");
+        const rates = analysis.null_rates || {};
+        const rows = [];
+        for (const table of Object.keys(rates)) {
+            for (const col of Object.keys(rates[table])) {
+                rows.push({ table: table, column: col, rate: rates[table][col] });
+            }
+        }
+        if (rows.length === 0) {
+            container.appendChild(createElement("p", { text: "No NULL rate data available." }));
+            return;
+        }
+        rows.sort((a, b) => b.rate - a.rate);
+        const thead = createElement("thead", null,
+            createElement("tr", null,
+                createElement("th", { text: "Table" }),
+                createElement("th", { text: "Column" }),
+                createElement("th", { text: "NULL rate" })));
+        const tbody = createElement("tbody");
+        for (const r of rows) {
+            const cls = classForNullRate(r.rate);
+            tbody.appendChild(createElement("tr", null,
+                createElement("td", { text: r.table }),
+                createElement("td", { text: r.column }),
+                createElement("td", { className: "numeric" },
+                    createElement("span", { className: cls, text: formatPercent(r.rate) }))));
+        }
+        container.appendChild(createElement("table", null, thead, tbody));
+    }
+
+    function renderIntegrity(analysis) {
+        const container = document.getElementById("integrity-container");
+        const rows = [
+            ["Duplicate nodes", analysis.duplicate_node_counts],
+            ["Duplicate edges", analysis.duplicate_edge_counts],
+            ["Dangling edges", analysis.dangling_edge_counts],
+            ["Referential integrity violations", analysis.referential_integrity_violations],
+            ["Self loops", analysis.self_loop_counts],
+            ["Isolated nodes", analysis.isolated_node_counts],
+            ["Cold-start nodes (degree 0-1)", analysis.cold_start_node_counts],
+        ];
+        const thead = createElement("thead", null,
+            createElement("tr", null,
+                createElement("th", { text: "Check" }),
+                createElement("th", { text: "Per-type counts" }),
+                createElement("th", { text: "Total" })));
+        const tbody = createElement("tbody");
+        for (const [label, obj] of rows) {
+            const total = sumValues(obj);
+            const isHardFail = (label === "Duplicate nodes" ||
+                                label === "Dangling edges" ||
+                                label === "Referential integrity violations");
+            const cls = isHardFail
+                ? (total > 0 ? "status-red" : "status-green")
+                : (total > 0 ? "status-yellow" : "status-green");
+            const detail = obj && Object.keys(obj).length
+                ? Object.keys(obj).map(k => k + ": " + formatNumber(obj[k])).join(", ")
+                : "(none)";
+            tbody.appendChild(createElement("tr", null,
+                createElement("td", { text: label }),
+                createElement("td", { className: "data-value", text: detail }),
+                createElement("td", { className: "numeric" },
+                    createElement("span", { className: cls, text: formatNumber(total) }))));
+        }
+        container.appendChild(createElement("table", null, thead, tbody));
+    }
+
+    function renderFeatureStatistics(profile) {
+        const section = document.getElementById("feature-statistics");
+        const container = document.getElementById("feature-statistics-container");
+        const facets = (profile && profile.facets_html_paths) || {};
+        const keys = Object.keys(facets);
+        if (keys.length === 0) {
+            section.hidden = true;
+            return;
+        }
+        section.hidden = false;
+        for (const tableName of keys) {
+            const srcdoc = facets[tableName] || "";
+            container.appendChild(createElement("details", { open: "" },
+                createElement("summary", { text: "FACETS: " + tableName }),
+                createElement("iframe", {
+                    className: "facets-embed",
+                    srcdoc: srcdoc,
+                    sandbox: "allow-scripts allow-same-origin",
+                })));
+        }
+    }
+
+    function renderCounts(analysis) {
+        const container = document.getElementById("counts-container");
+        const thead = createElement("thead", null,
+            createElement("tr", null,
+                createElement("th", { text: "Type" }),
+                createElement("th", { text: "Kind" }),
+                createElement("th", { text: "Count" })));
+        const tbody = createElement("tbody");
+        for (const [name, count] of Object.entries(analysis.node_counts || {})) {
+            tbody.appendChild(createElement("tr", null,
+                createElement("td", { text: name }),
+                createElement("td", { text: "node" }),
+                createElement("td", { className: "numeric data-value", text: formatNumber(count) })));
+        }
+        for (const [name, count] of Object.entries(analysis.edge_counts || {})) {
+            tbody.appendChild(createElement("tr", null,
+                createElement("td", { text: name }),
+                createElement("td", { text: "edge" }),
+                createElement("td", { className: "numeric data-value", text: formatNumber(count) })));
+        }
+        container.appendChild(createElement("table", null, thead, tbody));
+    }
+
+    function renderDegreeHistogram(buckets) {
+        // Returns an SVG element for the given bucket counts.
+        const width = 720;
+        const height = 220;
+        const padLeft = 50;
+        const padRight = 10;
+        const padTop = 16;
+        const padBottom = 40;
+        const innerW = width - padLeft - padRight;
+        const innerH = height - padTop - padBottom;
+
+        const svg = document.createElementNS("http://www.w3.org/2000/svg", "svg");
+        svg.setAttribute("class", "histogram");
+        svg.setAttribute("viewBox", "0 0 " + width + " " + height);
+
+        const counts = BUCKET_ORDER.map(k => (buckets && buckets[k]) || 0);
+        const maxCount = Math.max(1, ...counts);
+        const barWidth = innerW / BUCKET_ORDER.length;
+
+        // Axis lines.
+        const axis = document.createElementNS("http://www.w3.org/2000/svg", "line");
+        axis.setAttribute("class", "axis");
+        axis.setAttribute("x1", padLeft);
+        axis.setAttribute("y1", padTop + innerH);
+        axis.setAttribute("x2", padLeft + innerW);
+        axis.setAttribute("y2", padTop + innerH);
+        svg.appendChild(axis);
+
+        for (let i = 0; i < BUCKET_ORDER.length; i++) {
+            const c = counts[i];
+            const h = (c / maxCount) * innerH;
+            const x = padLeft + i * barWidth + 4;
+            const y = padTop + innerH - h;
+            const rect = document.createElementNS("http://www.w3.org/2000/svg", "rect");
+            rect.setAttribute("class", "bar");
+            rect.setAttribute("x", x);
+            rect.setAttribute("y", y);
+            rect.setAttribute("width", Math.max(1, barWidth - 8));
+            rect.setAttribute("height", h);
+            svg.appendChild(rect);
+
+            const valueLabel = document.createElementNS("http://www.w3.org/2000/svg", "text");
+            valueLabel.setAttribute("class", "value");
+            valueLabel.setAttribute("x", x + (barWidth - 8) / 2);
+            valueLabel.setAttribute("y", y - 4);
+            valueLabel.setAttribute("text-anchor", "middle");
+            valueLabel.textContent = formatNumber(c);
+            svg.appendChild(valueLabel);
+
+            const xLabel = document.createElementNS("http://www.w3.org/2000/svg", "text");
+            xLabel.setAttribute("class", "label");
+            xLabel.setAttribute("x", x + (barWidth - 8) / 2);
+            xLabel.setAttribute("y", padTop + innerH + 16);
+            xLabel.setAttribute("text-anchor", "middle");
+            xLabel.textContent = BUCKET_ORDER[i];
+            svg.appendChild(xLabel);
+        }
+
+        // Y-axis max label.
+        const maxLabel = document.createElementNS("http://www.w3.org/2000/svg", "text");
+        maxLabel.setAttribute("class", "label");
+        maxLabel.setAttribute("x", padLeft - 6);
+        maxLabel.setAttribute("y", padTop + 10);
+        maxLabel.setAttribute("text-anchor", "end");
+        maxLabel.textContent = formatNumber(maxCount);
+        svg.appendChild(maxLabel);
+
+        return svg;
+    }
+
+    function renderDegree(analysis) {
+        const container = document.getElementById("degree-container");
+        const degrees = analysis.degree_stats || {};
+        const keys = Object.keys(degrees);
+        if (keys.length === 0) {
+            container.appendChild(createElement("p", { text: "No degree stats available." }));
+            return;
+        }
+        for (const edgeType of keys) {
+            const stats = degrees[edgeType];
+            const median = stats.median || 1;
+            const ratio = stats.p99 / Math.max(1, median);
+            const ratioClass = classForThreshold(ratio, 50, 100);
+
+            const statsLine = createElement("p", { className: "data-value" },
+                "min=" + formatNumber(stats.min) +
+                ", mean=" + (stats.mean !== undefined ? stats.mean.toFixed(2) : "-") +
+                ", median=" + formatNumber(stats.median) +
+                ", p90=" + formatNumber(stats.p90) +
+                ", p99=" + formatNumber(stats.p99) +
+                ", p99.9=" + formatNumber(stats.p999) +
+                ", max=" + formatNumber(stats.max) +
+                " | p99/median=",
+                createElement("span", { className: ratioClass, text: ratio.toFixed(1) }));
+
+            container.appendChild(createElement("h3", { text: edgeType }));
+            container.appendChild(statsLine);
+            container.appendChild(renderDegreeHistogram(stats.buckets || {}));
+        }
+    }
+
+    function renderHubs(analysis) {
+        const container = document.getElementById("hubs-container");
+        const hubs = analysis.top_hubs || {};
+        const keys = Object.keys(hubs);
+        if (keys.length === 0) {
+            container.appendChild(createElement("p", { text: "No hub data available." }));
+            return;
+        }
+        for (const edgeType of keys) {
+            container.appendChild(createElement("h3", { text: edgeType }));
+            const thead = createElement("thead", null,
+                createElement("tr", null,
+                    createElement("th", { text: "Rank" }),
+                    createElement("th", { text: "Node ID" }),
+                    createElement("th", { text: "Degree" })));
+            const tbody = createElement("tbody");
+            const rows = (hubs[edgeType] || []).slice(0, 20);
+            rows.forEach((entry, i) => {
+                const nodeId = Array.isArray(entry) ? entry[0] : entry.node_id;
+                const degree = Array.isArray(entry) ? entry[1] : entry.degree;
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: String(i + 1) }),
+                    createElement("td", { className: "data-value", text: String(nodeId) }),
+                    createElement("td", { className: "numeric data-value", text: formatNumber(degree) })));
+            });
+            container.appendChild(createElement("table", null, thead, tbody));
+        }
+    }
+
+    function renderSuperHubWarning(analysis) {
+        const box = document.getElementById("super-hub-warning");
+        const clamps = analysis.super_hub_int16_clamp_count || {};
+        const totalClamps = sumValues(clamps);
+        if (totalClamps <= 0) {
+            box.hidden = true;
+            return;
+        }
+        box.hidden = false;
+        box.className = "warning-box";
+        const detail = Object.keys(clamps)
+            .map(k => k + ": " + formatNumber(clamps[k]))
+            .join(", ");
+        box.appendChild(createElement("strong", { text: "Super-hub int16 clamp warning. " }));
+        box.appendChild(document.createTextNode(
+            formatNumber(totalClamps) + " node(s) exceed the int16 degree limit (32,767) and " +
+            "will be silently clamped by GiGL. Per-type: " + detail
+        ));
+    }
+
+    function renderAdvanced(analysis) {
+        const section = document.getElementById("advanced");
+        const container = document.getElementById("advanced-container");
+
+        const classImb = analysis.class_imbalance || {};
+        const labelCov = analysis.label_coverage || {};
+        const edgeDist = analysis.edge_type_distribution || {};
+        const reciprocity = analysis.reciprocity || {};
+        const powerLaw = analysis.power_law_exponent || {};
+
+        const hasTier3 = Object.keys(classImb).length ||
+                         Object.keys(labelCov).length ||
+                         Object.keys(edgeDist).length;
+        const hasTier4 = Object.keys(reciprocity).length ||
+                         Object.keys(powerLaw).length;
+
+        if (!hasTier3 && !hasTier4) {
+            section.hidden = true;
+            return;
+        }
+        section.hidden = false;
+
+        if (Object.keys(classImb).length) {
+            container.appendChild(createElement("h3", { text: "Class imbalance" }));
+            for (const nodeType of Object.keys(classImb)) {
+                const counts = classImb[nodeType];
+                const values = Object.values(counts);
+                const maxC = Math.max(...values);
+                const minC = Math.max(1, Math.min(...values));
+                const ratio = maxC / minC;
+                const cls = ratio > 10 ? "status-red" : ratio > 5 ? "status-yellow" : "status-green";
+                container.appendChild(createElement("p", { className: "data-value" },
+                    nodeType + " max/min ratio = ",
+                    createElement("span", { className: cls, text: "1:" + ratio.toFixed(1) })));
+                const tbody = createElement("tbody");
+                for (const [label, count] of Object.entries(counts)) {
+                    tbody.appendChild(createElement("tr", null,
+                        createElement("td", { text: String(label) }),
+                        createElement("td", { className: "numeric data-value", text: formatNumber(count) })));
+                }
+                container.appendChild(createElement("table", null,
+                    createElement("thead", null, createElement("tr", null,
+                        createElement("th", { text: "Class" }),
+                        createElement("th", { text: "Count" }))),
+                    tbody));
+            }
+        }
+
+        if (Object.keys(labelCov).length) {
+            container.appendChild(createElement("h3", { text: "Label coverage" }));
+            const tbody = createElement("tbody");
+            for (const [nodeType, frac] of Object.entries(labelCov)) {
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: nodeType }),
+                    createElement("td", { className: "numeric data-value", text: formatPercent(frac) })));
+            }
+            container.appendChild(createElement("table", null,
+                createElement("thead", null, createElement("tr", null,
+                    createElement("th", { text: "Node type" }),
+                    createElement("th", { text: "Coverage" }))),
+                tbody));
+        }
+
+        if (Object.keys(edgeDist).length) {
+            container.appendChild(createElement("h3", { text: "Edge type distribution" }));
+            const total = sumValues(edgeDist);
+            const tbody = createElement("tbody");
+            for (const [edgeType, count] of Object.entries(edgeDist)) {
+                const frac = total > 0 ? count / total : 0;
+                let cls = "status-green";
+                if (frac < 0.001) cls = "status-red";
+                else if (frac > 0.9) cls = "status-red";
+                else if (frac > 0.8) cls = "status-yellow";
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: edgeType }),
+                    createElement("td", { className: "numeric data-value", text: formatNumber(count) }),
+                    createElement("td", { className: "numeric" },
+                        createElement("span", { className: cls, text: formatPercent(frac) }))));
+            }
+            container.appendChild(createElement("table", null,
+                createElement("thead", null, createElement("tr", null,
+                    createElement("th", { text: "Edge type" }),
+                    createElement("th", { text: "Count" }),
+                    createElement("th", { text: "Share" }))),
+                tbody));
+        }
+
+        if (Object.keys(reciprocity).length) {
+            container.appendChild(createElement("h3", { text: "Reciprocity" }));
+            const tbody = createElement("tbody");
+            for (const [edgeType, val] of Object.entries(reciprocity)) {
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: edgeType }),
+                    createElement("td", { className: "numeric data-value", text: formatPercent(val) })));
+            }
+            container.appendChild(createElement("table", null,
+                createElement("thead", null, createElement("tr", null,
+                    createElement("th", { text: "Edge type" }),
+                    createElement("th", { text: "Reciprocity" }))),
+                tbody));
+        }
+
+        if (Object.keys(powerLaw).length) {
+            container.appendChild(createElement("h3", { text: "Power-law exponent" }));
+            const tbody = createElement("tbody");
+            for (const [edgeType, alpha] of Object.entries(powerLaw)) {
+                const cls = alpha < 2 ? "status-red" : alpha < 2.5 ? "status-yellow" : "status-green";
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: edgeType }),
+                    createElement("td", { className: "numeric" },
+                        createElement("span", { className: cls, text: alpha.toFixed(2) }))));
+            }
+            container.appendChild(createElement("table", null,
+                createElement("thead", null, createElement("tr", null,
+                    createElement("th", { text: "Edge type" }),
+                    createElement("th", { text: "Alpha" }))),
+                tbody));
+        }
+    }
+
+    function renderFooter(analysis, profile) {
+        const container = document.getElementById("footer-container");
+
+        const artifacts = [];
+        const paths = (profile && profile.facets_html_paths) || {};
+        for (const [k, v] of Object.entries(paths)) {
+            artifacts.push("FACETS " + k + ": " + v);
+        }
+        const statsPaths = (profile && profile.stats_paths) || {};
+        for (const [k, v] of Object.entries(statsPaths)) {
+            artifacts.push("Stats " + k + ": " + v);
+        }
+
+        if (artifacts.length) {
+            container.appendChild(createElement("h3", { text: "Raw artifacts" }));
+            const ul = createElement("ul", { className: "footer-list" });
+            for (const a of artifacts) {
+                ul.appendChild(createElement("li", null, createElement("code", { text: a })));
+            }
+            container.appendChild(ul);
+        }
+
+        container.appendChild(createElement("h3", { text: "References" }));
+        const refs = [
+            "PinSage (Pinterest, KDD 2018)",
+            "PinnerSage (Pinterest, KDD 2020)",
+            "BLADE (Amazon, WSDM 2023)",
+            "LiGNN (LinkedIn, KDD 2024)",
+            "TwHIN (Twitter/X, KDD 2022)",
+            "GiGL (Snap, KDD 2025)",
+            "AliGraph (Alibaba, VLDB 2019)",
+            "GraphSMOTE (WSDM 2021)",
+            "Beyond Homophily (NeurIPS 2020)",
+            "Uber Fraud Detection + Grab Spade (VLDB 2023)",
+            "Google Maps ETA (CIKM 2021)",
+            "Feature Propagation (ICLR 2022)",
+            "GraphBFF (Feb 2026)",
+            "DQuaG (EDBT 2025)",
+            "LinkedIn Cross-Domain GNN (June 2025)",
+            "Oversmoothing/Oversquashing Complexity (March 2026)",
+            "Demystifying Common Beliefs (ICLR 2026)",
+            "Meta GEM and Adaptive Ranking (2025-2026)",
+        ];
+        const refUl = createElement("ul", { className: "footer-list" });
+        for (const r of refs) {
+            refUl.appendChild(createElement("li", { text: r }));
+        }
+        container.appendChild(refUl);
+    }
+
+    function main() {
+        const analysis = parseJSONTag("analysis-data");
+        const profile = parseJSONTag("profile-data");
+        renderHeader(analysis);
+        renderOverview(analysis);
+        renderNullRates(analysis);
+        renderIntegrity(analysis);
+        renderFeatureStatistics(profile);
+        renderCounts(analysis);
+        renderDegree(analysis);
+        renderHubs(analysis);
+        renderSuperHubWarning(analysis);
+        renderAdvanced(analysis);
+        renderFooter(analysis, profile);
+    }
+
+    if (document.readyState === "loading") {
+        document.addEventListener("DOMContentLoaded", main);
+    } else {
+        main();
+    }
+})();
diff --git a/gigl/analytics/data_analyzer/report/report.ai.html b/gigl/analytics/data_analyzer/report/report.ai.html
new file mode 100644
index 000000000..cde781196
--- /dev/null
+++ b/gigl/analytics/data_analyzer/report/report.ai.html
@@ -0,0 +1,69 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>GiGL Data Analysis Report</title>
+    <style>/* INJECT_STYLES */</style>
+</head>
+<body>
+    <header id="report-header">
+        <h1>GiGL Data Analysis Report</h1>
+        <p class="meta" id="report-meta"></p>
+        <p class="config-summary" id="report-config-summary"></p>
+    </header>
+
+    <section id="overview">
+        <h2>Overview</h2>
+        <div class="card-grid" id="overview-cards"></div>
+    </section>
+
+    <section id="data-quality">
+        <h2>Data Quality</h2>
+        <details open>
+            <summary>NULL rates per column</summary>
+            <div id="null-rates-container"></div>
+        </details>
+        <details open>
+            <summary>Integrity checks</summary>
+            <div id="integrity-container"></div>
+        </details>
+    </section>
+
+    <section id="feature-statistics" hidden>
+        <h2>Feature Statistics</h2>
+        <div id="feature-statistics-container"></div>
+    </section>
+
+    <section id="graph-structure">
+        <h2>Graph Structure</h2>
+        <details open>
+            <summary>Node and edge counts</summary>
+            <div id="counts-container"></div>
+        </details>
+        <details open>
+            <summary>Degree distribution</summary>
+            <div id="degree-container"></div>
+        </details>
+        <details open>
+            <summary>Top-20 hubs</summary>
+            <div id="hubs-container"></div>
+        </details>
+        <div id="super-hub-warning" hidden></div>
+    </section>
+
+    <section id="advanced" hidden>
+        <h2>Advanced Metrics</h2>
+        <div id="advanced-container"></div>
+    </section>
+
+    <footer id="report-footer">
+        <h2>References and Artifacts</h2>
+        <div id="footer-container"></div>
+    </footer>
+
+    <script id="analysis-data" type="application/json">/* INJECT_ANALYSIS_DATA */</script>
+    <script id="profile-data" type="application/json">/* INJECT_PROFILE_DATA */</script>
+    <script>/* INJECT_SCRIPTS */</script>
+</body>
+</html>
diff --git a/gigl/analytics/data_analyzer/report/styles.ai.css b/gigl/analytics/data_analyzer/report/styles.ai.css
new file mode 100644
index 000000000..7c170d76e
--- /dev/null
+++ b/gigl/analytics/data_analyzer/report/styles.ai.css
@@ -0,0 +1,173 @@
+:root {
+    --color-ok: #28a745;
+    --color-warn: #ffc107;
+    --color-crit: #dc3545;
+    --color-bg: #f8f9fa;
+    --color-card-bg: #ffffff;
+    --color-border: #dee2e6;
+    --color-text: #212529;
+    --color-text-muted: #6c757d;
+    --font-sans: system-ui, -apple-system, "Segoe UI", Roboto, sans-serif;
+    --font-mono: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+* { box-sizing: border-box; }
+
+body {
+    max-width: 1200px;
+    margin: 0 auto;
+    padding: 24px;
+    font-family: var(--font-sans);
+    background: var(--color-bg);
+    color: var(--color-text);
+    line-height: 1.5;
+}
+
+h1, h2, h3, h4 {
+    font-family: var(--font-sans);
+    margin-top: 1.2em;
+    margin-bottom: 0.5em;
+}
+
+h1 { font-size: 1.8rem; }
+h2 { font-size: 1.4rem; border-bottom: 1px solid var(--color-border); padding-bottom: 4px; }
+h3 { font-size: 1.15rem; }
+
+.meta, .config-summary {
+    color: var(--color-text-muted);
+    font-size: 0.9rem;
+    margin: 4px 0;
+}
+
+.data-value {
+    font-family: var(--font-mono);
+    color: #111;
+}
+
+.status-green  { background: var(--color-ok);   color: #ffffff; padding: 2px 6px; border-radius: 3px; }
+.status-yellow { background: var(--color-warn); color: #212529; padding: 2px 6px; border-radius: 3px; }
+.status-red    { background: var(--color-crit); color: #ffffff; padding: 2px 6px; border-radius: 3px; }
+
+.status-dot {
+    display: inline-block;
+    width: 12px;
+    height: 12px;
+    border-radius: 50%;
+    vertical-align: middle;
+}
+.status-dot.status-green  { background: var(--color-ok); }
+.status-dot.status-yellow { background: var(--color-warn); }
+.status-dot.status-red    { background: var(--color-crit); }
+
+details {
+    background: var(--color-card-bg);
+    border: 1px solid var(--color-border);
+    border-radius: 6px;
+    padding: 8px 12px;
+    margin: 8px 0;
+}
+
+summary {
+    cursor: pointer;
+    font-weight: 600;
+    padding: 4px 0;
+    user-select: none;
+}
+
+.card-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
+    gap: 12px;
+    margin-top: 8px;
+}
+
+.card {
+    background: var(--color-card-bg);
+    border: 1px solid var(--color-border);
+    border-radius: 6px;
+    padding: 16px;
+    text-align: center;
+}
+
+.card .card-label {
+    font-size: 0.85rem;
+    color: var(--color-text-muted);
+    margin-bottom: 6px;
+}
+
+.card .card-value {
+    font-family: var(--font-mono);
+    font-size: 1.4rem;
+    font-weight: 600;
+}
+
+table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 8px 0;
+    background: var(--color-card-bg);
+    font-size: 0.92rem;
+}
+
+th, td {
+    padding: 6px 10px;
+    border-bottom: 1px solid var(--color-border);
+    text-align: left;
+}
+
+th {
+    background: #f1f3f5;
+    font-weight: 600;
+}
+
+tbody tr:nth-child(even) {
+    background: #fafbfc;
+}
+
+td.numeric {
+    font-family: var(--font-mono);
+    text-align: right;
+}
+
+svg.histogram {
+    width: 100%;
+    max-width: 720px;
+    height: 220px;
+    background: var(--color-card-bg);
+    border: 1px solid var(--color-border);
+    border-radius: 6px;
+}
+
+svg.histogram .bar     { fill: #4c78a8; }
+svg.histogram .axis    { stroke: var(--color-border); stroke-width: 1; }
+svg.histogram .label   { font-family: var(--font-sans); font-size: 11px; fill: var(--color-text); }
+svg.histogram .value   { font-family: var(--font-mono); font-size: 11px; fill: var(--color-text); }
+
+iframe.facets-embed {
+    width: 100%;
+    min-height: 600px;
+    border: 0;
+}
+
+.warning-box {
+    background: var(--color-crit);
+    color: #ffffff;
+    padding: 12px 16px;
+    border-radius: 6px;
+    margin: 12px 0;
+    font-weight: 600;
+}
+
+.footer-list {
+    font-size: 0.85rem;
+    color: var(--color-text-muted);
+}
+
+.footer-list code { font-family: var(--font-mono); }
+
+@media print {
+    body { background: #ffffff; padding: 0; max-width: none; }
+    details { break-inside: avoid; border: 1px solid #ccc; }
+    iframe.facets-embed { min-height: 400px; }
+    .card, table, svg.histogram { break-inside: avoid; }
+}

From 28503d9f491ea7b4cd8245607f0182f6f65ada27 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 22:20:16 +0000
Subject: [PATCH 09/20] feat(analytics): add ReportGenerator with snapshot test

Implements the report_generator module that stitches AI-owned template,
styles, and chart JS into a single self-contained HTML report by
replacing the four INJECT_* placeholders. Adds a golden-file snapshot
test (and four structural tests) so future AI-driven edits to the
report assets fail fast until the snapshot is regenerated. Registers
the *.ai.{html,js,css} assets as package-data so importlib.resources
can resolve them from an installed wheel.

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 .../data_analyzer/report/report_generator.py  |  63 ++
 pyproject.toml                                |   1 +
 .../test_assets/analytics/golden_report.html  | 852 ++++++++++++++++++
 .../report/report_generator_test.py           | 122 +++
 4 files changed, 1038 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/report/report_generator.py
 create mode 100644 tests/test_assets/analytics/golden_report.html
 create mode 100644 tests/unit/analytics/data_analyzer/report/report_generator_test.py

diff --git a/gigl/analytics/data_analyzer/report/report_generator.py b/gigl/analytics/data_analyzer/report/report_generator.py
new file mode 100644
index 000000000..a24cb2c67
--- /dev/null
+++ b/gigl/analytics/data_analyzer/report/report_generator.py
@@ -0,0 +1,63 @@
+"""Generates a single self-contained HTML report from analysis results.
+
+Loads the AI-owned template (report.ai.html), styles (styles.ai.css),
+and chart logic (charts.ai.js), then injects serialized analysis data.
+
+The template, styles, and chart logic are defined by SPEC.md in this
+directory. AI-owned files (*.ai.html, *.ai.js, *.ai.css) can be
+regenerated from the SPEC.
+"""
+import dataclasses
+import json
+from importlib import resources
+from typing import Optional
+
+from gigl.analytics.data_analyzer.config import DataAnalyzerConfig
+from gigl.analytics.data_analyzer.types import FeatureProfileResult, GraphAnalysisResult
+from gigl.common.logger import Logger
+
+logger = Logger()
+
+
+def generate_report(
+    analysis_result: GraphAnalysisResult,
+    profile_result: Optional[FeatureProfileResult],
+    config: Optional[DataAnalyzerConfig],
+) -> str:
+    """Generate a self-contained HTML report from analysis results.
+
+    Args:
+        analysis_result: Graph structure analysis results.
+        profile_result: TFDV feature profiling results (optional).
+        config: Analyzer config for metadata display (optional).
+
+    Returns:
+        Complete HTML string that opens standalone in any browser.
+
+    Example:
+        >>> html = generate_report(
+        ...     analysis_result=result,
+        ...     profile_result=None,
+        ...     config=None,
+        ... )
+        >>> # Write to GCS or local file
+    """
+    template_dir = resources.files("gigl.analytics.data_analyzer.report")
+    html_template = template_dir.joinpath("report.ai.html").read_text()
+    css = template_dir.joinpath("styles.ai.css").read_text()
+    js = template_dir.joinpath("charts.ai.js").read_text()
+
+    analysis_json = json.dumps(dataclasses.asdict(analysis_result), default=str)
+    profile_json = json.dumps(
+        dataclasses.asdict(profile_result) if profile_result else {},
+        default=str,
+    )
+
+    html = html_template
+    html = html.replace("/* INJECT_STYLES */", css)
+    html = html.replace("/* INJECT_SCRIPTS */", js)
+    html = html.replace("/* INJECT_ANALYSIS_DATA */", analysis_json)
+    html = html.replace("/* INJECT_PROFILE_DATA */", profile_json)
+
+    logger.info(f"Generated HTML report ({len(html)} bytes)")
+    return html
diff --git a/pyproject.toml b/pyproject.toml
index d83e5587b..15c0495cf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -246,6 +246,7 @@ gigl-post-install = "gigl.scripts.post_install:main"
 # Include dep_vars.env from the root directory
 "gigl" = ["dep_vars.env", "**/*.yaml"]
 "gigl.scripts" = ["*.sh"]
+"gigl.analytics.data_analyzer.report" = ["*.ai.html", "*.ai.js", "*.ai.css"]
 
 
 [tool.black]
diff --git a/tests/test_assets/analytics/golden_report.html b/tests/test_assets/analytics/golden_report.html
new file mode 100644
index 000000000..b231427db
--- /dev/null
+++ b/tests/test_assets/analytics/golden_report.html
@@ -0,0 +1,852 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>GiGL Data Analysis Report</title>
+    <style>:root {
+    --color-ok: #28a745;
+    --color-warn: #ffc107;
+    --color-crit: #dc3545;
+    --color-bg: #f8f9fa;
+    --color-card-bg: #ffffff;
+    --color-border: #dee2e6;
+    --color-text: #212529;
+    --color-text-muted: #6c757d;
+    --font-sans: system-ui, -apple-system, "Segoe UI", Roboto, sans-serif;
+    --font-mono: ui-monospace, SFMono-Regular, Menlo, monospace;
+}
+
+* { box-sizing: border-box; }
+
+body {
+    max-width: 1200px;
+    margin: 0 auto;
+    padding: 24px;
+    font-family: var(--font-sans);
+    background: var(--color-bg);
+    color: var(--color-text);
+    line-height: 1.5;
+}
+
+h1, h2, h3, h4 {
+    font-family: var(--font-sans);
+    margin-top: 1.2em;
+    margin-bottom: 0.5em;
+}
+
+h1 { font-size: 1.8rem; }
+h2 { font-size: 1.4rem; border-bottom: 1px solid var(--color-border); padding-bottom: 4px; }
+h3 { font-size: 1.15rem; }
+
+.meta, .config-summary {
+    color: var(--color-text-muted);
+    font-size: 0.9rem;
+    margin: 4px 0;
+}
+
+.data-value {
+    font-family: var(--font-mono);
+    color: #111;
+}
+
+.status-green  { background: var(--color-ok);   color: #ffffff; padding: 2px 6px; border-radius: 3px; }
+.status-yellow { background: var(--color-warn); color: #212529; padding: 2px 6px; border-radius: 3px; }
+.status-red    { background: var(--color-crit); color: #ffffff; padding: 2px 6px; border-radius: 3px; }
+
+.status-dot {
+    display: inline-block;
+    width: 12px;
+    height: 12px;
+    border-radius: 50%;
+    vertical-align: middle;
+}
+.status-dot.status-green  { background: var(--color-ok); }
+.status-dot.status-yellow { background: var(--color-warn); }
+.status-dot.status-red    { background: var(--color-crit); }
+
+details {
+    background: var(--color-card-bg);
+    border: 1px solid var(--color-border);
+    border-radius: 6px;
+    padding: 8px 12px;
+    margin: 8px 0;
+}
+
+summary {
+    cursor: pointer;
+    font-weight: 600;
+    padding: 4px 0;
+    user-select: none;
+}
+
+.card-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
+    gap: 12px;
+    margin-top: 8px;
+}
+
+.card {
+    background: var(--color-card-bg);
+    border: 1px solid var(--color-border);
+    border-radius: 6px;
+    padding: 16px;
+    text-align: center;
+}
+
+.card .card-label {
+    font-size: 0.85rem;
+    color: var(--color-text-muted);
+    margin-bottom: 6px;
+}
+
+.card .card-value {
+    font-family: var(--font-mono);
+    font-size: 1.4rem;
+    font-weight: 600;
+}
+
+table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 8px 0;
+    background: var(--color-card-bg);
+    font-size: 0.92rem;
+}
+
+th, td {
+    padding: 6px 10px;
+    border-bottom: 1px solid var(--color-border);
+    text-align: left;
+}
+
+th {
+    background: #f1f3f5;
+    font-weight: 600;
+}
+
+tbody tr:nth-child(even) {
+    background: #fafbfc;
+}
+
+td.numeric {
+    font-family: var(--font-mono);
+    text-align: right;
+}
+
+svg.histogram {
+    width: 100%;
+    max-width: 720px;
+    height: 220px;
+    background: var(--color-card-bg);
+    border: 1px solid var(--color-border);
+    border-radius: 6px;
+}
+
+svg.histogram .bar     { fill: #4c78a8; }
+svg.histogram .axis    { stroke: var(--color-border); stroke-width: 1; }
+svg.histogram .label   { font-family: var(--font-sans); font-size: 11px; fill: var(--color-text); }
+svg.histogram .value   { font-family: var(--font-mono); font-size: 11px; fill: var(--color-text); }
+
+iframe.facets-embed {
+    width: 100%;
+    min-height: 600px;
+    border: 0;
+}
+
+.warning-box {
+    background: var(--color-crit);
+    color: #ffffff;
+    padding: 12px 16px;
+    border-radius: 6px;
+    margin: 12px 0;
+    font-weight: 600;
+}
+
+.footer-list {
+    font-size: 0.85rem;
+    color: var(--color-text-muted);
+}
+
+.footer-list code { font-family: var(--font-mono); }
+
+@media print {
+    body { background: #ffffff; padding: 0; max-width: none; }
+    details { break-inside: avoid; border: 1px solid #ccc; }
+    iframe.facets-embed { min-height: 400px; }
+    .card, table, svg.histogram { break-inside: avoid; }
+}
+</style>
+</head>
+<body>
+    <header id="report-header">
+        <h1>GiGL Data Analysis Report</h1>
+        <p class="meta" id="report-meta"></p>
+        <p class="config-summary" id="report-config-summary"></p>
+    </header>
+
+    <section id="overview">
+        <h2>Overview</h2>
+        <div class="card-grid" id="overview-cards"></div>
+    </section>
+
+    <section id="data-quality">
+        <h2>Data Quality</h2>
+        <details open>
+            <summary>NULL rates per column</summary>
+            <div id="null-rates-container"></div>
+        </details>
+        <details open>
+            <summary>Integrity checks</summary>
+            <div id="integrity-container"></div>
+        </details>
+    </section>
+
+    <section id="feature-statistics" hidden>
+        <h2>Feature Statistics</h2>
+        <div id="feature-statistics-container"></div>
+    </section>
+
+    <section id="graph-structure">
+        <h2>Graph Structure</h2>
+        <details open>
+            <summary>Node and edge counts</summary>
+            <div id="counts-container"></div>
+        </details>
+        <details open>
+            <summary>Degree distribution</summary>
+            <div id="degree-container"></div>
+        </details>
+        <details open>
+            <summary>Top-20 hubs</summary>
+            <div id="hubs-container"></div>
+        </details>
+        <div id="super-hub-warning" hidden></div>
+    </section>
+
+    <section id="advanced" hidden>
+        <h2>Advanced Metrics</h2>
+        <div id="advanced-container"></div>
+    </section>
+
+    <footer id="report-footer">
+        <h2>References and Artifacts</h2>
+        <div id="footer-container"></div>
+    </footer>
+
+    <script id="analysis-data" type="application/json">{"duplicate_node_counts": {"user": 0}, "dangling_edge_counts": {"follows": 0}, "referential_integrity_violations": {"follows": 0}, "node_counts": {"user": 1000000}, "edge_counts": {"follows": 5000000}, "null_rates": {"p.d.nodes": {"age": 0.05, "country": 0.12}}, "duplicate_edge_counts": {"follows": 150}, "self_loop_counts": {"follows": 0}, "isolated_node_counts": {"user": 8000}, "degree_stats": {"follows_out": {"min": 0, "max": 50000, "mean": 10.0, "median": 5, "p90": 25, "p99": 200, "p999": 5000, "percentiles": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], "buckets": {"0-1": 100000, "2-10": 600000, "11-100": 250000, "101-1K": 45000, "1K-10K": 4500, "10K+": 500}}}, "top_hubs": {"follows_out": [["hub_1", 50000], ["hub_2", 35000]]}, "super_hub_int16_clamp_count": {"follows_out": 2}, "cold_start_node_counts": {"user": 100000}, "feature_memory_bytes": {"user": 8000000000}, "neighbor_explosion_estimate": {"follows": 75000}, "class_imbalance": {}, "label_coverage": {}, "edge_type_distribution": {}, "edge_type_node_coverage": {}, "reciprocity": {}, "power_law_exponent": {}}</script>
+    <script id="profile-data" type="application/json">{}</script>
+    <script>(function () {
+    "use strict";
+
+    // Bucket order for degree histograms; must match GraphStructureAnalyzer output.
+    const BUCKET_ORDER = ["0-1", "2-10", "11-100", "101-1K", "1K-10K", "10K+"];
+
+    function parseJSONTag(id) {
+        const node = document.getElementById(id);
+        if (!node) return {};
+        const raw = (node.textContent || "").trim();
+        if (!raw) return {};
+        try {
+            return JSON.parse(raw);
+        } catch (e) {
+            console.error("Failed to parse JSON tag #" + id, e);
+            return {};
+        }
+    }
+
+    function createElement(tag, attrs, ...children) {
+        const el = document.createElement(tag);
+        if (attrs) {
+            for (const key of Object.keys(attrs)) {
+                const val = attrs[key];
+                if (val === null || val === undefined || val === false) continue;
+                if (key === "className") el.className = val;
+                else if (key === "text") el.textContent = val;
+                else if (key === "hidden") el.hidden = Boolean(val);
+                else el.setAttribute(key, val);
+            }
+        }
+        for (const child of children) {
+            if (child === null || child === undefined) continue;
+            if (typeof child === "string" || typeof child === "number") {
+                el.appendChild(document.createTextNode(String(child)));
+            } else {
+                el.appendChild(child);
+            }
+        }
+        return el;
+    }
+
+    function formatNumber(n) {
+        if (n === null || n === undefined) return "-";
+        if (typeof n !== "number") return String(n);
+        return n.toLocaleString("en-US");
+    }
+
+    function formatPercent(fraction) {
+        if (fraction === null || fraction === undefined) return "-";
+        return (fraction * 100).toFixed(2) + "%";
+    }
+
+    function classForThreshold(value, green, yellow) {
+        // value <= green -> green, value <= yellow -> yellow, else red.
+        if (value <= green) return "status-green";
+        if (value <= yellow) return "status-yellow";
+        return "status-red";
+    }
+
+    function classForNullRate(rate) {
+        if (rate > 0.9) return "status-red";
+        if (rate > 0.5) return "status-yellow";
+        return "status-green";
+    }
+
+    function sumValues(obj) {
+        if (!obj) return 0;
+        let total = 0;
+        for (const key of Object.keys(obj)) {
+            const v = obj[key];
+            if (typeof v === "number") total += v;
+        }
+        return total;
+    }
+
+    function hasAnyPositive(obj) {
+        if (!obj) return false;
+        for (const key of Object.keys(obj)) {
+            if (obj[key] > 0) return true;
+        }
+        return false;
+    }
+
+    // ---- Rendering ----
+
+    function renderHeader(analysis) {
+        const metaEl = document.getElementById("report-meta");
+        const cfgEl = document.getElementById("report-config-summary");
+        const now = new Date().toISOString();
+        metaEl.textContent = "Generated at " + now;
+
+        const nodeTypes = Object.keys(analysis.node_counts || {});
+        const edgeTypes = Object.keys(analysis.edge_counts || {});
+        cfgEl.textContent =
+            "Node tables: " + (nodeTypes.length ? nodeTypes.join(", ") : "(none)") +
+            " | Edge tables: " + (edgeTypes.length ? edgeTypes.join(", ") : "(none)");
+    }
+
+    function overallStatus(analysis) {
+        // Hard fails -> red.
+        if (hasAnyPositive(analysis.duplicate_node_counts) ||
+            hasAnyPositive(analysis.dangling_edge_counts) ||
+            hasAnyPositive(analysis.referential_integrity_violations) ||
+            hasAnyPositive(analysis.super_hub_int16_clamp_count)) {
+            return "status-red";
+        }
+        // Check thresholded metrics for yellow.
+        const totalNodes = sumValues(analysis.node_counts);
+        if (totalNodes > 0) {
+            const isolatedFrac = sumValues(analysis.isolated_node_counts) / totalNodes;
+            const coldFrac = sumValues(analysis.cold_start_node_counts) / totalNodes;
+            if (isolatedFrac > 0.05 || coldFrac > 0.10) return "status-red";
+            if (isolatedFrac > 0.01 || coldFrac > 0.05) return "status-yellow";
+        }
+        // NULL rates.
+        const nullRates = analysis.null_rates || {};
+        for (const table of Object.keys(nullRates)) {
+            for (const col of Object.keys(nullRates[table])) {
+                const r = nullRates[table][col];
+                if (r > 0.9) return "status-red";
+            }
+        }
+        return "status-green";
+    }
+
+    function renderOverview(analysis) {
+        const container = document.getElementById("overview-cards");
+        const totalNodes = sumValues(analysis.node_counts);
+        const totalEdges = sumValues(analysis.edge_counts);
+        const nodeTypes = Object.keys(analysis.node_counts || {}).length;
+        const edgeTypes = Object.keys(analysis.edge_counts || {}).length;
+        const status = overallStatus(analysis);
+
+        const cards = [
+            ["Total nodes", formatNumber(totalNodes)],
+            ["Total edges", formatNumber(totalEdges)],
+            ["Node types", formatNumber(nodeTypes)],
+            ["Edge types", formatNumber(edgeTypes)],
+        ];
+        for (const [label, value] of cards) {
+            container.appendChild(createElement("div", { className: "card" },
+                createElement("div", { className: "card-label", text: label }),
+                createElement("div", { className: "card-value data-value", text: value })
+            ));
+        }
+        const statusLabel = status === "status-green" ? "OK" :
+                            status === "status-yellow" ? "WARNING" : "CRITICAL";
+        container.appendChild(createElement("div", { className: "card" },
+            createElement("div", { className: "card-label", text: "Overall status" }),
+            createElement("div", { className: "card-value" },
+                createElement("span", { className: status, text: statusLabel }))
+        ));
+    }
+
+    function renderNullRates(analysis) {
+        const container = document.getElementById("null-rates-container");
+        const rates = analysis.null_rates || {};
+        const rows = [];
+        for (const table of Object.keys(rates)) {
+            for (const col of Object.keys(rates[table])) {
+                rows.push({ table: table, column: col, rate: rates[table][col] });
+            }
+        }
+        if (rows.length === 0) {
+            container.appendChild(createElement("p", { text: "No NULL rate data available." }));
+            return;
+        }
+        rows.sort((a, b) => b.rate - a.rate);
+        const thead = createElement("thead", null,
+            createElement("tr", null,
+                createElement("th", { text: "Table" }),
+                createElement("th", { text: "Column" }),
+                createElement("th", { text: "NULL rate" })));
+        const tbody = createElement("tbody");
+        for (const r of rows) {
+            const cls = classForNullRate(r.rate);
+            tbody.appendChild(createElement("tr", null,
+                createElement("td", { text: r.table }),
+                createElement("td", { text: r.column }),
+                createElement("td", { className: "numeric" },
+                    createElement("span", { className: cls, text: formatPercent(r.rate) }))));
+        }
+        container.appendChild(createElement("table", null, thead, tbody));
+    }
+
+    function renderIntegrity(analysis) {
+        const container = document.getElementById("integrity-container");
+        const rows = [
+            ["Duplicate nodes", analysis.duplicate_node_counts],
+            ["Duplicate edges", analysis.duplicate_edge_counts],
+            ["Dangling edges", analysis.dangling_edge_counts],
+            ["Referential integrity violations", analysis.referential_integrity_violations],
+            ["Self loops", analysis.self_loop_counts],
+            ["Isolated nodes", analysis.isolated_node_counts],
+            ["Cold-start nodes (degree 0-1)", analysis.cold_start_node_counts],
+        ];
+        const thead = createElement("thead", null,
+            createElement("tr", null,
+                createElement("th", { text: "Check" }),
+                createElement("th", { text: "Per-type counts" }),
+                createElement("th", { text: "Total" })));
+        const tbody = createElement("tbody");
+        for (const [label, obj] of rows) {
+            const total = sumValues(obj);
+            const isHardFail = (label === "Duplicate nodes" ||
+                                label === "Dangling edges" ||
+                                label === "Referential integrity violations");
+            const cls = isHardFail
+                ? (total > 0 ? "status-red" : "status-green")
+                : (total > 0 ? "status-yellow" : "status-green");
+            const detail = obj && Object.keys(obj).length
+                ? Object.keys(obj).map(k => k + ": " + formatNumber(obj[k])).join(", ")
+                : "(none)";
+            tbody.appendChild(createElement("tr", null,
+                createElement("td", { text: label }),
+                createElement("td", { className: "data-value", text: detail }),
+                createElement("td", { className: "numeric" },
+                    createElement("span", { className: cls, text: formatNumber(total) }))));
+        }
+        container.appendChild(createElement("table", null, thead, tbody));
+    }
+
+    function renderFeatureStatistics(profile) {
+        const section = document.getElementById("feature-statistics");
+        const container = document.getElementById("feature-statistics-container");
+        const facets = (profile && profile.facets_html_paths) || {};
+        const keys = Object.keys(facets);
+        if (keys.length === 0) {
+            section.hidden = true;
+            return;
+        }
+        section.hidden = false;
+        for (const tableName of keys) {
+            const srcdoc = facets[tableName] || "";
+            container.appendChild(createElement("details", { open: "" },
+                createElement("summary", { text: "FACETS: " + tableName }),
+                createElement("iframe", {
+                    className: "facets-embed",
+                    srcdoc: srcdoc,
+                    sandbox: "allow-scripts allow-same-origin",
+                })));
+        }
+    }
+
+    function renderCounts(analysis) {
+        const container = document.getElementById("counts-container");
+        const thead = createElement("thead", null,
+            createElement("tr", null,
+                createElement("th", { text: "Type" }),
+                createElement("th", { text: "Kind" }),
+                createElement("th", { text: "Count" })));
+        const tbody = createElement("tbody");
+        for (const [name, count] of Object.entries(analysis.node_counts || {})) {
+            tbody.appendChild(createElement("tr", null,
+                createElement("td", { text: name }),
+                createElement("td", { text: "node" }),
+                createElement("td", { className: "numeric data-value", text: formatNumber(count) })));
+        }
+        for (const [name, count] of Object.entries(analysis.edge_counts || {})) {
+            tbody.appendChild(createElement("tr", null,
+                createElement("td", { text: name }),
+                createElement("td", { text: "edge" }),
+                createElement("td", { className: "numeric data-value", text: formatNumber(count) })));
+        }
+        container.appendChild(createElement("table", null, thead, tbody));
+    }
+
+    function renderDegreeHistogram(buckets) {
+        // Returns an SVG element for the given bucket counts.
+        const width = 720;
+        const height = 220;
+        const padLeft = 50;
+        const padRight = 10;
+        const padTop = 16;
+        const padBottom = 40;
+        const innerW = width - padLeft - padRight;
+        const innerH = height - padTop - padBottom;
+
+        const svg = document.createElementNS("http://www.w3.org/2000/svg", "svg");
+        svg.setAttribute("class", "histogram");
+        svg.setAttribute("viewBox", "0 0 " + width + " " + height);
+
+        const counts = BUCKET_ORDER.map(k => (buckets && buckets[k]) || 0);
+        const maxCount = Math.max(1, ...counts);
+        const barWidth = innerW / BUCKET_ORDER.length;
+
+        // Axis lines.
+        const axis = document.createElementNS("http://www.w3.org/2000/svg", "line");
+        axis.setAttribute("class", "axis");
+        axis.setAttribute("x1", padLeft);
+        axis.setAttribute("y1", padTop + innerH);
+        axis.setAttribute("x2", padLeft + innerW);
+        axis.setAttribute("y2", padTop + innerH);
+        svg.appendChild(axis);
+
+        for (let i = 0; i < BUCKET_ORDER.length; i++) {
+            const c = counts[i];
+            const h = (c / maxCount) * innerH;
+            const x = padLeft + i * barWidth + 4;
+            const y = padTop + innerH - h;
+            const rect = document.createElementNS("http://www.w3.org/2000/svg", "rect");
+            rect.setAttribute("class", "bar");
+            rect.setAttribute("x", x);
+            rect.setAttribute("y", y);
+            rect.setAttribute("width", Math.max(1, barWidth - 8));
+            rect.setAttribute("height", h);
+            svg.appendChild(rect);
+
+            const valueLabel = document.createElementNS("http://www.w3.org/2000/svg", "text");
+            valueLabel.setAttribute("class", "value");
+            valueLabel.setAttribute("x", x + (barWidth - 8) / 2);
+            valueLabel.setAttribute("y", y - 4);
+            valueLabel.setAttribute("text-anchor", "middle");
+            valueLabel.textContent = formatNumber(c);
+            svg.appendChild(valueLabel);
+
+            const xLabel = document.createElementNS("http://www.w3.org/2000/svg", "text");
+            xLabel.setAttribute("class", "label");
+            xLabel.setAttribute("x", x + (barWidth - 8) / 2);
+            xLabel.setAttribute("y", padTop + innerH + 16);
+            xLabel.setAttribute("text-anchor", "middle");
+            xLabel.textContent = BUCKET_ORDER[i];
+            svg.appendChild(xLabel);
+        }
+
+        // Y-axis max label.
+        const maxLabel = document.createElementNS("http://www.w3.org/2000/svg", "text");
+        maxLabel.setAttribute("class", "label");
+        maxLabel.setAttribute("x", padLeft - 6);
+        maxLabel.setAttribute("y", padTop + 10);
+        maxLabel.setAttribute("text-anchor", "end");
+        maxLabel.textContent = formatNumber(maxCount);
+        svg.appendChild(maxLabel);
+
+        return svg;
+    }
+
+    function renderDegree(analysis) {
+        const container = document.getElementById("degree-container");
+        const degrees = analysis.degree_stats || {};
+        const keys = Object.keys(degrees);
+        if (keys.length === 0) {
+            container.appendChild(createElement("p", { text: "No degree stats available." }));
+            return;
+        }
+        for (const edgeType of keys) {
+            const stats = degrees[edgeType];
+            const median = stats.median || 1;
+            const ratio = stats.p99 / Math.max(1, median);
+            const ratioClass = classForThreshold(ratio, 50, 100);
+
+            const statsLine = createElement("p", { className: "data-value" },
+                "min=" + formatNumber(stats.min) +
+                ", mean=" + (stats.mean !== undefined ? stats.mean.toFixed(2) : "-") +
+                ", median=" + formatNumber(stats.median) +
+                ", p90=" + formatNumber(stats.p90) +
+                ", p99=" + formatNumber(stats.p99) +
+                ", p99.9=" + formatNumber(stats.p999) +
+                ", max=" + formatNumber(stats.max) +
+                " | p99/median=",
+                createElement("span", { className: ratioClass, text: ratio.toFixed(1) }));
+
+            container.appendChild(createElement("h3", { text: edgeType }));
+            container.appendChild(statsLine);
+            container.appendChild(renderDegreeHistogram(stats.buckets || {}));
+        }
+    }
+
+    function renderHubs(analysis) {
+        const container = document.getElementById("hubs-container");
+        const hubs = analysis.top_hubs || {};
+        const keys = Object.keys(hubs);
+        if (keys.length === 0) {
+            container.appendChild(createElement("p", { text: "No hub data available." }));
+            return;
+        }
+        for (const edgeType of keys) {
+            container.appendChild(createElement("h3", { text: edgeType }));
+            const thead = createElement("thead", null,
+                createElement("tr", null,
+                    createElement("th", { text: "Rank" }),
+                    createElement("th", { text: "Node ID" }),
+                    createElement("th", { text: "Degree" })));
+            const tbody = createElement("tbody");
+            const rows = (hubs[edgeType] || []).slice(0, 20);
+            rows.forEach((entry, i) => {
+                const nodeId = Array.isArray(entry) ? entry[0] : entry.node_id;
+                const degree = Array.isArray(entry) ? entry[1] : entry.degree;
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: String(i + 1) }),
+                    createElement("td", { className: "data-value", text: String(nodeId) }),
+                    createElement("td", { className: "numeric data-value", text: formatNumber(degree) })));
+            });
+            container.appendChild(createElement("table", null, thead, tbody));
+        }
+    }
+
+    function renderSuperHubWarning(analysis) {
+        const box = document.getElementById("super-hub-warning");
+        const clamps = analysis.super_hub_int16_clamp_count || {};
+        const totalClamps = sumValues(clamps);
+        if (totalClamps <= 0) {
+            box.hidden = true;
+            return;
+        }
+        box.hidden = false;
+        box.className = "warning-box";
+        const detail = Object.keys(clamps)
+            .map(k => k + ": " + formatNumber(clamps[k]))
+            .join(", ");
+        box.appendChild(createElement("strong", { text: "Super-hub int16 clamp warning. " }));
+        box.appendChild(document.createTextNode(
+            formatNumber(totalClamps) + " node(s) exceed the int16 degree limit (32,767) and " +
+            "will be silently clamped by GiGL. Per-type: " + detail
+        ));
+    }
+
+    function renderAdvanced(analysis) {
+        const section = document.getElementById("advanced");
+        const container = document.getElementById("advanced-container");
+
+        const classImb = analysis.class_imbalance || {};
+        const labelCov = analysis.label_coverage || {};
+        const edgeDist = analysis.edge_type_distribution || {};
+        const reciprocity = analysis.reciprocity || {};
+        const powerLaw = analysis.power_law_exponent || {};
+
+        const hasTier3 = Object.keys(classImb).length ||
+                         Object.keys(labelCov).length ||
+                         Object.keys(edgeDist).length;
+        const hasTier4 = Object.keys(reciprocity).length ||
+                         Object.keys(powerLaw).length;
+
+        if (!hasTier3 && !hasTier4) {
+            section.hidden = true;
+            return;
+        }
+        section.hidden = false;
+
+        if (Object.keys(classImb).length) {
+            container.appendChild(createElement("h3", { text: "Class imbalance" }));
+            for (const nodeType of Object.keys(classImb)) {
+                const counts = classImb[nodeType];
+                const values = Object.values(counts);
+                const maxC = Math.max(...values);
+                const minC = Math.max(1, Math.min(...values));
+                const ratio = maxC / minC;
+                const cls = ratio > 10 ? "status-red" : ratio > 5 ? "status-yellow" : "status-green";
+                container.appendChild(createElement("p", { className: "data-value" },
+                    nodeType + " max/min ratio = ",
+                    createElement("span", { className: cls, text: "1:" + ratio.toFixed(1) })));
+                const tbody = createElement("tbody");
+                for (const [label, count] of Object.entries(counts)) {
+                    tbody.appendChild(createElement("tr", null,
+                        createElement("td", { text: String(label) }),
+                        createElement("td", { className: "numeric data-value", text: formatNumber(count) })));
+                }
+                container.appendChild(createElement("table", null,
+                    createElement("thead", null, createElement("tr", null,
+                        createElement("th", { text: "Class" }),
+                        createElement("th", { text: "Count" }))),
+                    tbody));
+            }
+        }
+
+        if (Object.keys(labelCov).length) {
+            container.appendChild(createElement("h3", { text: "Label coverage" }));
+            const tbody = createElement("tbody");
+            for (const [nodeType, frac] of Object.entries(labelCov)) {
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: nodeType }),
+                    createElement("td", { className: "numeric data-value", text: formatPercent(frac) })));
+            }
+            container.appendChild(createElement("table", null,
+                createElement("thead", null, createElement("tr", null,
+                    createElement("th", { text: "Node type" }),
+                    createElement("th", { text: "Coverage" }))),
+                tbody));
+        }
+
+        if (Object.keys(edgeDist).length) {
+            container.appendChild(createElement("h3", { text: "Edge type distribution" }));
+            const total = sumValues(edgeDist);
+            const tbody = createElement("tbody");
+            for (const [edgeType, count] of Object.entries(edgeDist)) {
+                const frac = total > 0 ? count / total : 0;
+                let cls = "status-green";
+                if (frac < 0.001) cls = "status-red";
+                else if (frac > 0.9) cls = "status-red";
+                else if (frac > 0.8) cls = "status-yellow";
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: edgeType }),
+                    createElement("td", { className: "numeric data-value", text: formatNumber(count) }),
+                    createElement("td", { className: "numeric" },
+                        createElement("span", { className: cls, text: formatPercent(frac) }))));
+            }
+            container.appendChild(createElement("table", null,
+                createElement("thead", null, createElement("tr", null,
+                    createElement("th", { text: "Edge type" }),
+                    createElement("th", { text: "Count" }),
+                    createElement("th", { text: "Share" }))),
+                tbody));
+        }
+
+        if (Object.keys(reciprocity).length) {
+            container.appendChild(createElement("h3", { text: "Reciprocity" }));
+            const tbody = createElement("tbody");
+            for (const [edgeType, val] of Object.entries(reciprocity)) {
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: edgeType }),
+                    createElement("td", { className: "numeric data-value", text: formatPercent(val) })));
+            }
+            container.appendChild(createElement("table", null,
+                createElement("thead", null, createElement("tr", null,
+                    createElement("th", { text: "Edge type" }),
+                    createElement("th", { text: "Reciprocity" }))),
+                tbody));
+        }
+
+        if (Object.keys(powerLaw).length) {
+            container.appendChild(createElement("h3", { text: "Power-law exponent" }));
+            const tbody = createElement("tbody");
+            for (const [edgeType, alpha] of Object.entries(powerLaw)) {
+                const cls = alpha < 2 ? "status-red" : alpha < 2.5 ? "status-yellow" : "status-green";
+                tbody.appendChild(createElement("tr", null,
+                    createElement("td", { text: edgeType }),
+                    createElement("td", { className: "numeric" },
+                        createElement("span", { className: cls, text: alpha.toFixed(2) }))));
+            }
+            container.appendChild(createElement("table", null,
+                createElement("thead", null, createElement("tr", null,
+                    createElement("th", { text: "Edge type" }),
+                    createElement("th", { text: "Alpha" }))),
+                tbody));
+        }
+    }
+
+    function renderFooter(analysis, profile) {
+        const container = document.getElementById("footer-container");
+
+        const artifacts = [];
+        const paths = (profile && profile.facets_html_paths) || {};
+        for (const [k, v] of Object.entries(paths)) {
+            artifacts.push("FACETS " + k + ": " + v);
+        }
+        const statsPaths = (profile && profile.stats_paths) || {};
+        for (const [k, v] of Object.entries(statsPaths)) {
+            artifacts.push("Stats " + k + ": " + v);
+        }
+
+        if (artifacts.length) {
+            container.appendChild(createElement("h3", { text: "Raw artifacts" }));
+            const ul = createElement("ul", { className: "footer-list" });
+            for (const a of artifacts) {
+                ul.appendChild(createElement("li", null, createElement("code", { text: a })));
+            }
+            container.appendChild(ul);
+        }
+
+        container.appendChild(createElement("h3", { text: "References" }));
+        const refs = [
+            "PinSage (Pinterest, KDD 2018)",
+            "PinnerSage (Pinterest, KDD 2020)",
+            "BLADE (Amazon, WSDM 2023)",
+            "LiGNN (LinkedIn, KDD 2024)",
+            "TwHIN (Twitter/X, KDD 2022)",
+            "GiGL (Snap, KDD 2025)",
+            "AliGraph (Alibaba, VLDB 2019)",
+            "GraphSMOTE (WSDM 2021)",
+            "Beyond Homophily (NeurIPS 2020)",
+            "Uber Fraud Detection + Grab Spade (VLDB 2023)",
+            "Google Maps ETA (CIKM 2021)",
+            "Feature Propagation (ICLR 2022)",
+            "GraphBFF (Feb 2026)",
+            "DQuaG (EDBT 2025)",
+            "LinkedIn Cross-Domain GNN (June 2025)",
+            "Oversmoothing/Oversquashing Complexity (March 2026)",
+            "Demystifying Common Beliefs (ICLR 2026)",
+            "Meta GEM and Adaptive Ranking (2025-2026)",
+        ];
+        const refUl = createElement("ul", { className: "footer-list" });
+        for (const r of refs) {
+            refUl.appendChild(createElement("li", { text: r }));
+        }
+        container.appendChild(refUl);
+    }
+
+    function main() {
+        const analysis = parseJSONTag("analysis-data");
+        const profile = parseJSONTag("profile-data");
+        renderHeader(analysis);
+        renderOverview(analysis);
+        renderNullRates(analysis);
+        renderIntegrity(analysis);
+        renderFeatureStatistics(profile);
+        renderCounts(analysis);
+        renderDegree(analysis);
+        renderHubs(analysis);
+        renderSuperHubWarning(analysis);
+        renderAdvanced(analysis);
+        renderFooter(analysis, profile);
+    }
+
+    if (document.readyState === "loading") {
+        document.addEventListener("DOMContentLoaded", main);
+    } else {
+        main();
+    }
+})();
+</script>
+</body>
+</html>
diff --git a/tests/unit/analytics/data_analyzer/report/report_generator_test.py b/tests/unit/analytics/data_analyzer/report/report_generator_test.py
new file mode 100644
index 000000000..a9c980e9d
--- /dev/null
+++ b/tests/unit/analytics/data_analyzer/report/report_generator_test.py
@@ -0,0 +1,122 @@
+from pathlib import Path
+
+from gigl.analytics.data_analyzer.report.report_generator import generate_report
+from gigl.analytics.data_analyzer.types import DegreeStats, GraphAnalysisResult
+from tests.test_assets.test_case import TestCase
+
+GOLDEN_REPORT_PATH = (
+    Path(__file__).parents[4] / "test_assets" / "analytics" / "golden_report.html"
+)
+
+
+def _make_test_result() -> GraphAnalysisResult:
+    """Deterministic test data for snapshot testing."""
+    return GraphAnalysisResult(
+        duplicate_node_counts={"user": 0},
+        dangling_edge_counts={"follows": 0},
+        referential_integrity_violations={"follows": 0},
+        node_counts={"user": 1000000},
+        edge_counts={"follows": 5000000},
+        null_rates={"p.d.nodes": {"age": 0.05, "country": 0.12}},
+        duplicate_edge_counts={"follows": 150},
+        self_loop_counts={"follows": 0},
+        isolated_node_counts={"user": 8000},
+        degree_stats={
+            "follows_out": DegreeStats(
+                min=0,
+                max=50000,
+                mean=10.0,
+                median=5,
+                p90=25,
+                p99=200,
+                p999=5000,
+                percentiles=list(range(101)),
+                buckets={
+                    "0-1": 100000,
+                    "2-10": 600000,
+                    "11-100": 250000,
+                    "101-1K": 45000,
+                    "1K-10K": 4500,
+                    "10K+": 500,
+                },
+            )
+        },
+        top_hubs={"follows_out": [("hub_1", 50000), ("hub_2", 35000)]},
+        super_hub_int16_clamp_count={"follows_out": 2},
+        cold_start_node_counts={"user": 100000},
+        feature_memory_bytes={"user": 8000000000},
+        neighbor_explosion_estimate={"follows": 75000},
+    )
+
+
+class ReportGeneratorStructuralTest(TestCase):
+    """Structural assertions on the generated HTML."""
+
+    def test_output_is_non_empty_html(self) -> None:
+        html = generate_report(
+            analysis_result=_make_test_result(),
+            profile_result=None,
+            config=None,
+        )
+        self.assertIsInstance(html, str)
+        self.assertGreater(len(html), 1000)
+        self.assertIn("<html", html)
+        self.assertIn("GiGL Data Analysis Report", html)
+
+    def test_placeholders_all_replaced(self) -> None:
+        html = generate_report(
+            analysis_result=_make_test_result(),
+            profile_result=None,
+            config=None,
+        )
+        # None of the injection placeholders should remain in the output.
+        self.assertNotIn("/* INJECT_STYLES */", html)
+        self.assertNotIn("/* INJECT_SCRIPTS */", html)
+        self.assertNotIn("/* INJECT_ANALYSIS_DATA */", html)
+        self.assertNotIn("/* INJECT_PROFILE_DATA */", html)
+
+    def test_injected_data_present(self) -> None:
+        html = generate_report(
+            analysis_result=_make_test_result(),
+            profile_result=None,
+            config=None,
+        )
+        # The JSON data lives inside a hidden script tag.
+        self.assertIn('"node_counts"', html)
+        # Either 1000000 (int) or "1000000" (str) is acceptable depending on serialization.
+        self.assertTrue('"1000000"' in html or "1000000" in html)
+
+    def test_empty_profile_serializes_as_empty_object(self) -> None:
+        html = generate_report(
+            analysis_result=_make_test_result(),
+            profile_result=None,
+            config=None,
+        )
+        # When profile_result is None, we inject an empty JSON object.
+        self.assertIn('id="profile-data"', html)
+
+
+class ReportGeneratorSnapshotTest(TestCase):
+    """Golden-file snapshot test to catch structural regressions."""
+
+    def test_snapshot_matches_golden(self) -> None:
+        html = generate_report(
+            analysis_result=_make_test_result(),
+            profile_result=None,
+            config=None,
+        )
+        if not GOLDEN_REPORT_PATH.exists():
+            self.fail(
+                f"Golden file missing: {GOLDEN_REPORT_PATH}. "
+                f"Create it by writing the current output of generate_report "
+                f"with _make_test_result() as input."
+            )
+        golden = GOLDEN_REPORT_PATH.read_text()
+        self.assertEqual(
+            html,
+            golden,
+            msg=(
+                "HTML output changed. If this is intentional, regenerate the "
+                f"golden file at {GOLDEN_REPORT_PATH}."
+            ),
+        )

From 018e35ec600f9043efd59a96a426a20da53a4149 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 22:21:53 +0000
Subject: [PATCH 10/20] feat(analytics): add DataAnalyzer orchestrator with CLI
 entry point

Implements the main orchestrator class that coordinates graph structure
analysis, feature profiling, and HTML report generation. Includes CLI
entry point with argparse for analyzer_config_uri and resource_config_uri.

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/data_analyzer/__init__.py      |  4 +
 gigl/analytics/data_analyzer/__main__.py      |  6 ++
 gigl/analytics/data_analyzer/data_analyzer.py | 94 +++++++++++++++++++
 3 files changed, 104 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/__main__.py
 create mode 100644 gigl/analytics/data_analyzer/data_analyzer.py

diff --git a/gigl/analytics/data_analyzer/__init__.py b/gigl/analytics/data_analyzer/__init__.py
index 0b681ff11..45304dacc 100644
--- a/gigl/analytics/data_analyzer/__init__.py
+++ b/gigl/analytics/data_analyzer/__init__.py
@@ -4,3 +4,7 @@
 Produces a single HTML report covering data quality, feature distributions,
 and graph structure metrics from BigQuery node/edge tables.
 """
+
+from gigl.analytics.data_analyzer.data_analyzer import DataAnalyzer
+
+__all__ = ["DataAnalyzer"]
diff --git a/gigl/analytics/data_analyzer/__main__.py b/gigl/analytics/data_analyzer/__main__.py
new file mode 100644
index 000000000..693551d33
--- /dev/null
+++ b/gigl/analytics/data_analyzer/__main__.py
@@ -0,0 +1,6 @@
+"""Entry point for running the BQ Data Analyzer as a module: python -m gigl.analytics.data_analyzer."""
+
+from gigl.analytics.data_analyzer.data_analyzer import main
+
+if __name__ == "__main__":
+    main()
diff --git a/gigl/analytics/data_analyzer/data_analyzer.py b/gigl/analytics/data_analyzer/data_analyzer.py
new file mode 100644
index 000000000..f9ac8c162
--- /dev/null
+++ b/gigl/analytics/data_analyzer/data_analyzer.py
@@ -0,0 +1,94 @@
+"""Main orchestrator and CLI entry point for the BQ Data Analyzer."""
+import argparse
+from typing import Optional
+
+from gigl.analytics.data_analyzer.config import DataAnalyzerConfig, load_analyzer_config
+from gigl.analytics.data_analyzer.graph_structure_analyzer import (
+    DataQualityError,
+    GraphStructureAnalyzer,
+)
+from gigl.analytics.data_analyzer.report.report_generator import generate_report
+from gigl.analytics.data_analyzer.types import (
+    FeatureProfileResult,
+    GraphAnalysisResult,
+)
+from gigl.common import Uri
+from gigl.common.logger import Logger
+
+logger = Logger()
+
+
+class DataAnalyzer:
+    """Orchestrates graph structure analysis, feature profiling, and report generation.
+
+    Example:
+        >>> from gigl.analytics.data_analyzer.config import load_analyzer_config
+        >>> config = load_analyzer_config("gs://bucket/config.yaml")
+        >>> analyzer = DataAnalyzer()
+        >>> report_path = analyzer.run(config=config)
+    """
+
+    def run(
+        self,
+        config: DataAnalyzerConfig,
+        resource_config_uri: Optional[Uri] = None,
+    ) -> str:
+        """Run the full analysis pipeline and generate an HTML report.
+
+        Args:
+            config: Analyzer configuration.
+            resource_config_uri: Optional resource config for Dataflow sizing.
+
+        Returns:
+            GCS path to the generated HTML report.
+        """
+        analysis_result: GraphAnalysisResult
+        profile_result: Optional[FeatureProfileResult] = None
+
+        structure_analyzer = GraphStructureAnalyzer()
+        try:
+            analysis_result = structure_analyzer.analyze(config)
+        except DataQualityError as e:
+            logger.error(f"Tier 1 data quality failure: {e}")
+            analysis_result = e.partial_result
+
+        # TODO: run feature profiler (TFDV/Dataflow) in parallel once implemented.
+
+        html = generate_report(
+            analysis_result=analysis_result,
+            profile_result=profile_result,
+            config=config,
+        )
+
+        report_gcs_path = f"{config.output_gcs_path.rstrip('/')}/report.html"
+        logger.info(f"Generated report; would upload to {report_gcs_path}")
+        # TODO: wire up GCS upload via gigl.common.utils.gcs.GcsUtils
+
+        return report_gcs_path
+
+
+def main() -> None:
+    """CLI entry point for the BQ Data Analyzer."""
+    parser = argparse.ArgumentParser(
+        description="BQ Data Analyzer: analyze graph data in BigQuery before GNN training"
+    )
+    parser.add_argument(
+        "--analyzer_config_uri",
+        required=True,
+        help="Path or GCS URI to the analyzer YAML config",
+    )
+    parser.add_argument(
+        "--resource_config_uri",
+        required=False,
+        help="Path or GCS URI to the resource config for Dataflow sizing",
+    )
+    args = parser.parse_args()
+
+    config = load_analyzer_config(args.analyzer_config_uri)
+    analyzer = DataAnalyzer()
+    report_path = analyzer.run(config=config)
+    logger.info(f"Report generated at: {report_path}")
+
+
+if __name__ == "__main__":
+    main()

From 42f8d786af952bc6d86531cd6939c4bb876413f1 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 22:22:44 +0000
Subject: [PATCH 11/20] feat(analytics): add FeatureProfiler stub
 (TFDV/Dataflow integration deferred)

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 .../data_analyzer/feature_profiler.py         | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/feature_profiler.py

diff --git a/gigl/analytics/data_analyzer/feature_profiler.py b/gigl/analytics/data_analyzer/feature_profiler.py
new file mode 100644
index 000000000..a20f593b6
--- /dev/null
+++ b/gigl/analytics/data_analyzer/feature_profiler.py
@@ -0,0 +1,57 @@
+"""TFDV feature profiling via Beam/Dataflow.
+
+Builds standalone Beam pipelines that read from BQ tables, run
+tfdv.GenerateStatistics(), and produce FACETS HTML visualizations.
+
+Will reuse existing PTransforms:
+- GenerateAndVisualizeStats (gigl/src/data_preprocessor/lib/transform/utils.py:120)
+- IngestRawFeatures (gigl/src/data_preprocessor/lib/transform/utils.py:85)
+- init_beam_pipeline_options (gigl/src/common/utils/dataflow.py)
+
+NOTE: Currently a stub. Full implementation is deferred to a future PR
+once the wrapping Dataflow infrastructure is ready. The stub logs a
+warning and returns an empty FeatureProfileResult so callers can wire
+up their code without blocking on Dataflow.
+"""
+from typing import Optional
+
+from gigl.analytics.data_analyzer.config import DataAnalyzerConfig
+from gigl.analytics.data_analyzer.types import FeatureProfileResult
+from gigl.common import Uri
+from gigl.common.logger import Logger
+
+logger = Logger()
+
+
+class FeatureProfiler:
+    """Runs TFDV feature profiling on BQ tables via Dataflow.
+
+    Currently a stub. See module docstring.
+
+    Example:
+        >>> profiler = FeatureProfiler()
+        >>> result = profiler.profile(config)
+        >>> # result.facets_html_paths will be empty until full impl lands
+    """
+
+    def profile(
+        self,
+        config: DataAnalyzerConfig,
+        resource_config_uri: Optional[Uri] = None,
+    ) -> FeatureProfileResult:
+        """Run TFDV profiling on all tables in config.
+
+        Args:
+            config: Analyzer configuration with table specs.
+            resource_config_uri: Optional resource config for Dataflow sizing.
+
+        Returns:
+            FeatureProfileResult with GCS paths to TFDV artifacts.
+        """
+        logger.warning(
+            "FeatureProfiler not yet implemented. "
+            "Returning empty results. "
+            "Full implementation will wire up Beam/Dataflow pipelines "
+            "using GenerateAndVisualizeStats and IngestRawFeatures."
+        )
+        return FeatureProfileResult()

From 56eb17044460af88c5062867e51752900a3e0297 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 22:50:03 +0000
Subject: [PATCH 12/20] fix(analytics): cast OmegaConf.to_object result in
 config_test

Narrows the Union return type for mypy in the direct-merge test path.

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 tests/unit/analytics/data_analyzer/config_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/analytics/data_analyzer/config_test.py b/tests/unit/analytics/data_analyzer/config_test.py
index 5057462a1..86b454037 100644
--- a/tests/unit/analytics/data_analyzer/config_test.py
+++ b/tests/unit/analytics/data_analyzer/config_test.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import cast
 
 from omegaconf import OmegaConf
 
@@ -41,7 +42,7 @@ def test_optional_fields_default_to_none_or_false(self) -> None:
         """
         raw = OmegaConf.create(yaml_str)
         merged = OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), raw)
-        config = OmegaConf.to_object(merged)
+        config = cast(DataAnalyzerConfig, OmegaConf.to_object(merged))
         self.assertIsNone(config.node_tables[0].label_column)
         self.assertIsNone(config.edge_tables[0].timestamp_column)
         self.assertIsNone(config.fan_out)

From 7f387f6c57f7cf7f43ebbc9359d41f49634df016 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 23:04:40 +0000
Subject: [PATCH 13/20] style(analytics): apply isort and mdformat to
 data_analyzer files

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/data_analyzer/data_analyzer.py |   5 +-
 gigl/analytics/data_analyzer/report/SPEC.md   | 128 ++++++++----------
 2 files changed, 57 insertions(+), 76 deletions(-)

diff --git a/gigl/analytics/data_analyzer/data_analyzer.py b/gigl/analytics/data_analyzer/data_analyzer.py
index f9ac8c162..3a99ffa35 100644
--- a/gigl/analytics/data_analyzer/data_analyzer.py
+++ b/gigl/analytics/data_analyzer/data_analyzer.py
@@ -8,10 +8,7 @@
     GraphStructureAnalyzer,
 )
 from gigl.analytics.data_analyzer.report.report_generator import generate_report
-from gigl.analytics.data_analyzer.types import (
-    FeatureProfileResult,
-    GraphAnalysisResult,
-)
+from gigl.analytics.data_analyzer.types import FeatureProfileResult, GraphAnalysisResult
 from gigl.common import Uri
 from gigl.common.logger import Logger
 
diff --git a/gigl/analytics/data_analyzer/report/SPEC.md b/gigl/analytics/data_analyzer/report/SPEC.md
index 0eecb3d73..a67649497 100644
--- a/gigl/analytics/data_analyzer/report/SPEC.md
+++ b/gigl/analytics/data_analyzer/report/SPEC.md
@@ -2,94 +2,83 @@
 
 ## Purpose
 
-This SPEC defines the single self-contained HTML report that the BQ Data Analyzer
-produces for a graph dataset. The three `.ai.{html,js,css}` files in this
-directory implement the SPEC and should be regenerated from it whenever the SPEC
-changes. The Python `report_generator.py` module is the only non-AI-owned
-component in this directory; it loads the AI assets via `importlib.resources`,
-injects data from a `GraphAnalysisResult` dataclass, and writes a single HTML
-file to disk.
+This SPEC defines the single self-contained HTML report that the BQ Data Analyzer produces for a graph dataset. The
+three `.ai.{html,js,css}` files in this directory implement the SPEC and should be regenerated from it whenever the SPEC
+changes. The Python `report_generator.py` module is the only non-AI-owned component in this directory; it loads the AI
+assets via `importlib.resources`, injects data from a `GraphAnalysisResult` dataclass, and writes a single HTML file to
+disk.
 
 ## Constraints
 
-- Single self-contained HTML file. No external CDN, no external JS/CSS/font
-  dependencies, no network requests at view time.
+- Single self-contained HTML file. No external CDN, no external JS/CSS/font dependencies, no network requests at view
+  time.
 - Opens in any modern browser (Chrome, Firefox, Safari, Edge) without a server.
 - Max-width 1200px, centered horizontally.
 - Light background (`#f8f9fa`).
-- Monospace font (`ui-monospace`, `SFMono-Regular`, `Menlo`, `monospace`) for all
-  numeric data values; sans-serif (`system-ui`, `-apple-system`,
-  `"Segoe UI"`, `Roboto`, sans-serif) for labels and headings.
-- Collapsible sections use `<details>` / `<summary>` (no JS required to
-  expand/collapse).
+- Monospace font (`ui-monospace`, `SFMono-Regular`, `Menlo`, `monospace`) for all numeric data values; sans-serif
+  (`system-ui`, `-apple-system`, `"Segoe UI"`, `Roboto`, sans-serif) for labels and headings.
+- Collapsible sections use `<details>` / `<summary>` (no JS required to expand/collapse).
 - Color coding for status uses these exact values:
   - Green: `#28a745` (OK)
   - Yellow: `#ffc107` (warning)
   - Red: `#dc3545` (critical)
-- Total report HTML should be reasonable in size (a single dataset's report
-  with embedded FACETS iframes may be multi-MB; that is acceptable).
+- Total report HTML should be reasonable in size (a single dataset's report with embedded FACETS iframes may be
+  multi-MB; that is acceptable).
 
 ## Sections (in display order)
 
-1. **Header** (`<header id="report-header">`) — "GiGL Data Analysis Report"
-   title, generation timestamp, and a short config summary listing the analyzed
-   node tables and edge tables.
-2. **Overview Dashboard** (`<section id="overview">`) — Card grid showing total
-   nodes, total edges, number of node types, number of edge types, and an
-   overall traffic-light status indicator (green/yellow/red). The status is the
+1. **Header** (`<header id="report-header">`) — "GiGL Data Analysis Report" title, generation timestamp, and a short
+   config summary listing the analyzed node tables and edge tables.
+2. **Overview Dashboard** (`<section id="overview">`) — Card grid showing total nodes, total edges, number of node
+   types, number of edge types, and an overall traffic-light status indicator (green/yellow/red). The status is the
    worst severity across all detected issues.
-3. **Data Quality** (`<section id="data-quality">`) — Per-table NULL rates table
-   sorted highest-first with rows color-coded (NULL rate > 50% = yellow,
-   > 90% = red). Duplicate node counts, duplicate edge counts, dangling edge
-   counts, and referential integrity violations. Any nonzero count in these
-   four is rendered red.
-4. **Feature Statistics** (`<section id="feature-statistics">`) — Optional. One
-   subsection per table with the corresponding FACETS HTML embedded inside an
-   `<iframe srcdoc="...">` to isolate styles. Entire section is hidden if no
-   profile data is provided.
-5. **Graph Structure** (`<section id="graph-structure">`) — Node and edge count
-   table. Per-edge-type degree distribution rendered as inline SVG histogram
-   using the `buckets` dict from `DegreeStats` (buckets `0-1`, `2-10`, `11-100`,
-   `101-1K`, `1K-10K`, `10K+`). Top-20 hub table per edge type. Super-hub int16
-   clamp warning box (red) shown if any edge type reports a clamp count > 0.
-6. **Advanced** (`<section id="advanced">`) — Optional Tier 3 / Tier 4 data.
-   Shown only if the relevant fields are populated:
+3. **Data Quality** (`<section id="data-quality">`) — Per-table NULL rates table sorted highest-first with rows
+   color-coded (NULL rate > 50% = yellow,
+   > 90% = red). Duplicate node counts, duplicate edge counts, dangling edge counts, and referential integrity
+   > violations. Any nonzero count in these four is rendered red.
+4. **Feature Statistics** (`<section id="feature-statistics">`) — Optional. One subsection per table with the
+   corresponding FACETS HTML embedded inside an `<iframe srcdoc="...">` to isolate styles. Entire section is hidden if
+   no profile data is provided.
+5. **Graph Structure** (`<section id="graph-structure">`) — Node and edge count table. Per-edge-type degree distribution
+   rendered as inline SVG histogram using the `buckets` dict from `DegreeStats` (buckets `0-1`, `2-10`, `11-100`,
+   `101-1K`, `1K-10K`, `10K+`). Top-20 hub table per edge type. Super-hub int16 clamp warning box (red) shown if any
+   edge type reports a clamp count > 0.
+6. **Advanced** (`<section id="advanced">`) — Optional Tier 3 / Tier 4 data. Shown only if the relevant fields are
+   populated:
    - Class imbalance (bar chart and per-class counts)
    - Label coverage (percentage per node type)
    - Edge type distribution (bar chart)
    - Reciprocity per edge type
    - Power-law exponent per edge type
-7. **Footer** (`<footer id="report-footer">`) — GiGL version / commit, list of
-   raw artifact GCS paths, and a condensed list of literature references (the
-   18 papers from `docs/plans/20260415-bq-data-analyzer-references.md`).
+7. **Footer** (`<footer id="report-footer">`) — GiGL version / commit, list of raw artifact GCS paths, and a condensed
+   list of literature references (the 18 papers from `docs/plans/20260415-bq-data-analyzer-references.md`).
 
 ## Key Thresholds
 
-Thresholds used to color-code metrics. These must match the design doc
-(`docs/plans/20260415-bq-data-analyzer.md`) exactly.
+Thresholds used to color-code metrics. These must match the design doc (`docs/plans/20260415-bq-data-analyzer.md`)
+exactly.
 
-| Metric                             | Green          | Yellow       | Red          |
-|------------------------------------|----------------|--------------|--------------|
-| Edge homophily                     | > 0.7          | 0.3 - 0.7    | < 0.3        |
-| Class imbalance ratio              | < 1:5          | 1:5 - 1:10   | > 1:10       |
-| Feature missing rate               | < 10%          | 10 - 50%     | > 90%        |
-| Isolated node fraction             | < 1%           | 1 - 5%       | > 5%         |
-| Degree p99/median                  | < 50           | 50 - 100     | > 100        |
-| Node degree (int16 clamp)          | < 32,767       | n/a          | > 32,767     |
-| Cold-start fraction (degree 0-1)   | < 5%           | 5 - 10%      | > 10%        |
-| Edge type dominance                | No type > 80%  | Any > 90%    | Any < 0.1%   |
+| Metric                           | Green         | Yellow     | Red        |
+| -------------------------------- | ------------- | ---------- | ---------- |
+| Edge homophily                   | > 0.7         | 0.3 - 0.7  | < 0.3      |
+| Class imbalance ratio            | < 1:5         | 1:5 - 1:10 | > 1:10     |
+| Feature missing rate             | < 10%         | 10 - 50%   | > 90%      |
+| Isolated node fraction           | < 1%          | 1 - 5%     | > 5%       |
+| Degree p99/median                | < 50          | 50 - 100   | > 100      |
+| Node degree (int16 clamp)        | < 32,767      | n/a        | > 32,767   |
+| Cold-start fraction (degree 0-1) | < 5%          | 5 - 10%    | > 10%      |
+| Edge type dominance              | No type > 80% | Any > 90%  | Any < 0.1% |
 
 ## Data Injection Contract
 
-`report_generator.py` produces a final HTML file by performing four exact
-string replacements on `report.ai.html`:
+`report_generator.py` produces a final HTML file by performing four exact string replacements on `report.ai.html`:
 
-| Placeholder                   | Replaced with                                                  |
-|-------------------------------|----------------------------------------------------------------|
-| `/* INJECT_STYLES */`         | Raw contents of `styles.ai.css`                                |
-| `/* INJECT_SCRIPTS */`        | Raw contents of `charts.ai.js`                                 |
-| `/* INJECT_ANALYSIS_DATA */`  | JSON-serialized `GraphAnalysisResult` (`dataclasses.asdict`)   |
-| `/* INJECT_PROFILE_DATA */`   | JSON-serialized `FeatureProfileResult` (or `{}` if absent)     |
+| Placeholder                  | Replaced with                                                |
+| ---------------------------- | ------------------------------------------------------------ |
+| `/* INJECT_STYLES */`        | Raw contents of `styles.ai.css`                              |
+| `/* INJECT_SCRIPTS */`       | Raw contents of `charts.ai.js`                               |
+| `/* INJECT_ANALYSIS_DATA */` | JSON-serialized `GraphAnalysisResult` (`dataclasses.asdict`) |
+| `/* INJECT_PROFILE_DATA */`  | JSON-serialized `FeatureProfileResult` (or `{}` if absent)   |
 
 The JS reads these injected JSON strings from hidden script tags:
 
@@ -101,11 +90,9 @@ The JS reads these injected JSON strings from hidden script tags:
 On page load the JS:
 
 1. Parses both JSON blobs.
-2. Populates each section by generating DOM nodes (never `innerHTML` with
-   untrusted strings; always `textContent`).
+2. Populates each section by generating DOM nodes (never `innerHTML` with untrusted strings; always `textContent`).
 3. Renders the degree distribution as an inline SVG bar chart.
-4. Applies color coding (`status-green`, `status-yellow`, `status-red`) based
-   on the thresholds above.
+4. Applies color coding (`status-green`, `status-yellow`, `status-red`) based on the thresholds above.
 5. Hides `#feature-statistics` if the profile data is empty / `{}`.
 6. Hides `#advanced` if no Tier 3 or Tier 4 data is present.
 
@@ -121,18 +108,15 @@ The JS queries these DOM IDs. The HTML template must provide them:
 - `#advanced`
 - `#report-footer`
 - `#analysis-data` (hidden JSON script tag)
-- `#profile-data`  (hidden JSON script tag)
+- `#profile-data` (hidden JSON script tag)
 
 ## Regeneration Instructions
 
 To regenerate `report.ai.html`, `charts.ai.js`, and `styles.ai.css`:
 
 1. Read this `SPEC.md` in full.
-2. Implement the sections, element IDs, thresholds, and data injection contract
-   exactly as specified.
+2. Implement the sections, element IDs, thresholds, and data injection contract exactly as specified.
 3. Keep the HTML template minimal (all content is rendered by JS).
-4. Keep the JS as a single IIFE with no external dependencies; use DOM helpers,
-   not templating libraries.
+4. Keep the JS as a single IIFE with no external dependencies; use DOM helpers, not templating libraries.
 5. Use the exact color hex values specified in "Constraints".
-6. Update the snapshot test golden file at
-   `tests/test_assets/analytics/golden_report.html` after regenerating.
+6. Update the snapshot test golden file at `tests/test_assets/analytics/golden_report.html` after regenerating.

From 14df2b8a469af448e75ec9d902de11541c64eb23 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 23:34:28 +0000
Subject: [PATCH 14/20] docs(analytics): add PRD.md for HTML report (product
 intent)

Sits alongside SPEC.md to separate product requirements (why and what)
from technical implementation contract (how). Both are AI-owned and
together form the input for regenerating report.ai.html, charts.ai.js,
and styles.ai.css.

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/data_analyzer/report/PRD.md | 170 +++++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 gigl/analytics/data_analyzer/report/PRD.md

diff --git a/gigl/analytics/data_analyzer/report/PRD.md b/gigl/analytics/data_analyzer/report/PRD.md
new file mode 100644
index 000000000..43f5fc1e9
--- /dev/null
+++ b/gigl/analytics/data_analyzer/report/PRD.md
@@ -0,0 +1,170 @@
+# PRD: BQ Data Analyzer HTML Report
+
+## Status
+
+**AI-owned.** An AI agent reads this PRD together with the sibling `SPEC.md` and regenerates `report.ai.html`,
+`charts.ai.js`, and `styles.ai.css` when the product intent or technical contract changes. This PRD describes *why* and
+*what*; `SPEC.md` describes *how*.
+
+## Problem
+
+Before training a GNN on graph data in BigQuery, engineers need a fast way to see whether the data is healthy enough to
+train on. Today they find out only after a Dataflow job crashes or a trainer produces a poor model, which costs days and
+thousands of dollars per iteration.
+
+A review of 18 production GNN papers ([reference doc](../../../docs/plans/20260415-bq-data-analyzer-references.md))
+found that graph-specific data properties drive 30-230% model quality differences. None of these are caught by standard
+tabular data quality tools. We need a report that surfaces these graph-specific issues in a form engineers can act on in
+minutes, not days.
+
+## Users
+
+| Persona                                  | Primary need                                                              | Frequency                  |
+| ---------------------------------------- | ------------------------------------------------------------------------- | -------------------------- |
+| **GNN engineer running an applied task** | Decide whether a new BQ dataset is trainable, and if not, what to fix     | Per new dataset or refresh |
+| **Applied task reviewer / tech lead**    | Sanity-check a teammate's dataset choices before approving a training run | Per PR                     |
+| **On-call engineer**                     | Triage why a training run degraded vs last week                           | Per incident               |
+
+Out of scope: data scientists doing generic exploratory data analysis, product managers, non-technical stakeholders.
+
+## User Stories
+
+1. **As a GNN engineer**, I point the analyzer at a new BQ node/edge table pair and open the resulting HTML report.
+   Within 30 seconds of scrolling I know whether the dataset has any training-blocking issues (dangling edges,
+   referential integrity, duplicates).
+2. **As a GNN engineer**, I inspect the degree distribution histogram for each edge type and decide whether my planned
+   fan-out is realistic or will cause neighbor explosion.
+3. **As a reviewer**, I share the GCS link to the report in a PR comment. My teammate opens it in a browser without
+   installing anything.
+4. **As an on-call engineer**, I run the analyzer on today's data and last week's data and diff the two reports to see
+   what changed.
+5. **As any of the above**, I expand the collapsed sections I do not care about so the overview stays scannable.
+
+## Goals
+
+1. **Zero-setup viewing.** The report opens in any modern browser with no server, no CDN, no authentication beyond the
+   GCS link. Works offline once downloaded.
+2. **Action-oriented.** Every numeric finding is color-coded against a literature-derived threshold (green/yellow/red)
+   so the reader knows what to do about it.
+3. **Traceable.** Every color-coded threshold and every check cites the paper or codebase location that justifies it, so
+   readers can verify claims.
+4. **Portable.** A single `.html` file that can be shared in chat, stored indefinitely in GCS, and archived alongside
+   the training run it describes.
+5. **Graph-native.** Surfaces metrics that matter for GNNs specifically (degree distribution, super-hub int16 clamp,
+   cold-start fraction, homophily, neighbor explosion), not just generic tabular stats.
+6. **AI-regenerable.** The three `.ai.*` assets can be regenerated deterministically from this PRD plus `SPEC.md`
+   without human intervention on the HTML/JS/CSS.
+
+## Non-Goals
+
+- **Not a real-time monitoring dashboard.** Aegis covers that
+  ([Phase 2](../../../docs/plans/20260415-bq-data-analyzer.md#aegis-integration-phase-2)). This report is a
+  point-in-time snapshot.
+- **Not a BI tool.** No filtering, drill-down, or ad-hoc querying. The report is a rendered artifact, not an interactive
+  app.
+- **Not cross-dataset comparison.** Diffing reports is a user workflow (open two tabs), not a report feature.
+- **Not a model evaluation report.** This is about training data, not trained model performance.
+- **Not accessible (WCAG AA) in v1.** We document this gap and will address it if the report is used by users who need
+  it.
+
+## Functional Requirements
+
+Each requirement maps to a section of `SPEC.md` where the implementation contract lives.
+
+**FR-1: Overview at a glance.** The first screen (above the fold) shows total nodes, total edges, node/edge type counts,
+and a single green/yellow/red status light summarizing the worst issue found. Rationale: engineers decide "do I need to
+look deeper" in the first 5 seconds.
+
+**FR-2: Hard-fail visibility.** Dangling edges, referential integrity violations, and duplicate nodes render red
+regardless of magnitude. These block training entirely. The report shows them prominently even if count is exactly one.
+Rationale: [GiGL](../../../docs/plans/20260415-bq-data-analyzer-references.md#6-gigl),
+[AliGraph (7.1)](../../../docs/plans/20260415-bq-data-analyzer-references.md#7-aligraph) — silent NaN propagation from
+referential integrity violations is a production-documented failure mode.
+
+**FR-3: Degree distribution per edge type.** Inline SVG histogram using the six literature-aligned buckets: `0-1`,
+`2-10`, `11-100`, `101-1K`, `1K-10K`, `10K+`. Separate in-degree and out-degree. Rationale:
+[BLADE](../../../docs/plans/20260415-bq-data-analyzer-references.md#3-blade) showed 230% embedding improvement from
+degree-adaptive neighborhoods; the reader needs to see which buckets dominate.
+
+**FR-4: Super-hub warning.** A red call-out appears when any node exceeds the GiGL int16 degree clamp (32,767). Include
+the count and the affected edge type. Rationale:
+[GiGL (6.2)](../../../docs/plans/20260415-bq-data-analyzer-references.md#6-gigl) — the clamp is silent in production and
+corrupts PPR sampling probabilities. Users have no other way to discover this.
+
+**FR-5: Cold-start visibility.** Show the count and fraction of degree-0-1 nodes per type. Color-code the fraction
+against the 5% / 10% threshold. Rationale:
+[LiGNN (4.1)](../../../docs/plans/20260415-bq-data-analyzer-references.md#4-lignn) — +0.28% AUC from cold-start
+densification; the reader decides whether densification is worth investigating.
+
+**FR-6: Optional Tier 3 visibility.** Class imbalance, label coverage, edge type distribution, and per-edge-type node
+coverage are shown only when the input data supports them. Rationale: a report full of "not applicable" sections is
+noise.
+
+**FR-7: Embedded FACETS.** When feature profiling is available, the FACETS HTML output is embedded inline via
+`<iframe srcdoc="...">` so that the TFDV-generated styles do not leak into the main report. Rationale: FACETS is an
+industry-standard visualization; engineers already know how to read it.
+
+**FR-8: Collapsible sections.** Every section below the overview is independently collapsible via native
+`<details>`/`<summary>` with sensible defaults (hard fails always open; advanced sections closed by default). Rationale:
+the report is comprehensive by design, but any one reading needs only the sections relevant to their question.
+
+**FR-9: Literature citations in footer.** The footer lists the 18 source papers used to set thresholds, with inline
+references wherever a threshold is color-coded. Rationale: "cite sources" is an explicit user preference, and traceable
+thresholds are more defensible than magic numbers.
+
+**FR-10: Raw artifact links.** The footer lists GCS paths to the raw outputs (TFDV stats `.tfrecord`, FACETS `.html` per
+table, schema `.pbtxt`) so the reader can dig deeper with other tools.
+
+## Non-Functional Requirements
+
+| Requirement                                      | Target                                                                                                                   |
+| ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------ |
+| **Load time** (opening the HTML from local disk) | Under 3 seconds for a report with up to 20 tables                                                                        |
+| **File size**                                    | Under 1 MB baseline; up to ~10 MB when FACETS iframes are embedded                                                       |
+| **Browser support**                              | Latest Chrome, Firefox, Safari, Edge. No IE.                                                                             |
+| **Dependencies**                                 | Zero external — no CDN, no Google Fonts, no JS framework. All CSS/JS inlined.                                            |
+| **Portability**                                  | Viewing the report over a GCS `gs://` link works without re-download. Saving to disk works.                              |
+| **Determinism**                                  | Same input data + same analyzer version produces byte-identical HTML (enables snapshot testing).                         |
+| **Security**                                     | All data injected via `textContent`, never `innerHTML`. FACETS embeds are isolated in iframes. No remote resource loads. |
+| **Accessibility**                                | Best-effort only in v1: semantic HTML, reasonable color contrast. Full WCAG AA is a non-goal.                            |
+
+## Success Metrics
+
+How we know this PRD was successfully implemented:
+
+1. **Snapshot test stays green.** The golden file at `tests/test_assets/analytics/golden_report.html` matches the
+   generated output for a known input. Any intentional change to the report requires a reviewed update to the golden
+   file.
+2. **Report opens standalone.** Downloading the HTML file and opening it offline produces the same rendering as opening
+   it from GCS.
+3. **All threshold values match the design doc.** A reviewer can open `SPEC.md`, the `20260415-bq-data-analyzer.md`
+   design doc, and the rendered report and confirm all three agree on green/yellow/red cutoffs.
+4. **Regeneration works end-to-end.** An AI agent, given only this PRD and `SPEC.md`, regenerates `report.ai.html`,
+   `charts.ai.js`, and `styles.ai.css` such that the snapshot test still passes.
+
+## Open Questions
+
+1. **Should the report surface the power-law exponent estimate by default?** We compute it from degree stats (cheap),
+   but
+   [Demystifying (17.1)](../../../docs/plans/20260415-bq-data-analyzer-references.md#17-demystifying-common-beliefs-in-graph-ml)
+   cautions against relying on derived metrics that summarize away the full distribution. Current answer: show it only
+   in the Advanced section with a caveat.
+2. **Should FACETS embeds be lazy-loaded?** A 20-table report with FACETS per table can be ~10 MB. Lazy loading (iframe
+   `loading="lazy"`) would speed first paint but complicates the "single self-contained HTML" goal. Current answer:
+   eager load; revisit if reports routinely exceed 10 MB.
+3. **Should we support dark mode?** Not in v1. The color-coded thresholds (red/yellow/green) assume a light background;
+   a dark theme would need separate color values.
+
+## References
+
+- **Technical spec:** [`SPEC.md`](SPEC.md) in this directory — the contract for regenerating the `.ai.*` files.
+- **Design doc:** [`docs/plans/20260415-bq-data-analyzer.md`](../../../docs/plans/20260415-bq-data-analyzer.md) —
+  architecture, 4-tier validation, cost control, tradeoff analysis.
+- **Literature review:**
+  [`docs/plans/20260415-bq-data-analyzer-references.md`](../../../docs/plans/20260415-bq-data-analyzer-references.md) —
+  18 papers, 100+ findings with source citations, consolidated threshold table.
+- **1-pager:** [`docs/plans/20260416-data-analyzer-1-pager.md`](../../../docs/plans/20260416-data-analyzer-1-pager.md) —
+  executive summary for peer engineers.
+- **Engineering spec:**
+  [`docs/plans/20260416-data-analyzer-engineering-spec.md`](../../../docs/plans/20260416-data-analyzer-engineering-spec.md)
+  — per-layer implementation plan.

From 5e166fa14fd7a7ffa0c70895d66ed4c6924002bb Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Fri, 17 Apr 2026 23:51:47 +0000
Subject: [PATCH 15/20] docs(analytics): add BQ Data Analyzer design docs,
 literature review, 1-pager, engineering spec

Colocates all planning docs for the BQ Data Analyzer feature:
- 20260415-bq-data-analyzer.md: full design doc with 4-tier validation,
  cost control, tradeoff analysis
- 20260415-bq-data-analyzer-references.md: literature review of 18
  production GNN papers with 100+ findings, common themes, and
  consolidated threshold table
- 20260416-data-analyzer-1-pager.md: executive summary for peer
  engineers and decision makers
- 20260416-data-analyzer-engineering-spec.md: per-layer implementation
  plan that the analyzer code in this branch follows

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 .../20260415-bq-data-analyzer-references.md   | 1063 +++++++++++++++++
 docs/plans/20260415-bq-data-analyzer.md       |  468 ++++++++
 docs/plans/20260416-data-analyzer-1-pager.md  |  105 ++
 ...20260416-data-analyzer-engineering-spec.md |  374 ++++++
 4 files changed, 2010 insertions(+)
 create mode 100644 docs/plans/20260415-bq-data-analyzer-references.md
 create mode 100644 docs/plans/20260415-bq-data-analyzer.md
 create mode 100644 docs/plans/20260416-data-analyzer-1-pager.md
 create mode 100644 docs/plans/20260416-data-analyzer-engineering-spec.md

diff --git a/docs/plans/20260415-bq-data-analyzer-references.md b/docs/plans/20260415-bq-data-analyzer-references.md
new file mode 100644
index 000000000..ad461fbe6
--- /dev/null
+++ b/docs/plans/20260415-bq-data-analyzer-references.md
@@ -0,0 +1,1063 @@
+# BQ Data Analyzer: Literature Review and Analysis Mapping
+
+This document catalogs findings from production GNN papers that inform what to validate and analyze about graph data
+before training. Each paper includes multiple insights, and each insight maps to a concrete analysis check.
+
+## Common Themes
+
+Cross-cutting findings that recur across multiple papers. Sorted by number of supporting papers. Each theme maps to one
+or more concrete checks the analyzer should perform.
+
+| Theme                                                       | Description                                                                                                                                                                                          | Papers                                                                                              | Analysis Check                                                                                                                                                             |
+| ----------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Power-law degree distributions require adaptive handling    | Uniform sampling and aggregation fail on skewed degree distributions; every major production system adapts its sampling or aggregation to degree skew.                                               | PinSage (1.1, 1.3), PinnerSage (2.1), BLADE (3.2, 3.5), LiGNN (4.5), AliGraph (7.8), GiGL (6.2)     | Degree distribution with percentiles (p50, p90, p99, p99.9). Degree-stratified bucket counts. Power-law exponent fitting. p99/median ratio.                                |
+| Cold-start / low-degree nodes produce degraded embeddings   | Nodes with zero or near-zero edges fall back to node-own features only, producing poor representations that require densification or special handling.                                               | PinSage (1.7), PinnerSage (2.5), BLADE (3.1), LiGNN (4.1), Feature Propagation (12.6)               | Cold-start fraction: count of nodes with degree 0-1 per type. Nodes-below-fan-out count per edge type. Alert if cold-start fraction > 10%.                                 |
+| Edge type heterogeneity and imbalance bias sampling         | When one edge type dominates or types have vastly different densities, standard neighbor sampling almost never selects minority types, silently degrading multi-relational learning.                 | PinnerSage (2.3), TwHIN (5.1, 5.2), LiGNN (4.7), AliGraph (7.7), Fraud Detection (10.5)             | Edge count per type. Per-edge-type density. Alert if any type < 0.1% of total or dominant type > 90%. Per-edge-type node coverage fraction.                                |
+| Sampling strategy depends on graph structure                | The optimal sampling strategy (fan-out, neighborhood size, bias direction) is not fixed but depends on the degree distribution, graph density, and domain.                                           | PinSage (1.1), BLADE (3.2, 3.3), LiGNN (4.5, 4.9), AliGraph (7.7)                                   | Full degree distribution, neighbor explosion estimate for given fan-out, per-edge-type density ratios.                                                                     |
+| Temporal freshness / staleness degrades model quality       | Stale edges and misaligned feature-edge timestamps inject noise; multiple papers report 2-8% AUC degradation from temporal inconsistency.                                                            | PinnerSage (2.4), LiGNN (4.4), AliGraph (7.6), Fraud Detection (10.1, 10.3), Meta (18.2)            | Edge timestamp freshness distribution. Stale edge fraction. Feature-edge timestamp gap. Alert if > 24h misalignment.                                                       |
+| Hub node dominance distorts training                        | A small fraction of nodes concentrates a large fraction of edges, dominating aggregation and causing partition imbalance; requires caching, normalization, or down-weighting.                        | PinSage (1.2, 1.5), AliGraph (7.8), GiGL (6.2), Beyond Homophily (9.7)                              | Top-K highest-degree nodes per edge type. Hub concentration: fraction of edges involving top 0.1% and top 1% of nodes. Nodes exceeding int16 degree clamp (32,767).        |
+| Memory budget scales with node count and feature dimensions | ID embedding tables, feature storage, and neighbor expansion all scale with node cardinality and feature dimensions, making memory estimation a prerequisite for feasibility.                        | PinSage (1.8), LiGNN (4.3), TwHIN (5.6), AliGraph (7.4), GraphBFF (13.2)                            | Feature memory budget: node_count x feature_dim x dtype_size per type. ID embedding memory estimate. Neighbor explosion estimate.                                          |
+| Homophily level determines architecture choice              | The single most predictive graph property for GNN architecture selection; standard GCN/GAT fail at low homophily, and feature propagation becomes harmful.                                           | Beyond Homophily (9.1, 9.2, 9.3), Feature Propagation (12.5), GraphSMOTE (8.3), Demystifying (17.1) | Edge homophily ratio. Per-class and per-node homophily distribution. If h < 0.3, warn about standard architectures and feature propagation.                                |
+| Feature missing / incomplete data                           | Missing features are common in production graphs (5-15% of nodes); propagation can recover signal up to 90% missing but fails above 95%, with recovery quality depending on local connectivity.      | AliGraph (7.3), Feature Propagation (12.1, 12.2), Google Maps ETA (11.5), Meta (18.1)               | NULL rate per column per table. Feature missing rate per node type. For missing nodes: average degree and feature-bearing neighbor count. Phase transition alert at > 95%. |
+| Class imbalance amplified by message passing                | GNN aggregation exacerbates label imbalance 2-3x in representation space; minority class F1 drops 30-40% at 1:10 ratio, with topology imbalance compounding the effect.                              | GraphSMOTE (8.1, 8.2), Beyond Homophily (9.7), Fraud Detection (implicit)                           | Class imbalance ratio (max_count / min_count). Per-class node counts. Warning at > 1:5, critical at > 1:10. Per-class degree distribution if labels available.             |
+| Directed graph semantics carry information                  | Asymmetric relationships (co-purchase, follows, one-way streets) require separate source/target representations or directional message passing; treating directed graphs as undirected loses signal. | BLADE (3.4), TwHIN (5.4), Google Maps ETA (11.2)                                                    | Reciprocity fraction per edge type. If reciprocity > 95%, graph is effectively undirected. If low, warn if treated as undirected.                                          |
+| Feature scale / normalization across types                  | Features spanning orders of magnitude or with large dimension ratios across node types cause gradient instability and aggregation bias without per-type normalization.                               | PinSage (1.5), AliGraph (7.4, 7.5), Google Maps ETA (11.4)                                          | Per-feature min/max/range. Feature dimensionality per node type. Alert if dimension ratio > 10x across types or scale range > 10^4.                                        |
+| Graph connectivity / isolated components                    | Nodes in small disconnected components get poor embeddings because message passing cannot aggregate meaningful neighborhood information; cross-domain densification helps.                           | AliGraph (7.2), Feature Propagation (12.4), LiGNN (4.6), Oversmoothing (16.2)                       | Isolated node count (degree 0). Small component fraction. Per-edge-type degree distribution to identify sparse domains. Connected component analysis (Tier 4).             |
+| Edge referential integrity / dangling edges                 | Edges referencing non-existent nodes cause silent NaN propagation during message passing; must be a hard validation failure.                                                                         | GiGL (6.1, 6.5), AliGraph (7.1), DQuaG (14.1)                                                       | Edge src/dst existence in node table. NULL src/dst check. Hard fail if dangling edges found.                                                                               |
+| Degree distribution determines partitioning quality         | Extreme degree skew causes partition imbalance and cross-partition edge explosion, leading to stragglers and degraded aggregation at partition boundaries.                                           | GiGL (6.9), AliGraph (7.9)                                                                          | Degree distribution skew. Graph density. Hub concentration metrics inform partitioning strategy.                                                                           |
+
+## Table of Contents
+
+01. [PinSage (Pinterest, KDD 2018)](#1-pinsage)
+02. [PinnerSage (Pinterest, KDD 2020)](#2-pinnersage)
+03. [BLADE (Amazon, WSDM 2023)](#3-blade)
+04. [LiGNN (LinkedIn, KDD 2024)](#4-lignn)
+05. [TwHIN (Twitter/X, KDD 2022)](#5-twhin)
+06. [GiGL (Snap, KDD 2025)](#6-gigl)
+07. [AliGraph (Alibaba, VLDB 2019)](#7-aligraph)
+08. [GraphSMOTE (WSDM 2021)](#8-graphsmote)
+09. [Beyond Homophily (NeurIPS 2020)](#9-beyond-homophily)
+10. [Uber Fraud Detection + Grab Spade (VLDB 2023)](#10-fraud-detection)
+11. [Google Maps ETA (CIKM 2021)](#11-google-maps-eta)
+12. [Feature Propagation (ICLR 2022)](#12-feature-propagation)
+13. [GraphBFF (Feb 2026)](#13-graphbff)
+14. [DQuaG (EDBT 2025)](#14-dquag)
+15. [LinkedIn Cross-Domain GNN (June 2025)](#15-linkedin-cross-domain-gnn)
+16. [Oversmoothing/Oversquashing Complexity (March 2026)](#16-oversmoothingoversquashing-complexity)
+17. [Demystifying Common Beliefs (ICLR 2026)](#17-demystifying-common-beliefs-in-graph-ml)
+18. [Meta GEM and Adaptive Ranking (2025-2026)](#18-meta-gem-and-adaptive-ranking)
+19. [Consolidated Threshold Table](#19-consolidated-threshold-table)
+
+______________________________________________________________________
+
+## 1. PinSage
+
+**PinSage: Graph Convolutional Neural Networks for Web-Scale Recommender Systems** Ying et al., Pinterest, KDD 2018.
+[arxiv.org/abs/1806.01973](https://arxiv.org/abs/1806.01973)
+
+**Summary:** First industrial-scale GCN deployed on Pinterest's bipartite graph of 3B nodes (Pins + Boards) and 18B
+edges. Introduces random-walk-based importance sampling, importance pooling, curriculum training with hard negatives,
+and a producer-consumer mini-batch architecture.
+
+**Graph characteristics:** 3B nodes, 18B edges, bipartite (Pins-Boards), power-law degree distribution, average degree
+~6 with extreme skew (some boards have millions of pins).
+
+### Findings
+
+**1.1 Power-law degree distributions require importance sampling.** *Source: PinSage (Ying et al., KDD 2018), Section
+4.1 "Constructing Convolutions via Random Walks"*
+
+Uniform K-hop expansion fails because hub nodes dominate every neighborhood. Random-walk-based top-T neighbor selection
+using L1-normalized visit counts yielded a 46% improvement over uniform K-hop sampling.
+
+Analysis: Compute degree distribution with percentiles (p50, p90, p99, p99.9). Compute p99/median ratio. Alert if > 100
+(extreme skew).
+
+**1.2 Hub nodes dominate aggregation.** *Source: PinSage (Ying et al., KDD 2018), Section 4.2 "Importance Pooling"*
+
+A small fraction of boards have millions of pins. Without normalization, embeddings converge toward hub representations.
+PinSage uses importance pooling (visit-count-weighted aggregation) to down-weight hubs.
+
+Analysis: Report top-K highest-degree nodes per edge type. Compute hub concentration: fraction of total edges involving
+top 0.1% of nodes.
+
+**1.3 Importance pooling outperforms mean/max pooling.** *Source: PinSage (Ying et al., KDD 2018), Section 4.2
+"Importance Pooling"*
+
+Ablation confirmed visit-count-weighted aggregation outperforms both alternatives. The choice depends on knowing the
+degree distribution.
+
+Analysis: Degree distribution is prerequisite for choosing aggregation strategy.
+
+**1.4 Multi-modal features give 60% improvement over single modality.** *Source: PinSage (Ying et al., KDD 2018),
+Section 5 "Experiments", Table 1*
+
+Each Pin has a CNN visual embedding and a Word2Vec annotation embedding. Combining them was critical.
+
+Analysis: Report feature count and types per node type. Flag node types with only one feature modality.
+
+**1.5 L2 normalization after each GCN layer is critical.** *Source: PinSage (Ying et al., KDD 2018), Section 4.3
+"Stacking Convolutions"*
+
+Prevents high-degree nodes from having larger-magnitude representations. Essential for inner-product similarity at
+inference.
+
+Analysis: Feature magnitude distribution (min/max/std). Flag features with extreme scale differences across node types.
+
+**1.6 Curriculum training uses hard negatives from 2-hop graph structure.** *Source: PinSage (Ying et al., KDD 2018),
+Section 4.4 "Curriculum Training"*
+
+Epoch n adds n-1 hard negatives per positive. Hard negatives are 2-hop neighbors not in the positive set. Their
+availability depends directly on graph structure.
+
+Analysis: Average 2-hop neighborhood size. If too small, hard negative mining may be limited.
+
+**1.7 Cold-start items produce degraded embeddings.** *Source: PinSage (Ying et al., KDD 2018), Section 5 "Experiments",
+cold-start analysis*
+
+Items with zero or near-zero edges fall back to node-own features only. The cold-start fraction quantifies this risk.
+
+Analysis: Count nodes with degree 0-1 per type (cold-start fraction).
+
+**1.8 Feature loading I/O is the CPU bottleneck.** *Source: PinSage (Ying et al., KDD 2018), Section 3 "MapReduce
+Inference"*
+
+Feature sizes and degree distributions determine the I/O cost of the producer-consumer architecture.
+
+Analysis: Feature memory budget: nodes x feature_dim x dtype_size per type.
+
+______________________________________________________________________
+
+## 2. PinnerSage
+
+**PinnerSage: Multi-Modal User Embedding Framework for Recommendations at Pinterest** Pal et al., Pinterest, KDD 2020.
+[arxiv.org/abs/2007.03634](https://arxiv.org/abs/2007.03634)
+
+**Summary:** Represents each user with multiple embeddings (one per interest cluster) by clustering 90-day action
+history using Ward hierarchical clustering. Achieved 4% engagement lift on homefeed and 20% on shopping.
+
+**Graph characteristics:** Same Pinterest graph (~3B pins, hundreds of millions of users). User-Pin action graph with
+typed edges (repin, click, closeup, hide). Power-law user activity distribution.
+
+### Findings
+
+**2.1 User activity follows a power-law.** *Source: PinnerSage (Pal et al., KDD 2020), Section 3 "PinnerSage
+Architecture"*
+
+Light users produce 3-5 clusters; heavy users produce 75-100. Action count distribution per user must be understood for
+batching strategies.
+
+Analysis: Source-node degree distribution. Degree-stratified bucket counts.
+
+**2.2 Single-embedding representation loses diverse interests.** *Source: PinnerSage (Pal et al., KDD 2020), Section 3
+"PinnerSage Architecture"*
+
+Averaging embeddings causes topic drift. Aggregation on skewed data destroys information.
+
+Analysis: Degree distribution variance. If high variance, single aggregation may lose minority signals.
+
+**2.3 Action type weighting matters.** *Source: PinnerSage (Pal et al., KDD 2020), Section 3 "PinnerSage Architecture"*
+
+Repins (strong intent) vs clicks (moderate) vs closeups (weak) vs hides (negative). If edges have different semantic
+meanings, their type distribution affects model quality.
+
+Analysis: Edge type distribution. Alert if 90%+ edges are one type.
+
+**2.4 Temporal decay of action relevance.** *Source: PinnerSage (Pal et al., KDD 2020), Section 4 "Experiments"*
+
+90-day window for batch; same-day for online. Stale edges inject noise.
+
+Analysis: Edge timestamp distribution. Fraction of edges older than configurable threshold.
+
+**2.5 Cold-start users with 0-1 actions need special handling.** *Source: PinnerSage (Pal et al., KDD 2020), Section 4
+"Experiments"*
+
+Real-time signals help most for these users. The fraction with sparse histories quantifies the problem.
+
+Analysis: Cold-start node count (degree 0-1) per type.
+
+**2.6 Medoid is better than centroid for cluster representation.** *Source: PinnerSage (Pal et al., KDD 2020), Section 3
+"PinnerSage Architecture"*
+
+Centroids drift to unobserved embedding space ("hallucinated" interests). Medoids are actual data points.
+
+Analysis: Not directly a data check, but motivates understanding feature space coverage.
+
+______________________________________________________________________
+
+## 3. BLADE
+
+**BLADE: Biased Graph Sampling for Better Related Product Recommendation** Virinchi and Saladi, Amazon, WSDM 2023.
+
+**Summary:** GNN for directed graphs using dual (source/target) embeddings with asymmetric loss and biased neighborhood
+sampling: neighborhood SIZE varies by power-law based on node in-degree, and neighbor SELECTION is biased toward
+high-degree neighbors. Improved HitRate/MRR by 6-230% across datasets with revenue/sales lift in production A/B tests.
+
+**Graph characteristics:** Directed graphs (co-purchase, citation, web). Power-law degree distribution. In-degree !=
+out-degree.
+
+### Findings
+
+**3.1 Uniform neighborhood sampling destroys low-degree node embeddings.** *Source: BLADE (Virinchi and Saladi, WSDM
+2023), Section 3 "BLADE Framework"*
+
+Fixed-size sampling (K=10) wastes budget on low-degree nodes (not enough real neighbors) and under-samples high-degree
+ones.
+
+Analysis: Count nodes whose degree < configured fan-out per edge type (nodes-below-fan-out). These nodes cannot fill
+their sampled neighborhood.
+
+**3.2 Neighborhood size should follow a power-law of in-degree.** *Source: BLADE (Virinchi and Saladi, WSDM 2023),
+Section 3 "BLADE Framework"*
+
+Low-degree nodes get LARGER neighborhoods (more hops); high-degree nodes get smaller. The power-law coefficient is
+estimated from the entire degree distribution. This is the core insight producing 230% improvement.
+
+Analysis: Separate in-degree vs out-degree distributions. Degree-stratified bucket counts: 0-1, 2-10, 11-100, 101-1K,
+1K-10K, 10K+.
+
+**3.3 Sampling probability biased toward high-degree neighbors.** *Source: BLADE (Virinchi and Saladi, WSDM 2023),
+Section 3 "BLADE Framework"*
+
+Sampling a high-degree neighbor is more productive because its embedding already encodes information from its own large
+neighborhood. This is INVERSE of PinSage's hub down-weighting during aggregation. Both agree degree distribution
+matters; they handle it at different pipeline stages.
+
+Analysis: Degree distribution is prerequisite for configuring sampling strategy.
+
+**3.4 Directed graphs need dual (source/target) embeddings.** *Source: BLADE (Virinchi and Saladi, WSDM 2023), Section 3
+"BLADE Framework"*
+
+Asymmetric relationships (phone -> phone case) require separate representations.
+
+Analysis: Compute in-degree and out-degree separately. Detect directionality by checking reciprocity fraction. If
+reciprocity > 95%, graph is effectively undirected.
+
+**3.5 Largest gains on graphs with most extreme power-law distributions.** *Source: BLADE (Virinchi and Saladi, WSDM
+2023), Section 4 "Experiments"*
+
+The 230% improvement was on the most skewed dataset. Power-law exponent and degree skew predict the benefit magnitude.
+
+Analysis: Power-law exponent fitting on degree distribution. Lower exponent = more skewed = more benefit from adaptive
+sampling.
+
+**3.6 Removing biased sampling hurts ALL degree ranges.** *Source: BLADE (Virinchi and Saladi, WSDM 2023), Section 4
+"Experiments"*
+
+Not just tail nodes. The entire model degrades without degree-adaptive sampling.
+
+Analysis: Full degree distribution is always needed, not just tail statistics.
+
+______________________________________________________________________
+
+## 4. LiGNN
+
+**LiGNN: Graph Neural Networks at LinkedIn** Yin et al., LinkedIn, KDD 2024.
+[arxiv.org/abs/2402.11139](https://arxiv.org/abs/2402.11139) Best Paper, Applied Data Science Track.
+
+**Summary:** LinkedIn's deployed GNN framework producing graph embeddings across Feed, People Recommendations, Job
+Recommendations, and Ads. Algorithmic improvements (temporal architectures, cold-start densification, ID embeddings) and
+systems improvements (7x training speedup via adaptive sampling).
+
+**Graph characteristics:** Up to 100B nodes, hundreds of billions of edges. Tens of node types and edge types. Three
+edge categories: engagement, affinity, attribute.
+
+### Findings
+
+**4.1 Cold-start densification improves model quality.** *Source: LiGNN (Yin et al., KDD 2024), Section 4 "Cold-Start
+Solutions"*
+
+When a low-out-degree node is similar to a high-out-degree node (measured via external embeddings), artificial edges are
+added. Adding cold-start edges yielded +0.28% AUC on Follow Feed.
+
+Analysis: Cold-start node count (degree 0-1). Alert if > 10%.
+
+**4.2 2-hop PPR captures 90% of gains.** *Source: LiGNN (Yin et al., KDD 2024), Section 3 "Sampling Strategies"*
+
+In Follow Feed and People Recommendations, 2-hop PPR sampling contributed ~90% of the total performance gain while
+accelerating sampling by 3x vs deeper sampling.
+
+Analysis: Degree distribution + estimated PPR reach to predict sampling effectiveness.
+
+**4.3 ID embeddings give +15.3% validation AUC.** *Source: LiGNN (Yin et al., KDD 2024), Section 4 "ID Embeddings"*
+
+The single largest ablation impact. ID embedding tables scale with node count.
+
+Analysis: Node cardinality per type. ID embedding memory: node_count x embedding_dim x dtype_size.
+
+**4.4 Temporal modeling yields +5.8% AUC lift.** *Source: LiGNN (Yin et al., KDD 2024), Section 5 "Temporal
+Architecture"*
+
+Static SAGE encoder + transformer-based temporal sequence model on edge histories.
+
+Analysis: Edge timestamp freshness distribution. Stale edge fraction.
+
+**4.5 Increasing from 20 to 200 neighbors = +3.2% AUC.** *Source: LiGNN (Yin et al., KDD 2024), Section 6 "Systems
+Improvements"*
+
+Performance generally improves with more neighbors sampled, but with diminishing returns.
+
+Analysis: Degree distribution percentiles to determine if nodes can support the target neighbor count. Neighbor
+explosion estimate for given fan-out.
+
+**4.6 Cross-domain graph densification helps sparse nodes.** *Source: LiGNN (Yin et al., KDD 2024), Section 6 "Systems
+Improvements"*
+
+Combining subgraphs from different domains (feed, jobs, notifications) densifies neighborhoods.
+
+Analysis: Per-edge-type degree distribution. Identify sparse vs dense domains.
+
+**4.7 Three edge categories have different semantics.** *Source: LiGNN (Yin et al., KDD 2024), Section 6 "Systems
+Improvements"*
+
+Engagement (views, clicks, likes), affinity (historical member-creator interactions), attribute (HAS-A relationships).
+Different categories serve different purposes.
+
+Analysis: Edge type inventory with per-type counts and semantic categorization.
+
+**4.8 Attention aggregator outperforms mean/self-attention by +0.9% AUC.** *Source: LiGNN (Yin et al., KDD 2024),
+Section 7 "Experiments"*
+
+Choice of aggregator interacts with degree distribution.
+
+Analysis: Degree variance. Attention benefits more when degree varies widely.
+
+**4.9 Adaptive neighbor sampling enables 7x training speedup.** *Source: LiGNN (Yin et al., KDD 2024), Section 6
+"Systems Improvements"*
+
+Starts with small neighbor count and increases by monitoring model performance.
+
+Analysis: Degree distribution determines the range over which adaptive sampling operates.
+
+______________________________________________________________________
+
+## 5. TwHIN
+
+**TwHIN: Embedding the Twitter Heterogeneous Information Network for Personalized Recommendation** El-Kishky et al.,
+Twitter/X, KDD 2022. [arxiv.org/abs/2202.05387](https://arxiv.org/abs/2202.05387)
+
+**Summary:** Models Twitter as a heterogeneous information network using knowledge graph embedding (TransE) to learn
+representations at 10^9 nodes and 10^11 edges. Pretrained TwHIN embeddings used across ads ranking, follow
+recommendation, offensive content detection, and search.
+
+**Graph characteristics:** 4 entity types (User, Tweet, Advertiser, Ad), 7 relation types (Follows, Authors, Favorites,
+Replies, Retweets, Promotes, Clicks). Directed edges.
+
+### Findings
+
+**5.1 High-coverage vs low-coverage relations are fundamentally different.** *Source: TwHIN (El-Kishky et al., KDD
+2022), Section 3 "TwHIN Model"*
+
+Some relations are high-coverage (most users participate, e.g., Follows, Favorites) while others are low-coverage (e.g.,
+ad interactions). This coverage imbalance is a core structural property.
+
+Analysis: Per-edge-type node coverage: fraction of nodes participating in each relation.
+
+**5.2 Edge type imbalance causes sampling bias.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section 3 "TwHIN Model"*
+
+When one edge type dominates, standard neighbor sampling almost never selects minority types.
+
+Analysis: Edge count per type. Alert if any type < 0.1% of total or dominant type > 90%.
+
+**5.3 No node features by design.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section 3 "TwHIN Model"*
+
+TwHIN learns embeddings purely from graph structure (knowledge graph embedding approach). Deliberate choice for
+scalability.
+
+Analysis: Feature existence check per node type. Relevant for architecture choice (feature-based GNN vs structure-only
+KGE).
+
+**5.4 Directed edges carry semantic meaning.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section 3 "TwHIN Model"*
+
+Follows, Favorites, Authors are inherently directional.
+
+Analysis: Reciprocity per edge type. Low reciprocity means direction carries information.
+
+**5.5 Multi-modal user representation via engagement clustering.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section
+4 "Experiments"*
+
+Users represented as mixture of multiple embeddings (not single vector). >300% recall improvement from multi-modal over
+unimodal.
+
+Analysis: Per-node degree distribution across different relation types. Identifies multi-interest signal.
+
+**5.6 128-dimensional embeddings for all entity types.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section 4
+"Experiments"*
+
+Embedding dimensionality directly determines memory footprint.
+
+Analysis: Memory budget: node_count x 128 x dtype_size per entity type.
+
+______________________________________________________________________
+
+## 6. GiGL
+
+**GiGL: Large-Scale Graph Neural Networks at Snapchat** Snap, KDD 2025.
+[arxiv.org/abs/2502.15054](https://arxiv.org/abs/2502.15054)
+
+**Summary:** Open-source library for billion-scale GNN training and inference. End-to-end pipeline from BigQuery
+preprocessing to distributed training via GLT. 35+ production launches across friend recommendation, content
+recommendation, spam detection, and advertising.
+
+**Graph characteristics:** Hundreds of millions of nodes, tens of billions of edges, hundreds of node and edge features.
+Supports homogeneous, heterogeneous, and labeled homogeneous graphs.
+
+### Findings
+
+**6.1 BQ dangling edge validation.** *Source: GiGL (Snap, KDD 2025), Section 3 "System Design",
+`gigl/analytics/graph_validation/bq_graph_validator.py`*
+
+`BQGraphValidator` (`gigl/analytics/graph_validation/bq_graph_validator.py`) checks for NULL src/dst node IDs. Hard fail
+if dangling edges exist.
+
+Analysis: Already implemented. Reuse in DataAnalyzer.
+
+**6.2 int16 degree clamping at 32,767.** *Source: GiGL (Snap, KDD 2025), Section 4 "System Design",
+`gigl/distributed/utils/degree.py:134-137`*
+
+In `gigl/distributed/utils/degree.py:134-137`, node degrees are clamped to `torch.iinfo(torch.int16).max`. Super-hub
+nodes silently lose degree precision, affecting PPR sampling probabilities.
+
+Analysis: Count nodes with degree > 32,767 per edge type.
+
+**6.3 Label edge types excluded from structural computations.** *Source: GiGL (Snap, KDD 2025), Section 4 "System
+Design"*
+
+Label edges (for ABLP supervision) are filtered out of degree computation and PPR traversal to prevent ground-truth
+leakage.
+
+Analysis: Validate that label edge types are properly identified and excluded.
+
+**6.4 Over-counting correction in distributed degree computation.** *Source: GiGL (Snap, KDD 2025), Section 4 "System
+Design"*
+
+When multiple processes share the same graph partition, naive all-reduce over-counts degrees. GiGL corrects by dividing
+by local_world_size.
+
+Analysis: Not a pre-training data check, but validates distributed correctness.
+
+**6.5 Node ID enumeration catches missing-node references.** *Source: GiGL (Snap, KDD 2025), Section 3 "System Design"*
+
+Raw node IDs (possibly strings) are enumerated to integers. Enumerated edge tables are validated for dangling edges,
+catching edges referencing non-existent nodes.
+
+Analysis: Edge referential integrity: edges where src/dst not in node table.
+
+**6.6 TFT-backed Data Preprocessor.** *Source: GiGL (Snap, KDD 2025), Section 3 "System Design"*
+
+Distributed feature transformation using TensorFlow Transform on Beam/Dataflow. Reads from BigQuery, outputs TFRecords.
+
+Analysis: TFDV feature statistics (what the FeatureProfiler component reuses).
+
+**6.7 TFDV statistics commented out.** *Source: GiGL (Snap, KDD 2025), Section 5 "System Design", `utils.py:120` and
+`utils.py:300`*
+
+`GenerateAndVisualizeStats` exists at `utils.py:120` but is commented out at `utils.py:300`. This is the gap the
+DataAnalyzer fills.
+
+Analysis: Re-enable as standalone component.
+
+**6.8 PPR sampling parameters.** *Source: GiGL (Snap, KDD 2025), Section 4 "System Design"*
+
+Alpha=0.5, eps=1e-4, max_ppr_nodes=50, num_neighbors_per_hop=100,000.
+
+Analysis: Degree distribution to estimate PPR subgraph sizes for given parameters.
+
+**6.9 Semi-random graph partitioning.** *Source: GiGL (Snap, KDD 2025), Section 5 "System Design"*
+
+Nodes shuffled across machines; adjacent edges collocated based on source or destination. Ensures 1-hop neighborhoods
+are within-machine.
+
+Analysis: Degree distribution determines partition balance. Extreme skew causes stragglers.
+
+**6.10 Multi-task heterogeneous graph for ads.** *Source: GiGL (Snap, KDD 2025), Section 5 "System Design"*
+
+Rather than separate partial graphs per ad type, Snapchat joins users with different ad types (product, app, web) into
+one heterogeneous graph with multi-task learning.
+
+Analysis: Edge type inventory and cross-type connectivity.
+
+______________________________________________________________________
+
+## 7. AliGraph
+
+**AliGraph: A Comprehensive Graph Neural Network Platform** Zhu et al., Alibaba, VLDB 2019.
+[arxiv.org/abs/1902.08730](https://arxiv.org/abs/1902.08730)
+
+**Summary:** Alibaba's GNN platform for e-commerce, handling 493M vertices and 6.8B edges across recommendations, search
+ranking, and fraud detection. Three-layer architecture: storage, sampling, operator.
+
+**Graph characteristics:** 493M vertices, 6.8B edges (Taobao user-item interaction). Power-law degree distribution.
+Multiple edge types: click, purchase, add-to-cart, favorite. Rich node attributes.
+
+### Findings
+
+**7.1 Feature alignment: ID space mismatches between tables.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 3
+"System Architecture"*
+
+Node attribute tables and edge tables had mismatched node ID spaces. Nodes in edges that had no corresponding feature
+row caused silent NaN propagation during message passing.
+
+Analysis: Edge referential integrity (src/dst exists in node table). Hard fail.
+
+**7.2 8% of nodes in components smaller than 10 nodes.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 4 "System
+Architecture"*
+
+These small-component nodes get poor embeddings because message passing cannot aggregate meaningful neighborhood
+information.
+
+Analysis: Isolated node count. Connected component analysis (Tier 4 opt-in).
+
+**7.3 12% of item nodes had incomplete attributes.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 4 "System
+Architecture"*
+
+Missing category, price, or description. Used mean imputation for numerical, learned "unknown" embedding for
+categorical.
+
+Analysis: NULL rate per column per table. Feature Propagation research shows GNNs tolerate up to 90% missing, but
+imputation strategy depends on rate.
+
+**7.4 Feature dimension heterogeneity across node types.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 5 "System
+Architecture"*
+
+User features (~200 dims) vs item features (~1024 dims). Direct concatenation without normalization degraded performance
+by 5-8%.
+
+Analysis: Feature dimensionality per node type. Alert if ratio > 10x across types.
+
+**7.5 Feature scale spanning 6 orders of magnitude.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 5 "System
+Architecture"*
+
+Raw price values caused gradient instability. Log-transformation + z-score normalization required.
+
+Analysis: Per-feature min/max/range. Alert if max/min ratio > 10^4.
+
+**7.6 15% daily edge churn; stale features degraded quality 2-3% AUC.** *Source: AliGraph (Zhu et al., VLDB 2019),
+Section 6 "Experiments"*
+
+Features computed from the graph became stale within hours if not refreshed.
+
+Analysis: Edge timestamp freshness (Tier 4 opt-in). Data staleness metrics.
+
+**7.7 Edge density varies 100x across subgraph types.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 6
+"Experiments"*
+
+User-user interaction subgraph was 100x sparser than item-item co-purchase. Required density-aware sampling rates.
+
+Analysis: Per-edge-type density: edge_count / (src_node_count x dst_node_count).
+
+**7.8 Top 1% vertices by degree accessed in >80% of mini-batches.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 6
+"Experiments"*
+
+Hub caching critical for training throughput.
+
+Analysis: Top-K hub analysis. Hub concentration: fraction of edges involving top 1% nodes.
+
+**7.9 Naive hash partitioning caused 3-5x more cross-partition edges than necessary.** *Source: AliGraph (Zhu et al.,
+VLDB 2019), Section 6 "Experiments"*
+
+Vertices near partition boundaries had systematically lower-quality aggregated features.
+
+Analysis: Degree distribution and graph density inform partitioning strategy.
+
+______________________________________________________________________
+
+## 8. GraphSMOTE
+
+**GraphSMOTE: Imbalanced Node Classification on Graphs via Augmentation** Zhao et al., WSDM 2021.
+[arxiv.org/abs/2103.08826](https://arxiv.org/abs/2103.08826)
+
+**Summary:** Demonstrates that GNN message passing exacerbates class imbalance. Proposes synthetic minority node
+generation in embedding space with topology-aware edge generation. Consistent improvements across multiple datasets.
+
+**Graph characteristics:** Cora (2,708 nodes, 7 classes), CiteSeer (3,327 nodes, 6 classes), BlogCatalog (5,196 nodes, 6
+classes). Artificially created imbalance ratios from 1:1 to 1:20.
+
+### Findings
+
+**8.1 Message passing amplifies imbalance 2-3x.** *Source: GraphSMOTE (Zhao et al., WSDM 2021), Section 3 "Method"*
+
+A 1:10 label imbalance becomes effectively 1:25 in representation space after 2 layers of GCN aggregation. The
+"representation imbalance" measured by variance of class-conditioned embedding distributions is 2-3x worse than original
+label imbalance.
+
+Analysis: Class imbalance ratio (max_count / min_count). Warning at > 1:5, critical at > 1:10.
+
+**8.2 Critical threshold at 1:5 to 1:10.** *Source: GraphSMOTE (Zhao et al., WSDM 2021), Section 4 "Experiments"*
+
+Performance degradation approximately linear from 1:1 to 1:5, then accelerates sharply. At 1:10, minority class F1 drops
+30-40%. At 1:20, some classes become nearly undetectable (F1 < 0.1).
+
+Analysis: Per-class node counts. Flag if any class pair ratio exceeds 5:1. Critical alert at > 10:1.
+
+**8.3 Intra-class edge density: majority 40-60%, minority 10-25%.** *Source: GraphSMOTE (Zhao et al., WSDM 2021),
+Section 3 "Method"*
+
+Minority class nodes have fewer same-class neighbors. They experience fundamentally different message-passing dynamics.
+
+Analysis: Homophily ratio (Tier 4 opt-in). Per-class intra-class edge fraction if labels available.
+
+**8.4 Degree-imbalance correlation.** *Source: GraphSMOTE (Zhao et al., WSDM 2021), Section 4 "Experiments"*
+
+In social network datasets, minority-class nodes tended to have lower average degree, compounding the problem. They had
+both fewer training signals and less informative neighborhoods.
+
+Analysis: Per-class degree distribution if labels available. Correlation between class and degree.
+
+**8.5 Topology imbalance is distinct from label imbalance.** *Source: GraphSMOTE (Zhao et al., WSDM 2021), Section 4
+"Experiments"*
+
+Even when class label counts are balanced, if minority nodes are scattered (not clustered) in the graph, they are
+topologically disadvantaged.
+
+Analysis: Node-level homophily distribution (Tier 4 opt-in).
+
+______________________________________________________________________
+
+## 9. Beyond Homophily
+
+**Beyond Homophily in Graph Neural Networks: Current Limitations and Effective Designs** Zhu et al., NeurIPS 2020.
+[arxiv.org/abs/2006.11468](https://arxiv.org/abs/2006.11468)
+
+**Summary:** Formally defines homophily ratio, demonstrates standard GCN/GAT/GraphSAGE architectures fail on
+heterophilic graphs, and proposes H2GCN. Provides the most comprehensive analysis of how graph homophily affects GNN
+performance.
+
+**Graph characteristics:** Homophilic datasets: Cora (h=0.81), CiteSeer (h=0.74), PubMed (h=0.80). Heterophilic
+datasets: Texas (h=0.11), Wisconsin (h=0.21), Cornell (h=0.30), Actor (h=0.22).
+
+### Findings
+
+**9.1 Edge homophily ratio is the most predictive graph property for architecture selection.** *Source: Beyond Homophily
+(Zhu et al., NeurIPS 2020), Section 3 "Analysis"*
+
+h = fraction of edges connecting same-class nodes. This single number predicts whether standard GCN will work (h > 0.5)
+or fail (h < 0.3).
+
+Analysis: Compute edge homophily ratio. Green (h > 0.7), yellow (0.3-0.7), red (h < 0.3).
+
+**9.2 GCN drops 30-50 percentage points from homophilic to heterophilic.** *Source: Beyond Homophily (Zhu et al.,
+NeurIPS 2020), Section 3 "Analysis"*
+
+GAT: 25-45 point drop. GraphSAGE: 20-40 point drop. The degradation is severe.
+
+Analysis: If h < 0.3, warn that standard architectures will likely underperform.
+
+**9.3 MLP outperforms GCN at h < 0.2.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020), Section 3 "Analysis"*
+
+On highly heterophilic graphs, ignoring graph structure is better than using it with standard architectures.
+
+Analysis: If h < 0.2, recommend benchmarking MLP baseline before investing in GNN.
+
+**9.4 Class-conditional homophily varies dramatically.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020), Section 3
+"Analysis"*
+
+In Actor dataset, some classes have h=0.4 while others have h=0.1. The worst-performing class is typically the one with
+lowest class-conditional homophily.
+
+Analysis: Per-class homophily ratio. Flag classes with h < 0.2.
+
+**9.5 Node-level homophily distribution matters.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020), Section 3
+"Analysis"*
+
+A graph with mean h=0.5 could be uniformly 0.5 or bimodal (some nodes all same-class, others all different-class). These
+have very different implications.
+
+Analysis: Per-node homophily, report distribution (mean, std, skewness).
+
+**9.6 2-hop neighbors are MORE informative than 1-hop in heterophilic graphs.** *Source: Beyond Homophily (Zhu et al.,
+NeurIPS 2020), Section 4 "H2GCN Design"*
+
+"Enemy of my enemy is my friend." H2GCN exploits this by separately aggregating 1-hop and 2-hop.
+
+Analysis: If h < 0.3, compute 2-hop homophily. If 2-hop > 1-hop, flag for H2GCN-style architecture.
+
+**9.7 High degree hurts in heterophilic graphs.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020), Section 4 "H2GCN
+Design"*
+
+Opposite of homophilic graphs. High-degree nodes aggregate more "wrong-class" information.
+
+Analysis: Cross-reference degree distribution with homophily. If heterophilic, warn about high-degree nodes.
+
+**9.8 Feature propagation is harmful in heterophilic settings.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020),
+Section 4 "H2GCN Design"*
+
+Standard feature propagation (multiplying by adjacency matrix) smooths features, making connected nodes more similar. On
+heterophilic graphs, this destroys distinguishing signal.
+
+Analysis: If h < 0.3, warn that feature propagation/smoothing may degrade performance.
+
+______________________________________________________________________
+
+## 10. Fraud Detection
+
+**Uber Fraud Detection** (2022) and **Grab Spade** (Jiang et al., VLDB 2023)
+
+**Summary (Uber):** Heterogeneous transaction graphs connecting riders, drivers, payment methods, devices, locations.
+Detects fraud rings via dense subgraphs and anomalous multi-hop patterns.
+
+**Summary (Grab Spade):** Production GNN for fraud detection processing 100M+ nodes, 1B+ edges across ride-hailing,
+food, payments. Details temporal consistency, graph evolution, and adversarial actors.
+
+**Graph characteristics:** Heterogeneous, 6+ node types (Uber), 10+ edge types. Extreme class imbalance (fraud rate
+0.1-2%). Temporal dynamics.
+
+### Findings
+
+**10.1 Temporal edge ordering changes AUC by 5-8%.** *Source: Uber Fraud Detection (2022) / Grab Spade (Jiang et al.,
+VLDB 2023), Section 3*
+
+Processing edges chronologically vs shuffled has significant impact. Temporal leakage inflates offline metrics but fails
+in production.
+
+Analysis: Check for timestamp columns on edges. Validate chronological consistency.
+
+**10.2 Label noise: 20-40% of fraud labels are delayed.** *Source: Uber Fraud Detection (2022) / Grab Spade (Jiang et
+al., VLDB 2023), Section 4*
+
+Ground truth fraud confirmed days/weeks after event. Training on point-in-time vs eventual labels creates different
+models.
+
+Analysis: Label timestamp distribution relative to edge timestamps. Flag delayed labels.
+
+**10.3 Feature-edge timestamp misalignment >24h degrades AUC 2-4%.** *Source: Grab Spade (Jiang et al., VLDB 2023),
+Section 4*
+
+Node features computed at time T1 but edges from time T2 creates temporal inconsistency.
+
+Analysis: Compare feature computation timestamps vs edge timestamps. Flag > 24h gaps.
+
+**10.4 Dense subgraph detection.** *Source: Uber Fraud Detection (2022) / Grab Spade (Jiang et al., VLDB 2023), Section
+5*
+
+Fraud rings manifest as near-cliques with local density > 0.7 (legitimate < 0.3). Specific 3-node motifs are 50x more
+common among fraudsters.
+
+Analysis: Local density distribution (Tier 4, specialized for fraud domains).
+
+**10.5 Edge type importance varies 10x.** *Source: Grab Spade (Jiang et al., VLDB 2023), Section 5*
+
+Shared-device edges are far more indicative of fraud than shared-location edges. Static binary edges lose this signal.
+
+Analysis: Per-edge-type statistics. Edge weight distribution if weighted.
+
+**10.6 Temporal degree bursts are strong fraud signals.** *Source: Grab Spade (Jiang et al., VLDB 2023), Section 5*
+
+Sudden increases in node degree (device connecting to 20 new users in 1 hour).
+
+Analysis: Time-windowed degree analysis (Tier 4, specialized).
+
+______________________________________________________________________
+
+## 11. Google Maps ETA
+
+**ETA Prediction with Graph Neural Networks in Google Maps** Derrow-Pinion et al., Google, CIKM 2021.
+[arxiv.org/abs/2108.11482](https://arxiv.org/abs/2108.11482)
+
+**Summary:** GNN for ETA prediction on road network graphs. Supersegments (3-15 connected road segments) as graph
+substructures. Reduced negative ETA outcomes by 40%+ in cities. Serves billions of queries daily.
+
+**Graph characteristics:** Road segments as nodes (hundreds of millions globally), segment-to-segment connectivity as
+edges. Bounded degree (2-6 typical, max rarely > 10). Relatively static topology.
+
+### Findings
+
+**11.1 Bounded degree distribution (2-6 typical).** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021), Section
+3 "Method"*
+
+Unlike social networks, road networks have naturally bounded degree. Full neighborhoods can be used without sampling.
+
+Analysis: Degree distribution. Compare to expected range for domain. Bounded degree simplifies sampling.
+
+**11.2 Directionality matters.** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021), Section 3 "Method"*
+
+One-way streets, highway on-ramps. Using undirected message passing lost directional information.
+
+Analysis: Check if graph is directed. Warn if directed graph is treated as undirected.
+
+**11.3 Edge features are as important as node features.** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021),
+Section 3 "Method"*
+
+Travel time along a segment (edge feature) is more directly predictive than segment attributes.
+
+Analysis: Edge feature dimensionality and completeness. Alert if edge features are missing.
+
+**11.4 Per-category normalization needed.** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021), Section 4
+"Experiments"*
+
+"Fast" speed on a highway is different from "fast" on a residential street. Road-type-specific normalization improved
+predictions.
+
+Analysis: Per-category (node type) feature distribution. Flag if features have vastly different scales across
+categories.
+
+**11.5 Missing real-time features on 5-15% of segments.** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021),
+Section 4 "Experiments"*
+
+Falls back to historical averages. Encoding "data freshness" as an additional feature improved robustness.
+
+Analysis: Feature missing rate. Spatial clustering of missing features (are they concentrated in certain regions?).
+
+______________________________________________________________________
+
+## 12. Feature Propagation
+
+**On the Unreasonable Effectiveness of Feature Propagation in Learning on Graphs with Missing Node Features** Rossi et
+al., ICLR 2022.
+
+**Summary:** Demonstrates simple feature propagation (diffusing known features along edges) can effectively reconstruct
+missing features, often matching sophisticated imputation methods. Provides theoretical analysis of when this works.
+
+**Graph characteristics:** Tested across Cora, CiteSeer, PubMed, Amazon, Coauthor datasets. Controlled missing rates:
+10-99%.
+
+### Findings
+
+**12.1 Feature propagation recovers features up to 90% missing with ~5% accuracy drop.** *Source: Feature Propagation
+(Rossi et al., ICLR 2022), Section 3 "Feature Propagation"*
+
+On Cora, 90% missing + propagation dropped only ~5% from full-feature baseline. At 99% missing, ~15% drop.
+
+Analysis: Feature missing rate. If < 90%, propagation is viable. If 90-95%, warn of degradation. If > 95%, critical.
+
+**12.2 Phase transition at 95-99% missing.** *Source: Feature Propagation (Rossi et al., ICLR 2022), Section 3 "Feature
+Propagation"*
+
+Below 95%, performance degrades gracefully. Above 99%, propagation fails to recover meaningful features, especially on
+sparse graphs.
+
+Analysis: Flag datasets with > 95% missing features as critical.
+
+**12.3 Spectral gap predicts recovery quality.** *Source: Feature Propagation (Rossi et al., ICLR 2022), Section 3
+"Feature Propagation"*
+
+Graphs with larger spectral gap recover features faster. Spectral gap > 0.1 means good recovery; < 0.01 means poor.
+
+Analysis: Estimate spectral gap (Tier 4, approximated from graph Laplacian).
+
+**12.4 Spatially clustered missing features are harder to recover.** *Source: Feature Propagation (Rossi et al., ICLR
+2022), Section 4 "Experiments"*
+
+Uniformly random missing is easiest. If an entire connected component has no features, propagation cannot help.
+
+Analysis: Spatial autocorrelation of missingness pattern. Per-connected-component feature coverage.
+
+**12.5 Low-pass filter property destroys heterophilic signal.** *Source: Feature Propagation (Rossi et al., ICLR 2022),
+Section 4 "Experiments"*
+
+Propagation smooths features, preserving low-frequency components. On heterophilic graphs, discriminative information is
+in high-frequency components and gets destroyed.
+
+Analysis: Cross-reference: if h < 0.3 AND features are missing, warn that propagation may harm performance.
+
+**12.6 Local connectivity of missing nodes determines recovery quality.** *Source: Feature Propagation (Rossi et al.,
+ICLR 2022), Section 4 "Experiments"*
+
+A missing-feature node with 5+ feature-bearing neighbors recovers well. With only 1, recovery is poor.
+
+Analysis: For nodes with missing features: compute average degree and number of feature-bearing neighbors.
+
+______________________________________________________________________
+
+## 13. GraphBFF
+
+**GraphBFF: Scaling Graph Foundation Models to Billion Nodes and Edges** arXiv:2602.04768, February 2026.
+
+**Summary:** First end-to-end recipe for billion-parameter Graph Foundation Models (1.4B parameters trained on 1B
+samples). Supports arbitrary heterogeneous graphs at billion scale with zero-shot and few-shot transfer.
+
+**Graph characteristics:** Billion-scale heterogeneous graphs. Diverse domains (social, citation, molecular, knowledge
+graphs). Variable feature dimensionality across graph types.
+
+### Findings
+
+**13.1 Cross-domain graph heterogeneity requires standardized validation.** *Source: GraphBFF, Section 3 "Training
+Recipe"*
+
+Foundation models train on diverse graph distributions. Out-of-distribution detection and feature normalization across
+domains are prerequisites for transfer learning.
+
+Analysis: Feature distribution comparison across node/edge types. Schema consistency checks. Feature normalization
+validation.
+
+**13.2 Scale validation is critical for foundation model feasibility.** *Source: GraphBFF, Section 4 "Experiments"*
+
+At 1B samples and 1.4B parameters, data pipeline failures are catastrophic. Basic count validation prevents wasted
+compute.
+
+Analysis: Node/edge count sanity checks. Feature memory budget estimation at billion scale.
+
+______________________________________________________________________
+
+## 14. DQuaG
+
+**DQuaG: Automated Data Quality Validation in an End-to-End GNN Framework** Dong et al., EDBT 2025.
+[arxiv.org/abs/2502.10667](https://arxiv.org/abs/2502.10667)
+
+**Summary:** Framework combining GAT+GIN fusion encoder with dual decoders for data quality detection and repair
+suggestions. Addresses complex interdependencies in graph data that single-table validation misses.
+
+**Graph characteristics:** Relational databases modeled as graphs. Entity-relationship graphs with typed nodes and
+edges.
+
+### Findings
+
+**14.1 Graph data quality requires topology-aware validation.** *Source: DQuaG, Section 3 "Methodology"*
+
+Errors in graph data often manifest through interdependencies: a node's feature error may only be detectable by
+examining its neighborhood. Single-table validation (NULL checks, type checks) misses these structural anomalies.
+
+Analysis: Cross-table consistency checks. Edge referential integrity. Feature consistency across connected nodes.
+
+**14.2 Dual detection and repair improves data quality workflows.** *Source: DQuaG, Section 4 "Experiments"*
+
+A validation system that both detects issues AND suggests repairs reduces manual investigation time. The analyzer should
+not just flag problems but indicate severity and suggest next steps.
+
+Analysis: Traffic-light severity indicators with actionable recommendations in the HTML report.
+
+______________________________________________________________________
+
+## 15. LinkedIn Cross-Domain GNN
+
+**Large Scalable Cross-Domain Graph Neural Networks for Personalized Notification at LinkedIn** arXiv:2506.12700, June
+2025\.
+
+**Summary:** Deployed cross-domain GNN unifying user, content, and activity signals across LinkedIn's notification
+system. Significantly outperforms single-domain baselines. Integrates LLM and GNN capabilities via the STAR system.
+
+**Graph characteristics:** Cross-domain heterogeneous graph. Multiple independent subgraphs (feed, jobs, notifications)
+joined into one. Adaptive sampling across domains.
+
+### Findings
+
+**15.1 Cross-domain schema alignment is a prerequisite.** *Source: LinkedIn Cross-Domain GNN, Section 3 "System
+Architecture"*
+
+When joining subgraphs from different domains, node ID spaces, feature schemas, and edge semantics must align.
+Misalignment causes silent data corruption during cross-domain message passing.
+
+Analysis: Schema consistency across node/edge tables. ID uniqueness across domains. Feature type alignment validation.
+
+**15.2 Per-domain sparsity varies dramatically.** *Source: LinkedIn Cross-Domain GNN, Section 4 "Experiments"*
+
+Some domains are dense (feed interactions) while others are sparse (job applications). Per-domain degree distribution
+analysis identifies which domains benefit most from cross-domain densification.
+
+Analysis: Per-edge-type degree distribution. Per-domain density metrics. Cold-start fraction per domain.
+
+______________________________________________________________________
+
+## 16. Oversmoothing/Oversquashing Complexity
+
+**On the Complexity of Optimal Graph Rewiring for Oversmoothing and Oversquashing in Graph Neural Networks**
+arXiv:2603.26140, March 2026.
+
+**Summary:** Proves NP-hardness of optimal graph rewiring to improve spectral gap and conductance for mitigating
+oversmoothing/oversquashing. Establishes fundamental limits on information flow optimization in graph structure.
+
+**Graph characteristics:** Theoretical results applicable to all graph types. Connects graph topology metrics to GNN
+training pathologies.
+
+### Findings
+
+**16.1 Spectral gap and conductance are fundamental quality metrics.** *Source: arXiv:2603.26140, Section 3 "Hardness
+Results"*
+
+Oversmoothing and oversquashing are topology-dependent, not just model-dependent. The spectral gap of the graph
+Laplacian and the Cheeger conductance predict susceptibility to both pathologies before any model is trained.
+
+Analysis: Spectral gap estimation (Tier 4, approximated from graph Laplacian). Graph conductance metrics. Information
+flow bottleneck detection.
+
+**16.2 Graph topology predicts training pathologies.** *Source: arXiv:2603.26140, Section 2 "Preliminaries"*
+
+Dense, high-clustering graphs are prone to oversmoothing (information becomes uniform). Sparse, low-clustering graphs
+are prone to oversquashing (information is lost through bottlenecks). These are pre-training diagnostic signals.
+
+Analysis: Clustering coefficient (Tier 4). Graph density per edge type. Degree distribution shape analysis.
+
+______________________________________________________________________
+
+## 17. Demystifying Common Beliefs in Graph ML
+
+**Demystifying Common Beliefs in Graph Machine Learning** Arnaiz-Rodriguez et al., ICLR 2026.
+[arxiv.org/abs/2505.15547](https://arxiv.org/abs/2505.15547)
+
+**Summary:** Challenges conventional wisdom about oversmoothing, oversquashing, and the homophily/heterophily dichotomy.
+Highlights ambiguities in standard metrics that prevent focused research and reliable threshold-based decisions.
+
+**Graph characteristics:** Meta-analysis across standard benchmarks. Synthetic and real-world graphs with controlled
+properties.
+
+### Findings
+
+**17.1 Homophily thresholds are oversimplified.** *Source: Demystifying, Section 4 "Homophily and Heterophily"*
+
+The standard edge homophily ratio (h) is ambiguous: the same h value can correspond to very different graph structures
+depending on class distribution and degree patterns. The commonly cited h > 0.7 / h < 0.3 thresholds should be
+interpreted with caution, not as hard cutoffs.
+
+Analysis: Report homophily as informational, not as a hard architecture selector. Include class-conditional and
+node-level homophily distributions for nuance.
+
+**17.2 Oversmoothing claims are often conflated with other effects.** *Source: Demystifying, Section 3 "Oversmoothing"*
+
+Many "oversmoothing" failures are actually caused by other factors (vanishing gradients, loss of rank). The paper warns
+against using oversmoothing as a blanket explanation for deep GNN failure. Topology metrics alone do not predict
+oversmoothing reliably.
+
+Analysis: Clustering coefficient and density are informational for oversmoothing risk, but should not trigger strong
+warnings without additional context.
+
+______________________________________________________________________
+
+## 18. Meta GEM and Adaptive Ranking
+
+**Meta's Generative Ads Model (GEM) and Adaptive Ranking Model** Engineering at Meta blog, November 2025 + March 2026.
+
+**Summary:** GEM is trained at LLM scale on thousands of GPUs using multi-dimensional parallelism for ads
+recommendation. Adaptive Ranking was deployed on Instagram Q4 2025, yielding +3% conversion and +5% CTR improvements.
+Both systems operate on massive heterogeneous user-content-ad graphs.
+
+**Graph characteristics:** Multi-domain ads graph spanning users, content, advertisers, ad units. Billions of nodes,
+hundreds of billions of interactions. Multi-task learning across conversion, click, and engagement objectives.
+
+### Findings
+
+**18.1 Multi-domain feature quality must be validated per domain.** *Source: Meta GEM blog, "Training at Scale" section*
+
+Different domains (user profiles, ad creative, content features) have different quality characteristics. Aggregate
+feature quality metrics hide per-domain problems. A feature with 5% missing rate overall might have 30% missing in the
+ad creative domain.
+
+Analysis: Feature missing rate broken down by node type / domain. Per-domain feature distribution comparison.
+
+**18.2 Training-serving data skew degrades production impact.** *Source: Meta Adaptive Ranking blog, "Results" section*
+
+Offline evaluation metrics can diverge from online impact when training data distribution doesn't match serving
+distribution. Graph data staleness and sampling bias are common causes.
+
+Analysis: Edge timestamp freshness. Sampling bias quantification via degree distribution analysis.
+
+______________________________________________________________________
+
+## 19. Consolidated Threshold Table
+
+Numerical thresholds extracted from the literature, with source justification.
+
+| Metric                                 | Green         | Yellow         | Red             | Source Paper                                 |
+| -------------------------------------- | ------------- | -------------- | --------------- | -------------------------------------------- |
+| Edge homophily ratio                   | > 0.7         | 0.3 - 0.7      | < 0.3           | Beyond Homophily (NeurIPS 2020)              |
+| Class imbalance ratio                  | < 1:5         | 1:5 - 1:10     | > 1:10          | GraphSMOTE (WSDM 2021)                       |
+| Feature missing rate                   | < 10%         | 10 - 50%       | > 90%           | Feature Propagation (ICLR 2022)              |
+| Missing feature phase transition       | < 90%         | 90 - 95%       | > 95%           | Feature Propagation (ICLR 2022)              |
+| Isolated node fraction                 | < 1%          | 1 - 5%         | > 5%            | AliGraph (VLDB 2019): 8% in small components |
+| Degree p99/median ratio                | < 50          | 50 - 100       | > 100           | PinSage (KDD 2018): power-law dominance      |
+| Node degree (GiGL int16 clamp)         | < 32,767      | n/a            | > 32,767        | GiGL `distributed/utils/degree.py`           |
+| Neighbor explosion (per seed)          | < 50K nodes   | 50K - 100K     | > 100K          | Layer-Neighbor Sampling (NeurIPS 2022)       |
+| Cold-start fraction (degree 0-1)       | < 5%          | 5 - 10%        | > 10%           | LiGNN (KDD 2024)                             |
+| Edge type dominance                    | No type > 80% | Any type > 90% | Any type < 0.1% | TwHIN (KDD 2022)                             |
+| Feature dimension ratio (across types) | < 5x          | 5 - 10x        | > 10x           | AliGraph (VLDB 2019): 5-8% degradation       |
+| Feature scale range                    | < 10^2        | 10^2 - 10^4    | > 10^4          | AliGraph (VLDB 2019): gradient instability   |
+| Edge staleness                         | < 1 day       | 1 - 7 days     | > 30 days       | AliGraph (VLDB 2019): 2-3% AUC degradation   |
diff --git a/docs/plans/20260415-bq-data-analyzer.md b/docs/plans/20260415-bq-data-analyzer.md
new file mode 100644
index 000000000..3efa5feec
--- /dev/null
+++ b/docs/plans/20260415-bq-data-analyzer.md
@@ -0,0 +1,468 @@
+# BQ Data Analyzer
+
+## Problem
+
+Before training a GNN on graph data in BigQuery, we need to understand data quality, feature distributions, and graph
+structure. Today, TFDV statistics generation is embedded inside the DataPreprocessor pipeline and commented out
+(`gigl/src/data_preprocessor/lib/transform/utils.py:300`). There is no standalone way to analyze BQ tables without
+running the full preprocessing flow.
+
+## Solution
+
+A standalone DataAnalyzer module under `gigl/analytics/` that takes BQ table references as input and produces a single
+HTML report combining TFDV feature statistics, FACETS visualizations, data quality checks, and graph structure metrics.
+
+## Components
+
+### 1. DataAnalyzerConfig (dataclass)
+
+- List of node table specs: BQ table URI, feature columns, ID column, node type
+- List of edge table specs: BQ table URI, src/dst ID columns, feature columns, edge type
+- Output GCS path for results
+- Optional: resource config URI for Dataflow sizing
+
+### 2. FeatureProfiler (reuses existing TFDV Beam components)
+
+For each table: builds a Beam pipeline that reads from BQ, converts to TFExamples, runs `tfdv.GenerateStatistics()`,
+writes stats TFRecord and FACETS HTML.
+
+Reuses: `IngestRawFeatures` (`utils.py:85`), `GenerateAndVisualizeStats` (`utils.py:120`), `WriteTFSchema`
+(`utils.py:186`), `init_beam_pipeline_options` (`dataflow.py`). Runs on Dataflow, same infra as DataPreprocessor.
+
+Also runs `tfdv.infer_schema()` for schema inference and `tfdv.validate_statistics()` for anomaly detection.
+
+### 3. GraphStructureAnalyzer (BQ SQL, extends BQGraphValidator)
+
+Two files: `queries.py` (SQL templates as string constants, same pattern as
+`gigl/src/data_preprocessor/lib/enumerate/queries.py`) and `graph_structure_analyzer.py` (orchestrator that calls
+`BqUtils.run_query()`).
+
+Validation dimensions are informed by a literature review of 12 production GNN papers. See
+`docs/plans/20260415-bq-data-analyzer-references.md` for full details and justifications.
+
+#### Cost Control
+
+| Check category                                                                                    | Can TABLESAMPLE? | Reason                                                       |
+| ------------------------------------------------------------------------------------------------- | ---------------- | ------------------------------------------------------------ |
+| Row-independent (NULL rates, feature distributions, cardinality)                                  | Yes              | Rows are independent; sampling preserves distribution        |
+| Structure-dependent (degree, referential integrity, hubs, isolated nodes, self-loops, duplicates) | No               | Sampling edges destroys connectivity, distorts degree counts |
+| Expensive structure-dependent (reciprocity, homophily, connected components)                      | No               | Must run on full data; opt-in via config flags instead       |
+
+#### Tier 1: Hard Fails (always-on, block training)
+
+Violations here mean training will fail or produce silently corrupt results.
+
+| Check                      | What                           | Justification                                                                                                                                                         |
+| -------------------------- | ------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Dangling edges             | NULL src or dst                | Breaks graph construction. Reuses `BQGraphValidator`. (GiGL)                                                                                                          |
+| Edge referential integrity | src or dst not in node table   | Breaks graph loading during `DistPartitioner`. `LEFT JOIN ... WHERE node.id IS NULL`. (GiGL enumeration, AliGraph: ID space mismatches caused silent NaN propagation) |
+| Duplicate nodes            | Same ID appears multiple times | Ambiguous feature loading, corrupt aggregation. (All papers, implied)                                                                                                 |
+
+#### Tier 2: Core Graph Understanding (always-on, essential metrics)
+
+Must-run checks that quantify fundamental graph properties. Not blocking, but critical for sampling strategy,
+architecture selection, and resource allocation.
+
+| Check                                   | What                                                           | Justification                                                                                                                                                                                                                      |
+| --------------------------------------- | -------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Node/edge counts                        | `COUNT(*)` per table                                           | Basic sanity. Off-by-orders-of-magnitude = pipeline failure.                                                                                                                                                                       |
+| Degree distribution (in/out separately) | min, max, mean, median, p90, p99, p99.9 via `APPROX_QUANTILES` | Single most important structural property. PinSage: 46% improvement from importance sampling on power-law graphs. BLADE: 230% embedding improvement from degree-adaptive neighborhoods. LiGNN: +3.2% AUC from 20 to 200 neighbors. |
+| Degree-stratified bucket counts         | Nodes in buckets: 0-1, 2-10, 11-100, 101-1K, 1K-10K, 10K+      | Different degree ranges need different sampling budgets. BLADE's core insight: neighborhood size should follow a power-law of in-degree.                                                                                           |
+| Super-hub int16 clamp count             | Nodes with degree > 32,767                                     | GiGL clamps to `torch.iinfo(torch.int16).max` in `distributed/utils/degree.py:134`. Silently truncates; affects PPR sampling probabilities.                                                                                        |
+| Top-K hub nodes                         | Highest-degree nodes per edge type                             | Hub nodes dominate aggregation (PinSage). AliGraph: top 1% vertices accessed in >80% of mini-batches.                                                                                                                              |
+| Isolated node count                     | Zero in-degree AND zero out-degree                             | Cannot receive message-passing signal. AliGraph: 8% of nodes were in components < 10 nodes.                                                                                                                                        |
+| Cold-start node count                   | Degree 0-1 per type                                            | Candidates for graph densification. LiGNN: +0.28% AUC from adding artificial edges for cold-start members.                                                                                                                         |
+| Self-loop count                         | Edges where src == dst                                         | Double-counted if pipeline adds self-loops (A+I normalization). AliGraph: self-loops helped 4/5 benchmarks but hurt fraud detection.                                                                                               |
+| Duplicate edge count                    | Same (src, dst) pair per edge type                             | Inflates degree, distorts aggregation. Most GNN frameworks assume simple graphs.                                                                                                                                                   |
+| NULL rates per column                   | Batched `COUNTIF(col IS NULL) / COUNT(*)`                      | Feature Propagation: GNNs tolerate up to 90% missing with ~5% accuracy drop. Phase transition at 95%.                                                                                                                              |
+| Feature memory budget                   | nodes x feature_dim x dtype_size per type                      | Must fit in distributed memory. 1B nodes x 256-dim x fp16 = 512GB. (AliGraph, LiGNN, TwHIN)                                                                                                                                        |
+| Neighbor explosion estimate             | Estimated subgraph size for given fan-out                      | OOM risk. At fan-out [15,10,5] with avg_degree=100, ~75K nodes per seed. (Layer-Neighbor Sampling)                                                                                                                                 |
+
+#### Tier 3: Label and Heterogeneous Quality (always-on if applicable)
+
+Run automatically when `label_column` is configured or multiple edge types exist.
+
+| Check                       | What                                             | Justification                                                                                                                                                              |
+| --------------------------- | ------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Class imbalance             | Per-class counts, max/min ratio                  | Message passing amplifies imbalance 2-3x. Critical threshold at 1:5. At 1:10, minority F1 drops 30-40%. At 1:20, some classes become undetectable. (GraphSMOTE)            |
+| Label coverage              | Fraction with non-NULL labels per type           | Supervision signal strength. Alert if < 5%. (GraphSMOTE)                                                                                                                   |
+| Edge type distribution      | Per-type count and fraction                      | Imbalance causes sampling bias for minority types. TwHIN: high-coverage vs low-coverage relations are fundamentally different. Alert if any type < 0.1% or dominant > 90%. |
+| Per-edge-type node coverage | Fraction of nodes participating in each relation | Identifies which node types have sparse coverage in which relations. (TwHIN)                                                                                               |
+
+#### Tier 4: Advanced / Opt-in (expensive, run on full data)
+
+Config flags to enable. No TABLESAMPLE; these run on full data or not at all.
+
+| Check                   | Config flag                          | Justification                                                                                                                                |
+| ----------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
+| Edge reciprocity        | `compute_reciprocity: true`          | Self-join, expensive at 1B+. Directed vs undirected semantics. BLADE: in-degree != out-degree matters for dual embeddings.                   |
+| Homophily ratio         | `compute_homophily: true`            | Standard GNNs degrade 30-50% on heterophilic graphs (h < 0.3). MLP outperforms GCN at h < 0.2. Needs labels + full edges. (Beyond Homophily) |
+| Connected components    | `compute_connected_components: true` | Disconnected components can't share info. AliGraph: 8% in small components. Requires iterative SQL or BQ Graph GQL.                          |
+| Power-law exponent      | Computed from degree stats (cheap)   | Alpha < 2 = extreme hub concentration. Predicts benefit of importance sampling. (PinSage, BLADE)                                             |
+| Temporal edge freshness | `timestamp_column` in edge config    | Stale edges inject noise. LiGNN: 5.8% AUC from temporal awareness. AliGraph: 15% daily churn, 2-3% AUC degradation.                          |
+| Clustering coefficient  | `compute_clustering: true`           | Over-smoothing risk with deeper models. Feature Propagation: higher clustering aids feature recovery.                                        |
+
+#### Key Thresholds
+
+| Metric                           | Green         | Yellow         | Red             | Source                  |
+| -------------------------------- | ------------- | -------------- | --------------- | ----------------------- |
+| Edge homophily                   | > 0.7         | 0.3 - 0.7      | < 0.3           | Beyond Homophily        |
+| Class imbalance ratio            | < 1:5         | 1:5 - 1:10     | > 1:10          | GraphSMOTE              |
+| Feature missing rate             | < 10%         | 10 - 50%       | > 90%           | Feature Propagation     |
+| Missing feature phase transition | < 90%         | 90 - 95%       | > 95%           | Feature Propagation     |
+| Isolated node fraction           | < 1%          | 1 - 5%         | > 5%            | AliGraph                |
+| Degree p99/median                | < 50          | 50 - 100       | > 100           | PinSage                 |
+| Node degree (int16 clamp)        | < 32,767      | n/a            | > 32,767        | GiGL                    |
+| Neighbor explosion (per seed)    | < 50K         | 50K - 100K     | > 100K          | Layer-Neighbor Sampling |
+| Cold-start fraction (degree 0-1) | < 5%          | 5 - 10%        | > 10%           | LiGNN                   |
+| Edge type dominance              | No type > 80% | Any type > 90% | Any type < 0.1% | TwHIN                   |
+
+#### Query Inventory (queries.py)
+
+| Constant                           | Tier | BQ Query                                                         |
+| ---------------------------------- | ---- | ---------------------------------------------------------------- |
+| `NODE_COUNT_QUERY`                 | 2    | `SELECT COUNT(*) FROM {table}`                                   |
+| `EDGE_COUNT_QUERY`                 | 2    | `SELECT COUNT(*) FROM {table}`                                   |
+| `NULL_RATES_QUERY`                 | 2    | Batched `COUNTIF(col IS NULL) / COUNT(*)` per column             |
+| `DUPLICATE_NODE_COUNT_QUERY`       | 1    | `GROUP BY id HAVING COUNT(*) > 1`                                |
+| `DUPLICATE_EDGE_COUNT_QUERY`       | 2    | `GROUP BY (src, dst) HAVING COUNT(*) > 1`                        |
+| `DANGLING_EDGES_QUERY`             | 1    | `WHERE src IS NULL OR dst IS NULL`                               |
+| `EDGE_REFERENTIAL_INTEGRITY_QUERY` | 1    | `LEFT JOIN node_table WHERE node.id IS NULL`                     |
+| `SELF_LOOP_COUNT_QUERY`            | 2    | `WHERE src = dst`                                                |
+| `ISOLATED_NODE_COUNT_QUERY`        | 2    | `LEFT JOIN edge_table ... WHERE edge IS NULL`                    |
+| `DEGREE_DISTRIBUTION_QUERY`        | 2    | `APPROX_QUANTILES` on `GROUP BY` degree counts (in/out separate) |
+| `DEGREE_BUCKET_QUERY`              | 2    | Bucket counts: 0-1, 2-10, 11-100, 101-1K, 1K-10K, 10K+           |
+| `TOP_K_HUBS_QUERY`                 | 2    | `ORDER BY degree DESC LIMIT k`                                   |
+| `SUPER_HUB_INT16_CLAMP_QUERY`      | 2    | `HAVING COUNT(*) > 32767`                                        |
+| `COLD_START_NODE_COUNT_QUERY`      | 2    | `LEFT JOIN ... HAVING COUNT(edge) <= 1`                          |
+| `CLASS_IMBALANCE_QUERY`            | 3    | `GROUP BY label_column` with counts                              |
+| `LABEL_COVERAGE_QUERY`             | 3    | `COUNTIF(label IS NOT NULL) / COUNT(*)`                          |
+| `EDGE_TYPE_DISTRIBUTION_QUERY`     | 3    | `COUNT(*)` per edge type table                                   |
+| `EDGE_TYPE_NODE_COVERAGE_QUERY`    | 3    | `COUNT(DISTINCT src)` and `COUNT(DISTINCT dst)` per edge type    |
+
+`FEATURE_MEMORY_BUDGET`, `NEIGHBOR_EXPLOSION_ESTIMATE`, and `POWER_LAW_EXPONENT` are computed in Python from schema
+metadata and degree stats, not BQ queries.
+
+#### Result Dataclass
+
+```python
+@dataclass
+class DegreeStats:
+    min: int
+    max: int
+    mean: float
+    median: int
+    p90: int
+    p99: int
+    p999: int
+    percentiles: list[int]  # 100 values from APPROX_QUANTILES
+    buckets: dict[str, int]  # "0-1": count, "2-10": count, etc.
+
+@dataclass
+class GraphAnalysisResult:
+    # Tier 1: hard fails
+    duplicate_node_counts: dict[str, int]
+    dangling_edge_counts: dict[str, int]
+    referential_integrity_violations: dict[str, int]
+
+    # Tier 2: core graph understanding
+    node_counts: dict[str, int]
+    edge_counts: dict[str, int]
+    null_rates: dict[str, dict[str, float]]
+    duplicate_edge_counts: dict[str, int]
+    self_loop_counts: dict[str, int]
+    isolated_node_counts: dict[str, int]
+    degree_stats: dict[str, DegreeStats]       # edge_type -> stats (in + out separate)
+    top_hubs: dict[str, list[tuple[str, int]]]
+    super_hub_int16_clamp_count: dict[str, int]
+    cold_start_node_counts: dict[str, int]
+    feature_memory_bytes: dict[str, int]
+    neighbor_explosion_estimate: dict[str, int]
+
+    # Tier 3: label and heterogeneous (populated if applicable)
+    class_imbalance: dict[str, dict[str, int]]
+    label_coverage: dict[str, float]
+    edge_type_distribution: dict[str, int]
+    edge_type_node_coverage: dict[str, dict[str, int]]  # edge_type -> {src_coverage, dst_coverage}
+
+    # Tier 4: opt-in (populated if enabled)
+    reciprocity: dict[str, float]
+    power_law_exponent: dict[str, float]
+```
+
+### 4. ReportGenerator (AI-owned, see `gigl/analytics/data_analyzer/PRD.md`)
+
+Takes all raw outputs (TFDV stats, BQ query results, schemas) and generates a single self-contained HTML report. This
+component is defined by the PRD and generated/maintained by AI.
+
+## Data Flow
+
+```
+BQ Tables (node + edge)
+    |
+    +-- FeatureProfiler (Dataflow)
+    |   +-- TFDV stats (per table)
+    |   +-- Inferred schemas
+    |   +-- Anomaly reports
+    |
+    +-- GraphStructureAnalyzer (BQ SQL)
+    |   +-- Counts
+    |   +-- Degree distributions
+    |   +-- Quality checks
+    |   +-- Hub analysis
+    |
+    +-- ReportGenerator
+        +-- Single HTML report (GCS)
+```
+
+## Entry Point
+
+```python
+# CLI
+python -m gigl.analytics.data_analyzer \
+    --analyzer_config_uri gs://bucket/analyzer_config.yaml \
+    --resource_config_uri gs://bucket/resource_config.yaml
+
+# Python
+from gigl.analytics.data_analyzer import DataAnalyzer
+analyzer = DataAnalyzer()
+analyzer.run(config=config, resource_config_uri=resource_config_uri)
+```
+
+## Config Format (YAML)
+
+```yaml
+node_tables:
+  - bq_table: "project.dataset.user_nodes"
+    node_type: "user"
+    id_column: "user_id"
+    feature_columns: ["age", "country", "embedding"]
+    label_column: "label"  # optional, enables class imbalance and label coverage checks
+
+edge_tables:
+  - bq_table: "project.dataset.user_user_edges"
+    edge_type: "engages"
+    src_id_column: "src_user_id"
+    dst_id_column: "dst_user_id"
+    feature_columns: ["weight", "recency"]
+
+output_gcs_path: "gs://bucket/analysis_output/"
+
+# Neighbor explosion estimation (optional)
+fan_out: [15, 10, 5]  # sampling fan-out per GNN layer, used to estimate subgraph size
+
+# Opt-in expensive checks (Tier 4)
+# compute_reciprocity: false
+# compute_homophily: false
+# compute_connected_components: false
+# compute_clustering: false
+```
+
+## Reused Code
+
+| Component                  | Source                       | Path                                                    |
+| -------------------------- | ---------------------------- | ------------------------------------------------------- |
+| TFDV stats generation      | `GenerateAndVisualizeStats`  | `gigl/src/data_preprocessor/lib/transform/utils.py:120` |
+| Raw feature ingestion      | `IngestRawFeatures`          | `gigl/src/data_preprocessor/lib/transform/utils.py:85`  |
+| Instance dict to TFExample | `InstanceDictToTFExample`    | `gigl/src/data_preprocessor/lib/transform/utils.py:42`  |
+| TF schema writing          | `WriteTFSchema`              | `gigl/src/data_preprocessor/lib/transform/utils.py:186` |
+| Beam pipeline options      | `init_beam_pipeline_options` | `gigl/src/common/utils/dataflow.py`                     |
+| Dangling edge check        | `BQGraphValidator`           | `gigl/analytics/graph_validation/bq_graph_validator.py` |
+| BQ utilities               | `BqUtils`                    | `gigl/src/common/utils/bq.py`                           |
+| GCS path generation        | `gcs_constants`              | `gigl/src/common/constants/gcs.py`                      |
+| BQ data references         | `BigqueryNodeDataReference`  | `gigl/src/data_preprocessor/lib/ingest/bigquery.py`     |
+
+## New Files
+
+```
+gigl/analytics/
+    __init__.py                      # existing
+    graph_validation/                # existing
+    data_analyzer/
+        __init__.py
+        PRD.md                       # AI-owned: requirements for HTML report
+        data_analyzer.py             # main orchestrator + CLI entry point
+        config.py                    # DataAnalyzerConfig dataclass
+        feature_profiler.py          # TFDV Beam pipeline builder
+        graph_structure_analyzer.py  # BQ SQL queries for graph metrics
+        report_generator.py          # AI-owned: HTML report generation (built from PRD.md)
+        queries.py                   # SQL query templates
+```
+
+## Tradeoff Analysis
+
+### TFDV for feature profiling vs alternatives
+
+**Chose:** TFDV (TensorFlow Data Validation) on Dataflow.
+
+**Rejected:**
+
+- **Dataplex Auto Data Quality**: Google's managed BQ-native successor to TFDV. Serverless, petabyte-scale, zero infra.
+  We rejected it because TFDV code already exists in the codebase (commented out at `utils.py:300`), TFDV provides
+  FACETS HTML visualizations that Dataplex does not, and TFDV gives schema inference + anomaly detection in one
+  pipeline. Risk: TFDV is in slow-maintenance mode (~1 release/year, Python capped at 3.11).
+- **Great Expectations**: Industry standard, BQ SQL pushdown, 300+ checks, very active. Rejected because it's a new
+  dependency and doesn't provide FACETS-style visualizations. Better suited for ongoing validation contracts than
+  one-time profiling.
+- **whylogs**: Profiles scale with features not rows (ideal for 1B nodes), has Dataflow integration. Rejected because
+  it's a new dependency and the codebase already has TFDV infrastructure.
+- **ydata-profiling**: Not suitable for billion-scale. Requires pulling data into Python/pandas.
+
+### Standalone module vs new pipeline stage
+
+**Chose:** Standalone module in `gigl/analytics/data_analyzer/`.
+
+**Rejected:** Making it a new GiGL pipeline stage (like DataPreprocessor).
+
+Analysis should be runnable independently before committing to a full pipeline run. A pipeline stage requires a
+GbmlConfig proto, resource config, and the full orchestration framework. The standalone module takes a simple YAML
+config with BQ table references and can be run ad-hoc. This keeps the feedback loop short.
+
+### BQ SQL for graph structure analysis vs graph libraries
+
+**Chose:** BQ-native SQL queries using `APPROX_QUANTILES`, `APPROX_TOP_COUNT`, etc.
+
+**Rejected:**
+
+- **cuGraph (NVIDIA RAPIDS)**: Best performance for graph analytics with GPUs. Requires data export from BQ to GPU
+  memory and GPU infrastructure we may not have.
+- **NetworKit**: Parallel C++ graph analytics, handles billions of edges on CPU. Requires data export from BQ to local
+  storage.
+- **GraphFrames on Spark/Dataproc**: Distributed graph algorithms with BQ connector. Adds Spark cluster management
+  overhead for metrics we can compute in plain SQL.
+- **BQ Graph (GQL)**: Preview status, requires Enterprise edition, limited to pattern matching (no PageRank, community
+  detection yet).
+
+Data already lives in BQ. No data movement needed. BQ approximate functions handle billion-scale efficiently, and the
+existing codebase uses `BqUtils` extensively. For basic structural analysis (degree distributions, counts, hubs), BQ SQL
+is sufficient. If we need advanced graph algorithms (community detection, PageRank) later, we can add a GraphFrames or
+cuGraph integration as a separate component.
+
+### Single HTML report vs multiple artifacts
+
+**Chose:** Single self-contained HTML file.
+
+**Rejected:** Separate FACETS HTML per table + JSON data files + notebook/dashboard.
+
+A single HTML is portable: share via GCS link, open in any browser, no serving infrastructure needed. Gives a complete
+picture in one view. FACETS per-table HTML is embedded inline. For up to ~20 tables this keeps file size manageable.
+
+### AI-owned report generator vs hand-coded templates
+
+**Chose:** AI-owned component driven by a PRD at `gigl/analytics/data_analyzer/PRD.md`.
+
+**Rejected:** Hand-coded Jinja2 or string template approach.
+
+The HTML report layout will evolve as we add analysis dimensions. A PRD that AI agents can read and regenerate the code
+from means the visualization stays in sync with requirements without manual template maintenance. The PRD serves as both
+documentation and executable spec.
+
+## Verification
+
+- Unit tests: mock BQ responses, verify query generation, verify config parsing
+- Integration test: run against a small test BQ table, verify HTML output is generated
+- Manual: inspect the HTML report in a browser
+
+## References
+
+Full literature review with multiple insights per paper and insight-to-analysis mappings:
+[`docs/plans/20260415-bq-data-analyzer-references.md`](20260415-bq-data-analyzer-references.md)
+
+12 papers reviewed: PinSage (Pinterest), PinnerSage (Pinterest), BLADE (Amazon), LiGNN (LinkedIn), TwHIN (Twitter/X),
+GiGL (Snap), AliGraph (Alibaba), GraphSMOTE, Beyond Homophily, Uber/Grab fraud detection, Google Maps ETA, Feature
+Propagation. 100+ total insights extracted.
+
+### Aegis Integration (Phase 2)
+
+We intentionally **do not depend on Aegis in Phase 1** of the BQ Data Analyzer. Instead, we keep the analyzer as a
+self-contained GiGL module that can run against arbitrary node/edge BQ tables given only a YAML config.
+
+#### What Aegis provides
+
+Aegis is Snap’s centralized data quality and anomaly detection service. For BQ / BigLake datasets that are onboarded,
+Aegis can automatically compute:
+
+- **Profile measures per feature** (e.g., `null_ratio`, `zero_ratio`, `nan_ratio`, `min`, `max`, `mean`, `median`,
+  `p90`, `p99`) into per-dataset, per-day tables such as\
+  `sc-dig.aegis_{tier}.aegis_{data_source_id}_numerical_YYYYMMDD` and `..._categorical_YYYYMMDD`. :llmCitationRef[1]
+- **Array/struct profile tables**, **current row-count tables**, and a shared **user coverage stats** table keyed by
+  `data_source_id`. :llmCitationRef[2]
+- Optional anomaly detection and alerting on top of those metrics (WoW, ARIMA, UDB, CIC). :llmCitationRef[3]
+  :llmCitationRef[4]
+
+Relevant references:
+
+- **Aegis Adhoc Data Quality Check User Guide** (how to onboard adhoc BQ datasets, config naming, CLI backfill, and
+  output table schemas). :llmCitationRef[5]
+- **Aegis – Data Quality Monitoring & Anomaly Detection** (Confluence overview, supported data sources, profile measure
+  semantics). :llmCitationRef[6]
+- **Aegis Config v2** (YAML schema for BQ/BigLake data sources and detectors, examples for training datasets).
+  :llmCitationRef[7]
+- **Aegis New Backend Design** (modular architecture: profile measures, anomaly detection, metadata store, alerting).
+  :llmCitationRef[8]
+
+#### Why we are not using Aegis in Phase 1
+
+Phase 1 goals are:
+
+- **Fast, low-friction EDA for arbitrary graph BQ tables** (often adhoc node/edge tables that change frequently).
+- **Graph-specific structure analysis** (degree distributions, hubs, isolated nodes, referential integrity, homophily
+  proxies, neighbor explosion, etc.), which Aegis does not natively compute today.
+
+Coupling Phase 1 directly to Aegis would introduce several frictions:
+
+- **Onboarding overhead per dataset**: Even in adhoc mode, Aegis requires adding a config YAML in the Flowrida repo
+  (`metrics_gov/aegis/configs/adhoc/`), following strict dataset naming conventions (partitioned table prefixes like
+  `*_YYYYMMDD`/`*_YYYYMMDDHH`), and running the CLI backfill for each new dataset we want to analyze. :llmCitationRef[9]
+  This is exactly the per-table “add code to Flowrida just to look at my data” tax that this tool is trying to avoid.
+- **Infra dependencies (BigLake / Nexus / JAM / permissions)**: Adhoc Aegis flows today often involve BigLake syncs or
+  Nexus tables, service account wiring (JAMs like `aegis_uum_test`), and additional IAM leases to read the Aegis metrics
+  datasets. :llmCitationRef[10] :llmCitationRef[11] That is appropriate for long-lived production datasets but heavy for
+  one-off graph experiments.
+- **Mismatch in focus**: Aegis is optimized for **continuous monitoring + alerting** (row counts, profile measures,
+  usage coverage, detector alerts) across many prod datasets; this module is optimized for **deep, graph-specific
+  inspection** (e.g., super-hub clamping risks, neighbor explosion, degree-bucket distributions, homophily signals)
+  prior to committing to a GNN pipeline. We would still have to build most of the graph-structure analysis ourselves,
+  even if we delegated generic null/row-count stats to Aegis.
+
+Given the above, we treat Aegis as **highly complementary** but not a hard runtime dependency for Phase 1. Phase 1 keeps
+all necessary profiling and structure checks self-contained (TFDV + BQ SQL) so that graph practitioners can run it with
+only a YAML config and BQ access.
+
+#### Phase 2: Aegis integration plan and PoC ideas
+
+Once the BQ Data Analyzer is stable and useful on its own, we plan a Phase 2 integration with Aegis along two axes:
+
+1. **Read existing Aegis metrics when available (PoC 1)**
+
+   - For BQ datasets that are *already* onboarded to Aegis (e.g., UUM MDS outputs), the analyzer can optionally
+     **consume Aegis profile tables** instead of recomputing basic stats:
+     - `aegis_{data_source_id}_numerical_YYYYMMDD` / `..._categorical_YYYYMMDD` / `..._array_YYYYMMDD` /
+       `..._current_row_count_YYYYMMDD` / `user_coverage_stats`. :llmCitationRef[12]
+   - The HTML report would treat these as another input source and clearly label “metrics imported from Aegis” vs
+     “metrics computed by analyzer.”
+   - **PoC**: pick one existing graph-ish dataset that already has Aegis monitoring (e.g., a UUM or training dataset),
+     run the analyzer twice (with and without Aegis import), and sanity-check metric parity (row counts, null ratios,
+     coverage).
+
+2. **Optionally publish graph metrics into Aegis (PoC 2)**
+
+   - For production graph datasets, we can add a small writer that emits **graph-structure summaries** (degree buckets,
+     isolated/cold-start node fractions, super-hub counts) into an Aegis-compatible metrics table or view, so they show
+     up alongside standard profile measures in Aegis dashboards/alerts. :llmCitationRef[13] :llmCitationRef[14]
+   - This keeps Aegis as the single pane of glass for ongoing monitoring, while the analyzer remains the “rich HTML
+     report” for deep dives.
+   - **PoC**: define a minimal Aegis-style schema for graph metrics, write a one-off backfill for a single dataset, and
+     validate that the metrics can be queried and (eventually) surfaced via Aegis UI.
+
+3. **Longer-term: better UX for adhoc graph datasets**
+
+   - Longer-term, if Aegis exposes a higher-level UI/CLI for “adhoc BQ analysis” (no Flowrida PRs, just point at a
+     table), the BQ Data Analyzer could be wired in as an implementation detail: Aegis would trigger the analyzer and
+     host links to the generated HTML report.
+   - We explicitly **defer this to future work**, pending Aegis UX/infra evolution.
+
+In summary, **Phase 1** keeps the BQ Data Analyzer self-contained and immediately usable for graph workloads; **Phase
+2** focuses on **reusing Aegis where it’s already strong (metrics storage, monitoring, anomaly detection)** and
+publishing graph-specific metrics back into that ecosystem, without blocking v1 on current Aegis onboarding and UX
+constraints.
diff --git a/docs/plans/20260416-data-analyzer-1-pager.md b/docs/plans/20260416-data-analyzer-1-pager.md
new file mode 100644
index 000000000..87132f7eb
--- /dev/null
+++ b/docs/plans/20260416-data-analyzer-1-pager.md
@@ -0,0 +1,105 @@
+# BQ Data Analyzer for GiGL
+
+## Problem
+
+We train GNNs on billion-node graphs stored in BigQuery. Today there is no way to analyze this data before committing to
+a full pipeline run. TFDV statistics generation exists in the codebase but is commented out and tightly coupled to the
+DataPreprocessor. Engineers discover data issues (dangling edges, extreme degree skew, missing features, class
+imbalance) only after training fails or produces poor results.
+
+A review of 18 production GNN papers (PinSage, LiGNN, TwHIN, GiGL, AliGraph, BLADE, and others) shows that
+graph-specific data properties directly determine model quality. Degree distribution alone accounts for 46-230%
+performance differences depending on sampling strategy (PinSage, BLADE). Class imbalance is amplified 2-3x by message
+passing (GraphSMOTE). Missing features are tolerable up to 90% but hit a phase transition at 95% (Feature Propagation,
+ICLR 2022). None of these are caught by standard tabular data quality tools.
+
+## Solution
+
+A standalone `DataAnalyzer` module under `gigl/analytics/` that takes a YAML config pointing at BQ tables and produces a
+single HTML report covering data quality, feature distributions (via TFDV/FACETS), and graph structure metrics (via BQ
+SQL).
+
+**Alternatives considered:**
+
+| Option                  | Verdict             | Reason                                                                                                                                                                                                                                                                                                                                                                        |
+| ----------------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Aegis from day 1        | Deferred to Phase 2 | Aegis requires per-dataset onboarding (Flowrida config, BigLake sync, JAM permissions). It does not compute graph-specific metrics (degree distributions, hubs, referential integrity, homophily). Good for continuous monitoring of production datasets, not for ad-hoc graph EDA. Phase 2 will read Aegis metrics when available and publish graph metrics back into Aegis. |
+| Dataplex Auto DQ        | Rejected            | Google's BQ-native successor to TFDV. Serverless and zero-infra, but lacks FACETS visualizations and schema inference.                                                                                                                                                                                                                                                        |
+| Great Expectations      | Rejected            | Industry standard with BQ SQL pushdown, but new dependency and no FACETS. Better for ongoing contracts than one-time profiling.                                                                                                                                                                                                                                               |
+| New GiGL pipeline stage | Rejected            | Requires GbmlConfig proto and full orchestration. Analysis should run independently with a simple YAML config.                                                                                                                                                                                                                                                                |
+
+## Architecture
+
+```
+YAML Config (BQ tables + columns)
+    |
+    +-- FeatureProfiler (Dataflow)        reuses existing TFDV Beam components
+    |   +-- TFDV stats + FACETS HTML      (GenerateAndVisualizeStats, IngestRawFeatures)
+    |   +-- Schema inference + anomalies
+    |
+    +-- GraphStructureAnalyzer (BQ SQL)   extends existing BQGraphValidator
+    |   +-- 25 validation checks across 4 tiers
+    |   +-- All structure checks run on full data (no TABLESAMPLE)
+    |
+    +-- ReportGenerator (AI-owned)
+        +-- Single self-contained HTML report -> GCS
+```
+
+**Validation tiers:**
+
+- **Tier 1 (Hard fails):** Dangling edges, referential integrity, duplicate nodes. Block training.
+- **Tier 2 (Core metrics):** Degree distribution, hubs, isolated nodes, cold-start count, NULL rates, memory budget,
+  neighbor explosion estimate. Always-on.
+- **Tier 3 (Label/heterogeneous):** Class imbalance, label coverage, edge type distribution. Auto-enabled when
+  applicable.
+- **Tier 4 (Opt-in):** Reciprocity, homophily, connected components, temporal freshness. Config flags, full data only.
+
+## What the Literature Says Peers Should Know
+
+From 18 papers across Pinterest, LinkedIn, Twitter/X, Snap, Alibaba, Amazon, Google, Uber, Grab, and Meta:
+
+| Finding                                             | Impact                                           | Source                     |
+| --------------------------------------------------- | ------------------------------------------------ | -------------------------- |
+| Degree distribution determines sampling strategy    | 46-230% quality difference                       | PinSage, BLADE             |
+| Cold-start nodes (degree 0-1) need densification    | +0.28% AUC at LinkedIn                           | LiGNN                      |
+| GiGL clamps degrees to int16 max (32,767)           | Silent precision loss for super-hubs             | GiGL codebase              |
+| Class imbalance amplified 2-3x by message passing   | Minority F1 drops 30-40% at 1:10 ratio           | GraphSMOTE                 |
+| Stale edges degrade quality 2-8% AUC                | Temporal freshness is first-class                | LiGNN, AliGraph, Uber/Grab |
+| Standard GNNs fail on heterophilic graphs (h < 0.3) | 30-50% accuracy drop; MLP wins at h < 0.2        | Beyond Homophily           |
+| Feature missing rate tolerable up to 90%            | Phase transition at 95%; 5% accuracy drop at 90% | Feature Propagation        |
+
+## Phasing
+
+- **Phase 1 (this work):** Self-contained analyzer. YAML config, TFDV + BQ SQL, single HTML report. No external
+  dependencies beyond BQ and Dataflow.
+- **Phase 2:** Aegis integration. Read existing Aegis profile metrics when available. Publish graph-structure metrics
+  back into Aegis for continuous monitoring.
+
+______________________________________________________________________
+
+## Appendix
+
+### A. Full Documentation
+
+| Document                                                     | Contents                                                                                          |
+| ------------------------------------------------------------ | ------------------------------------------------------------------------------------------------- |
+| [Design doc](20260415-bq-data-analyzer.md)                   | Components, config format, query inventory, result dataclasses, tradeoff analysis, reused code    |
+| [Literature review](20260415-bq-data-analyzer-references.md) | 18 papers, 100+ findings with source citations, common themes table, consolidated threshold table |
+| `gigl/analytics/data_analyzer/PRD.md`                        | HTML report specification (AI-owned)                                                              |
+
+### B. Key Thresholds (from literature)
+
+| Metric                  | Green | Yellow     | Red    |
+| ----------------------- | ----- | ---------- | ------ |
+| Edge homophily          | > 0.7 | 0.3 - 0.7  | < 0.3  |
+| Class imbalance         | < 1:5 | 1:5 - 1:10 | > 1:10 |
+| Feature missing         | < 10% | 10 - 50%   | > 90%  |
+| Cold-start fraction     | < 5%  | 5 - 10%    | > 10%  |
+| Degree p99/median       | < 50  | 50 - 100   | > 100  |
+| Neighbor explosion/seed | < 50K | 50K - 100K | > 100K |
+
+### C. Aegis Phase 2 Details
+
+See the [design doc, Aegis Integration section](20260415-bq-data-analyzer.md#aegis-integration-phase-2) for the full
+Phase 2 plan including two PoC proposals: (1) consuming existing Aegis profile tables, (2) publishing graph metrics into
+Aegis-compatible tables.
diff --git a/docs/plans/20260416-data-analyzer-engineering-spec.md b/docs/plans/20260416-data-analyzer-engineering-spec.md
new file mode 100644
index 000000000..55bbf14b6
--- /dev/null
+++ b/docs/plans/20260416-data-analyzer-engineering-spec.md
@@ -0,0 +1,374 @@
+# BQ Data Analyzer: Engineering Spec
+
+## Context
+
+This spec defines the implementation plan for the BQ Data Analyzer described in the
+[design doc](20260415-bq-data-analyzer.md) and [1-pager](20260416-data-analyzer-1-pager.md). The analyzer takes BQ table
+references via YAML config and produces a single HTML report covering data quality, feature distributions, and graph
+structure metrics for GNN training readiness.
+
+Validation dimensions are informed by a
+[literature review of 18 production GNN papers](20260415-bq-data-analyzer-references.md).
+
+## Architecture
+
+```
+YAML Config --> DataAnalyzer (orchestrator)
+                   |
+                   +-- GraphStructureAnalyzer (BQ SQL, 25 checks across 4 tiers)
+                   |       uses: BqUtils, BQGraphValidator
+                   |       output: GraphAnalysisResult
+                   |
+                   +-- FeatureProfiler (Beam/Dataflow)
+                   |       uses: GenerateAndVisualizeStats, IngestRawFeatures
+                   |       output: FeatureProfileResult
+                   |
+                   +-- ReportGenerator (data_analyzer/report/)
+                           uses: GraphAnalysisResult + FeatureProfileResult
+                           output: single self-contained HTML -> GCS
+```
+
+## Implementation: Layer-by-Layer
+
+### Layer 1: Config and Types
+
+**Files:**
+
+- `gigl/analytics/data_analyzer/config.py`
+- `gigl/analytics/data_analyzer/types.py`
+
+**config.py** defines three dataclasses parsed from YAML via OmegaConf (matching `gigl/common/utils/yaml_loader.py`
+pattern):
+
+```python
+@dataclass
+class NodeTableSpec:
+    bq_table: str
+    node_type: str
+    id_column: str
+    feature_columns: list[str]
+    label_column: Optional[str] = None
+
+@dataclass
+class EdgeTableSpec:
+    bq_table: str
+    edge_type: str
+    src_id_column: str
+    dst_id_column: str
+    feature_columns: list[str] = field(default_factory=list)
+    timestamp_column: Optional[str] = None
+
+@dataclass
+class DataAnalyzerConfig:
+    node_tables: list[NodeTableSpec]
+    edge_tables: list[EdgeTableSpec]
+    output_gcs_path: str
+    fan_out: Optional[list[int]] = None
+    compute_reciprocity: bool = False
+    compute_homophily: bool = False
+    compute_connected_components: bool = False
+    compute_clustering: bool = False
+```
+
+Config loading uses `OmegaConf.load()` + `OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), loaded)` +
+`OmegaConf.to_object()`.
+
+**types.py** defines result dataclasses:
+
+```python
+@dataclass
+class DegreeStats:
+    min: int
+    max: int
+    mean: float
+    median: int
+    p90: int
+    p99: int
+    p999: int
+    percentiles: list[int]
+    buckets: dict[str, int]  # "0-1": count, "2-10": count, etc.
+
+@dataclass
+class GraphAnalysisResult:
+    # Tier 1: hard fails
+    duplicate_node_counts: dict[str, int]
+    dangling_edge_counts: dict[str, int]
+    referential_integrity_violations: dict[str, int]
+    # Tier 2: core metrics
+    node_counts: dict[str, int]
+    edge_counts: dict[str, int]
+    null_rates: dict[str, dict[str, float]]
+    duplicate_edge_counts: dict[str, int]
+    self_loop_counts: dict[str, int]
+    isolated_node_counts: dict[str, int]
+    degree_stats: dict[str, DegreeStats]
+    top_hubs: dict[str, list[tuple[str, int]]]
+    super_hub_int16_clamp_count: dict[str, int]
+    cold_start_node_counts: dict[str, int]
+    feature_memory_bytes: dict[str, int]
+    neighbor_explosion_estimate: dict[str, int]
+    # Tier 3: label and heterogeneous
+    class_imbalance: dict[str, dict[str, int]]
+    label_coverage: dict[str, float]
+    edge_type_distribution: dict[str, int]
+    edge_type_node_coverage: dict[str, dict[str, int]]
+    # Tier 4: opt-in
+    reciprocity: dict[str, float]
+    power_law_exponent: dict[str, float]
+
+@dataclass
+class FeatureProfileResult:
+    facets_html_paths: dict[str, str]   # table_name -> GCS path to FACETS HTML
+    stats_paths: dict[str, str]         # table_name -> GCS path to stats TFRecord
+    schema_paths: dict[str, str]        # table_name -> GCS path to schema proto
+    anomalies: dict[str, list[str]]     # table_name -> list of anomaly descriptions
+```
+
+**Tests:** `tests/unit/analytics/data_analyzer/config_test.py`
+
+- Parse valid YAML, verify all fields populated
+- Parse YAML with optional fields omitted, verify defaults
+- Parse invalid YAML (missing required fields), verify error raised
+- Validate edge table references against node table types
+
+______________________________________________________________________
+
+### Layer 2: BQ Queries and GraphStructureAnalyzer
+
+**Files:**
+
+- `gigl/analytics/data_analyzer/queries.py`
+- `gigl/analytics/data_analyzer/graph_structure_analyzer.py`
+
+**queries.py**: 18 SQL template constants as module-level strings. Each parameterized with `.format()` for table names
+and column names. Pattern matches `gigl/src/data_preprocessor/lib/enumerate/queries.py`.
+
+Query inventory (from design doc):
+
+| Constant                           | Tier | Purpose                                   |
+| ---------------------------------- | ---- | ----------------------------------------- |
+| `NODE_COUNT_QUERY`                 | 2    | `SELECT COUNT(*) FROM {table}`            |
+| `EDGE_COUNT_QUERY`                 | 2    | `SELECT COUNT(*) FROM {table}`            |
+| `NULL_RATES_QUERY`                 | 2    | Batched `COUNTIF` per column              |
+| `DUPLICATE_NODE_COUNT_QUERY`       | 1    | `GROUP BY id HAVING COUNT(*) > 1`         |
+| `DUPLICATE_EDGE_COUNT_QUERY`       | 2    | `GROUP BY (src, dst) HAVING COUNT(*) > 1` |
+| `DANGLING_EDGES_QUERY`             | 1    | `WHERE src IS NULL OR dst IS NULL`        |
+| `EDGE_REFERENTIAL_INTEGRITY_QUERY` | 1    | `LEFT JOIN ... WHERE IS NULL`             |
+| `SELF_LOOP_COUNT_QUERY`            | 2    | `WHERE src = dst`                         |
+| `ISOLATED_NODE_COUNT_QUERY`        | 2    | `LEFT JOIN ... WHERE edge IS NULL`        |
+| `DEGREE_DISTRIBUTION_QUERY`        | 2    | `APPROX_QUANTILES` on degree counts       |
+| `DEGREE_BUCKET_QUERY`              | 2    | Bucket counts for 6 ranges                |
+| `TOP_K_HUBS_QUERY`                 | 2    | `ORDER BY degree DESC LIMIT k`            |
+| `SUPER_HUB_INT16_CLAMP_QUERY`      | 2    | `HAVING COUNT(*) > 32767`                 |
+| `COLD_START_NODE_COUNT_QUERY`      | 2    | Degree 0-1 count                          |
+| `CLASS_IMBALANCE_QUERY`            | 3    | `GROUP BY label_column`                   |
+| `LABEL_COVERAGE_QUERY`             | 3    | `COUNTIF(label IS NOT NULL)`              |
+| `EDGE_TYPE_DISTRIBUTION_QUERY`     | 3    | `COUNT(*)` per type                       |
+| `EDGE_TYPE_NODE_COVERAGE_QUERY`    | 3    | `COUNT(DISTINCT src/dst)` per type        |
+
+`FEATURE_MEMORY_BUDGET`, `NEIGHBOR_EXPLOSION_ESTIMATE`, and `POWER_LAW_EXPONENT` are computed in Python from schema
+metadata and degree stats.
+
+**graph_structure_analyzer.py**:
+
+- `GraphStructureAnalyzer.__init__(bq_project: Optional[str])` creates `BqUtils`
+- `analyze(config: DataAnalyzerConfig) -> GraphAnalysisResult`
+- Runs queries in parallel via `concurrent.futures.ThreadPoolExecutor` (same pattern as `data_preprocessor.py:311`)
+- Tier 1 checks raise `DataQualityError` on failure
+- Tier 3 checks auto-enabled when `label_column` or multiple edge types present
+- Tier 4 checks gated by config flags
+
+**Tests:** `tests/unit/analytics/data_analyzer/queries_test.py`
+
+- For each query constant: verify `.format()` with test table/column names produces valid SQL containing expected
+  clauses
+- Verify `NULL_RATES_QUERY` batches multiple columns into one query
+- Verify `DEGREE_DISTRIBUTION_QUERY` produces separate in/out queries
+
+**Tests:** `tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py`
+
+- Mock `BqUtils.run_query()` with `@patch`
+- Feed canned query results (lists of dicts mimicking BQ `RowIterator`)
+- Verify `GraphAnalysisResult` fields populated correctly
+- Verify Tier 1 checks raise `DataQualityError` when violations found
+- Verify Tier 3 checks skipped when no `label_column` configured
+- Verify Tier 4 checks skipped when config flags are False
+- Verify `feature_memory_bytes` computed correctly from schema metadata
+- Verify `neighbor_explosion_estimate` computed from degree stats and fan_out config
+
+______________________________________________________________________
+
+### Layer 3: FeatureProfiler (TFDV/Dataflow)
+
+**Files:**
+
+- `gigl/analytics/data_analyzer/feature_profiler.py`
+
+Standalone Beam pipeline builder. For each table in config:
+
+1. Builds feature spec from column names and inferred types
+2. Creates Beam pipeline via `init_beam_pipeline_options()`
+3. Reads from BQ via `BigqueryNodeDataReference` / `BigqueryEdgeDataReference`
+4. Runs `GenerateAndVisualizeStats` for TFDV stats + FACETS HTML
+5. Runs `tfdv.infer_schema()` for schema inference
+6. Runs `tfdv.validate_statistics()` for anomaly detection
+7. Writes outputs to GCS under `{output_gcs_path}/tfdv/{table_name}/`
+
+Returns `FeatureProfileResult` with GCS paths.
+
+Launches one Dataflow job per table via `ThreadPoolExecutor` with a lock (same pattern as DataPreprocessor for
+serialized `p.run()` calls).
+
+**Tests:** No unit tests for the Beam pipeline itself (Dataflow execution). Tested via integration test with real BQ and
+Dataflow.
+
+**Tests:** `tests/integration/analytics/data_analyzer/feature_profiler_test.py`
+
+- Requires cloud resources (BQ table, Dataflow, GCS bucket)
+- Runs profiler against a small test BQ table
+- Verifies output files exist on GCS (stats TFRecord, FACETS HTML, schema proto)
+
+______________________________________________________________________
+
+### Layer 4: HTML Report
+
+**Files:**
+
+- `gigl/analytics/data_analyzer/report/SPEC.md`
+- `gigl/analytics/data_analyzer/report/report_generator.py`
+- `gigl/analytics/data_analyzer/report/report.ai.html`
+- `gigl/analytics/data_analyzer/report/charts.ai.js`
+- `gigl/analytics/data_analyzer/report/styles.ai.css`
+- `gigl/analytics/data_analyzer/report/__init__.py`
+
+**SPEC.md**: AI-owned specification for the HTML report. Defines:
+
+- Report sections (header, overview dashboard, data quality, feature statistics, graph structure, footer)
+- Visual style (#f8f9fa background, monospace data values, sans-serif labels, 1200px max-width)
+- Color coding (green #28a745, yellow #ffc107, red #dc3545) with threshold values from literature
+- Collapsible sections (CSS-only)
+- Self-contained constraint (no external dependencies)
+- FACETS embedding via iframe/shadow DOM
+
+AI agents read SPEC.md to generate/regenerate the `.ai.{html|js|css}` files.
+
+**report_generator.py**: Python module:
+
+- `generate(analysis_result: GraphAnalysisResult, profile_result: Optional[FeatureProfileResult], config: DataAnalyzerConfig) -> str`
+- Loads `report.ai.html` template
+- Serializes results to JSON and injects into template
+- Embeds FACETS HTML strings inline (reads from GCS paths in `FeatureProfileResult`)
+- Returns the complete HTML string
+
+**report.ai.html**: Main template with placeholder slots for data injection. Contains the report structure.
+
+**charts.ai.js**: Inline SVG chart generation for degree distribution histograms. No external charting library.
+
+**styles.ai.css**: Embedded CSS for the report layout and color coding.
+
+**Tests:** `tests/unit/analytics/data_analyzer/report/report_generator_test.py`
+
+- Snapshot test:
+  1. Construct `GraphAnalysisResult` with deterministic test data
+  2. Call `generate()` (without TFDV results, those are optional)
+  3. Compare output against golden file at `tests/test_assets/analytics/golden_report.html`
+  4. If snapshot differs, test fails with a diff showing what changed
+- Structural tests:
+  - Verify output contains expected section headings
+  - Verify Tier 1 hard-fail results appear with red indicators
+  - Verify threshold color coding applied correctly (green/yellow/red)
+  - Verify optional sections (Tier 3, Tier 4) omitted when data not present
+
+______________________________________________________________________
+
+### Layer 5: Orchestrator and CLI
+
+**Files:**
+
+- `gigl/analytics/data_analyzer/data_analyzer.py`
+- `gigl/analytics/data_analyzer/__init__.py`
+
+**data_analyzer.py**:
+
+```python
+class DataAnalyzer:
+    def run(
+        self,
+        config: DataAnalyzerConfig,
+        resource_config_uri: Optional[Uri] = None,
+    ) -> str:  # returns GCS path to HTML report
+        ...
+```
+
+Orchestration:
+
+1. Run `GraphStructureAnalyzer.analyze(config)` and `FeatureProfiler.profile(config)` in parallel via
+   `ThreadPoolExecutor`
+2. If GraphStructureAnalyzer raises `DataQualityError` (Tier 1 failure), still generate report showing the failures, but
+   log error
+3. Pass both results to `ReportGenerator.generate()`
+4. Upload HTML string to `{config.output_gcs_path}/report.html` via `GcsUtils`
+5. Log the GCS path
+
+CLI entry point (`if __name__ == "__main__"`):
+
+```python
+parser = argparse.ArgumentParser()
+parser.add_argument("--analyzer_config_uri", required=True)
+parser.add_argument("--resource_config_uri", required=False)
+```
+
+Matches DataPreprocessor CLI pattern.
+
+**Tests:** No dedicated unit test for the orchestrator (it's thin glue). Tested via integration test.
+
+______________________________________________________________________
+
+## Testing Summary
+
+| Test file                          | Layer | Type        | What it tests                                            |
+| ---------------------------------- | ----- | ----------- | -------------------------------------------------------- |
+| `config_test.py`                   | 1     | Unit        | YAML parsing, defaults, validation                       |
+| `queries_test.py`                  | 2     | Unit        | SQL template correctness (string assertions)             |
+| `graph_structure_analyzer_test.py` | 2     | Unit        | Mocked BQ, result population, tier gating, error raising |
+| `report_generator_test.py`         | 4     | Unit        | Snapshot test, structural assertions, threshold coloring |
+| `feature_profiler_test.py`         | 3     | Integration | Real BQ + Dataflow, output file existence                |
+| `graph_structure_analyzer_test.py` | 2     | Integration | Real BQ, query execution, result correctness             |
+
+All unit tests use `tests.test_assets.test_case.TestCase` as base class. BQ mocking uses
+`@patch("gigl.src.common.utils.bq.bigquery.Client")` pattern from existing `bq_test.py`.
+
+## AI-Owned File Convention
+
+Files named `*.ai.{html|js|css}` are generated and maintained by AI agents. The `SPEC.md` in the same directory defines
+the requirements. To regenerate:
+
+1. Read `SPEC.md`
+2. Generate/update the `.ai.*` files to match the spec
+3. Run snapshot test to verify no regressions
+
+This convention is new to the codebase. The `report/` directory is the first instance.
+
+## Reused Code
+
+| Component                    | Path                                                    |
+| ---------------------------- | ------------------------------------------------------- |
+| `GenerateAndVisualizeStats`  | `gigl/src/data_preprocessor/lib/transform/utils.py:120` |
+| `IngestRawFeatures`          | `gigl/src/data_preprocessor/lib/transform/utils.py:85`  |
+| `InstanceDictToTFExample`    | `gigl/src/data_preprocessor/lib/transform/utils.py:42`  |
+| `WriteTFSchema`              | `gigl/src/data_preprocessor/lib/transform/utils.py:186` |
+| `init_beam_pipeline_options` | `gigl/src/common/utils/dataflow.py`                     |
+| `BQGraphValidator`           | `gigl/analytics/graph_validation/bq_graph_validator.py` |
+| `BqUtils`                    | `gigl/src/common/utils/bq.py`                           |
+| `gcs_constants`              | `gigl/src/common/constants/gcs.py`                      |
+| `BigqueryNodeDataReference`  | `gigl/src/data_preprocessor/lib/ingest/bigquery.py`     |
+| `yaml_loader`                | `gigl/common/utils/yaml_loader.py`                      |
+
+## Verification
+
+- `make unit_test_py PY_TEST_FILES="config_test.py"` for config parsing
+- `make unit_test_py PY_TEST_FILES="queries_test.py"` for SQL templates
+- `make unit_test_py PY_TEST_FILES="graph_structure_analyzer_test.py"` for mocked BQ
+- `make unit_test_py PY_TEST_FILES="report_generator_test.py"` for snapshot test
+- `make integration_test PY_TEST_FILES="feature_profiler_test.py"` for TFDV/Dataflow
+- Manual: run analyzer against real BQ tables, open HTML report in browser

From d3f1eb8fec61376e99d40688ba676bbbceeccad6 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Sat, 18 Apr 2026 00:07:11 +0000
Subject: [PATCH 16/20] delete plans

---
 .../20260415-bq-data-analyzer-references.md   | 1063 -----------------
 docs/plans/20260415-bq-data-analyzer.md       |  468 --------
 docs/plans/20260416-data-analyzer-1-pager.md  |  105 --
 ...20260416-data-analyzer-engineering-spec.md |  374 ------
 4 files changed, 2010 deletions(-)
 delete mode 100644 docs/plans/20260415-bq-data-analyzer-references.md
 delete mode 100644 docs/plans/20260415-bq-data-analyzer.md
 delete mode 100644 docs/plans/20260416-data-analyzer-1-pager.md
 delete mode 100644 docs/plans/20260416-data-analyzer-engineering-spec.md

diff --git a/docs/plans/20260415-bq-data-analyzer-references.md b/docs/plans/20260415-bq-data-analyzer-references.md
deleted file mode 100644
index ad461fbe6..000000000
--- a/docs/plans/20260415-bq-data-analyzer-references.md
+++ /dev/null
@@ -1,1063 +0,0 @@
-# BQ Data Analyzer: Literature Review and Analysis Mapping
-
-This document catalogs findings from production GNN papers that inform what to validate and analyze about graph data
-before training. Each paper includes multiple insights, and each insight maps to a concrete analysis check.
-
-## Common Themes
-
-Cross-cutting findings that recur across multiple papers. Sorted by number of supporting papers. Each theme maps to one
-or more concrete checks the analyzer should perform.
-
-| Theme                                                       | Description                                                                                                                                                                                          | Papers                                                                                              | Analysis Check                                                                                                                                                             |
-| ----------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Power-law degree distributions require adaptive handling    | Uniform sampling and aggregation fail on skewed degree distributions; every major production system adapts its sampling or aggregation to degree skew.                                               | PinSage (1.1, 1.3), PinnerSage (2.1), BLADE (3.2, 3.5), LiGNN (4.5), AliGraph (7.8), GiGL (6.2)     | Degree distribution with percentiles (p50, p90, p99, p99.9). Degree-stratified bucket counts. Power-law exponent fitting. p99/median ratio.                                |
-| Cold-start / low-degree nodes produce degraded embeddings   | Nodes with zero or near-zero edges fall back to node-own features only, producing poor representations that require densification or special handling.                                               | PinSage (1.7), PinnerSage (2.5), BLADE (3.1), LiGNN (4.1), Feature Propagation (12.6)               | Cold-start fraction: count of nodes with degree 0-1 per type. Nodes-below-fan-out count per edge type. Alert if cold-start fraction > 10%.                                 |
-| Edge type heterogeneity and imbalance bias sampling         | When one edge type dominates or types have vastly different densities, standard neighbor sampling almost never selects minority types, silently degrading multi-relational learning.                 | PinnerSage (2.3), TwHIN (5.1, 5.2), LiGNN (4.7), AliGraph (7.7), Fraud Detection (10.5)             | Edge count per type. Per-edge-type density. Alert if any type < 0.1% of total or dominant type > 90%. Per-edge-type node coverage fraction.                                |
-| Sampling strategy depends on graph structure                | The optimal sampling strategy (fan-out, neighborhood size, bias direction) is not fixed but depends on the degree distribution, graph density, and domain.                                           | PinSage (1.1), BLADE (3.2, 3.3), LiGNN (4.5, 4.9), AliGraph (7.7)                                   | Full degree distribution, neighbor explosion estimate for given fan-out, per-edge-type density ratios.                                                                     |
-| Temporal freshness / staleness degrades model quality       | Stale edges and misaligned feature-edge timestamps inject noise; multiple papers report 2-8% AUC degradation from temporal inconsistency.                                                            | PinnerSage (2.4), LiGNN (4.4), AliGraph (7.6), Fraud Detection (10.1, 10.3), Meta (18.2)            | Edge timestamp freshness distribution. Stale edge fraction. Feature-edge timestamp gap. Alert if > 24h misalignment.                                                       |
-| Hub node dominance distorts training                        | A small fraction of nodes concentrates a large fraction of edges, dominating aggregation and causing partition imbalance; requires caching, normalization, or down-weighting.                        | PinSage (1.2, 1.5), AliGraph (7.8), GiGL (6.2), Beyond Homophily (9.7)                              | Top-K highest-degree nodes per edge type. Hub concentration: fraction of edges involving top 0.1% and top 1% of nodes. Nodes exceeding int16 degree clamp (32,767).        |
-| Memory budget scales with node count and feature dimensions | ID embedding tables, feature storage, and neighbor expansion all scale with node cardinality and feature dimensions, making memory estimation a prerequisite for feasibility.                        | PinSage (1.8), LiGNN (4.3), TwHIN (5.6), AliGraph (7.4), GraphBFF (13.2)                            | Feature memory budget: node_count x feature_dim x dtype_size per type. ID embedding memory estimate. Neighbor explosion estimate.                                          |
-| Homophily level determines architecture choice              | The single most predictive graph property for GNN architecture selection; standard GCN/GAT fail at low homophily, and feature propagation becomes harmful.                                           | Beyond Homophily (9.1, 9.2, 9.3), Feature Propagation (12.5), GraphSMOTE (8.3), Demystifying (17.1) | Edge homophily ratio. Per-class and per-node homophily distribution. If h < 0.3, warn about standard architectures and feature propagation.                                |
-| Feature missing / incomplete data                           | Missing features are common in production graphs (5-15% of nodes); propagation can recover signal up to 90% missing but fails above 95%, with recovery quality depending on local connectivity.      | AliGraph (7.3), Feature Propagation (12.1, 12.2), Google Maps ETA (11.5), Meta (18.1)               | NULL rate per column per table. Feature missing rate per node type. For missing nodes: average degree and feature-bearing neighbor count. Phase transition alert at > 95%. |
-| Class imbalance amplified by message passing                | GNN aggregation exacerbates label imbalance 2-3x in representation space; minority class F1 drops 30-40% at 1:10 ratio, with topology imbalance compounding the effect.                              | GraphSMOTE (8.1, 8.2), Beyond Homophily (9.7), Fraud Detection (implicit)                           | Class imbalance ratio (max_count / min_count). Per-class node counts. Warning at > 1:5, critical at > 1:10. Per-class degree distribution if labels available.             |
-| Directed graph semantics carry information                  | Asymmetric relationships (co-purchase, follows, one-way streets) require separate source/target representations or directional message passing; treating directed graphs as undirected loses signal. | BLADE (3.4), TwHIN (5.4), Google Maps ETA (11.2)                                                    | Reciprocity fraction per edge type. If reciprocity > 95%, graph is effectively undirected. If low, warn if treated as undirected.                                          |
-| Feature scale / normalization across types                  | Features spanning orders of magnitude or with large dimension ratios across node types cause gradient instability and aggregation bias without per-type normalization.                               | PinSage (1.5), AliGraph (7.4, 7.5), Google Maps ETA (11.4)                                          | Per-feature min/max/range. Feature dimensionality per node type. Alert if dimension ratio > 10x across types or scale range > 10^4.                                        |
-| Graph connectivity / isolated components                    | Nodes in small disconnected components get poor embeddings because message passing cannot aggregate meaningful neighborhood information; cross-domain densification helps.                           | AliGraph (7.2), Feature Propagation (12.4), LiGNN (4.6), Oversmoothing (16.2)                       | Isolated node count (degree 0). Small component fraction. Per-edge-type degree distribution to identify sparse domains. Connected component analysis (Tier 4).             |
-| Edge referential integrity / dangling edges                 | Edges referencing non-existent nodes cause silent NaN propagation during message passing; must be a hard validation failure.                                                                         | GiGL (6.1, 6.5), AliGraph (7.1), DQuaG (14.1)                                                       | Edge src/dst existence in node table. NULL src/dst check. Hard fail if dangling edges found.                                                                               |
-| Degree distribution determines partitioning quality         | Extreme degree skew causes partition imbalance and cross-partition edge explosion, leading to stragglers and degraded aggregation at partition boundaries.                                           | GiGL (6.9), AliGraph (7.9)                                                                          | Degree distribution skew. Graph density. Hub concentration metrics inform partitioning strategy.                                                                           |
-
-## Table of Contents
-
-01. [PinSage (Pinterest, KDD 2018)](#1-pinsage)
-02. [PinnerSage (Pinterest, KDD 2020)](#2-pinnersage)
-03. [BLADE (Amazon, WSDM 2023)](#3-blade)
-04. [LiGNN (LinkedIn, KDD 2024)](#4-lignn)
-05. [TwHIN (Twitter/X, KDD 2022)](#5-twhin)
-06. [GiGL (Snap, KDD 2025)](#6-gigl)
-07. [AliGraph (Alibaba, VLDB 2019)](#7-aligraph)
-08. [GraphSMOTE (WSDM 2021)](#8-graphsmote)
-09. [Beyond Homophily (NeurIPS 2020)](#9-beyond-homophily)
-10. [Uber Fraud Detection + Grab Spade (VLDB 2023)](#10-fraud-detection)
-11. [Google Maps ETA (CIKM 2021)](#11-google-maps-eta)
-12. [Feature Propagation (ICLR 2022)](#12-feature-propagation)
-13. [GraphBFF (Feb 2026)](#13-graphbff)
-14. [DQuaG (EDBT 2025)](#14-dquag)
-15. [LinkedIn Cross-Domain GNN (June 2025)](#15-linkedin-cross-domain-gnn)
-16. [Oversmoothing/Oversquashing Complexity (March 2026)](#16-oversmoothingoversquashing-complexity)
-17. [Demystifying Common Beliefs (ICLR 2026)](#17-demystifying-common-beliefs-in-graph-ml)
-18. [Meta GEM and Adaptive Ranking (2025-2026)](#18-meta-gem-and-adaptive-ranking)
-19. [Consolidated Threshold Table](#19-consolidated-threshold-table)
-
-______________________________________________________________________
-
-## 1. PinSage
-
-**PinSage: Graph Convolutional Neural Networks for Web-Scale Recommender Systems** Ying et al., Pinterest, KDD 2018.
-[arxiv.org/abs/1806.01973](https://arxiv.org/abs/1806.01973)
-
-**Summary:** First industrial-scale GCN deployed on Pinterest's bipartite graph of 3B nodes (Pins + Boards) and 18B
-edges. Introduces random-walk-based importance sampling, importance pooling, curriculum training with hard negatives,
-and a producer-consumer mini-batch architecture.
-
-**Graph characteristics:** 3B nodes, 18B edges, bipartite (Pins-Boards), power-law degree distribution, average degree
-~6 with extreme skew (some boards have millions of pins).
-
-### Findings
-
-**1.1 Power-law degree distributions require importance sampling.** *Source: PinSage (Ying et al., KDD 2018), Section
-4.1 "Constructing Convolutions via Random Walks"*
-
-Uniform K-hop expansion fails because hub nodes dominate every neighborhood. Random-walk-based top-T neighbor selection
-using L1-normalized visit counts yielded a 46% improvement over uniform K-hop sampling.
-
-Analysis: Compute degree distribution with percentiles (p50, p90, p99, p99.9). Compute p99/median ratio. Alert if > 100
-(extreme skew).
-
-**1.2 Hub nodes dominate aggregation.** *Source: PinSage (Ying et al., KDD 2018), Section 4.2 "Importance Pooling"*
-
-A small fraction of boards have millions of pins. Without normalization, embeddings converge toward hub representations.
-PinSage uses importance pooling (visit-count-weighted aggregation) to down-weight hubs.
-
-Analysis: Report top-K highest-degree nodes per edge type. Compute hub concentration: fraction of total edges involving
-top 0.1% of nodes.
-
-**1.3 Importance pooling outperforms mean/max pooling.** *Source: PinSage (Ying et al., KDD 2018), Section 4.2
-"Importance Pooling"*
-
-Ablation confirmed visit-count-weighted aggregation outperforms both alternatives. The choice depends on knowing the
-degree distribution.
-
-Analysis: Degree distribution is prerequisite for choosing aggregation strategy.
-
-**1.4 Multi-modal features give 60% improvement over single modality.** *Source: PinSage (Ying et al., KDD 2018),
-Section 5 "Experiments", Table 1*
-
-Each Pin has a CNN visual embedding and a Word2Vec annotation embedding. Combining them was critical.
-
-Analysis: Report feature count and types per node type. Flag node types with only one feature modality.
-
-**1.5 L2 normalization after each GCN layer is critical.** *Source: PinSage (Ying et al., KDD 2018), Section 4.3
-"Stacking Convolutions"*
-
-Prevents high-degree nodes from having larger-magnitude representations. Essential for inner-product similarity at
-inference.
-
-Analysis: Feature magnitude distribution (min/max/std). Flag features with extreme scale differences across node types.
-
-**1.6 Curriculum training uses hard negatives from 2-hop graph structure.** *Source: PinSage (Ying et al., KDD 2018),
-Section 4.4 "Curriculum Training"*
-
-Epoch n adds n-1 hard negatives per positive. Hard negatives are 2-hop neighbors not in the positive set. Their
-availability depends directly on graph structure.
-
-Analysis: Average 2-hop neighborhood size. If too small, hard negative mining may be limited.
-
-**1.7 Cold-start items produce degraded embeddings.** *Source: PinSage (Ying et al., KDD 2018), Section 5 "Experiments",
-cold-start analysis*
-
-Items with zero or near-zero edges fall back to node-own features only. The cold-start fraction quantifies this risk.
-
-Analysis: Count nodes with degree 0-1 per type (cold-start fraction).
-
-**1.8 Feature loading I/O is the CPU bottleneck.** *Source: PinSage (Ying et al., KDD 2018), Section 3 "MapReduce
-Inference"*
-
-Feature sizes and degree distributions determine the I/O cost of the producer-consumer architecture.
-
-Analysis: Feature memory budget: nodes x feature_dim x dtype_size per type.
-
-______________________________________________________________________
-
-## 2. PinnerSage
-
-**PinnerSage: Multi-Modal User Embedding Framework for Recommendations at Pinterest** Pal et al., Pinterest, KDD 2020.
-[arxiv.org/abs/2007.03634](https://arxiv.org/abs/2007.03634)
-
-**Summary:** Represents each user with multiple embeddings (one per interest cluster) by clustering 90-day action
-history using Ward hierarchical clustering. Achieved 4% engagement lift on homefeed and 20% on shopping.
-
-**Graph characteristics:** Same Pinterest graph (~3B pins, hundreds of millions of users). User-Pin action graph with
-typed edges (repin, click, closeup, hide). Power-law user activity distribution.
-
-### Findings
-
-**2.1 User activity follows a power-law.** *Source: PinnerSage (Pal et al., KDD 2020), Section 3 "PinnerSage
-Architecture"*
-
-Light users produce 3-5 clusters; heavy users produce 75-100. Action count distribution per user must be understood for
-batching strategies.
-
-Analysis: Source-node degree distribution. Degree-stratified bucket counts.
-
-**2.2 Single-embedding representation loses diverse interests.** *Source: PinnerSage (Pal et al., KDD 2020), Section 3
-"PinnerSage Architecture"*
-
-Averaging embeddings causes topic drift. Aggregation on skewed data destroys information.
-
-Analysis: Degree distribution variance. If high variance, single aggregation may lose minority signals.
-
-**2.3 Action type weighting matters.** *Source: PinnerSage (Pal et al., KDD 2020), Section 3 "PinnerSage Architecture"*
-
-Repins (strong intent) vs clicks (moderate) vs closeups (weak) vs hides (negative). If edges have different semantic
-meanings, their type distribution affects model quality.
-
-Analysis: Edge type distribution. Alert if 90%+ edges are one type.
-
-**2.4 Temporal decay of action relevance.** *Source: PinnerSage (Pal et al., KDD 2020), Section 4 "Experiments"*
-
-90-day window for batch; same-day for online. Stale edges inject noise.
-
-Analysis: Edge timestamp distribution. Fraction of edges older than configurable threshold.
-
-**2.5 Cold-start users with 0-1 actions need special handling.** *Source: PinnerSage (Pal et al., KDD 2020), Section 4
-"Experiments"*
-
-Real-time signals help most for these users. The fraction with sparse histories quantifies the problem.
-
-Analysis: Cold-start node count (degree 0-1) per type.
-
-**2.6 Medoid is better than centroid for cluster representation.** *Source: PinnerSage (Pal et al., KDD 2020), Section 3
-"PinnerSage Architecture"*
-
-Centroids drift to unobserved embedding space ("hallucinated" interests). Medoids are actual data points.
-
-Analysis: Not directly a data check, but motivates understanding feature space coverage.
-
-______________________________________________________________________
-
-## 3. BLADE
-
-**BLADE: Biased Graph Sampling for Better Related Product Recommendation** Virinchi and Saladi, Amazon, WSDM 2023.
-
-**Summary:** GNN for directed graphs using dual (source/target) embeddings with asymmetric loss and biased neighborhood
-sampling: neighborhood SIZE varies by power-law based on node in-degree, and neighbor SELECTION is biased toward
-high-degree neighbors. Improved HitRate/MRR by 6-230% across datasets with revenue/sales lift in production A/B tests.
-
-**Graph characteristics:** Directed graphs (co-purchase, citation, web). Power-law degree distribution. In-degree !=
-out-degree.
-
-### Findings
-
-**3.1 Uniform neighborhood sampling destroys low-degree node embeddings.** *Source: BLADE (Virinchi and Saladi, WSDM
-2023), Section 3 "BLADE Framework"*
-
-Fixed-size sampling (K=10) wastes budget on low-degree nodes (not enough real neighbors) and under-samples high-degree
-ones.
-
-Analysis: Count nodes whose degree < configured fan-out per edge type (nodes-below-fan-out). These nodes cannot fill
-their sampled neighborhood.
-
-**3.2 Neighborhood size should follow a power-law of in-degree.** *Source: BLADE (Virinchi and Saladi, WSDM 2023),
-Section 3 "BLADE Framework"*
-
-Low-degree nodes get LARGER neighborhoods (more hops); high-degree nodes get smaller. The power-law coefficient is
-estimated from the entire degree distribution. This is the core insight producing 230% improvement.
-
-Analysis: Separate in-degree vs out-degree distributions. Degree-stratified bucket counts: 0-1, 2-10, 11-100, 101-1K,
-1K-10K, 10K+.
-
-**3.3 Sampling probability biased toward high-degree neighbors.** *Source: BLADE (Virinchi and Saladi, WSDM 2023),
-Section 3 "BLADE Framework"*
-
-Sampling a high-degree neighbor is more productive because its embedding already encodes information from its own large
-neighborhood. This is INVERSE of PinSage's hub down-weighting during aggregation. Both agree degree distribution
-matters; they handle it at different pipeline stages.
-
-Analysis: Degree distribution is prerequisite for configuring sampling strategy.
-
-**3.4 Directed graphs need dual (source/target) embeddings.** *Source: BLADE (Virinchi and Saladi, WSDM 2023), Section 3
-"BLADE Framework"*
-
-Asymmetric relationships (phone -> phone case) require separate representations.
-
-Analysis: Compute in-degree and out-degree separately. Detect directionality by checking reciprocity fraction. If
-reciprocity > 95%, graph is effectively undirected.
-
-**3.5 Largest gains on graphs with most extreme power-law distributions.** *Source: BLADE (Virinchi and Saladi, WSDM
-2023), Section 4 "Experiments"*
-
-The 230% improvement was on the most skewed dataset. Power-law exponent and degree skew predict the benefit magnitude.
-
-Analysis: Power-law exponent fitting on degree distribution. Lower exponent = more skewed = more benefit from adaptive
-sampling.
-
-**3.6 Removing biased sampling hurts ALL degree ranges.** *Source: BLADE (Virinchi and Saladi, WSDM 2023), Section 4
-"Experiments"*
-
-Not just tail nodes. The entire model degrades without degree-adaptive sampling.
-
-Analysis: Full degree distribution is always needed, not just tail statistics.
-
-______________________________________________________________________
-
-## 4. LiGNN
-
-**LiGNN: Graph Neural Networks at LinkedIn** Yin et al., LinkedIn, KDD 2024.
-[arxiv.org/abs/2402.11139](https://arxiv.org/abs/2402.11139) Best Paper, Applied Data Science Track.
-
-**Summary:** LinkedIn's deployed GNN framework producing graph embeddings across Feed, People Recommendations, Job
-Recommendations, and Ads. Algorithmic improvements (temporal architectures, cold-start densification, ID embeddings) and
-systems improvements (7x training speedup via adaptive sampling).
-
-**Graph characteristics:** Up to 100B nodes, hundreds of billions of edges. Tens of node types and edge types. Three
-edge categories: engagement, affinity, attribute.
-
-### Findings
-
-**4.1 Cold-start densification improves model quality.** *Source: LiGNN (Yin et al., KDD 2024), Section 4 "Cold-Start
-Solutions"*
-
-When a low-out-degree node is similar to a high-out-degree node (measured via external embeddings), artificial edges are
-added. Adding cold-start edges yielded +0.28% AUC on Follow Feed.
-
-Analysis: Cold-start node count (degree 0-1). Alert if > 10%.
-
-**4.2 2-hop PPR captures 90% of gains.** *Source: LiGNN (Yin et al., KDD 2024), Section 3 "Sampling Strategies"*
-
-In Follow Feed and People Recommendations, 2-hop PPR sampling contributed ~90% of the total performance gain while
-accelerating sampling by 3x vs deeper sampling.
-
-Analysis: Degree distribution + estimated PPR reach to predict sampling effectiveness.
-
-**4.3 ID embeddings give +15.3% validation AUC.** *Source: LiGNN (Yin et al., KDD 2024), Section 4 "ID Embeddings"*
-
-The single largest ablation impact. ID embedding tables scale with node count.
-
-Analysis: Node cardinality per type. ID embedding memory: node_count x embedding_dim x dtype_size.
-
-**4.4 Temporal modeling yields +5.8% AUC lift.** *Source: LiGNN (Yin et al., KDD 2024), Section 5 "Temporal
-Architecture"*
-
-Static SAGE encoder + transformer-based temporal sequence model on edge histories.
-
-Analysis: Edge timestamp freshness distribution. Stale edge fraction.
-
-**4.5 Increasing from 20 to 200 neighbors = +3.2% AUC.** *Source: LiGNN (Yin et al., KDD 2024), Section 6 "Systems
-Improvements"*
-
-Performance generally improves with more neighbors sampled, but with diminishing returns.
-
-Analysis: Degree distribution percentiles to determine if nodes can support the target neighbor count. Neighbor
-explosion estimate for given fan-out.
-
-**4.6 Cross-domain graph densification helps sparse nodes.** *Source: LiGNN (Yin et al., KDD 2024), Section 6 "Systems
-Improvements"*
-
-Combining subgraphs from different domains (feed, jobs, notifications) densifies neighborhoods.
-
-Analysis: Per-edge-type degree distribution. Identify sparse vs dense domains.
-
-**4.7 Three edge categories have different semantics.** *Source: LiGNN (Yin et al., KDD 2024), Section 6 "Systems
-Improvements"*
-
-Engagement (views, clicks, likes), affinity (historical member-creator interactions), attribute (HAS-A relationships).
-Different categories serve different purposes.
-
-Analysis: Edge type inventory with per-type counts and semantic categorization.
-
-**4.8 Attention aggregator outperforms mean/self-attention by +0.9% AUC.** *Source: LiGNN (Yin et al., KDD 2024),
-Section 7 "Experiments"*
-
-Choice of aggregator interacts with degree distribution.
-
-Analysis: Degree variance. Attention benefits more when degree varies widely.
-
-**4.9 Adaptive neighbor sampling enables 7x training speedup.** *Source: LiGNN (Yin et al., KDD 2024), Section 6
-"Systems Improvements"*
-
-Starts with small neighbor count and increases by monitoring model performance.
-
-Analysis: Degree distribution determines the range over which adaptive sampling operates.
-
-______________________________________________________________________
-
-## 5. TwHIN
-
-**TwHIN: Embedding the Twitter Heterogeneous Information Network for Personalized Recommendation** El-Kishky et al.,
-Twitter/X, KDD 2022. [arxiv.org/abs/2202.05387](https://arxiv.org/abs/2202.05387)
-
-**Summary:** Models Twitter as a heterogeneous information network using knowledge graph embedding (TransE) to learn
-representations at 10^9 nodes and 10^11 edges. Pretrained TwHIN embeddings used across ads ranking, follow
-recommendation, offensive content detection, and search.
-
-**Graph characteristics:** 4 entity types (User, Tweet, Advertiser, Ad), 7 relation types (Follows, Authors, Favorites,
-Replies, Retweets, Promotes, Clicks). Directed edges.
-
-### Findings
-
-**5.1 High-coverage vs low-coverage relations are fundamentally different.** *Source: TwHIN (El-Kishky et al., KDD
-2022), Section 3 "TwHIN Model"*
-
-Some relations are high-coverage (most users participate, e.g., Follows, Favorites) while others are low-coverage (e.g.,
-ad interactions). This coverage imbalance is a core structural property.
-
-Analysis: Per-edge-type node coverage: fraction of nodes participating in each relation.
-
-**5.2 Edge type imbalance causes sampling bias.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section 3 "TwHIN Model"*
-
-When one edge type dominates, standard neighbor sampling almost never selects minority types.
-
-Analysis: Edge count per type. Alert if any type < 0.1% of total or dominant type > 90%.
-
-**5.3 No node features by design.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section 3 "TwHIN Model"*
-
-TwHIN learns embeddings purely from graph structure (knowledge graph embedding approach). Deliberate choice for
-scalability.
-
-Analysis: Feature existence check per node type. Relevant for architecture choice (feature-based GNN vs structure-only
-KGE).
-
-**5.4 Directed edges carry semantic meaning.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section 3 "TwHIN Model"*
-
-Follows, Favorites, Authors are inherently directional.
-
-Analysis: Reciprocity per edge type. Low reciprocity means direction carries information.
-
-**5.5 Multi-modal user representation via engagement clustering.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section
-4 "Experiments"*
-
-Users represented as mixture of multiple embeddings (not single vector). >300% recall improvement from multi-modal over
-unimodal.
-
-Analysis: Per-node degree distribution across different relation types. Identifies multi-interest signal.
-
-**5.6 128-dimensional embeddings for all entity types.** *Source: TwHIN (El-Kishky et al., KDD 2022), Section 4
-"Experiments"*
-
-Embedding dimensionality directly determines memory footprint.
-
-Analysis: Memory budget: node_count x 128 x dtype_size per entity type.
-
-______________________________________________________________________
-
-## 6. GiGL
-
-**GiGL: Large-Scale Graph Neural Networks at Snapchat** Snap, KDD 2025.
-[arxiv.org/abs/2502.15054](https://arxiv.org/abs/2502.15054)
-
-**Summary:** Open-source library for billion-scale GNN training and inference. End-to-end pipeline from BigQuery
-preprocessing to distributed training via GLT. 35+ production launches across friend recommendation, content
-recommendation, spam detection, and advertising.
-
-**Graph characteristics:** Hundreds of millions of nodes, tens of billions of edges, hundreds of node and edge features.
-Supports homogeneous, heterogeneous, and labeled homogeneous graphs.
-
-### Findings
-
-**6.1 BQ dangling edge validation.** *Source: GiGL (Snap, KDD 2025), Section 3 "System Design",
-`gigl/analytics/graph_validation/bq_graph_validator.py`*
-
-`BQGraphValidator` (`gigl/analytics/graph_validation/bq_graph_validator.py`) checks for NULL src/dst node IDs. Hard fail
-if dangling edges exist.
-
-Analysis: Already implemented. Reuse in DataAnalyzer.
-
-**6.2 int16 degree clamping at 32,767.** *Source: GiGL (Snap, KDD 2025), Section 4 "System Design",
-`gigl/distributed/utils/degree.py:134-137`*
-
-In `gigl/distributed/utils/degree.py:134-137`, node degrees are clamped to `torch.iinfo(torch.int16).max`. Super-hub
-nodes silently lose degree precision, affecting PPR sampling probabilities.
-
-Analysis: Count nodes with degree > 32,767 per edge type.
-
-**6.3 Label edge types excluded from structural computations.** *Source: GiGL (Snap, KDD 2025), Section 4 "System
-Design"*
-
-Label edges (for ABLP supervision) are filtered out of degree computation and PPR traversal to prevent ground-truth
-leakage.
-
-Analysis: Validate that label edge types are properly identified and excluded.
-
-**6.4 Over-counting correction in distributed degree computation.** *Source: GiGL (Snap, KDD 2025), Section 4 "System
-Design"*
-
-When multiple processes share the same graph partition, naive all-reduce over-counts degrees. GiGL corrects by dividing
-by local_world_size.
-
-Analysis: Not a pre-training data check, but validates distributed correctness.
-
-**6.5 Node ID enumeration catches missing-node references.** *Source: GiGL (Snap, KDD 2025), Section 3 "System Design"*
-
-Raw node IDs (possibly strings) are enumerated to integers. Enumerated edge tables are validated for dangling edges,
-catching edges referencing non-existent nodes.
-
-Analysis: Edge referential integrity: edges where src/dst not in node table.
-
-**6.6 TFT-backed Data Preprocessor.** *Source: GiGL (Snap, KDD 2025), Section 3 "System Design"*
-
-Distributed feature transformation using TensorFlow Transform on Beam/Dataflow. Reads from BigQuery, outputs TFRecords.
-
-Analysis: TFDV feature statistics (what the FeatureProfiler component reuses).
-
-**6.7 TFDV statistics commented out.** *Source: GiGL (Snap, KDD 2025), Section 5 "System Design", `utils.py:120` and
-`utils.py:300`*
-
-`GenerateAndVisualizeStats` exists at `utils.py:120` but is commented out at `utils.py:300`. This is the gap the
-DataAnalyzer fills.
-
-Analysis: Re-enable as standalone component.
-
-**6.8 PPR sampling parameters.** *Source: GiGL (Snap, KDD 2025), Section 4 "System Design"*
-
-Alpha=0.5, eps=1e-4, max_ppr_nodes=50, num_neighbors_per_hop=100,000.
-
-Analysis: Degree distribution to estimate PPR subgraph sizes for given parameters.
-
-**6.9 Semi-random graph partitioning.** *Source: GiGL (Snap, KDD 2025), Section 5 "System Design"*
-
-Nodes shuffled across machines; adjacent edges collocated based on source or destination. Ensures 1-hop neighborhoods
-are within-machine.
-
-Analysis: Degree distribution determines partition balance. Extreme skew causes stragglers.
-
-**6.10 Multi-task heterogeneous graph for ads.** *Source: GiGL (Snap, KDD 2025), Section 5 "System Design"*
-
-Rather than separate partial graphs per ad type, Snapchat joins users with different ad types (product, app, web) into
-one heterogeneous graph with multi-task learning.
-
-Analysis: Edge type inventory and cross-type connectivity.
-
-______________________________________________________________________
-
-## 7. AliGraph
-
-**AliGraph: A Comprehensive Graph Neural Network Platform** Zhu et al., Alibaba, VLDB 2019.
-[arxiv.org/abs/1902.08730](https://arxiv.org/abs/1902.08730)
-
-**Summary:** Alibaba's GNN platform for e-commerce, handling 493M vertices and 6.8B edges across recommendations, search
-ranking, and fraud detection. Three-layer architecture: storage, sampling, operator.
-
-**Graph characteristics:** 493M vertices, 6.8B edges (Taobao user-item interaction). Power-law degree distribution.
-Multiple edge types: click, purchase, add-to-cart, favorite. Rich node attributes.
-
-### Findings
-
-**7.1 Feature alignment: ID space mismatches between tables.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 3
-"System Architecture"*
-
-Node attribute tables and edge tables had mismatched node ID spaces. Nodes in edges that had no corresponding feature
-row caused silent NaN propagation during message passing.
-
-Analysis: Edge referential integrity (src/dst exists in node table). Hard fail.
-
-**7.2 8% of nodes in components smaller than 10 nodes.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 4 "System
-Architecture"*
-
-These small-component nodes get poor embeddings because message passing cannot aggregate meaningful neighborhood
-information.
-
-Analysis: Isolated node count. Connected component analysis (Tier 4 opt-in).
-
-**7.3 12% of item nodes had incomplete attributes.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 4 "System
-Architecture"*
-
-Missing category, price, or description. Used mean imputation for numerical, learned "unknown" embedding for
-categorical.
-
-Analysis: NULL rate per column per table. Feature Propagation research shows GNNs tolerate up to 90% missing, but
-imputation strategy depends on rate.
-
-**7.4 Feature dimension heterogeneity across node types.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 5 "System
-Architecture"*
-
-User features (~200 dims) vs item features (~1024 dims). Direct concatenation without normalization degraded performance
-by 5-8%.
-
-Analysis: Feature dimensionality per node type. Alert if ratio > 10x across types.
-
-**7.5 Feature scale spanning 6 orders of magnitude.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 5 "System
-Architecture"*
-
-Raw price values caused gradient instability. Log-transformation + z-score normalization required.
-
-Analysis: Per-feature min/max/range. Alert if max/min ratio > 10^4.
-
-**7.6 15% daily edge churn; stale features degraded quality 2-3% AUC.** *Source: AliGraph (Zhu et al., VLDB 2019),
-Section 6 "Experiments"*
-
-Features computed from the graph became stale within hours if not refreshed.
-
-Analysis: Edge timestamp freshness (Tier 4 opt-in). Data staleness metrics.
-
-**7.7 Edge density varies 100x across subgraph types.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 6
-"Experiments"*
-
-User-user interaction subgraph was 100x sparser than item-item co-purchase. Required density-aware sampling rates.
-
-Analysis: Per-edge-type density: edge_count / (src_node_count x dst_node_count).
-
-**7.8 Top 1% vertices by degree accessed in >80% of mini-batches.** *Source: AliGraph (Zhu et al., VLDB 2019), Section 6
-"Experiments"*
-
-Hub caching critical for training throughput.
-
-Analysis: Top-K hub analysis. Hub concentration: fraction of edges involving top 1% nodes.
-
-**7.9 Naive hash partitioning caused 3-5x more cross-partition edges than necessary.** *Source: AliGraph (Zhu et al.,
-VLDB 2019), Section 6 "Experiments"*
-
-Vertices near partition boundaries had systematically lower-quality aggregated features.
-
-Analysis: Degree distribution and graph density inform partitioning strategy.
-
-______________________________________________________________________
-
-## 8. GraphSMOTE
-
-**GraphSMOTE: Imbalanced Node Classification on Graphs via Augmentation** Zhao et al., WSDM 2021.
-[arxiv.org/abs/2103.08826](https://arxiv.org/abs/2103.08826)
-
-**Summary:** Demonstrates that GNN message passing exacerbates class imbalance. Proposes synthetic minority node
-generation in embedding space with topology-aware edge generation. Consistent improvements across multiple datasets.
-
-**Graph characteristics:** Cora (2,708 nodes, 7 classes), CiteSeer (3,327 nodes, 6 classes), BlogCatalog (5,196 nodes, 6
-classes). Artificially created imbalance ratios from 1:1 to 1:20.
-
-### Findings
-
-**8.1 Message passing amplifies imbalance 2-3x.** *Source: GraphSMOTE (Zhao et al., WSDM 2021), Section 3 "Method"*
-
-A 1:10 label imbalance becomes effectively 1:25 in representation space after 2 layers of GCN aggregation. The
-"representation imbalance" measured by variance of class-conditioned embedding distributions is 2-3x worse than original
-label imbalance.
-
-Analysis: Class imbalance ratio (max_count / min_count). Warning at > 1:5, critical at > 1:10.
-
-**8.2 Critical threshold at 1:5 to 1:10.** *Source: GraphSMOTE (Zhao et al., WSDM 2021), Section 4 "Experiments"*
-
-Performance degradation approximately linear from 1:1 to 1:5, then accelerates sharply. At 1:10, minority class F1 drops
-30-40%. At 1:20, some classes become nearly undetectable (F1 < 0.1).
-
-Analysis: Per-class node counts. Flag if any class pair ratio exceeds 5:1. Critical alert at > 10:1.
-
-**8.3 Intra-class edge density: majority 40-60%, minority 10-25%.** *Source: GraphSMOTE (Zhao et al., WSDM 2021),
-Section 3 "Method"*
-
-Minority class nodes have fewer same-class neighbors. They experience fundamentally different message-passing dynamics.
-
-Analysis: Homophily ratio (Tier 4 opt-in). Per-class intra-class edge fraction if labels available.
-
-**8.4 Degree-imbalance correlation.** *Source: GraphSMOTE (Zhao et al., WSDM 2021), Section 4 "Experiments"*
-
-In social network datasets, minority-class nodes tended to have lower average degree, compounding the problem. They had
-both fewer training signals and less informative neighborhoods.
-
-Analysis: Per-class degree distribution if labels available. Correlation between class and degree.
-
-**8.5 Topology imbalance is distinct from label imbalance.** *Source: GraphSMOTE (Zhao et al., WSDM 2021), Section 4
-"Experiments"*
-
-Even when class label counts are balanced, if minority nodes are scattered (not clustered) in the graph, they are
-topologically disadvantaged.
-
-Analysis: Node-level homophily distribution (Tier 4 opt-in).
-
-______________________________________________________________________
-
-## 9. Beyond Homophily
-
-**Beyond Homophily in Graph Neural Networks: Current Limitations and Effective Designs** Zhu et al., NeurIPS 2020.
-[arxiv.org/abs/2006.11468](https://arxiv.org/abs/2006.11468)
-
-**Summary:** Formally defines homophily ratio, demonstrates standard GCN/GAT/GraphSAGE architectures fail on
-heterophilic graphs, and proposes H2GCN. Provides the most comprehensive analysis of how graph homophily affects GNN
-performance.
-
-**Graph characteristics:** Homophilic datasets: Cora (h=0.81), CiteSeer (h=0.74), PubMed (h=0.80). Heterophilic
-datasets: Texas (h=0.11), Wisconsin (h=0.21), Cornell (h=0.30), Actor (h=0.22).
-
-### Findings
-
-**9.1 Edge homophily ratio is the most predictive graph property for architecture selection.** *Source: Beyond Homophily
-(Zhu et al., NeurIPS 2020), Section 3 "Analysis"*
-
-h = fraction of edges connecting same-class nodes. This single number predicts whether standard GCN will work (h > 0.5)
-or fail (h < 0.3).
-
-Analysis: Compute edge homophily ratio. Green (h > 0.7), yellow (0.3-0.7), red (h < 0.3).
-
-**9.2 GCN drops 30-50 percentage points from homophilic to heterophilic.** *Source: Beyond Homophily (Zhu et al.,
-NeurIPS 2020), Section 3 "Analysis"*
-
-GAT: 25-45 point drop. GraphSAGE: 20-40 point drop. The degradation is severe.
-
-Analysis: If h < 0.3, warn that standard architectures will likely underperform.
-
-**9.3 MLP outperforms GCN at h < 0.2.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020), Section 3 "Analysis"*
-
-On highly heterophilic graphs, ignoring graph structure is better than using it with standard architectures.
-
-Analysis: If h < 0.2, recommend benchmarking MLP baseline before investing in GNN.
-
-**9.4 Class-conditional homophily varies dramatically.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020), Section 3
-"Analysis"*
-
-In Actor dataset, some classes have h=0.4 while others have h=0.1. The worst-performing class is typically the one with
-lowest class-conditional homophily.
-
-Analysis: Per-class homophily ratio. Flag classes with h < 0.2.
-
-**9.5 Node-level homophily distribution matters.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020), Section 3
-"Analysis"*
-
-A graph with mean h=0.5 could be uniformly 0.5 or bimodal (some nodes all same-class, others all different-class). These
-have very different implications.
-
-Analysis: Per-node homophily, report distribution (mean, std, skewness).
-
-**9.6 2-hop neighbors are MORE informative than 1-hop in heterophilic graphs.** *Source: Beyond Homophily (Zhu et al.,
-NeurIPS 2020), Section 4 "H2GCN Design"*
-
-"Enemy of my enemy is my friend." H2GCN exploits this by separately aggregating 1-hop and 2-hop.
-
-Analysis: If h < 0.3, compute 2-hop homophily. If 2-hop > 1-hop, flag for H2GCN-style architecture.
-
-**9.7 High degree hurts in heterophilic graphs.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020), Section 4 "H2GCN
-Design"*
-
-Opposite of homophilic graphs. High-degree nodes aggregate more "wrong-class" information.
-
-Analysis: Cross-reference degree distribution with homophily. If heterophilic, warn about high-degree nodes.
-
-**9.8 Feature propagation is harmful in heterophilic settings.** *Source: Beyond Homophily (Zhu et al., NeurIPS 2020),
-Section 4 "H2GCN Design"*
-
-Standard feature propagation (multiplying by adjacency matrix) smooths features, making connected nodes more similar. On
-heterophilic graphs, this destroys distinguishing signal.
-
-Analysis: If h < 0.3, warn that feature propagation/smoothing may degrade performance.
-
-______________________________________________________________________
-
-## 10. Fraud Detection
-
-**Uber Fraud Detection** (2022) and **Grab Spade** (Jiang et al., VLDB 2023)
-
-**Summary (Uber):** Heterogeneous transaction graphs connecting riders, drivers, payment methods, devices, locations.
-Detects fraud rings via dense subgraphs and anomalous multi-hop patterns.
-
-**Summary (Grab Spade):** Production GNN for fraud detection processing 100M+ nodes, 1B+ edges across ride-hailing,
-food, payments. Details temporal consistency, graph evolution, and adversarial actors.
-
-**Graph characteristics:** Heterogeneous, 6+ node types (Uber), 10+ edge types. Extreme class imbalance (fraud rate
-0.1-2%). Temporal dynamics.
-
-### Findings
-
-**10.1 Temporal edge ordering changes AUC by 5-8%.** *Source: Uber Fraud Detection (2022) / Grab Spade (Jiang et al.,
-VLDB 2023), Section 3*
-
-Processing edges chronologically vs shuffled has significant impact. Temporal leakage inflates offline metrics but fails
-in production.
-
-Analysis: Check for timestamp columns on edges. Validate chronological consistency.
-
-**10.2 Label noise: 20-40% of fraud labels are delayed.** *Source: Uber Fraud Detection (2022) / Grab Spade (Jiang et
-al., VLDB 2023), Section 4*
-
-Ground truth fraud confirmed days/weeks after event. Training on point-in-time vs eventual labels creates different
-models.
-
-Analysis: Label timestamp distribution relative to edge timestamps. Flag delayed labels.
-
-**10.3 Feature-edge timestamp misalignment >24h degrades AUC 2-4%.** *Source: Grab Spade (Jiang et al., VLDB 2023),
-Section 4*
-
-Node features computed at time T1 but edges from time T2 creates temporal inconsistency.
-
-Analysis: Compare feature computation timestamps vs edge timestamps. Flag > 24h gaps.
-
-**10.4 Dense subgraph detection.** *Source: Uber Fraud Detection (2022) / Grab Spade (Jiang et al., VLDB 2023), Section
-5*
-
-Fraud rings manifest as near-cliques with local density > 0.7 (legitimate < 0.3). Specific 3-node motifs are 50x more
-common among fraudsters.
-
-Analysis: Local density distribution (Tier 4, specialized for fraud domains).
-
-**10.5 Edge type importance varies 10x.** *Source: Grab Spade (Jiang et al., VLDB 2023), Section 5*
-
-Shared-device edges are far more indicative of fraud than shared-location edges. Static binary edges lose this signal.
-
-Analysis: Per-edge-type statistics. Edge weight distribution if weighted.
-
-**10.6 Temporal degree bursts are strong fraud signals.** *Source: Grab Spade (Jiang et al., VLDB 2023), Section 5*
-
-Sudden increases in node degree (device connecting to 20 new users in 1 hour).
-
-Analysis: Time-windowed degree analysis (Tier 4, specialized).
-
-______________________________________________________________________
-
-## 11. Google Maps ETA
-
-**ETA Prediction with Graph Neural Networks in Google Maps** Derrow-Pinion et al., Google, CIKM 2021.
-[arxiv.org/abs/2108.11482](https://arxiv.org/abs/2108.11482)
-
-**Summary:** GNN for ETA prediction on road network graphs. Supersegments (3-15 connected road segments) as graph
-substructures. Reduced negative ETA outcomes by 40%+ in cities. Serves billions of queries daily.
-
-**Graph characteristics:** Road segments as nodes (hundreds of millions globally), segment-to-segment connectivity as
-edges. Bounded degree (2-6 typical, max rarely > 10). Relatively static topology.
-
-### Findings
-
-**11.1 Bounded degree distribution (2-6 typical).** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021), Section
-3 "Method"*
-
-Unlike social networks, road networks have naturally bounded degree. Full neighborhoods can be used without sampling.
-
-Analysis: Degree distribution. Compare to expected range for domain. Bounded degree simplifies sampling.
-
-**11.2 Directionality matters.** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021), Section 3 "Method"*
-
-One-way streets, highway on-ramps. Using undirected message passing lost directional information.
-
-Analysis: Check if graph is directed. Warn if directed graph is treated as undirected.
-
-**11.3 Edge features are as important as node features.** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021),
-Section 3 "Method"*
-
-Travel time along a segment (edge feature) is more directly predictive than segment attributes.
-
-Analysis: Edge feature dimensionality and completeness. Alert if edge features are missing.
-
-**11.4 Per-category normalization needed.** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021), Section 4
-"Experiments"*
-
-"Fast" speed on a highway is different from "fast" on a residential street. Road-type-specific normalization improved
-predictions.
-
-Analysis: Per-category (node type) feature distribution. Flag if features have vastly different scales across
-categories.
-
-**11.5 Missing real-time features on 5-15% of segments.** *Source: Google Maps ETA (Derrow-Pinion et al., CIKM 2021),
-Section 4 "Experiments"*
-
-Falls back to historical averages. Encoding "data freshness" as an additional feature improved robustness.
-
-Analysis: Feature missing rate. Spatial clustering of missing features (are they concentrated in certain regions?).
-
-______________________________________________________________________
-
-## 12. Feature Propagation
-
-**On the Unreasonable Effectiveness of Feature Propagation in Learning on Graphs with Missing Node Features** Rossi et
-al., ICLR 2022.
-
-**Summary:** Demonstrates simple feature propagation (diffusing known features along edges) can effectively reconstruct
-missing features, often matching sophisticated imputation methods. Provides theoretical analysis of when this works.
-
-**Graph characteristics:** Tested across Cora, CiteSeer, PubMed, Amazon, Coauthor datasets. Controlled missing rates:
-10-99%.
-
-### Findings
-
-**12.1 Feature propagation recovers features up to 90% missing with ~5% accuracy drop.** *Source: Feature Propagation
-(Rossi et al., ICLR 2022), Section 3 "Feature Propagation"*
-
-On Cora, 90% missing + propagation dropped only ~5% from full-feature baseline. At 99% missing, ~15% drop.
-
-Analysis: Feature missing rate. If < 90%, propagation is viable. If 90-95%, warn of degradation. If > 95%, critical.
-
-**12.2 Phase transition at 95-99% missing.** *Source: Feature Propagation (Rossi et al., ICLR 2022), Section 3 "Feature
-Propagation"*
-
-Below 95%, performance degrades gracefully. Above 99%, propagation fails to recover meaningful features, especially on
-sparse graphs.
-
-Analysis: Flag datasets with > 95% missing features as critical.
-
-**12.3 Spectral gap predicts recovery quality.** *Source: Feature Propagation (Rossi et al., ICLR 2022), Section 3
-"Feature Propagation"*
-
-Graphs with larger spectral gap recover features faster. Spectral gap > 0.1 means good recovery; < 0.01 means poor.
-
-Analysis: Estimate spectral gap (Tier 4, approximated from graph Laplacian).
-
-**12.4 Spatially clustered missing features are harder to recover.** *Source: Feature Propagation (Rossi et al., ICLR
-2022), Section 4 "Experiments"*
-
-Uniformly random missing is easiest. If an entire connected component has no features, propagation cannot help.
-
-Analysis: Spatial autocorrelation of missingness pattern. Per-connected-component feature coverage.
-
-**12.5 Low-pass filter property destroys heterophilic signal.** *Source: Feature Propagation (Rossi et al., ICLR 2022),
-Section 4 "Experiments"*
-
-Propagation smooths features, preserving low-frequency components. On heterophilic graphs, discriminative information is
-in high-frequency components and gets destroyed.
-
-Analysis: Cross-reference: if h < 0.3 AND features are missing, warn that propagation may harm performance.
-
-**12.6 Local connectivity of missing nodes determines recovery quality.** *Source: Feature Propagation (Rossi et al.,
-ICLR 2022), Section 4 "Experiments"*
-
-A missing-feature node with 5+ feature-bearing neighbors recovers well. With only 1, recovery is poor.
-
-Analysis: For nodes with missing features: compute average degree and number of feature-bearing neighbors.
-
-______________________________________________________________________
-
-## 13. GraphBFF
-
-**GraphBFF: Scaling Graph Foundation Models to Billion Nodes and Edges** arXiv:2602.04768, February 2026.
-
-**Summary:** First end-to-end recipe for billion-parameter Graph Foundation Models (1.4B parameters trained on 1B
-samples). Supports arbitrary heterogeneous graphs at billion scale with zero-shot and few-shot transfer.
-
-**Graph characteristics:** Billion-scale heterogeneous graphs. Diverse domains (social, citation, molecular, knowledge
-graphs). Variable feature dimensionality across graph types.
-
-### Findings
-
-**13.1 Cross-domain graph heterogeneity requires standardized validation.** *Source: GraphBFF, Section 3 "Training
-Recipe"*
-
-Foundation models train on diverse graph distributions. Out-of-distribution detection and feature normalization across
-domains are prerequisites for transfer learning.
-
-Analysis: Feature distribution comparison across node/edge types. Schema consistency checks. Feature normalization
-validation.
-
-**13.2 Scale validation is critical for foundation model feasibility.** *Source: GraphBFF, Section 4 "Experiments"*
-
-At 1B samples and 1.4B parameters, data pipeline failures are catastrophic. Basic count validation prevents wasted
-compute.
-
-Analysis: Node/edge count sanity checks. Feature memory budget estimation at billion scale.
-
-______________________________________________________________________
-
-## 14. DQuaG
-
-**DQuaG: Automated Data Quality Validation in an End-to-End GNN Framework** Dong et al., EDBT 2025.
-[arxiv.org/abs/2502.10667](https://arxiv.org/abs/2502.10667)
-
-**Summary:** Framework combining GAT+GIN fusion encoder with dual decoders for data quality detection and repair
-suggestions. Addresses complex interdependencies in graph data that single-table validation misses.
-
-**Graph characteristics:** Relational databases modeled as graphs. Entity-relationship graphs with typed nodes and
-edges.
-
-### Findings
-
-**14.1 Graph data quality requires topology-aware validation.** *Source: DQuaG, Section 3 "Methodology"*
-
-Errors in graph data often manifest through interdependencies: a node's feature error may only be detectable by
-examining its neighborhood. Single-table validation (NULL checks, type checks) misses these structural anomalies.
-
-Analysis: Cross-table consistency checks. Edge referential integrity. Feature consistency across connected nodes.
-
-**14.2 Dual detection and repair improves data quality workflows.** *Source: DQuaG, Section 4 "Experiments"*
-
-A validation system that both detects issues AND suggests repairs reduces manual investigation time. The analyzer should
-not just flag problems but indicate severity and suggest next steps.
-
-Analysis: Traffic-light severity indicators with actionable recommendations in the HTML report.
-
-______________________________________________________________________
-
-## 15. LinkedIn Cross-Domain GNN
-
-**Large Scalable Cross-Domain Graph Neural Networks for Personalized Notification at LinkedIn** arXiv:2506.12700, June
-2025\.
-
-**Summary:** Deployed cross-domain GNN unifying user, content, and activity signals across LinkedIn's notification
-system. Significantly outperforms single-domain baselines. Integrates LLM and GNN capabilities via the STAR system.
-
-**Graph characteristics:** Cross-domain heterogeneous graph. Multiple independent subgraphs (feed, jobs, notifications)
-joined into one. Adaptive sampling across domains.
-
-### Findings
-
-**15.1 Cross-domain schema alignment is a prerequisite.** *Source: LinkedIn Cross-Domain GNN, Section 3 "System
-Architecture"*
-
-When joining subgraphs from different domains, node ID spaces, feature schemas, and edge semantics must align.
-Misalignment causes silent data corruption during cross-domain message passing.
-
-Analysis: Schema consistency across node/edge tables. ID uniqueness across domains. Feature type alignment validation.
-
-**15.2 Per-domain sparsity varies dramatically.** *Source: LinkedIn Cross-Domain GNN, Section 4 "Experiments"*
-
-Some domains are dense (feed interactions) while others are sparse (job applications). Per-domain degree distribution
-analysis identifies which domains benefit most from cross-domain densification.
-
-Analysis: Per-edge-type degree distribution. Per-domain density metrics. Cold-start fraction per domain.
-
-______________________________________________________________________
-
-## 16. Oversmoothing/Oversquashing Complexity
-
-**On the Complexity of Optimal Graph Rewiring for Oversmoothing and Oversquashing in Graph Neural Networks**
-arXiv:2603.26140, March 2026.
-
-**Summary:** Proves NP-hardness of optimal graph rewiring to improve spectral gap and conductance for mitigating
-oversmoothing/oversquashing. Establishes fundamental limits on information flow optimization in graph structure.
-
-**Graph characteristics:** Theoretical results applicable to all graph types. Connects graph topology metrics to GNN
-training pathologies.
-
-### Findings
-
-**16.1 Spectral gap and conductance are fundamental quality metrics.** *Source: arXiv:2603.26140, Section 3 "Hardness
-Results"*
-
-Oversmoothing and oversquashing are topology-dependent, not just model-dependent. The spectral gap of the graph
-Laplacian and the Cheeger conductance predict susceptibility to both pathologies before any model is trained.
-
-Analysis: Spectral gap estimation (Tier 4, approximated from graph Laplacian). Graph conductance metrics. Information
-flow bottleneck detection.
-
-**16.2 Graph topology predicts training pathologies.** *Source: arXiv:2603.26140, Section 2 "Preliminaries"*
-
-Dense, high-clustering graphs are prone to oversmoothing (information becomes uniform). Sparse, low-clustering graphs
-are prone to oversquashing (information is lost through bottlenecks). These are pre-training diagnostic signals.
-
-Analysis: Clustering coefficient (Tier 4). Graph density per edge type. Degree distribution shape analysis.
-
-______________________________________________________________________
-
-## 17. Demystifying Common Beliefs in Graph ML
-
-**Demystifying Common Beliefs in Graph Machine Learning** Arnaiz-Rodriguez et al., ICLR 2026.
-[arxiv.org/abs/2505.15547](https://arxiv.org/abs/2505.15547)
-
-**Summary:** Challenges conventional wisdom about oversmoothing, oversquashing, and the homophily/heterophily dichotomy.
-Highlights ambiguities in standard metrics that prevent focused research and reliable threshold-based decisions.
-
-**Graph characteristics:** Meta-analysis across standard benchmarks. Synthetic and real-world graphs with controlled
-properties.
-
-### Findings
-
-**17.1 Homophily thresholds are oversimplified.** *Source: Demystifying, Section 4 "Homophily and Heterophily"*
-
-The standard edge homophily ratio (h) is ambiguous: the same h value can correspond to very different graph structures
-depending on class distribution and degree patterns. The commonly cited h > 0.7 / h < 0.3 thresholds should be
-interpreted with caution, not as hard cutoffs.
-
-Analysis: Report homophily as informational, not as a hard architecture selector. Include class-conditional and
-node-level homophily distributions for nuance.
-
-**17.2 Oversmoothing claims are often conflated with other effects.** *Source: Demystifying, Section 3 "Oversmoothing"*
-
-Many "oversmoothing" failures are actually caused by other factors (vanishing gradients, loss of rank). The paper warns
-against using oversmoothing as a blanket explanation for deep GNN failure. Topology metrics alone do not predict
-oversmoothing reliably.
-
-Analysis: Clustering coefficient and density are informational for oversmoothing risk, but should not trigger strong
-warnings without additional context.
-
-______________________________________________________________________
-
-## 18. Meta GEM and Adaptive Ranking
-
-**Meta's Generative Ads Model (GEM) and Adaptive Ranking Model** Engineering at Meta blog, November 2025 + March 2026.
-
-**Summary:** GEM is trained at LLM scale on thousands of GPUs using multi-dimensional parallelism for ads
-recommendation. Adaptive Ranking was deployed on Instagram Q4 2025, yielding +3% conversion and +5% CTR improvements.
-Both systems operate on massive heterogeneous user-content-ad graphs.
-
-**Graph characteristics:** Multi-domain ads graph spanning users, content, advertisers, ad units. Billions of nodes,
-hundreds of billions of interactions. Multi-task learning across conversion, click, and engagement objectives.
-
-### Findings
-
-**18.1 Multi-domain feature quality must be validated per domain.** *Source: Meta GEM blog, "Training at Scale" section*
-
-Different domains (user profiles, ad creative, content features) have different quality characteristics. Aggregate
-feature quality metrics hide per-domain problems. A feature with 5% missing rate overall might have 30% missing in the
-ad creative domain.
-
-Analysis: Feature missing rate broken down by node type / domain. Per-domain feature distribution comparison.
-
-**18.2 Training-serving data skew degrades production impact.** *Source: Meta Adaptive Ranking blog, "Results" section*
-
-Offline evaluation metrics can diverge from online impact when training data distribution doesn't match serving
-distribution. Graph data staleness and sampling bias are common causes.
-
-Analysis: Edge timestamp freshness. Sampling bias quantification via degree distribution analysis.
-
-______________________________________________________________________
-
-## 19. Consolidated Threshold Table
-
-Numerical thresholds extracted from the literature, with source justification.
-
-| Metric                                 | Green         | Yellow         | Red             | Source Paper                                 |
-| -------------------------------------- | ------------- | -------------- | --------------- | -------------------------------------------- |
-| Edge homophily ratio                   | > 0.7         | 0.3 - 0.7      | < 0.3           | Beyond Homophily (NeurIPS 2020)              |
-| Class imbalance ratio                  | < 1:5         | 1:5 - 1:10     | > 1:10          | GraphSMOTE (WSDM 2021)                       |
-| Feature missing rate                   | < 10%         | 10 - 50%       | > 90%           | Feature Propagation (ICLR 2022)              |
-| Missing feature phase transition       | < 90%         | 90 - 95%       | > 95%           | Feature Propagation (ICLR 2022)              |
-| Isolated node fraction                 | < 1%          | 1 - 5%         | > 5%            | AliGraph (VLDB 2019): 8% in small components |
-| Degree p99/median ratio                | < 50          | 50 - 100       | > 100           | PinSage (KDD 2018): power-law dominance      |
-| Node degree (GiGL int16 clamp)         | < 32,767      | n/a            | > 32,767        | GiGL `distributed/utils/degree.py`           |
-| Neighbor explosion (per seed)          | < 50K nodes   | 50K - 100K     | > 100K          | Layer-Neighbor Sampling (NeurIPS 2022)       |
-| Cold-start fraction (degree 0-1)       | < 5%          | 5 - 10%        | > 10%           | LiGNN (KDD 2024)                             |
-| Edge type dominance                    | No type > 80% | Any type > 90% | Any type < 0.1% | TwHIN (KDD 2022)                             |
-| Feature dimension ratio (across types) | < 5x          | 5 - 10x        | > 10x           | AliGraph (VLDB 2019): 5-8% degradation       |
-| Feature scale range                    | < 10^2        | 10^2 - 10^4    | > 10^4          | AliGraph (VLDB 2019): gradient instability   |
-| Edge staleness                         | < 1 day       | 1 - 7 days     | > 30 days       | AliGraph (VLDB 2019): 2-3% AUC degradation   |
diff --git a/docs/plans/20260415-bq-data-analyzer.md b/docs/plans/20260415-bq-data-analyzer.md
deleted file mode 100644
index 3efa5feec..000000000
--- a/docs/plans/20260415-bq-data-analyzer.md
+++ /dev/null
@@ -1,468 +0,0 @@
-# BQ Data Analyzer
-
-## Problem
-
-Before training a GNN on graph data in BigQuery, we need to understand data quality, feature distributions, and graph
-structure. Today, TFDV statistics generation is embedded inside the DataPreprocessor pipeline and commented out
-(`gigl/src/data_preprocessor/lib/transform/utils.py:300`). There is no standalone way to analyze BQ tables without
-running the full preprocessing flow.
-
-## Solution
-
-A standalone DataAnalyzer module under `gigl/analytics/` that takes BQ table references as input and produces a single
-HTML report combining TFDV feature statistics, FACETS visualizations, data quality checks, and graph structure metrics.
-
-## Components
-
-### 1. DataAnalyzerConfig (dataclass)
-
-- List of node table specs: BQ table URI, feature columns, ID column, node type
-- List of edge table specs: BQ table URI, src/dst ID columns, feature columns, edge type
-- Output GCS path for results
-- Optional: resource config URI for Dataflow sizing
-
-### 2. FeatureProfiler (reuses existing TFDV Beam components)
-
-For each table: builds a Beam pipeline that reads from BQ, converts to TFExamples, runs `tfdv.GenerateStatistics()`,
-writes stats TFRecord and FACETS HTML.
-
-Reuses: `IngestRawFeatures` (`utils.py:85`), `GenerateAndVisualizeStats` (`utils.py:120`), `WriteTFSchema`
-(`utils.py:186`), `init_beam_pipeline_options` (`dataflow.py`). Runs on Dataflow, same infra as DataPreprocessor.
-
-Also runs `tfdv.infer_schema()` for schema inference and `tfdv.validate_statistics()` for anomaly detection.
-
-### 3. GraphStructureAnalyzer (BQ SQL, extends BQGraphValidator)
-
-Two files: `queries.py` (SQL templates as string constants, same pattern as
-`gigl/src/data_preprocessor/lib/enumerate/queries.py`) and `graph_structure_analyzer.py` (orchestrator that calls
-`BqUtils.run_query()`).
-
-Validation dimensions are informed by a literature review of 12 production GNN papers. See
-`docs/plans/20260415-bq-data-analyzer-references.md` for full details and justifications.
-
-#### Cost Control
-
-| Check category                                                                                    | Can TABLESAMPLE? | Reason                                                       |
-| ------------------------------------------------------------------------------------------------- | ---------------- | ------------------------------------------------------------ |
-| Row-independent (NULL rates, feature distributions, cardinality)                                  | Yes              | Rows are independent; sampling preserves distribution        |
-| Structure-dependent (degree, referential integrity, hubs, isolated nodes, self-loops, duplicates) | No               | Sampling edges destroys connectivity, distorts degree counts |
-| Expensive structure-dependent (reciprocity, homophily, connected components)                      | No               | Must run on full data; opt-in via config flags instead       |
-
-#### Tier 1: Hard Fails (always-on, block training)
-
-Violations here mean training will fail or produce silently corrupt results.
-
-| Check                      | What                           | Justification                                                                                                                                                         |
-| -------------------------- | ------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Dangling edges             | NULL src or dst                | Breaks graph construction. Reuses `BQGraphValidator`. (GiGL)                                                                                                          |
-| Edge referential integrity | src or dst not in node table   | Breaks graph loading during `DistPartitioner`. `LEFT JOIN ... WHERE node.id IS NULL`. (GiGL enumeration, AliGraph: ID space mismatches caused silent NaN propagation) |
-| Duplicate nodes            | Same ID appears multiple times | Ambiguous feature loading, corrupt aggregation. (All papers, implied)                                                                                                 |
-
-#### Tier 2: Core Graph Understanding (always-on, essential metrics)
-
-Must-run checks that quantify fundamental graph properties. Not blocking, but critical for sampling strategy,
-architecture selection, and resource allocation.
-
-| Check                                   | What                                                           | Justification                                                                                                                                                                                                                      |
-| --------------------------------------- | -------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Node/edge counts                        | `COUNT(*)` per table                                           | Basic sanity. Off-by-orders-of-magnitude = pipeline failure.                                                                                                                                                                       |
-| Degree distribution (in/out separately) | min, max, mean, median, p90, p99, p99.9 via `APPROX_QUANTILES` | Single most important structural property. PinSage: 46% improvement from importance sampling on power-law graphs. BLADE: 230% embedding improvement from degree-adaptive neighborhoods. LiGNN: +3.2% AUC from 20 to 200 neighbors. |
-| Degree-stratified bucket counts         | Nodes in buckets: 0-1, 2-10, 11-100, 101-1K, 1K-10K, 10K+      | Different degree ranges need different sampling budgets. BLADE's core insight: neighborhood size should follow a power-law of in-degree.                                                                                           |
-| Super-hub int16 clamp count             | Nodes with degree > 32,767                                     | GiGL clamps to `torch.iinfo(torch.int16).max` in `distributed/utils/degree.py:134`. Silently truncates; affects PPR sampling probabilities.                                                                                        |
-| Top-K hub nodes                         | Highest-degree nodes per edge type                             | Hub nodes dominate aggregation (PinSage). AliGraph: top 1% vertices accessed in >80% of mini-batches.                                                                                                                              |
-| Isolated node count                     | Zero in-degree AND zero out-degree                             | Cannot receive message-passing signal. AliGraph: 8% of nodes were in components < 10 nodes.                                                                                                                                        |
-| Cold-start node count                   | Degree 0-1 per type                                            | Candidates for graph densification. LiGNN: +0.28% AUC from adding artificial edges for cold-start members.                                                                                                                         |
-| Self-loop count                         | Edges where src == dst                                         | Double-counted if pipeline adds self-loops (A+I normalization). AliGraph: self-loops helped 4/5 benchmarks but hurt fraud detection.                                                                                               |
-| Duplicate edge count                    | Same (src, dst) pair per edge type                             | Inflates degree, distorts aggregation. Most GNN frameworks assume simple graphs.                                                                                                                                                   |
-| NULL rates per column                   | Batched `COUNTIF(col IS NULL) / COUNT(*)`                      | Feature Propagation: GNNs tolerate up to 90% missing with ~5% accuracy drop. Phase transition at 95%.                                                                                                                              |
-| Feature memory budget                   | nodes x feature_dim x dtype_size per type                      | Must fit in distributed memory. 1B nodes x 256-dim x fp16 = 512GB. (AliGraph, LiGNN, TwHIN)                                                                                                                                        |
-| Neighbor explosion estimate             | Estimated subgraph size for given fan-out                      | OOM risk. At fan-out [15,10,5] with avg_degree=100, ~75K nodes per seed. (Layer-Neighbor Sampling)                                                                                                                                 |
-
-#### Tier 3: Label and Heterogeneous Quality (always-on if applicable)
-
-Run automatically when `label_column` is configured or multiple edge types exist.
-
-| Check                       | What                                             | Justification                                                                                                                                                              |
-| --------------------------- | ------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Class imbalance             | Per-class counts, max/min ratio                  | Message passing amplifies imbalance 2-3x. Critical threshold at 1:5. At 1:10, minority F1 drops 30-40%. At 1:20, some classes become undetectable. (GraphSMOTE)            |
-| Label coverage              | Fraction with non-NULL labels per type           | Supervision signal strength. Alert if < 5%. (GraphSMOTE)                                                                                                                   |
-| Edge type distribution      | Per-type count and fraction                      | Imbalance causes sampling bias for minority types. TwHIN: high-coverage vs low-coverage relations are fundamentally different. Alert if any type < 0.1% or dominant > 90%. |
-| Per-edge-type node coverage | Fraction of nodes participating in each relation | Identifies which node types have sparse coverage in which relations. (TwHIN)                                                                                               |
-
-#### Tier 4: Advanced / Opt-in (expensive, run on full data)
-
-Config flags to enable. No TABLESAMPLE; these run on full data or not at all.
-
-| Check                   | Config flag                          | Justification                                                                                                                                |
-| ----------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
-| Edge reciprocity        | `compute_reciprocity: true`          | Self-join, expensive at 1B+. Directed vs undirected semantics. BLADE: in-degree != out-degree matters for dual embeddings.                   |
-| Homophily ratio         | `compute_homophily: true`            | Standard GNNs degrade 30-50% on heterophilic graphs (h < 0.3). MLP outperforms GCN at h < 0.2. Needs labels + full edges. (Beyond Homophily) |
-| Connected components    | `compute_connected_components: true` | Disconnected components can't share info. AliGraph: 8% in small components. Requires iterative SQL or BQ Graph GQL.                          |
-| Power-law exponent      | Computed from degree stats (cheap)   | Alpha < 2 = extreme hub concentration. Predicts benefit of importance sampling. (PinSage, BLADE)                                             |
-| Temporal edge freshness | `timestamp_column` in edge config    | Stale edges inject noise. LiGNN: 5.8% AUC from temporal awareness. AliGraph: 15% daily churn, 2-3% AUC degradation.                          |
-| Clustering coefficient  | `compute_clustering: true`           | Over-smoothing risk with deeper models. Feature Propagation: higher clustering aids feature recovery.                                        |
-
-#### Key Thresholds
-
-| Metric                           | Green         | Yellow         | Red             | Source                  |
-| -------------------------------- | ------------- | -------------- | --------------- | ----------------------- |
-| Edge homophily                   | > 0.7         | 0.3 - 0.7      | < 0.3           | Beyond Homophily        |
-| Class imbalance ratio            | < 1:5         | 1:5 - 1:10     | > 1:10          | GraphSMOTE              |
-| Feature missing rate             | < 10%         | 10 - 50%       | > 90%           | Feature Propagation     |
-| Missing feature phase transition | < 90%         | 90 - 95%       | > 95%           | Feature Propagation     |
-| Isolated node fraction           | < 1%          | 1 - 5%         | > 5%            | AliGraph                |
-| Degree p99/median                | < 50          | 50 - 100       | > 100           | PinSage                 |
-| Node degree (int16 clamp)        | < 32,767      | n/a            | > 32,767        | GiGL                    |
-| Neighbor explosion (per seed)    | < 50K         | 50K - 100K     | > 100K          | Layer-Neighbor Sampling |
-| Cold-start fraction (degree 0-1) | < 5%          | 5 - 10%        | > 10%           | LiGNN                   |
-| Edge type dominance              | No type > 80% | Any type > 90% | Any type < 0.1% | TwHIN                   |
-
-#### Query Inventory (queries.py)
-
-| Constant                           | Tier | BQ Query                                                         |
-| ---------------------------------- | ---- | ---------------------------------------------------------------- |
-| `NODE_COUNT_QUERY`                 | 2    | `SELECT COUNT(*) FROM {table}`                                   |
-| `EDGE_COUNT_QUERY`                 | 2    | `SELECT COUNT(*) FROM {table}`                                   |
-| `NULL_RATES_QUERY`                 | 2    | Batched `COUNTIF(col IS NULL) / COUNT(*)` per column             |
-| `DUPLICATE_NODE_COUNT_QUERY`       | 1    | `GROUP BY id HAVING COUNT(*) > 1`                                |
-| `DUPLICATE_EDGE_COUNT_QUERY`       | 2    | `GROUP BY (src, dst) HAVING COUNT(*) > 1`                        |
-| `DANGLING_EDGES_QUERY`             | 1    | `WHERE src IS NULL OR dst IS NULL`                               |
-| `EDGE_REFERENTIAL_INTEGRITY_QUERY` | 1    | `LEFT JOIN node_table WHERE node.id IS NULL`                     |
-| `SELF_LOOP_COUNT_QUERY`            | 2    | `WHERE src = dst`                                                |
-| `ISOLATED_NODE_COUNT_QUERY`        | 2    | `LEFT JOIN edge_table ... WHERE edge IS NULL`                    |
-| `DEGREE_DISTRIBUTION_QUERY`        | 2    | `APPROX_QUANTILES` on `GROUP BY` degree counts (in/out separate) |
-| `DEGREE_BUCKET_QUERY`              | 2    | Bucket counts: 0-1, 2-10, 11-100, 101-1K, 1K-10K, 10K+           |
-| `TOP_K_HUBS_QUERY`                 | 2    | `ORDER BY degree DESC LIMIT k`                                   |
-| `SUPER_HUB_INT16_CLAMP_QUERY`      | 2    | `HAVING COUNT(*) > 32767`                                        |
-| `COLD_START_NODE_COUNT_QUERY`      | 2    | `LEFT JOIN ... HAVING COUNT(edge) <= 1`                          |
-| `CLASS_IMBALANCE_QUERY`            | 3    | `GROUP BY label_column` with counts                              |
-| `LABEL_COVERAGE_QUERY`             | 3    | `COUNTIF(label IS NOT NULL) / COUNT(*)`                          |
-| `EDGE_TYPE_DISTRIBUTION_QUERY`     | 3    | `COUNT(*)` per edge type table                                   |
-| `EDGE_TYPE_NODE_COVERAGE_QUERY`    | 3    | `COUNT(DISTINCT src)` and `COUNT(DISTINCT dst)` per edge type    |
-
-`FEATURE_MEMORY_BUDGET`, `NEIGHBOR_EXPLOSION_ESTIMATE`, and `POWER_LAW_EXPONENT` are computed in Python from schema
-metadata and degree stats, not BQ queries.
-
-#### Result Dataclass
-
-```python
-@dataclass
-class DegreeStats:
-    min: int
-    max: int
-    mean: float
-    median: int
-    p90: int
-    p99: int
-    p999: int
-    percentiles: list[int]  # 100 values from APPROX_QUANTILES
-    buckets: dict[str, int]  # "0-1": count, "2-10": count, etc.
-
-@dataclass
-class GraphAnalysisResult:
-    # Tier 1: hard fails
-    duplicate_node_counts: dict[str, int]
-    dangling_edge_counts: dict[str, int]
-    referential_integrity_violations: dict[str, int]
-
-    # Tier 2: core graph understanding
-    node_counts: dict[str, int]
-    edge_counts: dict[str, int]
-    null_rates: dict[str, dict[str, float]]
-    duplicate_edge_counts: dict[str, int]
-    self_loop_counts: dict[str, int]
-    isolated_node_counts: dict[str, int]
-    degree_stats: dict[str, DegreeStats]       # edge_type -> stats (in + out separate)
-    top_hubs: dict[str, list[tuple[str, int]]]
-    super_hub_int16_clamp_count: dict[str, int]
-    cold_start_node_counts: dict[str, int]
-    feature_memory_bytes: dict[str, int]
-    neighbor_explosion_estimate: dict[str, int]
-
-    # Tier 3: label and heterogeneous (populated if applicable)
-    class_imbalance: dict[str, dict[str, int]]
-    label_coverage: dict[str, float]
-    edge_type_distribution: dict[str, int]
-    edge_type_node_coverage: dict[str, dict[str, int]]  # edge_type -> {src_coverage, dst_coverage}
-
-    # Tier 4: opt-in (populated if enabled)
-    reciprocity: dict[str, float]
-    power_law_exponent: dict[str, float]
-```
-
-### 4. ReportGenerator (AI-owned, see `gigl/analytics/data_analyzer/PRD.md`)
-
-Takes all raw outputs (TFDV stats, BQ query results, schemas) and generates a single self-contained HTML report. This
-component is defined by the PRD and generated/maintained by AI.
-
-## Data Flow
-
-```
-BQ Tables (node + edge)
-    |
-    +-- FeatureProfiler (Dataflow)
-    |   +-- TFDV stats (per table)
-    |   +-- Inferred schemas
-    |   +-- Anomaly reports
-    |
-    +-- GraphStructureAnalyzer (BQ SQL)
-    |   +-- Counts
-    |   +-- Degree distributions
-    |   +-- Quality checks
-    |   +-- Hub analysis
-    |
-    +-- ReportGenerator
-        +-- Single HTML report (GCS)
-```
-
-## Entry Point
-
-```python
-# CLI
-python -m gigl.analytics.data_analyzer \
-    --analyzer_config_uri gs://bucket/analyzer_config.yaml \
-    --resource_config_uri gs://bucket/resource_config.yaml
-
-# Python
-from gigl.analytics.data_analyzer import DataAnalyzer
-analyzer = DataAnalyzer()
-analyzer.run(config=config, resource_config_uri=resource_config_uri)
-```
-
-## Config Format (YAML)
-
-```yaml
-node_tables:
-  - bq_table: "project.dataset.user_nodes"
-    node_type: "user"
-    id_column: "user_id"
-    feature_columns: ["age", "country", "embedding"]
-    label_column: "label"  # optional, enables class imbalance and label coverage checks
-
-edge_tables:
-  - bq_table: "project.dataset.user_user_edges"
-    edge_type: "engages"
-    src_id_column: "src_user_id"
-    dst_id_column: "dst_user_id"
-    feature_columns: ["weight", "recency"]
-
-output_gcs_path: "gs://bucket/analysis_output/"
-
-# Neighbor explosion estimation (optional)
-fan_out: [15, 10, 5]  # sampling fan-out per GNN layer, used to estimate subgraph size
-
-# Opt-in expensive checks (Tier 4)
-# compute_reciprocity: false
-# compute_homophily: false
-# compute_connected_components: false
-# compute_clustering: false
-```
-
-## Reused Code
-
-| Component                  | Source                       | Path                                                    |
-| -------------------------- | ---------------------------- | ------------------------------------------------------- |
-| TFDV stats generation      | `GenerateAndVisualizeStats`  | `gigl/src/data_preprocessor/lib/transform/utils.py:120` |
-| Raw feature ingestion      | `IngestRawFeatures`          | `gigl/src/data_preprocessor/lib/transform/utils.py:85`  |
-| Instance dict to TFExample | `InstanceDictToTFExample`    | `gigl/src/data_preprocessor/lib/transform/utils.py:42`  |
-| TF schema writing          | `WriteTFSchema`              | `gigl/src/data_preprocessor/lib/transform/utils.py:186` |
-| Beam pipeline options      | `init_beam_pipeline_options` | `gigl/src/common/utils/dataflow.py`                     |
-| Dangling edge check        | `BQGraphValidator`           | `gigl/analytics/graph_validation/bq_graph_validator.py` |
-| BQ utilities               | `BqUtils`                    | `gigl/src/common/utils/bq.py`                           |
-| GCS path generation        | `gcs_constants`              | `gigl/src/common/constants/gcs.py`                      |
-| BQ data references         | `BigqueryNodeDataReference`  | `gigl/src/data_preprocessor/lib/ingest/bigquery.py`     |
-
-## New Files
-
-```
-gigl/analytics/
-    __init__.py                      # existing
-    graph_validation/                # existing
-    data_analyzer/
-        __init__.py
-        PRD.md                       # AI-owned: requirements for HTML report
-        data_analyzer.py             # main orchestrator + CLI entry point
-        config.py                    # DataAnalyzerConfig dataclass
-        feature_profiler.py          # TFDV Beam pipeline builder
-        graph_structure_analyzer.py  # BQ SQL queries for graph metrics
-        report_generator.py          # AI-owned: HTML report generation (built from PRD.md)
-        queries.py                   # SQL query templates
-```
-
-## Tradeoff Analysis
-
-### TFDV for feature profiling vs alternatives
-
-**Chose:** TFDV (TensorFlow Data Validation) on Dataflow.
-
-**Rejected:**
-
-- **Dataplex Auto Data Quality**: Google's managed BQ-native successor to TFDV. Serverless, petabyte-scale, zero infra.
-  We rejected it because TFDV code already exists in the codebase (commented out at `utils.py:300`), TFDV provides
-  FACETS HTML visualizations that Dataplex does not, and TFDV gives schema inference + anomaly detection in one
-  pipeline. Risk: TFDV is in slow-maintenance mode (~1 release/year, Python capped at 3.11).
-- **Great Expectations**: Industry standard, BQ SQL pushdown, 300+ checks, very active. Rejected because it's a new
-  dependency and doesn't provide FACETS-style visualizations. Better suited for ongoing validation contracts than
-  one-time profiling.
-- **whylogs**: Profiles scale with features not rows (ideal for 1B nodes), has Dataflow integration. Rejected because
-  it's a new dependency and the codebase already has TFDV infrastructure.
-- **ydata-profiling**: Not suitable for billion-scale. Requires pulling data into Python/pandas.
-
-### Standalone module vs new pipeline stage
-
-**Chose:** Standalone module in `gigl/analytics/data_analyzer/`.
-
-**Rejected:** Making it a new GiGL pipeline stage (like DataPreprocessor).
-
-Analysis should be runnable independently before committing to a full pipeline run. A pipeline stage requires a
-GbmlConfig proto, resource config, and the full orchestration framework. The standalone module takes a simple YAML
-config with BQ table references and can be run ad-hoc. This keeps the feedback loop short.
-
-### BQ SQL for graph structure analysis vs graph libraries
-
-**Chose:** BQ-native SQL queries using `APPROX_QUANTILES`, `APPROX_TOP_COUNT`, etc.
-
-**Rejected:**
-
-- **cuGraph (NVIDIA RAPIDS)**: Best performance for graph analytics with GPUs. Requires data export from BQ to GPU
-  memory and GPU infrastructure we may not have.
-- **NetworKit**: Parallel C++ graph analytics, handles billions of edges on CPU. Requires data export from BQ to local
-  storage.
-- **GraphFrames on Spark/Dataproc**: Distributed graph algorithms with BQ connector. Adds Spark cluster management
-  overhead for metrics we can compute in plain SQL.
-- **BQ Graph (GQL)**: Preview status, requires Enterprise edition, limited to pattern matching (no PageRank, community
-  detection yet).
-
-Data already lives in BQ. No data movement needed. BQ approximate functions handle billion-scale efficiently, and the
-existing codebase uses `BqUtils` extensively. For basic structural analysis (degree distributions, counts, hubs), BQ SQL
-is sufficient. If we need advanced graph algorithms (community detection, PageRank) later, we can add a GraphFrames or
-cuGraph integration as a separate component.
-
-### Single HTML report vs multiple artifacts
-
-**Chose:** Single self-contained HTML file.
-
-**Rejected:** Separate FACETS HTML per table + JSON data files + notebook/dashboard.
-
-A single HTML is portable: share via GCS link, open in any browser, no serving infrastructure needed. Gives a complete
-picture in one view. FACETS per-table HTML is embedded inline. For up to ~20 tables this keeps file size manageable.
-
-### AI-owned report generator vs hand-coded templates
-
-**Chose:** AI-owned component driven by a PRD at `gigl/analytics/data_analyzer/PRD.md`.
-
-**Rejected:** Hand-coded Jinja2 or string template approach.
-
-The HTML report layout will evolve as we add analysis dimensions. A PRD that AI agents can read and regenerate the code
-from means the visualization stays in sync with requirements without manual template maintenance. The PRD serves as both
-documentation and executable spec.
-
-## Verification
-
-- Unit tests: mock BQ responses, verify query generation, verify config parsing
-- Integration test: run against a small test BQ table, verify HTML output is generated
-- Manual: inspect the HTML report in a browser
-
-## References
-
-Full literature review with multiple insights per paper and insight-to-analysis mappings:
-[`docs/plans/20260415-bq-data-analyzer-references.md`](20260415-bq-data-analyzer-references.md)
-
-12 papers reviewed: PinSage (Pinterest), PinnerSage (Pinterest), BLADE (Amazon), LiGNN (LinkedIn), TwHIN (Twitter/X),
-GiGL (Snap), AliGraph (Alibaba), GraphSMOTE, Beyond Homophily, Uber/Grab fraud detection, Google Maps ETA, Feature
-Propagation. 100+ total insights extracted.
-
-### Aegis Integration (Phase 2)
-
-We intentionally **do not depend on Aegis in Phase 1** of the BQ Data Analyzer. Instead, we keep the analyzer as a
-self-contained GiGL module that can run against arbitrary node/edge BQ tables given only a YAML config.
-
-#### What Aegis provides
-
-Aegis is Snap’s centralized data quality and anomaly detection service. For BQ / BigLake datasets that are onboarded,
-Aegis can automatically compute:
-
-- **Profile measures per feature** (e.g., `null_ratio`, `zero_ratio`, `nan_ratio`, `min`, `max`, `mean`, `median`,
-  `p90`, `p99`) into per-dataset, per-day tables such as\
-  `sc-dig.aegis_{tier}.aegis_{data_source_id}_numerical_YYYYMMDD` and `..._categorical_YYYYMMDD`. :llmCitationRef[1]
-- **Array/struct profile tables**, **current row-count tables**, and a shared **user coverage stats** table keyed by
-  `data_source_id`. :llmCitationRef[2]
-- Optional anomaly detection and alerting on top of those metrics (WoW, ARIMA, UDB, CIC). :llmCitationRef[3]
-  :llmCitationRef[4]
-
-Relevant references:
-
-- **Aegis Adhoc Data Quality Check User Guide** (how to onboard adhoc BQ datasets, config naming, CLI backfill, and
-  output table schemas). :llmCitationRef[5]
-- **Aegis – Data Quality Monitoring & Anomaly Detection** (Confluence overview, supported data sources, profile measure
-  semantics). :llmCitationRef[6]
-- **Aegis Config v2** (YAML schema for BQ/BigLake data sources and detectors, examples for training datasets).
-  :llmCitationRef[7]
-- **Aegis New Backend Design** (modular architecture: profile measures, anomaly detection, metadata store, alerting).
-  :llmCitationRef[8]
-
-#### Why we are not using Aegis in Phase 1
-
-Phase 1 goals are:
-
-- **Fast, low-friction EDA for arbitrary graph BQ tables** (often adhoc node/edge tables that change frequently).
-- **Graph-specific structure analysis** (degree distributions, hubs, isolated nodes, referential integrity, homophily
-  proxies, neighbor explosion, etc.), which Aegis does not natively compute today.
-
-Coupling Phase 1 directly to Aegis would introduce several frictions:
-
-- **Onboarding overhead per dataset**: Even in adhoc mode, Aegis requires adding a config YAML in the Flowrida repo
-  (`metrics_gov/aegis/configs/adhoc/`), following strict dataset naming conventions (partitioned table prefixes like
-  `*_YYYYMMDD`/`*_YYYYMMDDHH`), and running the CLI backfill for each new dataset we want to analyze. :llmCitationRef[9]
-  This is exactly the per-table “add code to Flowrida just to look at my data” tax that this tool is trying to avoid.
-- **Infra dependencies (BigLake / Nexus / JAM / permissions)**: Adhoc Aegis flows today often involve BigLake syncs or
-  Nexus tables, service account wiring (JAMs like `aegis_uum_test`), and additional IAM leases to read the Aegis metrics
-  datasets. :llmCitationRef[10] :llmCitationRef[11] That is appropriate for long-lived production datasets but heavy for
-  one-off graph experiments.
-- **Mismatch in focus**: Aegis is optimized for **continuous monitoring + alerting** (row counts, profile measures,
-  usage coverage, detector alerts) across many prod datasets; this module is optimized for **deep, graph-specific
-  inspection** (e.g., super-hub clamping risks, neighbor explosion, degree-bucket distributions, homophily signals)
-  prior to committing to a GNN pipeline. We would still have to build most of the graph-structure analysis ourselves,
-  even if we delegated generic null/row-count stats to Aegis.
-
-Given the above, we treat Aegis as **highly complementary** but not a hard runtime dependency for Phase 1. Phase 1 keeps
-all necessary profiling and structure checks self-contained (TFDV + BQ SQL) so that graph practitioners can run it with
-only a YAML config and BQ access.
-
-#### Phase 2: Aegis integration plan and PoC ideas
-
-Once the BQ Data Analyzer is stable and useful on its own, we plan a Phase 2 integration with Aegis along two axes:
-
-1. **Read existing Aegis metrics when available (PoC 1)**
-
-   - For BQ datasets that are *already* onboarded to Aegis (e.g., UUM MDS outputs), the analyzer can optionally
-     **consume Aegis profile tables** instead of recomputing basic stats:
-     - `aegis_{data_source_id}_numerical_YYYYMMDD` / `..._categorical_YYYYMMDD` / `..._array_YYYYMMDD` /
-       `..._current_row_count_YYYYMMDD` / `user_coverage_stats`. :llmCitationRef[12]
-   - The HTML report would treat these as another input source and clearly label “metrics imported from Aegis” vs
-     “metrics computed by analyzer.”
-   - **PoC**: pick one existing graph-ish dataset that already has Aegis monitoring (e.g., a UUM or training dataset),
-     run the analyzer twice (with and without Aegis import), and sanity-check metric parity (row counts, null ratios,
-     coverage).
-
-2. **Optionally publish graph metrics into Aegis (PoC 2)**
-
-   - For production graph datasets, we can add a small writer that emits **graph-structure summaries** (degree buckets,
-     isolated/cold-start node fractions, super-hub counts) into an Aegis-compatible metrics table or view, so they show
-     up alongside standard profile measures in Aegis dashboards/alerts. :llmCitationRef[13] :llmCitationRef[14]
-   - This keeps Aegis as the single pane of glass for ongoing monitoring, while the analyzer remains the “rich HTML
-     report” for deep dives.
-   - **PoC**: define a minimal Aegis-style schema for graph metrics, write a one-off backfill for a single dataset, and
-     validate that the metrics can be queried and (eventually) surfaced via Aegis UI.
-
-3. **Longer-term: better UX for adhoc graph datasets**
-
-   - Longer-term, if Aegis exposes a higher-level UI/CLI for “adhoc BQ analysis” (no Flowrida PRs, just point at a
-     table), the BQ Data Analyzer could be wired in as an implementation detail: Aegis would trigger the analyzer and
-     host links to the generated HTML report.
-   - We explicitly **defer this to future work**, pending Aegis UX/infra evolution.
-
-In summary, **Phase 1** keeps the BQ Data Analyzer self-contained and immediately usable for graph workloads; **Phase
-2** focuses on **reusing Aegis where it’s already strong (metrics storage, monitoring, anomaly detection)** and
-publishing graph-specific metrics back into that ecosystem, without blocking v1 on current Aegis onboarding and UX
-constraints.
diff --git a/docs/plans/20260416-data-analyzer-1-pager.md b/docs/plans/20260416-data-analyzer-1-pager.md
deleted file mode 100644
index 87132f7eb..000000000
--- a/docs/plans/20260416-data-analyzer-1-pager.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# BQ Data Analyzer for GiGL
-
-## Problem
-
-We train GNNs on billion-node graphs stored in BigQuery. Today there is no way to analyze this data before committing to
-a full pipeline run. TFDV statistics generation exists in the codebase but is commented out and tightly coupled to the
-DataPreprocessor. Engineers discover data issues (dangling edges, extreme degree skew, missing features, class
-imbalance) only after training fails or produces poor results.
-
-A review of 18 production GNN papers (PinSage, LiGNN, TwHIN, GiGL, AliGraph, BLADE, and others) shows that
-graph-specific data properties directly determine model quality. Degree distribution alone accounts for 46-230%
-performance differences depending on sampling strategy (PinSage, BLADE). Class imbalance is amplified 2-3x by message
-passing (GraphSMOTE). Missing features are tolerable up to 90% but hit a phase transition at 95% (Feature Propagation,
-ICLR 2022). None of these are caught by standard tabular data quality tools.
-
-## Solution
-
-A standalone `DataAnalyzer` module under `gigl/analytics/` that takes a YAML config pointing at BQ tables and produces a
-single HTML report covering data quality, feature distributions (via TFDV/FACETS), and graph structure metrics (via BQ
-SQL).
-
-**Alternatives considered:**
-
-| Option                  | Verdict             | Reason                                                                                                                                                                                                                                                                                                                                                                        |
-| ----------------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Aegis from day 1        | Deferred to Phase 2 | Aegis requires per-dataset onboarding (Flowrida config, BigLake sync, JAM permissions). It does not compute graph-specific metrics (degree distributions, hubs, referential integrity, homophily). Good for continuous monitoring of production datasets, not for ad-hoc graph EDA. Phase 2 will read Aegis metrics when available and publish graph metrics back into Aegis. |
-| Dataplex Auto DQ        | Rejected            | Google's BQ-native successor to TFDV. Serverless and zero-infra, but lacks FACETS visualizations and schema inference.                                                                                                                                                                                                                                                        |
-| Great Expectations      | Rejected            | Industry standard with BQ SQL pushdown, but new dependency and no FACETS. Better for ongoing contracts than one-time profiling.                                                                                                                                                                                                                                               |
-| New GiGL pipeline stage | Rejected            | Requires GbmlConfig proto and full orchestration. Analysis should run independently with a simple YAML config.                                                                                                                                                                                                                                                                |
-
-## Architecture
-
-```
-YAML Config (BQ tables + columns)
-    |
-    +-- FeatureProfiler (Dataflow)        reuses existing TFDV Beam components
-    |   +-- TFDV stats + FACETS HTML      (GenerateAndVisualizeStats, IngestRawFeatures)
-    |   +-- Schema inference + anomalies
-    |
-    +-- GraphStructureAnalyzer (BQ SQL)   extends existing BQGraphValidator
-    |   +-- 25 validation checks across 4 tiers
-    |   +-- All structure checks run on full data (no TABLESAMPLE)
-    |
-    +-- ReportGenerator (AI-owned)
-        +-- Single self-contained HTML report -> GCS
-```
-
-**Validation tiers:**
-
-- **Tier 1 (Hard fails):** Dangling edges, referential integrity, duplicate nodes. Block training.
-- **Tier 2 (Core metrics):** Degree distribution, hubs, isolated nodes, cold-start count, NULL rates, memory budget,
-  neighbor explosion estimate. Always-on.
-- **Tier 3 (Label/heterogeneous):** Class imbalance, label coverage, edge type distribution. Auto-enabled when
-  applicable.
-- **Tier 4 (Opt-in):** Reciprocity, homophily, connected components, temporal freshness. Config flags, full data only.
-
-## What the Literature Says Peers Should Know
-
-From 18 papers across Pinterest, LinkedIn, Twitter/X, Snap, Alibaba, Amazon, Google, Uber, Grab, and Meta:
-
-| Finding                                             | Impact                                           | Source                     |
-| --------------------------------------------------- | ------------------------------------------------ | -------------------------- |
-| Degree distribution determines sampling strategy    | 46-230% quality difference                       | PinSage, BLADE             |
-| Cold-start nodes (degree 0-1) need densification    | +0.28% AUC at LinkedIn                           | LiGNN                      |
-| GiGL clamps degrees to int16 max (32,767)           | Silent precision loss for super-hubs             | GiGL codebase              |
-| Class imbalance amplified 2-3x by message passing   | Minority F1 drops 30-40% at 1:10 ratio           | GraphSMOTE                 |
-| Stale edges degrade quality 2-8% AUC                | Temporal freshness is first-class                | LiGNN, AliGraph, Uber/Grab |
-| Standard GNNs fail on heterophilic graphs (h < 0.3) | 30-50% accuracy drop; MLP wins at h < 0.2        | Beyond Homophily           |
-| Feature missing rate tolerable up to 90%            | Phase transition at 95%; 5% accuracy drop at 90% | Feature Propagation        |
-
-## Phasing
-
-- **Phase 1 (this work):** Self-contained analyzer. YAML config, TFDV + BQ SQL, single HTML report. No external
-  dependencies beyond BQ and Dataflow.
-- **Phase 2:** Aegis integration. Read existing Aegis profile metrics when available. Publish graph-structure metrics
-  back into Aegis for continuous monitoring.
-
-______________________________________________________________________
-
-## Appendix
-
-### A. Full Documentation
-
-| Document                                                     | Contents                                                                                          |
-| ------------------------------------------------------------ | ------------------------------------------------------------------------------------------------- |
-| [Design doc](20260415-bq-data-analyzer.md)                   | Components, config format, query inventory, result dataclasses, tradeoff analysis, reused code    |
-| [Literature review](20260415-bq-data-analyzer-references.md) | 18 papers, 100+ findings with source citations, common themes table, consolidated threshold table |
-| `gigl/analytics/data_analyzer/PRD.md`                        | HTML report specification (AI-owned)                                                              |
-
-### B. Key Thresholds (from literature)
-
-| Metric                  | Green | Yellow     | Red    |
-| ----------------------- | ----- | ---------- | ------ |
-| Edge homophily          | > 0.7 | 0.3 - 0.7  | < 0.3  |
-| Class imbalance         | < 1:5 | 1:5 - 1:10 | > 1:10 |
-| Feature missing         | < 10% | 10 - 50%   | > 90%  |
-| Cold-start fraction     | < 5%  | 5 - 10%    | > 10%  |
-| Degree p99/median       | < 50  | 50 - 100   | > 100  |
-| Neighbor explosion/seed | < 50K | 50K - 100K | > 100K |
-
-### C. Aegis Phase 2 Details
-
-See the [design doc, Aegis Integration section](20260415-bq-data-analyzer.md#aegis-integration-phase-2) for the full
-Phase 2 plan including two PoC proposals: (1) consuming existing Aegis profile tables, (2) publishing graph metrics into
-Aegis-compatible tables.
diff --git a/docs/plans/20260416-data-analyzer-engineering-spec.md b/docs/plans/20260416-data-analyzer-engineering-spec.md
deleted file mode 100644
index 55bbf14b6..000000000
--- a/docs/plans/20260416-data-analyzer-engineering-spec.md
+++ /dev/null
@@ -1,374 +0,0 @@
-# BQ Data Analyzer: Engineering Spec
-
-## Context
-
-This spec defines the implementation plan for the BQ Data Analyzer described in the
-[design doc](20260415-bq-data-analyzer.md) and [1-pager](20260416-data-analyzer-1-pager.md). The analyzer takes BQ table
-references via YAML config and produces a single HTML report covering data quality, feature distributions, and graph
-structure metrics for GNN training readiness.
-
-Validation dimensions are informed by a
-[literature review of 18 production GNN papers](20260415-bq-data-analyzer-references.md).
-
-## Architecture
-
-```
-YAML Config --> DataAnalyzer (orchestrator)
-                   |
-                   +-- GraphStructureAnalyzer (BQ SQL, 25 checks across 4 tiers)
-                   |       uses: BqUtils, BQGraphValidator
-                   |       output: GraphAnalysisResult
-                   |
-                   +-- FeatureProfiler (Beam/Dataflow)
-                   |       uses: GenerateAndVisualizeStats, IngestRawFeatures
-                   |       output: FeatureProfileResult
-                   |
-                   +-- ReportGenerator (data_analyzer/report/)
-                           uses: GraphAnalysisResult + FeatureProfileResult
-                           output: single self-contained HTML -> GCS
-```
-
-## Implementation: Layer-by-Layer
-
-### Layer 1: Config and Types
-
-**Files:**
-
-- `gigl/analytics/data_analyzer/config.py`
-- `gigl/analytics/data_analyzer/types.py`
-
-**config.py** defines three dataclasses parsed from YAML via OmegaConf (matching `gigl/common/utils/yaml_loader.py`
-pattern):
-
-```python
-@dataclass
-class NodeTableSpec:
-    bq_table: str
-    node_type: str
-    id_column: str
-    feature_columns: list[str]
-    label_column: Optional[str] = None
-
-@dataclass
-class EdgeTableSpec:
-    bq_table: str
-    edge_type: str
-    src_id_column: str
-    dst_id_column: str
-    feature_columns: list[str] = field(default_factory=list)
-    timestamp_column: Optional[str] = None
-
-@dataclass
-class DataAnalyzerConfig:
-    node_tables: list[NodeTableSpec]
-    edge_tables: list[EdgeTableSpec]
-    output_gcs_path: str
-    fan_out: Optional[list[int]] = None
-    compute_reciprocity: bool = False
-    compute_homophily: bool = False
-    compute_connected_components: bool = False
-    compute_clustering: bool = False
-```
-
-Config loading uses `OmegaConf.load()` + `OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), loaded)` +
-`OmegaConf.to_object()`.
-
-**types.py** defines result dataclasses:
-
-```python
-@dataclass
-class DegreeStats:
-    min: int
-    max: int
-    mean: float
-    median: int
-    p90: int
-    p99: int
-    p999: int
-    percentiles: list[int]
-    buckets: dict[str, int]  # "0-1": count, "2-10": count, etc.
-
-@dataclass
-class GraphAnalysisResult:
-    # Tier 1: hard fails
-    duplicate_node_counts: dict[str, int]
-    dangling_edge_counts: dict[str, int]
-    referential_integrity_violations: dict[str, int]
-    # Tier 2: core metrics
-    node_counts: dict[str, int]
-    edge_counts: dict[str, int]
-    null_rates: dict[str, dict[str, float]]
-    duplicate_edge_counts: dict[str, int]
-    self_loop_counts: dict[str, int]
-    isolated_node_counts: dict[str, int]
-    degree_stats: dict[str, DegreeStats]
-    top_hubs: dict[str, list[tuple[str, int]]]
-    super_hub_int16_clamp_count: dict[str, int]
-    cold_start_node_counts: dict[str, int]
-    feature_memory_bytes: dict[str, int]
-    neighbor_explosion_estimate: dict[str, int]
-    # Tier 3: label and heterogeneous
-    class_imbalance: dict[str, dict[str, int]]
-    label_coverage: dict[str, float]
-    edge_type_distribution: dict[str, int]
-    edge_type_node_coverage: dict[str, dict[str, int]]
-    # Tier 4: opt-in
-    reciprocity: dict[str, float]
-    power_law_exponent: dict[str, float]
-
-@dataclass
-class FeatureProfileResult:
-    facets_html_paths: dict[str, str]   # table_name -> GCS path to FACETS HTML
-    stats_paths: dict[str, str]         # table_name -> GCS path to stats TFRecord
-    schema_paths: dict[str, str]        # table_name -> GCS path to schema proto
-    anomalies: dict[str, list[str]]     # table_name -> list of anomaly descriptions
-```
-
-**Tests:** `tests/unit/analytics/data_analyzer/config_test.py`
-
-- Parse valid YAML, verify all fields populated
-- Parse YAML with optional fields omitted, verify defaults
-- Parse invalid YAML (missing required fields), verify error raised
-- Validate edge table references against node table types
-
-______________________________________________________________________
-
-### Layer 2: BQ Queries and GraphStructureAnalyzer
-
-**Files:**
-
-- `gigl/analytics/data_analyzer/queries.py`
-- `gigl/analytics/data_analyzer/graph_structure_analyzer.py`
-
-**queries.py**: 18 SQL template constants as module-level strings. Each parameterized with `.format()` for table names
-and column names. Pattern matches `gigl/src/data_preprocessor/lib/enumerate/queries.py`.
-
-Query inventory (from design doc):
-
-| Constant                           | Tier | Purpose                                   |
-| ---------------------------------- | ---- | ----------------------------------------- |
-| `NODE_COUNT_QUERY`                 | 2    | `SELECT COUNT(*) FROM {table}`            |
-| `EDGE_COUNT_QUERY`                 | 2    | `SELECT COUNT(*) FROM {table}`            |
-| `NULL_RATES_QUERY`                 | 2    | Batched `COUNTIF` per column              |
-| `DUPLICATE_NODE_COUNT_QUERY`       | 1    | `GROUP BY id HAVING COUNT(*) > 1`         |
-| `DUPLICATE_EDGE_COUNT_QUERY`       | 2    | `GROUP BY (src, dst) HAVING COUNT(*) > 1` |
-| `DANGLING_EDGES_QUERY`             | 1    | `WHERE src IS NULL OR dst IS NULL`        |
-| `EDGE_REFERENTIAL_INTEGRITY_QUERY` | 1    | `LEFT JOIN ... WHERE IS NULL`             |
-| `SELF_LOOP_COUNT_QUERY`            | 2    | `WHERE src = dst`                         |
-| `ISOLATED_NODE_COUNT_QUERY`        | 2    | `LEFT JOIN ... WHERE edge IS NULL`        |
-| `DEGREE_DISTRIBUTION_QUERY`        | 2    | `APPROX_QUANTILES` on degree counts       |
-| `DEGREE_BUCKET_QUERY`              | 2    | Bucket counts for 6 ranges                |
-| `TOP_K_HUBS_QUERY`                 | 2    | `ORDER BY degree DESC LIMIT k`            |
-| `SUPER_HUB_INT16_CLAMP_QUERY`      | 2    | `HAVING COUNT(*) > 32767`                 |
-| `COLD_START_NODE_COUNT_QUERY`      | 2    | Degree 0-1 count                          |
-| `CLASS_IMBALANCE_QUERY`            | 3    | `GROUP BY label_column`                   |
-| `LABEL_COVERAGE_QUERY`             | 3    | `COUNTIF(label IS NOT NULL)`              |
-| `EDGE_TYPE_DISTRIBUTION_QUERY`     | 3    | `COUNT(*)` per type                       |
-| `EDGE_TYPE_NODE_COVERAGE_QUERY`    | 3    | `COUNT(DISTINCT src/dst)` per type        |
-
-`FEATURE_MEMORY_BUDGET`, `NEIGHBOR_EXPLOSION_ESTIMATE`, and `POWER_LAW_EXPONENT` are computed in Python from schema
-metadata and degree stats.
-
-**graph_structure_analyzer.py**:
-
-- `GraphStructureAnalyzer.__init__(bq_project: Optional[str])` creates `BqUtils`
-- `analyze(config: DataAnalyzerConfig) -> GraphAnalysisResult`
-- Runs queries in parallel via `concurrent.futures.ThreadPoolExecutor` (same pattern as `data_preprocessor.py:311`)
-- Tier 1 checks raise `DataQualityError` on failure
-- Tier 3 checks auto-enabled when `label_column` or multiple edge types present
-- Tier 4 checks gated by config flags
-
-**Tests:** `tests/unit/analytics/data_analyzer/queries_test.py`
-
-- For each query constant: verify `.format()` with test table/column names produces valid SQL containing expected
-  clauses
-- Verify `NULL_RATES_QUERY` batches multiple columns into one query
-- Verify `DEGREE_DISTRIBUTION_QUERY` produces separate in/out queries
-
-**Tests:** `tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py`
-
-- Mock `BqUtils.run_query()` with `@patch`
-- Feed canned query results (lists of dicts mimicking BQ `RowIterator`)
-- Verify `GraphAnalysisResult` fields populated correctly
-- Verify Tier 1 checks raise `DataQualityError` when violations found
-- Verify Tier 3 checks skipped when no `label_column` configured
-- Verify Tier 4 checks skipped when config flags are False
-- Verify `feature_memory_bytes` computed correctly from schema metadata
-- Verify `neighbor_explosion_estimate` computed from degree stats and fan_out config
-
-______________________________________________________________________
-
-### Layer 3: FeatureProfiler (TFDV/Dataflow)
-
-**Files:**
-
-- `gigl/analytics/data_analyzer/feature_profiler.py`
-
-Standalone Beam pipeline builder. For each table in config:
-
-1. Builds feature spec from column names and inferred types
-2. Creates Beam pipeline via `init_beam_pipeline_options()`
-3. Reads from BQ via `BigqueryNodeDataReference` / `BigqueryEdgeDataReference`
-4. Runs `GenerateAndVisualizeStats` for TFDV stats + FACETS HTML
-5. Runs `tfdv.infer_schema()` for schema inference
-6. Runs `tfdv.validate_statistics()` for anomaly detection
-7. Writes outputs to GCS under `{output_gcs_path}/tfdv/{table_name}/`
-
-Returns `FeatureProfileResult` with GCS paths.
-
-Launches one Dataflow job per table via `ThreadPoolExecutor` with a lock (same pattern as DataPreprocessor for
-serialized `p.run()` calls).
-
-**Tests:** No unit tests for the Beam pipeline itself (Dataflow execution). Tested via integration test with real BQ and
-Dataflow.
-
-**Tests:** `tests/integration/analytics/data_analyzer/feature_profiler_test.py`
-
-- Requires cloud resources (BQ table, Dataflow, GCS bucket)
-- Runs profiler against a small test BQ table
-- Verifies output files exist on GCS (stats TFRecord, FACETS HTML, schema proto)
-
-______________________________________________________________________
-
-### Layer 4: HTML Report
-
-**Files:**
-
-- `gigl/analytics/data_analyzer/report/SPEC.md`
-- `gigl/analytics/data_analyzer/report/report_generator.py`
-- `gigl/analytics/data_analyzer/report/report.ai.html`
-- `gigl/analytics/data_analyzer/report/charts.ai.js`
-- `gigl/analytics/data_analyzer/report/styles.ai.css`
-- `gigl/analytics/data_analyzer/report/__init__.py`
-
-**SPEC.md**: AI-owned specification for the HTML report. Defines:
-
-- Report sections (header, overview dashboard, data quality, feature statistics, graph structure, footer)
-- Visual style (#f8f9fa background, monospace data values, sans-serif labels, 1200px max-width)
-- Color coding (green #28a745, yellow #ffc107, red #dc3545) with threshold values from literature
-- Collapsible sections (CSS-only)
-- Self-contained constraint (no external dependencies)
-- FACETS embedding via iframe/shadow DOM
-
-AI agents read SPEC.md to generate/regenerate the `.ai.{html|js|css}` files.
-
-**report_generator.py**: Python module:
-
-- `generate(analysis_result: GraphAnalysisResult, profile_result: Optional[FeatureProfileResult], config: DataAnalyzerConfig) -> str`
-- Loads `report.ai.html` template
-- Serializes results to JSON and injects into template
-- Embeds FACETS HTML strings inline (reads from GCS paths in `FeatureProfileResult`)
-- Returns the complete HTML string
-
-**report.ai.html**: Main template with placeholder slots for data injection. Contains the report structure.
-
-**charts.ai.js**: Inline SVG chart generation for degree distribution histograms. No external charting library.
-
-**styles.ai.css**: Embedded CSS for the report layout and color coding.
-
-**Tests:** `tests/unit/analytics/data_analyzer/report/report_generator_test.py`
-
-- Snapshot test:
-  1. Construct `GraphAnalysisResult` with deterministic test data
-  2. Call `generate()` (without TFDV results, those are optional)
-  3. Compare output against golden file at `tests/test_assets/analytics/golden_report.html`
-  4. If snapshot differs, test fails with a diff showing what changed
-- Structural tests:
-  - Verify output contains expected section headings
-  - Verify Tier 1 hard-fail results appear with red indicators
-  - Verify threshold color coding applied correctly (green/yellow/red)
-  - Verify optional sections (Tier 3, Tier 4) omitted when data not present
-
-______________________________________________________________________
-
-### Layer 5: Orchestrator and CLI
-
-**Files:**
-
-- `gigl/analytics/data_analyzer/data_analyzer.py`
-- `gigl/analytics/data_analyzer/__init__.py`
-
-**data_analyzer.py**:
-
-```python
-class DataAnalyzer:
-    def run(
-        self,
-        config: DataAnalyzerConfig,
-        resource_config_uri: Optional[Uri] = None,
-    ) -> str:  # returns GCS path to HTML report
-        ...
-```
-
-Orchestration:
-
-1. Run `GraphStructureAnalyzer.analyze(config)` and `FeatureProfiler.profile(config)` in parallel via
-   `ThreadPoolExecutor`
-2. If GraphStructureAnalyzer raises `DataQualityError` (Tier 1 failure), still generate report showing the failures, but
-   log error
-3. Pass both results to `ReportGenerator.generate()`
-4. Upload HTML string to `{config.output_gcs_path}/report.html` via `GcsUtils`
-5. Log the GCS path
-
-CLI entry point (`if __name__ == "__main__"`):
-
-```python
-parser = argparse.ArgumentParser()
-parser.add_argument("--analyzer_config_uri", required=True)
-parser.add_argument("--resource_config_uri", required=False)
-```
-
-Matches DataPreprocessor CLI pattern.
-
-**Tests:** No dedicated unit test for the orchestrator (it's thin glue). Tested via integration test.
-
-______________________________________________________________________
-
-## Testing Summary
-
-| Test file                          | Layer | Type        | What it tests                                            |
-| ---------------------------------- | ----- | ----------- | -------------------------------------------------------- |
-| `config_test.py`                   | 1     | Unit        | YAML parsing, defaults, validation                       |
-| `queries_test.py`                  | 2     | Unit        | SQL template correctness (string assertions)             |
-| `graph_structure_analyzer_test.py` | 2     | Unit        | Mocked BQ, result population, tier gating, error raising |
-| `report_generator_test.py`         | 4     | Unit        | Snapshot test, structural assertions, threshold coloring |
-| `feature_profiler_test.py`         | 3     | Integration | Real BQ + Dataflow, output file existence                |
-| `graph_structure_analyzer_test.py` | 2     | Integration | Real BQ, query execution, result correctness             |
-
-All unit tests use `tests.test_assets.test_case.TestCase` as base class. BQ mocking uses
-`@patch("gigl.src.common.utils.bq.bigquery.Client")` pattern from existing `bq_test.py`.
-
-## AI-Owned File Convention
-
-Files named `*.ai.{html|js|css}` are generated and maintained by AI agents. The `SPEC.md` in the same directory defines
-the requirements. To regenerate:
-
-1. Read `SPEC.md`
-2. Generate/update the `.ai.*` files to match the spec
-3. Run snapshot test to verify no regressions
-
-This convention is new to the codebase. The `report/` directory is the first instance.
-
-## Reused Code
-
-| Component                    | Path                                                    |
-| ---------------------------- | ------------------------------------------------------- |
-| `GenerateAndVisualizeStats`  | `gigl/src/data_preprocessor/lib/transform/utils.py:120` |
-| `IngestRawFeatures`          | `gigl/src/data_preprocessor/lib/transform/utils.py:85`  |
-| `InstanceDictToTFExample`    | `gigl/src/data_preprocessor/lib/transform/utils.py:42`  |
-| `WriteTFSchema`              | `gigl/src/data_preprocessor/lib/transform/utils.py:186` |
-| `init_beam_pipeline_options` | `gigl/src/common/utils/dataflow.py`                     |
-| `BQGraphValidator`           | `gigl/analytics/graph_validation/bq_graph_validator.py` |
-| `BqUtils`                    | `gigl/src/common/utils/bq.py`                           |
-| `gcs_constants`              | `gigl/src/common/constants/gcs.py`                      |
-| `BigqueryNodeDataReference`  | `gigl/src/data_preprocessor/lib/ingest/bigquery.py`     |
-| `yaml_loader`                | `gigl/common/utils/yaml_loader.py`                      |
-
-## Verification
-
-- `make unit_test_py PY_TEST_FILES="config_test.py"` for config parsing
-- `make unit_test_py PY_TEST_FILES="queries_test.py"` for SQL templates
-- `make unit_test_py PY_TEST_FILES="graph_structure_analyzer_test.py"` for mocked BQ
-- `make unit_test_py PY_TEST_FILES="report_generator_test.py"` for snapshot test
-- `make integration_test PY_TEST_FILES="feature_profiler_test.py"` for TFDV/Dataflow
-- Manual: run analyzer against real BQ tables, open HTML report in browser

From c2c05e2c05c990b9ac9b4ddb6e41b008550f9405 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Mon, 20 Apr 2026 18:01:40 +0000
Subject: [PATCH 17/20] feat(analytics): write the HTML report to disk or GCS
 from the orchestrator

Previously the orchestrator generated the HTML in memory but left the
upload as a TODO, forcing practitioners to copy a Python snippet to see
the output. Now DataAnalyzer.run() writes report.html under
config.output_gcs_path, detecting the scheme:

- gs:// URIs upload via GcsUtils.upload_from_string()
- local paths write via pathlib, creating parent dirs as needed

Returns the final path (GCS URI or resolved local path) so the CLI can
log it and practitioners can open the file directly.

Tests cover both local and mocked-GCS paths plus trailing-slash handling.

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/data_analyzer/data_analyzer.py | 44 +++++++++++++++----
 .../data_analyzer/data_analyzer_test.py       | 37 ++++++++++++++++
 2 files changed, 73 insertions(+), 8 deletions(-)
 create mode 100644 tests/unit/analytics/data_analyzer/data_analyzer_test.py

diff --git a/gigl/analytics/data_analyzer/data_analyzer.py b/gigl/analytics/data_analyzer/data_analyzer.py
index 3a99ffa35..b4c497713 100644
--- a/gigl/analytics/data_analyzer/data_analyzer.py
+++ b/gigl/analytics/data_analyzer/data_analyzer.py
@@ -1,5 +1,6 @@
 """Main orchestrator and CLI entry point for the BQ Data Analyzer."""
 import argparse
+from pathlib import Path
 from typing import Optional
 
 from gigl.analytics.data_analyzer.config import DataAnalyzerConfig, load_analyzer_config
@@ -9,12 +10,37 @@
 )
 from gigl.analytics.data_analyzer.report.report_generator import generate_report
 from gigl.analytics.data_analyzer.types import FeatureProfileResult, GraphAnalysisResult
-from gigl.common import Uri
+from gigl.common import GcsUri, Uri
 from gigl.common.logger import Logger
+from gigl.common.utils.gcs import GcsUtils
 
 logger = Logger()
 
 
+def _write_report(html: str, output_gcs_path: str) -> str:
+    """Write the HTML report to a GCS URI or local path.
+
+    Args:
+        html: Rendered HTML string.
+        output_gcs_path: Output directory. If it starts with ``gs://`` the
+            report is uploaded via ``GcsUtils``. Otherwise it is written to
+            the local filesystem (the directory is created if missing).
+
+    Returns:
+        The full path to the written ``report.html`` file.
+    """
+    trimmed = output_gcs_path.rstrip("/")
+    report_path = f"{trimmed}/report.html"
+    if trimmed.startswith("gs://"):
+        GcsUtils().upload_from_string(GcsUri(report_path), html)
+    else:
+        local_path = Path(report_path).expanduser().resolve()
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+        local_path.write_text(html)
+        report_path = str(local_path)
+    return report_path
+
+
 class DataAnalyzer:
     """Orchestrates graph structure analysis, feature profiling, and report generation.
 
@@ -30,14 +56,18 @@ def run(
         config: DataAnalyzerConfig,
         resource_config_uri: Optional[Uri] = None,
     ) -> str:
-        """Run the full analysis pipeline and generate an HTML report.
+        """Run the full analysis pipeline and write an HTML report.
+
+        The report is written to ``{config.output_gcs_path}/report.html`` via
+        ``GcsUtils`` when the output path is a ``gs://`` URI, or to the local
+        filesystem otherwise (the parent directory is created if missing).
 
         Args:
             config: Analyzer configuration.
             resource_config_uri: Optional resource config for Dataflow sizing.
 
         Returns:
-            GCS path to the generated HTML report.
+            The path to the written ``report.html`` (GCS URI or local path).
         """
         analysis_result: GraphAnalysisResult
         profile_result: Optional[FeatureProfileResult] = None
@@ -57,11 +87,9 @@ def run(
             config=config,
         )
 
-        report_gcs_path = f"{config.output_gcs_path.rstrip('/')}/report.html"
-        logger.info(f"Generated report; would upload to {report_gcs_path}")
-        # TODO: wire up GCS upload via gigl.common.utils.gcs.GcsUtils
-
-        return report_gcs_path
+        report_path = _write_report(html, config.output_gcs_path)
+        logger.info(f"Report written to {report_path}")
+        return report_path
 
 
 def main() -> None:
diff --git a/tests/unit/analytics/data_analyzer/data_analyzer_test.py b/tests/unit/analytics/data_analyzer/data_analyzer_test.py
new file mode 100644
index 000000000..5f1b9f046
--- /dev/null
+++ b/tests/unit/analytics/data_analyzer/data_analyzer_test.py
@@ -0,0 +1,37 @@
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+from gigl.analytics.data_analyzer.data_analyzer import _write_report
+from tests.test_assets.test_case import TestCase
+
+HTML = "<html><body>report</body></html>"
+
+
+class WriteReportLocalTest(TestCase):
+    def test_writes_to_local_directory(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = _write_report(HTML, tmpdir)
+            report = Path(path)
+            self.assertTrue(report.exists())
+            self.assertEqual(report.read_text(), HTML)
+            self.assertEqual(report.name, "report.html")
+
+    def test_creates_missing_parent_dirs(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            nested = Path(tmpdir) / "nested" / "path"
+            path = _write_report(HTML, str(nested))
+            self.assertTrue(Path(path).exists())
+
+
+@patch("gigl.analytics.data_analyzer.data_analyzer.GcsUtils")
+class WriteReportGcsTest(TestCase):
+    def test_uploads_to_gcs(self, mock_gcs_cls: MagicMock) -> None:
+        path = _write_report(HTML, "gs://my-bucket/output/")
+        self.assertEqual(path, "gs://my-bucket/output/report.html")
+        mock_gcs_cls.return_value.upload_from_string.assert_called_once()
+
+    def test_handles_trailing_slash(self, mock_gcs_cls: MagicMock) -> None:
+        path_with = _write_report(HTML, "gs://my-bucket/output/")
+        path_without = _write_report(HTML, "gs://my-bucket/output")
+        self.assertEqual(path_with, path_without)

From e67eeac037be2a507ae5f20f2f97496e86928914 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Mon, 20 Apr 2026 18:48:29 +0000
Subject: [PATCH 18/20] docs(analytics): add practitioner README for the
 analytics module

Quickstart-first guide at gigl/analytics/README.md covering:

- 3-step quickstart (auth, YAML config, CLI command) with a single
  entry point that now writes report.html to disk or GCS
- Tier summary table (what runs when)
- Interpretation table with thresholds + "what to do" actions drawn
  from the 18-paper literature review
- Advanced config keys (opt-in Tier 3/4, label_column, timestamp_column,
  fan_out)
- Python API snippet for programmatic access
- graph_validation sub-package pointer
- Scope and limitations (FeatureProfiler stub, Tier 4 queries TODO)
- Links to design doc, literature review, 1-pager, engineering spec,
  report PRD, and report SPEC

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/README.md | 186 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 gigl/analytics/README.md

diff --git a/gigl/analytics/README.md b/gigl/analytics/README.md
new file mode 100644
index 000000000..b8db1f286
--- /dev/null
+++ b/gigl/analytics/README.md
@@ -0,0 +1,186 @@
+# GiGL Analytics
+
+Pre-training graph data validation and analysis tooling. Use this module before committing to a GNN training run to
+catch data quality and structural issues that silently degrade model quality.
+
+Two subpackages:
+
+- [`data_analyzer/`](data_analyzer/) — end-to-end `DataAnalyzer` that runs 4 tiers of BigQuery checks and produces a
+  single self-contained HTML report. **Start here.**
+- [`graph_validation/`](graph_validation/) — lightweight standalone validators (currently: `BQGraphValidator` for
+  dangling-edge checks). Use when you only need one check and not the full report.
+
+## Quickstart
+
+Three steps to a working report.
+
+### 1. Authenticate to BigQuery
+
+```bash
+gcloud auth application-default login
+```
+
+### 2. Write a YAML config
+
+Save as `my_analyzer_config.yaml`:
+
+```yaml
+node_tables:
+  - bq_table: "your-project.your_dataset.user_nodes"
+    node_type: "user"
+    id_column: "user_id"
+    feature_columns: ["age", "country"]
+    # label_column: "label"  # optional, enables Tier 3 checks
+
+edge_tables:
+  - bq_table: "your-project.your_dataset.user_edges"
+    edge_type: "follows"
+    src_id_column: "src_user_id"
+    dst_id_column: "dst_user_id"
+    # timestamp_column: "ts"  # optional, enables temporal freshness
+
+# Where to write the HTML report. Use a local path for quick iteration
+# or a gs:// URI to upload to GCS.
+output_gcs_path: "/tmp/my_analysis/"
+
+# Optional: sizing for neighbor-explosion estimates (fan-out per GNN layer).
+fan_out: [15, 10, 5]
+```
+
+### 3. Run the analyzer
+
+```bash
+uv run python -m gigl.analytics.data_analyzer \
+    --analyzer_config_uri my_analyzer_config.yaml
+```
+
+When it finishes you will see:
+
+```
+[INFO] Report written to /tmp/my_analysis/report.html
+```
+
+Open the file in any browser. No server, no external dependencies, fully offline.
+
+## What it checks
+
+The analyzer organizes checks into four tiers. Tiers 1 and 2 always run; tier 3 auto-enables when your config supports
+it; tier 4 is opt-in.
+
+| Tier                         | When                                                                                 | What it checks                                                                                                                                                                                                                          |
+| ---------------------------- | ------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **1. Hard fails**            | Always                                                                               | Dangling edges, referential integrity (edges to non-existent nodes), duplicate nodes. Raises `DataQualityError` — report still generated to show partial results.                                                                       |
+| **2. Core metrics**          | Always                                                                               | Node/edge counts, degree distribution (in/out), degree buckets, top-K hubs, super-hub int16 clamp count, cold-start node count, self-loops, duplicate edges, NULL rates per column, feature memory budget, neighbor-explosion estimate. |
+| **3. Label + heterogeneous** | Auto when `label_column` is set on any node table, or when multiple edge types exist | Class imbalance, label coverage, edge type distribution, per-edge-type node coverage.                                                                                                                                                   |
+| **4. Advanced**              | Opt-in via config flags                                                              | Reciprocity, homophily, connected components, clustering coefficient. Runs on full data (no sampling).                                                                                                                                  |
+
+Full per-check rationale with literature citations lives in the
+[design doc](../../docs/plans/20260415-bq-data-analyzer.md) and
+[literature review](../../docs/plans/20260415-bq-data-analyzer-references.md).
+
+## Interpreting the report
+
+The report color-codes every numeric finding against thresholds drawn from 18 production GNN papers. Summary:
+
+| Metric                                                   | Green | Yellow     | Red     | What to do when yellow/red                                                                                                                    |
+| -------------------------------------------------------- | ----- | ---------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| Dangling edges / referential integrity / duplicate nodes | 0     | —          | any > 0 | Fix the input tables. Training will fail or silently corrupt otherwise.                                                                       |
+| Feature missing rate                                     | < 10% | 10–50%     | > 90%   | Plan an imputation strategy; at > 95% the Feature Propagation phase transition hits and GNNs stop recovering signal well.                     |
+| Isolated node fraction                                   | < 1%  | 1–5%       | > 5%    | Filter isolated nodes or densify (LiGNN-style) for cold-start cohorts.                                                                        |
+| Cold-start fraction (degree 0–1)                         | < 5%  | 5–10%      | > 10%   | Candidates for graph densification; also flag for special handling at serving time.                                                           |
+| Super-hub int16 clamp (degree > 32,767)                  | 0     | —          | any > 0 | GiGL silently truncates super-hub degrees. Either cap the hub's edges upstream or plan to fix the clamp.                                      |
+| Degree p99/median                                        | < 50  | 50–100     | > 100   | Use importance sampling (PinSage) or degree-adaptive neighborhoods (BLADE).                                                                   |
+| Class imbalance ratio                                    | < 1:5 | 1:5 – 1:10 | > 1:10  | Message passing amplifies label imbalance 2-3x in representation space (GraphSMOTE). Consider resampling or GraphSMOTE-style synthetic nodes. |
+| Edge homophily (Tier 4)                                  | > 0.7 | 0.3 – 0.7  | < 0.3   | Standard GCN/GAT fail at low h. Consider H2GCN-style architectures; at h < 0.2 an MLP often wins.                                             |
+
+Full threshold table with citations:
+[`docs/plans/20260415-bq-data-analyzer-references.md`](../../docs/plans/20260415-bq-data-analyzer-references.md#19-consolidated-threshold-table).
+
+## Advanced config
+
+Optional YAML keys beyond the minimal quickstart:
+
+```yaml
+# Enable Tier 3 class-imbalance + label-coverage checks for a node type:
+node_tables:
+  - bq_table: ...
+    label_column: "label"
+
+# Enable Tier 4 temporal freshness for an edge type:
+edge_tables:
+  - bq_table: ...
+    timestamp_column: "created_at"
+
+# Neighbor explosion estimation — the fan-out per GNN layer you plan to train with:
+fan_out: [15, 10, 5]
+
+# Opt-in Tier 4 checks. Default false; all run on full data (no sampling).
+compute_reciprocity: true
+compute_homophily: true
+compute_connected_components: true
+compute_clustering: true
+```
+
+## Python API
+
+The CLI wraps a regular class. Call from your own code when you want programmatic access to the result dataclass:
+
+```python
+from gigl.analytics.data_analyzer import DataAnalyzer
+from gigl.analytics.data_analyzer.config import load_analyzer_config
+
+config = load_analyzer_config("my_analyzer_config.yaml")
+analyzer = DataAnalyzer()
+report_path = analyzer.run(config=config)
+# report_path points to the written report.html
+```
+
+The underlying `GraphStructureAnalyzer` is also callable directly if you only want the raw `GraphAnalysisResult`
+dataclass and no HTML:
+
+```python
+from gigl.analytics.data_analyzer.graph_structure_analyzer import GraphStructureAnalyzer
+
+result = GraphStructureAnalyzer().analyze(config)
+print(result.degree_stats)
+```
+
+## graph_validation
+
+One-off validators for the subset of cases where the full analyzer is overkill. Today the only check is dangling-edge
+detection:
+
+```python
+from gigl.analytics.graph_validation import BQGraphValidator
+
+has_dangling = BQGraphValidator.does_edge_table_have_dangling_edges(
+    edge_table="your-project.your_dataset.user_edges",
+    src_node_column_name="src_user_id",
+    dst_node_column_name="dst_user_id",
+)
+```
+
+The `DataAnalyzer` runs this check (and many more) as part of Tier 1, so prefer the full analyzer unless you
+specifically need to script a one-line gate.
+
+## Scope and limitations
+
+The v1 analyzer has two deliberate scope cuts:
+
+- **FeatureProfiler is a stub.** The class is wired in but the TFDV/Dataflow pipeline that produces FACETS HTML per
+  table is deferred to a follow-up PR. Calling it today logs a warning and returns an empty `FeatureProfileResult`. The
+  main report is fully functional without it.
+- **Tier 4 queries are not implemented.** Reciprocity, homophily, connected components, and clustering coefficient
+  config flags are accepted but currently no-op. The power-law exponent (Tier 4) is computed from degree stats as an
+  approximation.
+
+## Deeper reading
+
+- [Design doc](../../docs/plans/20260415-bq-data-analyzer.md) — architecture, 4-tier validation, cost control, tradeoff
+  analysis
+- [Literature review](../../docs/plans/20260415-bq-data-analyzer-references.md) — 18 production GNN papers, 100+
+  findings, consolidated threshold table
+- [1-pager](../../docs/plans/20260416-data-analyzer-1-pager.md) — executive summary
+- [Engineering spec](../../docs/plans/20260416-data-analyzer-engineering-spec.md) — per-layer implementation contract
+- [Report PRD](data_analyzer/report/PRD.md) — product intent for the HTML report
+- [Report SPEC](data_analyzer/report/SPEC.md) — technical contract for regenerating the HTML/JS/CSS assets

From 40f379a16476f9bd826345e6b65771497e2804d9 Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Mon, 20 Apr 2026 21:29:42 +0000
Subject: [PATCH 19/20] fix(analytics): address code-reviewer feedback on
 practitioner README

Changes from the review pass:

README fixes:
- Remove all docs/plans/* links (the plans were intentionally deleted
  in d3f1eb8f). Inline the relevant paper citations into the threshold
  table so readers aren't pointed at 404s.
- Add "Prerequisites" line pointing at the GiGL installation guide so
  the quickstart doesn't assume uv/deps are already set up.
- Mark Tier 4 flags (compute_homophily, compute_connected_components,
  compute_clustering, timestamp_column) as not-yet-implemented in both
  the tier table and the Advanced Config section, not only in the
  Scope section at the bottom.
- Add the power-law exponent mention to the Tier 4 row (was only in
  scope notes; it's actually computed today).
- Document the heterogeneous-graph referential-integrity caveat
  (analyzer currently joins each edge table against node_tables[0]).
- Link to tests/test_assets/analytics/golden_report.html so a reader
  can preview the output before authenticating to BQ.

Config fix:
- NodeTableSpec.feature_columns: MISSING -> field(default_factory=list)
  so that nodes with no features are legal. Previously users got a
  cryptic OmegaConf MissingMandatoryValue error, and no-feature nodes
  are a real use case.
- Add a regression test covering the no-feature-columns case.

All 31 analytics unit tests pass. mypy clean. check_format clean.

Co-Authored-By: shubhamvij <svij@snapchat.com>
---
 gigl/analytics/README.md                      | 138 +++++++++---------
 gigl/analytics/data_analyzer/config.py        |   2 +-
 .../analytics/data_analyzer/config_test.py    |  19 +++
 3 files changed, 90 insertions(+), 69 deletions(-)

diff --git a/gigl/analytics/README.md b/gigl/analytics/README.md
index b8db1f286..25ee7b3e5 100644
--- a/gigl/analytics/README.md
+++ b/gigl/analytics/README.md
@@ -5,56 +5,51 @@ catch data quality and structural issues that silently degrade model quality.
 
 Two subpackages:
 
-- [`data_analyzer/`](data_analyzer/) — end-to-end `DataAnalyzer` that runs 4 tiers of BigQuery checks and produces a
-  single self-contained HTML report. **Start here.**
+- [`data_analyzer/`](data_analyzer/) — end-to-end `DataAnalyzer` that runs BigQuery checks and produces a single
+  self-contained HTML report. **Start here.**
 - [`graph_validation/`](graph_validation/) — lightweight standalone validators (currently: `BQGraphValidator` for
   dangling-edge checks). Use when you only need one check and not the full report.
 
 ## Quickstart
 
-Three steps to a working report.
-
-### 1. Authenticate to BigQuery
+**Prerequisites.** Follow the [GiGL installation guide](../../docs/user_guide/getting_started/installation.md) so that
+`uv` and GiGL's Python dependencies are available. Then authenticate to BigQuery:
 
 ```bash
 gcloud auth application-default login
 ```
 
-### 2. Write a YAML config
-
-Save as `my_analyzer_config.yaml`:
+**1. Write a YAML config.** Save as `my_analyzer_config.yaml`:
 
 ```yaml
 node_tables:
   - bq_table: "your-project.your_dataset.user_nodes"
     node_type: "user"
     id_column: "user_id"
-    feature_columns: ["age", "country"]
-    # label_column: "label"  # optional, enables Tier 3 checks
+    feature_columns: ["age", "country"]  # optional; [] or omit if the node has no features
+    # label_column: "label"              # optional; enables Tier 3 label checks
 
 edge_tables:
   - bq_table: "your-project.your_dataset.user_edges"
     edge_type: "follows"
     src_id_column: "src_user_id"
     dst_id_column: "dst_user_id"
-    # timestamp_column: "ts"  # optional, enables temporal freshness
 
-# Where to write the HTML report. Use a local path for quick iteration
-# or a gs:// URI to upload to GCS.
+# Where to write the HTML report. Local path for quick iteration, or a gs:// URI.
 output_gcs_path: "/tmp/my_analysis/"
 
-# Optional: sizing for neighbor-explosion estimates (fan-out per GNN layer).
+# Optional: sizing for the neighbor-explosion estimate (fan-out per GNN layer).
 fan_out: [15, 10, 5]
 ```
 
-### 3. Run the analyzer
+**2. Run the analyzer.**
 
 ```bash
 uv run python -m gigl.analytics.data_analyzer \
     --analyzer_config_uri my_analyzer_config.yaml
 ```
 
-When it finishes you will see:
+**3. Open the report.** When the run completes:
 
 ```
 [INFO] Report written to /tmp/my_analysis/report.html
@@ -64,37 +59,34 @@ Open the file in any browser. No server, no external dependencies, fully offline
 
 ## What it checks
 
-The analyzer organizes checks into four tiers. Tiers 1 and 2 always run; tier 3 auto-enables when your config supports
-it; tier 4 is opt-in.
+The analyzer organizes checks into four tiers. Tiers 1 and 2 always run; Tier 3 auto-enables when your config supports
+it; Tier 4 is opt-in.
 
-| Tier                         | When                                                                                 | What it checks                                                                                                                                                                                                                          |
-| ---------------------------- | ------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **1. Hard fails**            | Always                                                                               | Dangling edges, referential integrity (edges to non-existent nodes), duplicate nodes. Raises `DataQualityError` — report still generated to show partial results.                                                                       |
-| **2. Core metrics**          | Always                                                                               | Node/edge counts, degree distribution (in/out), degree buckets, top-K hubs, super-hub int16 clamp count, cold-start node count, self-loops, duplicate edges, NULL rates per column, feature memory budget, neighbor-explosion estimate. |
-| **3. Label + heterogeneous** | Auto when `label_column` is set on any node table, or when multiple edge types exist | Class imbalance, label coverage, edge type distribution, per-edge-type node coverage.                                                                                                                                                   |
-| **4. Advanced**              | Opt-in via config flags                                                              | Reciprocity, homophily, connected components, clustering coefficient. Runs on full data (no sampling).                                                                                                                                  |
+| Tier                         | When                                                                                 | What it checks                                                                                                                                                                                                                                                                         |
+| ---------------------------- | ------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **1. Hard fails**            | Always                                                                               | Dangling edges (NULL src/dst), referential integrity (edges pointing to nodes not in the node table), duplicate nodes. Raises `DataQualityError` — the report still renders to show partial results.                                                                                   |
+| **2. Core metrics**          | Always                                                                               | Node/edge counts, degree distribution (in/out) with percentiles, degree buckets, top-K hubs, super-hub int16 clamp count, cold-start node count, self-loops, duplicate edges, NULL rates per column, feature memory budget estimate, neighbor-explosion estimate (requires `fan_out`). |
+| **3. Label + heterogeneous** | Auto when `label_column` is set on any node table, or when multiple edge types exist | Class imbalance, label coverage, edge type distribution, per-edge-type node coverage.                                                                                                                                                                                                  |
+| **4. Advanced**              | Opt-in via config flags                                                              | Power-law exponent (implemented as a degree-stats approximation). Reciprocity, homophily, connected components, clustering coefficient are **not yet implemented** — the flags are accepted but currently no-op.                                                                       |
 
-Full per-check rationale with literature citations lives in the
-[design doc](../../docs/plans/20260415-bq-data-analyzer.md) and
-[literature review](../../docs/plans/20260415-bq-data-analyzer-references.md).
+The thresholds below come from a review of production GNN papers (PinSage, BLADE, LiGNN, TwHIN, AliGraph, GraphSMOTE,
+Beyond Homophily, Feature Propagation, and others). See the inline citations in the threshold table for what each paper
+contributes.
 
 ## Interpreting the report
 
-The report color-codes every numeric finding against thresholds drawn from 18 production GNN papers. Summary:
-
-| Metric                                                   | Green | Yellow     | Red     | What to do when yellow/red                                                                                                                    |
-| -------------------------------------------------------- | ----- | ---------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
-| Dangling edges / referential integrity / duplicate nodes | 0     | —          | any > 0 | Fix the input tables. Training will fail or silently corrupt otherwise.                                                                       |
-| Feature missing rate                                     | < 10% | 10–50%     | > 90%   | Plan an imputation strategy; at > 95% the Feature Propagation phase transition hits and GNNs stop recovering signal well.                     |
-| Isolated node fraction                                   | < 1%  | 1–5%       | > 5%    | Filter isolated nodes or densify (LiGNN-style) for cold-start cohorts.                                                                        |
-| Cold-start fraction (degree 0–1)                         | < 5%  | 5–10%      | > 10%   | Candidates for graph densification; also flag for special handling at serving time.                                                           |
-| Super-hub int16 clamp (degree > 32,767)                  | 0     | —          | any > 0 | GiGL silently truncates super-hub degrees. Either cap the hub's edges upstream or plan to fix the clamp.                                      |
-| Degree p99/median                                        | < 50  | 50–100     | > 100   | Use importance sampling (PinSage) or degree-adaptive neighborhoods (BLADE).                                                                   |
-| Class imbalance ratio                                    | < 1:5 | 1:5 – 1:10 | > 1:10  | Message passing amplifies label imbalance 2-3x in representation space (GraphSMOTE). Consider resampling or GraphSMOTE-style synthetic nodes. |
-| Edge homophily (Tier 4)                                  | > 0.7 | 0.3 – 0.7  | < 0.3   | Standard GCN/GAT fail at low h. Consider H2GCN-style architectures; at h < 0.2 an MLP often wins.                                             |
+The report color-codes every numeric finding. Summary of the most important thresholds:
 
-Full threshold table with citations:
-[`docs/plans/20260415-bq-data-analyzer-references.md`](../../docs/plans/20260415-bq-data-analyzer-references.md#19-consolidated-threshold-table).
+| Metric                                                   | Green | Yellow     | Red     | What to do when yellow/red                                                                                                                                    |
+| -------------------------------------------------------- | ----- | ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Dangling edges / referential integrity / duplicate nodes | 0     | —          | any > 0 | Fix the input tables. Training will fail or silently corrupt otherwise.                                                                                       |
+| Feature missing rate                                     | < 10% | 10–50%     | > 90%   | Plan an imputation strategy; above ~95% the Feature Propagation phase transition (Rossi et al., ICLR 2022) hits and GNNs stop recovering signal reliably.     |
+| Isolated node fraction                                   | < 1%  | 1–5%       | > 5%    | Filter isolated nodes or densify (LiGNN, KDD 2024) for cold-start cohorts.                                                                                    |
+| Cold-start fraction (degree 0–1)                         | < 5%  | 5–10%      | > 10%   | Candidates for graph densification; also flag for special handling at serving time.                                                                           |
+| Super-hub int16 clamp (degree > 32,767)                  | 0     | —          | any > 0 | GiGL silently truncates super-hub degrees in `gigl/distributed/utils/degree.py`. Either cap the hub's edges upstream or plan to address the clamp.            |
+| Degree p99 / median                                      | < 50  | 50–100     | > 100   | Use importance sampling (PinSage, KDD 2018) or degree-adaptive neighborhoods (BLADE, WSDM 2023) — degree skew is the single biggest lever in production GNNs. |
+| Class imbalance ratio                                    | < 1:5 | 1:5 – 1:10 | > 1:10  | Message passing amplifies label imbalance 2–3× in representation space (GraphSMOTE, WSDM 2021). Consider resampling or GraphSMOTE-style synthetic nodes.      |
+| Edge homophily (Tier 4, future)                          | > 0.7 | 0.3 – 0.7  | < 0.3   | Standard GCN/GAT fail at low h (Zhu et al., NeurIPS 2020). Consider H2GCN-style architectures; below h ≈ 0.2 a plain MLP often wins.                          |
 
 ## Advanced config
 
@@ -106,24 +98,28 @@ node_tables:
   - bq_table: ...
     label_column: "label"
 
-# Enable Tier 4 temporal freshness for an edge type:
-edge_tables:
-  - bq_table: ...
-    timestamp_column: "created_at"
-
 # Neighbor explosion estimation — the fan-out per GNN layer you plan to train with:
 fan_out: [15, 10, 5]
 
-# Opt-in Tier 4 checks. Default false; all run on full data (no sampling).
+# Tier 4 opt-in flags. Default false.
+# NOTE: Only `compute_reciprocity` is wired into the analyzer today and it logs a
+# warning rather than computing a result. The other three flags are placeholders
+# for future work (see "Scope and limitations" below).
 compute_reciprocity: true
 compute_homophily: true
 compute_connected_components: true
 compute_clustering: true
+
+# Per-edge-type timestamp hint. NOTE: accepted by the config schema but not yet
+# consumed by any Tier 4 query (temporal freshness check is planned).
+edge_tables:
+  - bq_table: ...
+    timestamp_column: "created_at"
 ```
 
 ## Python API
 
-The CLI wraps a regular class. Call from your own code when you want programmatic access to the result dataclass:
+The CLI wraps a regular class. Call from your own code when you want programmatic access to the `GraphAnalysisResult`:
 
 ```python
 from gigl.analytics.data_analyzer import DataAnalyzer
@@ -132,11 +128,10 @@ from gigl.analytics.data_analyzer.config import load_analyzer_config
 config = load_analyzer_config("my_analyzer_config.yaml")
 analyzer = DataAnalyzer()
 report_path = analyzer.run(config=config)
-# report_path points to the written report.html
+# report_path points to the written report.html (local path or gs:// URI)
 ```
 
-The underlying `GraphStructureAnalyzer` is also callable directly if you only want the raw `GraphAnalysisResult`
-dataclass and no HTML:
+The underlying `GraphStructureAnalyzer` is also callable directly if you want the raw result dataclass and no HTML:
 
 ```python
 from gigl.analytics.data_analyzer.graph_structure_analyzer import GraphStructureAnalyzer
@@ -145,6 +140,10 @@ result = GraphStructureAnalyzer().analyze(config)
 print(result.degree_stats)
 ```
 
+See a rendered report example at
+[`tests/test_assets/analytics/golden_report.html`](../../tests/test_assets/analytics/golden_report.html) to preview the
+output format before authenticating to BQ.
+
 ## graph_validation
 
 One-off validators for the subset of cases where the full analyzer is overkill. Today the only check is dangling-edge
@@ -161,26 +160,29 @@ has_dangling = BQGraphValidator.does_edge_table_have_dangling_edges(
 ```
 
 The `DataAnalyzer` runs this check (and many more) as part of Tier 1, so prefer the full analyzer unless you
-specifically need to script a one-line gate.
+specifically need a one-line gate (e.g., inside an Airflow task or a preprocessing job). This subpackage is the intended
+home for additional standalone validators in the future.
 
 ## Scope and limitations
 
-The v1 analyzer has two deliberate scope cuts:
+Current implementation status:
 
-- **FeatureProfiler is a stub.** The class is wired in but the TFDV/Dataflow pipeline that produces FACETS HTML per
+- **FeatureProfiler is a stub.** The class is wired in but the TFDV/Dataflow pipeline that would produce FACETS HTML per
   table is deferred to a follow-up PR. Calling it today logs a warning and returns an empty `FeatureProfileResult`. The
   main report is fully functional without it.
-- **Tier 4 queries are not implemented.** Reciprocity, homophily, connected components, and clustering coefficient
-  config flags are accepted but currently no-op. The power-law exponent (Tier 4) is computed from degree stats as an
-  approximation.
-
-## Deeper reading
-
-- [Design doc](../../docs/plans/20260415-bq-data-analyzer.md) — architecture, 4-tier validation, cost control, tradeoff
-  analysis
-- [Literature review](../../docs/plans/20260415-bq-data-analyzer-references.md) — 18 production GNN papers, 100+
-  findings, consolidated threshold table
-- [1-pager](../../docs/plans/20260416-data-analyzer-1-pager.md) — executive summary
-- [Engineering spec](../../docs/plans/20260416-data-analyzer-engineering-spec.md) — per-layer implementation contract
-- [Report PRD](data_analyzer/report/PRD.md) — product intent for the HTML report
-- [Report SPEC](data_analyzer/report/SPEC.md) — technical contract for regenerating the HTML/JS/CSS assets
+- **Tier 4 checks are partial.** Power-law exponent is computed as a degree-stats approximation. Reciprocity, homophily,
+  connected components, and clustering coefficient config flags are accepted but currently no-op. The `timestamp_column`
+  edge field is accepted but no temporal-freshness query runs yet.
+- **Heterogeneous graphs: referential integrity caveat.** For each edge table, the referential-integrity check joins
+  against `config.node_tables[0]`. On heterogeneous graphs where different edges reference different node types, the
+  current implementation will under-report integrity violations — fix is tracked for a follow-up.
+- **GCS upload** works via `GcsUtils.upload_from_string` when `output_gcs_path` is a `gs://` URI, and falls back to
+  local filesystem write otherwise.
+
+## Related documents
+
+Within this module:
+
+- [`data_analyzer/report/PRD.md`](data_analyzer/report/PRD.md) — product intent for the HTML report (AI-owned)
+- [`data_analyzer/report/SPEC.md`](data_analyzer/report/SPEC.md) — technical contract for the AI-owned HTML/JS/CSS
+  assets
diff --git a/gigl/analytics/data_analyzer/config.py b/gigl/analytics/data_analyzer/config.py
index 0ea9c721e..08366bc91 100644
--- a/gigl/analytics/data_analyzer/config.py
+++ b/gigl/analytics/data_analyzer/config.py
@@ -15,7 +15,7 @@ class NodeTableSpec:
     bq_table: str = MISSING
     node_type: str = MISSING
     id_column: str = MISSING
-    feature_columns: list[str] = MISSING
+    feature_columns: list[str] = field(default_factory=list)
     label_column: Optional[str] = None
 
 
diff --git a/tests/unit/analytics/data_analyzer/config_test.py b/tests/unit/analytics/data_analyzer/config_test.py
index 86b454037..5d71d5e55 100644
--- a/tests/unit/analytics/data_analyzer/config_test.py
+++ b/tests/unit/analytics/data_analyzer/config_test.py
@@ -61,3 +61,22 @@ def test_missing_required_field_raises(self) -> None:
         with self.assertRaises(Exception):
             merged = OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), raw)
             OmegaConf.to_object(merged)
+
+    def test_node_table_without_feature_columns(self) -> None:
+        """Nodes with no features are legal; feature_columns defaults to []."""
+        yaml_str = """
+        node_tables:
+          - bq_table: "p.d.t"
+            node_type: "user"
+            id_column: "uid"
+        edge_tables:
+          - bq_table: "p.d.e"
+            edge_type: "follows"
+            src_id_column: "src"
+            dst_id_column: "dst"
+        output_gcs_path: "gs://bucket/out/"
+        """
+        raw = OmegaConf.create(yaml_str)
+        merged = OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), raw)
+        config = cast(DataAnalyzerConfig, OmegaConf.to_object(merged))
+        self.assertEqual(config.node_tables[0].feature_columns, [])

From 826c893f27e80a328416e125565a94e6d891b9be Mon Sep 17 00:00:00 2001
From: svij <svij@snapchat.com>
Date: Tue, 21 Apr 2026 01:19:03 +0000
Subject: [PATCH 20/20] tfdv

---
 gigl/analytics/data_analyzer/config.py        | 106 +++++++-
 gigl/analytics/data_analyzer/data_analyzer.py |  42 ++-
 .../data_analyzer/feature_profiler.py         | 186 +++++++++++--
 .../data_analyzer/graph_structure_analyzer.py | 136 +++++++---
 gigl/analytics/data_analyzer/queries.py       |  21 +-
 gigl/common/beam/tfdv_transforms.py           | 169 ++++++++++++
 gigl/src/common/constants/components.py       |   1 +
 .../data_preprocessor/lib/transform/utils.py  |  32 +--
 .../analytics/data_analyzer/config_test.py    |  96 ++++++-
 .../data_analyzer/data_analyzer_test.py       | 124 ++++++++-
 .../data_analyzer/feature_profiler_test.py    | 256 ++++++++++++++++++
 .../graph_structure_analyzer_test.py          | 128 +++++++++
 .../analytics/data_analyzer/queries_test.py   |  55 +++-
 tests/unit/common/beam/__init__.py            |   0
 .../unit/common/beam/tfdv_transforms_test.py  | 154 +++++++++++
 15 files changed, 1379 insertions(+), 127 deletions(-)
 create mode 100644 gigl/common/beam/tfdv_transforms.py
 create mode 100644 tests/unit/analytics/data_analyzer/feature_profiler_test.py
 create mode 100644 tests/unit/common/beam/__init__.py
 create mode 100644 tests/unit/common/beam/tfdv_transforms_test.py

diff --git a/gigl/analytics/data_analyzer/config.py b/gigl/analytics/data_analyzer/config.py
index 08366bc91..c892edb0f 100644
--- a/gigl/analytics/data_analyzer/config.py
+++ b/gigl/analytics/data_analyzer/config.py
@@ -1,3 +1,4 @@
+import re
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -7,6 +8,29 @@
 
 logger = Logger()
 
+# BigQuery identifier regexes used to reject configs that would be interpolated
+# directly into SQL. See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical
+# for the allowed grammar. Tables are of the form project.dataset.table;
+# columns are simple unquoted identifiers.
+_BQ_TABLE_REGEX = re.compile(r"^[A-Za-z0-9_.\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_$\-]+$")
+_BQ_COLUMN_REGEX = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
+
+
+def _validate_bq_table(name: str, field_label: str) -> None:
+    if not _BQ_TABLE_REGEX.fullmatch(name):
+        raise ValueError(
+            f"{field_label}={name!r} is not a valid BigQuery table reference. "
+            f"Expected project.dataset.table with no backticks, whitespace, or quotes."
+        )
+
+
+def _validate_bq_column(name: str, field_label: str) -> None:
+    if not _BQ_COLUMN_REGEX.fullmatch(name):
+        raise ValueError(
+            f"{field_label}={name!r} is not a valid BigQuery column identifier. "
+            f"Expected [A-Za-z_][A-Za-z0-9_]* with no backticks, whitespace, or quotes."
+        )
+
 
 @dataclass
 class NodeTableSpec:
@@ -21,12 +45,19 @@ class NodeTableSpec:
 
 @dataclass
 class EdgeTableSpec:
-    """Specification for an edge table in BigQuery."""
+    """Specification for an edge table in BigQuery.
+
+    For heterogeneous graphs (more than one node table), src_node_type and
+    dst_node_type must be set to the node_type of the matching node table.
+    For homogeneous graphs (single node table) they default to that node_type.
+    """
 
     bq_table: str = MISSING
     edge_type: str = MISSING
     src_id_column: str = MISSING
     dst_id_column: str = MISSING
+    src_node_type: Optional[str] = None
+    dst_node_type: Optional[str] = None
     feature_columns: list[str] = field(default_factory=list)
     timestamp_column: Optional[str] = None
 
@@ -53,6 +84,72 @@ class DataAnalyzerConfig:
     compute_clustering: bool = False
 
 
+def _validate_and_backfill(config: DataAnalyzerConfig) -> None:
+    """Run identifier validation and backfill default node-type references.
+
+    - Every bq_table must match project.dataset.table.
+    - Every id_column / src_id_column / dst_id_column / feature_column /
+      label_column / timestamp_column must be a bare BQ identifier.
+    - For homogeneous configs, an edge table with no src_node_type /
+      dst_node_type inherits the single node table's node_type.
+    - For heterogeneous configs, every edge table must explicitly declare
+      src_node_type and dst_node_type, and both must resolve to a known
+      node_type.
+    """
+    known_node_types = {nt.node_type for nt in config.node_tables}
+    single_node_type: Optional[str] = (
+        next(iter(known_node_types)) if len(config.node_tables) == 1 else None
+    )
+
+    for node_table in config.node_tables:
+        _validate_bq_table(node_table.bq_table, "node_tables.bq_table")
+        _validate_bq_column(node_table.id_column, "node_tables.id_column")
+        for col in node_table.feature_columns:
+            _validate_bq_column(col, "node_tables.feature_columns")
+        if node_table.label_column is not None:
+            _validate_bq_column(node_table.label_column, "node_tables.label_column")
+
+    for edge_table in config.edge_tables:
+        _validate_bq_table(edge_table.bq_table, "edge_tables.bq_table")
+        _validate_bq_column(edge_table.src_id_column, "edge_tables.src_id_column")
+        _validate_bq_column(edge_table.dst_id_column, "edge_tables.dst_id_column")
+        for col in edge_table.feature_columns:
+            _validate_bq_column(col, "edge_tables.feature_columns")
+        if edge_table.timestamp_column is not None:
+            _validate_bq_column(
+                edge_table.timestamp_column, "edge_tables.timestamp_column"
+            )
+
+        if edge_table.src_node_type is None:
+            if single_node_type is not None:
+                edge_table.src_node_type = single_node_type
+            else:
+                raise ValueError(
+                    f"edge_type={edge_table.edge_type}: src_node_type is required "
+                    f"when there are multiple node tables"
+                )
+        if edge_table.dst_node_type is None:
+            if single_node_type is not None:
+                edge_table.dst_node_type = single_node_type
+            else:
+                raise ValueError(
+                    f"edge_type={edge_table.edge_type}: dst_node_type is required "
+                    f"when there are multiple node tables"
+                )
+        if edge_table.src_node_type not in known_node_types:
+            raise ValueError(
+                f"edge_type={edge_table.edge_type}: src_node_type="
+                f"{edge_table.src_node_type!r} is not a declared node_type. "
+                f"Known: {sorted(known_node_types)}"
+            )
+        if edge_table.dst_node_type not in known_node_types:
+            raise ValueError(
+                f"edge_type={edge_table.edge_type}: dst_node_type="
+                f"{edge_table.dst_node_type!r} is not a declared node_type. "
+                f"Known: {sorted(known_node_types)}"
+            )
+
+
 def load_analyzer_config(config_path: str) -> DataAnalyzerConfig:
     """Load and validate a DataAnalyzerConfig from a YAML file.
 
@@ -60,14 +157,19 @@ def load_analyzer_config(config_path: str) -> DataAnalyzerConfig:
         config_path: Local file path or GCS URI to the YAML config.
 
     Returns:
-        Validated DataAnalyzerConfig instance.
+        Validated DataAnalyzerConfig instance with node-type references
+        backfilled on edge tables.
 
     Raises:
         omegaconf.errors.MissingMandatoryValue: If required fields are missing.
+        ValueError: If any bq_table or column name is not a valid BigQuery
+            identifier, or if a heterogeneous config is missing a required
+            src_node_type / dst_node_type.
     """
     raw = OmegaConf.load(config_path)
     merged = OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), raw)
     config: DataAnalyzerConfig = OmegaConf.to_object(merged)  # type: ignore
+    _validate_and_backfill(config)
     logger.info(
         f"Loaded analyzer config with {len(config.node_tables)} node tables "
         f"and {len(config.edge_tables)} edge tables"
diff --git a/gigl/analytics/data_analyzer/data_analyzer.py b/gigl/analytics/data_analyzer/data_analyzer.py
index b4c497713..f8062fa56 100644
--- a/gigl/analytics/data_analyzer/data_analyzer.py
+++ b/gigl/analytics/data_analyzer/data_analyzer.py
@@ -1,16 +1,18 @@
 """Main orchestrator and CLI entry point for the BQ Data Analyzer."""
 import argparse
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Optional
 
 from gigl.analytics.data_analyzer.config import DataAnalyzerConfig, load_analyzer_config
+from gigl.analytics.data_analyzer.feature_profiler import FeatureProfiler
 from gigl.analytics.data_analyzer.graph_structure_analyzer import (
     DataQualityError,
     GraphStructureAnalyzer,
 )
 from gigl.analytics.data_analyzer.report.report_generator import generate_report
 from gigl.analytics.data_analyzer.types import FeatureProfileResult, GraphAnalysisResult
-from gigl.common import GcsUri, Uri
+from gigl.common import GcsUri, Uri, UriFactory
 from gigl.common.logger import Logger
 from gigl.common.utils.gcs import GcsUtils
 
@@ -69,17 +71,28 @@ def run(
         Returns:
             The path to the written ``report.html`` (GCS URI or local path).
         """
-        analysis_result: GraphAnalysisResult
-        profile_result: Optional[FeatureProfileResult] = None
-
         structure_analyzer = GraphStructureAnalyzer()
-        try:
-            analysis_result = structure_analyzer.analyze(config)
-        except DataQualityError as e:
-            logger.error(f"Tier 1 data quality failure: {e}")
-            analysis_result = e.partial_result
-
-        # TODO: run feature profiler (TFDV/Dataflow) in parallel once implemented.
+        feature_profiler = FeatureProfiler()
+
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            structure_future = executor.submit(structure_analyzer.analyze, config)
+            profile_future = executor.submit(
+                feature_profiler.profile, config, resource_config_uri
+            )
+
+            analysis_result: GraphAnalysisResult
+            try:
+                analysis_result = structure_future.result()
+            except DataQualityError as e:
+                logger.error(f"Tier 1 data quality failure: {e}")
+                analysis_result = e.partial_result
+
+            profile_result: FeatureProfileResult
+            try:
+                profile_result = profile_future.result()
+            except Exception as e:
+                logger.exception(f"Feature profiler failed: {e}")
+                profile_result = FeatureProfileResult()
 
         html = generate_report(
             analysis_result=analysis_result,
@@ -110,8 +123,13 @@ def main() -> None:
     args = parser.parse_args()
 
     config = load_analyzer_config(args.analyzer_config_uri)
+    resource_config_uri: Optional[Uri] = (
+        UriFactory.create_uri(args.resource_config_uri)
+        if args.resource_config_uri
+        else None
+    )
     analyzer = DataAnalyzer()
-    report_path = analyzer.run(config=config)
+    report_path = analyzer.run(config=config, resource_config_uri=resource_config_uri)
     logger.info(f"Report generated at: {report_path}")
 
 
diff --git a/gigl/analytics/data_analyzer/feature_profiler.py b/gigl/analytics/data_analyzer/feature_profiler.py
index a20f593b6..e1227ac08 100644
--- a/gigl/analytics/data_analyzer/feature_profiler.py
+++ b/gigl/analytics/data_analyzer/feature_profiler.py
@@ -1,37 +1,69 @@
 """TFDV feature profiling via Beam/Dataflow.
 
-Builds standalone Beam pipelines that read from BQ tables, run
-tfdv.GenerateStatistics(), and produce FACETS HTML visualizations.
-
-Will reuse existing PTransforms:
-- GenerateAndVisualizeStats (gigl/src/data_preprocessor/lib/transform/utils.py:120)
-- IngestRawFeatures (gigl/src/data_preprocessor/lib/transform/utils.py:85)
-- init_beam_pipeline_options (gigl/src/common/utils/dataflow.py)
-
-NOTE: Currently a stub. Full implementation is deferred to a future PR
-once the wrapping Dataflow infrastructure is ready. The stub logs a
-warning and returns an empty FeatureProfileResult so callers can wire
-up their code without blocking on Dataflow.
+Launches one Dataflow pipeline per (node or edge) table that declares
+``feature_columns`` in the analyzer config. Each pipeline reads the
+selected columns from BigQuery, emits ``pa.RecordBatch`` batches, and
+runs ``tfdv.GenerateStatistics`` to write a Facets HTML visualization
+plus a TFDV stats TFRecord to GCS.
+
+Pipelines are launched concurrently using an internal
+``ThreadPoolExecutor``; each worker blocks on
+``p.run().wait_until_finish()`` for its table. Per-table exceptions are
+logged and the failed table is omitted from the returned
+``FeatureProfileResult`` - callers (and the HTML report) already handle
+missing keys.
 """
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
 from typing import Optional
 
+import apache_beam as beam
+
 from gigl.analytics.data_analyzer.config import DataAnalyzerConfig
 from gigl.analytics.data_analyzer.types import FeatureProfileResult
-from gigl.common import Uri
+from gigl.common import Uri, UriFactory
+from gigl.common.beam.tfdv_transforms import (
+    BqTableToRecordBatch,
+    GenerateAndVisualizeStats,
+)
 from gigl.common.logger import Logger
+from gigl.env.pipelines_config import get_resource_config
+from gigl.src.common.constants.components import GiGLComponents
+from gigl.src.common.types import AppliedTaskIdentifier
+from gigl.src.common.utils.dataflow import init_beam_pipeline_options
 
 logger = Logger()
 
+_PARALLEL_DATAFLOW_WORKERS = 10
+_APPLIED_TASK_IDENTIFIER = AppliedTaskIdentifier("data-analyzer")
+
+
+@dataclass(frozen=True)
+class _ProfileTask:
+    """One profiling unit: all features of a single node or edge table.
+
+    ``kind`` is ``"node"`` or ``"edge"`` (singular) and is used to build
+    the GCS output path and the result key (``"node:user"``, etc.).
+    """
+
+    kind: str
+    type_name: str
+    bq_table: str
+    feature_columns: list[str]
+
+    @property
+    def result_key(self) -> str:
+        return f"{self.kind}:{self.type_name}"
+
 
 class FeatureProfiler:
     """Runs TFDV feature profiling on BQ tables via Dataflow.
 
-    Currently a stub. See module docstring.
-
     Example:
         >>> profiler = FeatureProfiler()
-        >>> result = profiler.profile(config)
-        >>> # result.facets_html_paths will be empty until full impl lands
+        >>> result = profiler.profile(config, resource_config_uri=uri)
+        >>> result.facets_html_paths["node:user"]
+        'gs://bucket/analyzer/feature_profiler/nodes/user/facets.html'
     """
 
     def profile(
@@ -39,19 +71,119 @@ def profile(
         config: DataAnalyzerConfig,
         resource_config_uri: Optional[Uri] = None,
     ) -> FeatureProfileResult:
-        """Run TFDV profiling on all tables in config.
+        """Run TFDV profiling on all tables with declared feature columns.
+
+        Launches one Dataflow pipeline per table concurrently. Tables with
+        no ``feature_columns`` are skipped. Per-table failures are logged
+        and omitted from the result.
 
         Args:
-            config: Analyzer configuration with table specs.
-            resource_config_uri: Optional resource config for Dataflow sizing.
+            config: Analyzer configuration with node and edge table specs.
+            resource_config_uri: Resource config for Dataflow sizing.
+                Required - TFDV profiling needs Dataflow.
 
         Returns:
-            FeatureProfileResult with GCS paths to TFDV artifacts.
+            ``FeatureProfileResult`` with GCS paths keyed by
+            ``"node:{type}"`` / ``"edge:{type}"``. Empty if no tables
+            declared feature columns.
+
+        Raises:
+            ValueError: If ``resource_config_uri`` is None.
+        """
+        if resource_config_uri is None:
+            raise ValueError(
+                "FeatureProfiler requires a resource_config_uri for Dataflow sizing. "
+                "Pass --resource_config_uri when invoking the DataAnalyzer CLI."
+            )
+        # Eagerly populate the process-global resource config so that
+        # `init_beam_pipeline_options` (called on worker threads below)
+        # can resolve it without args.
+        get_resource_config(resource_config_uri=resource_config_uri)
+
+        tasks = _collect_profile_tasks(config)
+        if not tasks:
+            logger.info("No tables declared feature_columns; returning empty result.")
+            return FeatureProfileResult()
+
+        logger.info(f"Launching {len(tasks)} Dataflow feature-profile job(s).")
+        result = FeatureProfileResult()
+        with ThreadPoolExecutor(max_workers=_PARALLEL_DATAFLOW_WORKERS) as executor:
+            future_to_task = {
+                executor.submit(
+                    self._run_single_pipeline, task, config.output_gcs_path
+                ): task
+                for task in tasks
+            }
+            for future in as_completed(future_to_task):
+                task = future_to_task[future]
+                try:
+                    facets_uri, stats_uri = future.result()
+                    result.facets_html_paths[task.result_key] = facets_uri
+                    result.stats_paths[task.result_key] = stats_uri
+                except Exception as exc:
+                    logger.exception(
+                        f"Feature profiling failed for {task.result_key} "
+                        f"(table={task.bq_table}): {exc}"
+                    )
+        return result
+
+    def _run_single_pipeline(
+        self, task: _ProfileTask, output_gcs_path: str
+    ) -> tuple[str, str]:
+        """Build, run, and block on a single table's Dataflow pipeline.
+
+        Returns the ``(facets_uri, stats_uri)`` strings on success.
         """
-        logger.warning(
-            "FeatureProfiler not yet implemented. "
-            "Returning empty results. "
-            "Full implementation will wire up Beam/Dataflow pipelines "
-            "using GenerateAndVisualizeStats and IngestRawFeatures."
+        base = f"{output_gcs_path.rstrip('/')}/feature_profiler/{task.kind}s/{task.type_name}"
+        facets_uri = UriFactory.create_uri(f"{base}/facets.html")
+        stats_uri = UriFactory.create_uri(f"{base}/stats.tfrecord")
+
+        options = init_beam_pipeline_options(
+            applied_task_identifier=_APPLIED_TASK_IDENTIFIER,
+            job_name_suffix=f"profile-{task.kind}-{task.type_name}",
+            component=GiGLComponents.DataAnalyzer,
         )
-        return FeatureProfileResult()
+        with beam.Pipeline(options=options) as p:
+            _ = (
+                p
+                | f"Read {task.result_key} from BQ"
+                >> BqTableToRecordBatch(
+                    bq_table=task.bq_table,
+                    feature_columns=task.feature_columns,
+                )
+                | f"Generate TFDV stats for {task.result_key}"
+                >> GenerateAndVisualizeStats(
+                    facets_report_uri=facets_uri,
+                    stats_output_uri=stats_uri,
+                )
+            )
+        logger.info(f"Finished feature profiling for {task.result_key}.")
+        return facets_uri.uri, stats_uri.uri
+
+
+def _collect_profile_tasks(config: DataAnalyzerConfig) -> list[_ProfileTask]:
+    """Flatten the analyzer config into one ``_ProfileTask`` per table that
+    has non-empty ``feature_columns``. Tables without features are skipped.
+    """
+    tasks: list[_ProfileTask] = []
+    for node_table in config.node_tables:
+        if node_table.feature_columns:
+            tasks.append(
+                _ProfileTask(
+                    kind="node",
+                    type_name=node_table.node_type,
+                    bq_table=node_table.bq_table,
+                    feature_columns=list(node_table.feature_columns),
+                )
+            )
+    for edge_table in config.edge_tables:
+        if edge_table.feature_columns:
+            tasks.append(
+                _ProfileTask(
+                    kind="edge",
+                    type_name=edge_table.edge_type,
+                    bq_table=edge_table.bq_table,
+                    feature_columns=list(edge_table.feature_columns),
+                )
+            )
+    return tasks
diff --git a/gigl/analytics/data_analyzer/graph_structure_analyzer.py b/gigl/analytics/data_analyzer/graph_structure_analyzer.py
index a69e3bdb1..b8a97086f 100644
--- a/gigl/analytics/data_analyzer/graph_structure_analyzer.py
+++ b/gigl/analytics/data_analyzer/graph_structure_analyzer.py
@@ -19,6 +19,7 @@ class imbalance and label coverage (auto-enabled when node_tables have a
 """
 
 import math
+from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 
 from gigl.analytics.data_analyzer.config import (
@@ -124,6 +125,7 @@ def _run_tier1(
     ) -> None:
         """Run all tier 1 checks; raise DataQualityError on any violation."""
         violations: list[str] = []
+        node_tables_by_type = {nt.node_type: nt for nt in config.node_tables}
 
         # Duplicate nodes (per node table).
         for node_table in config.node_tables:
@@ -151,30 +153,47 @@ def _run_tier1(
                     f"edge_type={edge_table.edge_type} has {dangling} dangling edges"
                 )
 
-            # Referential integrity: join against the first node table (heterogeneous
-            # graphs with per-edge-type node types would refine this per edge table;
-            # for now we pair each edge table with config.node_tables[0]).
-            if config.node_tables:
-                node_table = config.node_tables[0]
-                ref_query = EDGE_REFERENTIAL_INTEGRITY_QUERY.format(
-                    edge_table=edge_table.bq_table,
-                    node_table=node_table.bq_table,
-                    src_id_column=edge_table.src_id_column,
-                    dst_id_column=edge_table.dst_id_column,
-                    node_id_column=node_table.id_column,
+            # Referential integrity: src and dst can resolve to different node
+            # tables on heterogeneous graphs. `load_analyzer_config` guarantees
+            # src_node_type / dst_node_type are populated and known.
+            if not config.node_tables:
+                continue
+            assert edge_table.src_node_type is not None, (
+                f"edge_type={edge_table.edge_type} has no src_node_type; "
+                "load the config via load_analyzer_config to backfill it."
+            )
+            assert edge_table.dst_node_type is not None, (
+                f"edge_type={edge_table.edge_type} has no dst_node_type; "
+                "load the config via load_analyzer_config to backfill it."
+            )
+            src_node_table = node_tables_by_type[edge_table.src_node_type]
+            dst_node_table = node_tables_by_type[edge_table.dst_node_type]
+            ref_query = EDGE_REFERENTIAL_INTEGRITY_QUERY.format(
+                edge_table=edge_table.bq_table,
+                src_node_table=src_node_table.bq_table,
+                dst_node_table=dst_node_table.bq_table,
+                src_id_column=edge_table.src_id_column,
+                dst_id_column=edge_table.dst_id_column,
+                src_node_id_column=src_node_table.id_column,
+                dst_node_id_column=dst_node_table.id_column,
+            )
+            rows = list(self._bq_utils.run_query(query=ref_query, labels={}))
+            if len(rows) != 1:
+                raise RuntimeError(
+                    f"Referential integrity query expected exactly 1 row; "
+                    f"got {len(rows)}. Query: {ref_query.strip()[:200]}"
+                )
+            missing_src = int(rows[0]["missing_src_count"] or 0)
+            missing_dst = int(rows[0]["missing_dst_count"] or 0)
+            total_missing = missing_src + missing_dst
+            result.referential_integrity_violations[
+                edge_table.edge_type
+            ] = total_missing
+            if total_missing > 0:
+                violations.append(
+                    f"edge_type={edge_table.edge_type} has {total_missing} "
+                    "referential integrity violations"
                 )
-                rows = list(self._bq_utils.run_query(query=ref_query, labels={}))
-                missing_src = rows[0]["missing_src_count"] if rows else 0
-                missing_dst = rows[0]["missing_dst_count"] if rows else 0
-                total_missing = int(missing_src) + int(missing_dst)
-                result.referential_integrity_violations[
-                    edge_table.edge_type
-                ] = total_missing
-                if total_missing > 0:
-                    violations.append(
-                        f"edge_type={edge_table.edge_type} has {total_missing} "
-                        "referential integrity violations"
-                    )
 
         if violations:
             msg = "Tier 1 data quality violations detected:\n  - " + "\n  - ".join(
@@ -190,19 +209,36 @@ def _run_tier1(
     def _run_tier2(
         self, config: DataAnalyzerConfig, result: GraphAnalysisResult
     ) -> None:
-        """Collect core structural metrics, fanning out BQ jobs in parallel."""
-        # Node-level metrics (counts + null rates).
-        for node_table in config.node_tables:
-            self._tier2_node_metrics(node_table, result)
+        """Collect core structural metrics, fanning out BQ jobs in parallel.
 
-        # Edge-level metrics. If a single node table exists, pair it with each
-        # edge table for isolated/cold-start joins; otherwise pair with the
-        # first node table (heterogeneous refinement is a TODO).
-        primary_node_table = config.node_tables[0] if config.node_tables else None
-        for edge_table in config.edge_tables:
-            self._tier2_edge_metrics(edge_table, primary_node_table, result)
+        Edge-level metrics are computed from the src-side perspective:
+        isolated/cold-start joins pair each edge with its src_node_type's
+        table. Hetero dst-perspective coverage is exposed separately via
+        Tier 3 edge_type_node_coverage.
+
+        BQ jobs are I/O-bound so ThreadPoolExecutor is used. Each worker
+        writes to distinct keys of the shared `result` dict (one key per
+        node_type / edge_type), so no lock is required under CPython's GIL.
+        """
+        node_tables_by_type = {nt.node_type: nt for nt in config.node_tables}
+
+        with ThreadPoolExecutor(max_workers=_PARALLEL_BQ_WORKERS) as executor:
+            futures = []
+            for node_table in config.node_tables:
+                futures.append(
+                    executor.submit(self._tier2_node_metrics, node_table, result)
+                )
+            for edge_table in config.edge_tables:
+                src_node_table = node_tables_by_type.get(edge_table.src_node_type or "")
+                futures.append(
+                    executor.submit(
+                        self._tier2_edge_metrics, edge_table, src_node_table, result
+                    )
+                )
+            for future in futures:
+                future.result()  # re-raise any exception
 
-        # Python-side computations.
+        # Python-side computations run after all BQ data is collected.
         self._compute_feature_memory_budget(config, result)
         self._compute_neighbor_explosion_estimate(config, result)
 
@@ -287,6 +323,7 @@ def _tier2_edge_metrics(
                     edge_table=edge_table.bq_table,
                     node_id_column=node_table.id_column,
                     src_id_column=edge_table.src_id_column,
+                    dst_id_column=edge_table.dst_id_column,
                 ),
                 "cold_start_count",
             )
@@ -343,13 +380,15 @@ def _build_degree_stats(self, table: str, id_column: str) -> DegreeStats:
         # We only have 100-bucket quantiles, so p999 ~= p99 as best-effort.
         p999 = p99
 
+        # Bucket keys must match BUCKET_ORDER in report/charts.ai.js for the
+        # histogram to render correctly; keep uppercase K.
         buckets: dict[str, int] = {
             "0-1": int(bucket_row["bucket_0_1"]),
             "2-10": int(bucket_row["bucket_2_10"]),
             "11-100": int(bucket_row["bucket_11_100"]),
-            "101-1k": int(bucket_row["bucket_101_1k"]),
-            "1k-10k": int(bucket_row["bucket_1k_10k"]),
-            "10k+": int(bucket_row["bucket_10k_plus"]),
+            "101-1K": int(bucket_row["bucket_101_1k"]),
+            "1K-10K": int(bucket_row["bucket_1k_10k"]),
+            "10K+": int(bucket_row["bucket_10k_plus"]),
         }
 
         return DegreeStats(
@@ -499,9 +538,24 @@ def _compute_neighbor_explosion_estimate(
     # ------------------------------------------------------------------ #
 
     def _query_scalar(self, query: str, column: str) -> int:
-        """Run a single-row, single-column query and return the scalar as int."""
+        """Run a single-row, single-column query and return the scalar as int.
+
+        Scalar queries (COUNT, COUNTIF) must return exactly one row with a
+        non-NULL value for the requested column. Any deviation indicates a
+        driver, auth, or schema mismatch rather than legitimate data — raise
+        loudly instead of silently coercing to 0, which would let a broken run
+        pass through as a green-light result.
+        """
         rows = list(self._bq_utils.run_query(query=query, labels={}))
-        if not rows:
-            return 0
+        if len(rows) != 1:
+            raise RuntimeError(
+                f"Scalar query expected exactly 1 row; got {len(rows)}. "
+                f"Query: {query.strip()[:200]}"
+            )
         value = rows[0][column]
-        return int(value) if value is not None else 0
+        if value is None:
+            raise RuntimeError(
+                f"Scalar query returned NULL for column '{column}'. "
+                f"Query: {query.strip()[:200]}"
+            )
+        return int(value)
diff --git a/gigl/analytics/data_analyzer/queries.py b/gigl/analytics/data_analyzer/queries.py
index 19b476a21..3243d2727 100644
--- a/gigl/analytics/data_analyzer/queries.py
+++ b/gigl/analytics/data_analyzer/queries.py
@@ -18,13 +18,13 @@
 
 EDGE_REFERENTIAL_INTEGRITY_QUERY = """
 SELECT
-    COUNTIF(src_node.{node_id_column} IS NULL) AS missing_src_count,
-    COUNTIF(dst_node.{node_id_column} IS NULL) AS missing_dst_count
+    COUNTIF(src_node.{src_node_id_column} IS NULL) AS missing_src_count,
+    COUNTIF(dst_node.{dst_node_id_column} IS NULL) AS missing_dst_count
 FROM `{edge_table}` AS e
-LEFT JOIN `{node_table}` AS src_node
-    ON e.{src_id_column} = src_node.{node_id_column}
-LEFT JOIN `{node_table}` AS dst_node
-    ON e.{dst_id_column} = dst_node.{node_id_column}
+LEFT JOIN `{src_node_table}` AS src_node
+    ON e.{src_id_column} = src_node.{src_node_id_column}
+LEFT JOIN `{dst_node_table}` AS dst_node
+    ON e.{dst_id_column} = dst_node.{dst_node_id_column}
 """
 
 DUPLICATE_NODE_COUNT_QUERY = """
@@ -124,9 +124,12 @@
     SELECT n.{node_id_column}, COALESCE(e.degree, 0) AS degree
     FROM `{node_table}` AS n
     LEFT JOIN (
-        SELECT {src_id_column} AS nid, COUNT(*) AS degree
-        FROM `{edge_table}`
-        GROUP BY {src_id_column}
+        SELECT nid, COUNT(*) AS degree FROM (
+            SELECT {src_id_column} AS nid FROM `{edge_table}`
+            UNION ALL
+            SELECT {dst_id_column} AS nid FROM `{edge_table}`
+        )
+        GROUP BY nid
     ) AS e ON n.{node_id_column} = e.nid
     WHERE COALESCE(e.degree, 0) <= 1
 )
diff --git a/gigl/common/beam/tfdv_transforms.py b/gigl/common/beam/tfdv_transforms.py
new file mode 100644
index 000000000..03058dc8e
--- /dev/null
+++ b/gigl/common/beam/tfdv_transforms.py
@@ -0,0 +1,169 @@
+"""Shared TFDV / Beam PTransforms usable by the data preprocessor and analytics.
+
+Exposes:
+  * ``GenerateAndVisualizeStats`` - Runs ``tfdv.GenerateStatistics`` over a
+    ``PCollection[pa.RecordBatch]`` and writes both a Facets HTML
+    visualization and a TFDV stats TFRecord.
+  * ``BqTableToRecordBatch`` - Reads the given columns from a BigQuery table
+    and emits ``PCollection[pa.RecordBatch]`` suitable for TFDV. Schema is
+    inferred from row values; no pre-declared TFDV schema is required.
+"""
+
+from typing import Iterable, Optional
+
+import apache_beam as beam
+import pyarrow as pa
+import tensorflow_data_validation as tfdv
+from apache_beam.pvalue import PBegin, PCollection
+from apache_beam.transforms.window import GlobalWindow
+from apache_beam.utils.windowed_value import WindowedValue
+from tensorflow_metadata.proto.v0 import statistics_pb2
+
+from gigl.common import Uri
+
+_DEFAULT_BQ_READ_BATCH_SIZE = 1000
+
+
+class GenerateAndVisualizeStats(beam.PTransform):
+    """Generate TFDV statistics and a Facets HTML visualization from a record
+    batch ``PCollection``.
+
+    Writes two side-effect outputs:
+      * A single-shard Facets HTML file at ``facets_report_uri``.
+      * A TFRecord of ``DatasetFeatureStatisticsList`` at ``stats_output_uri``.
+
+    Args:
+        facets_report_uri: URI for the Facets HTML visualization (typically
+            a ``GcsUri``; local ``LocalUri`` is also accepted for tests).
+        stats_output_uri: URI (file prefix) for the TFDV stats TFRecord.
+    """
+
+    def __init__(self, facets_report_uri: Uri, stats_output_uri: Uri):
+        self.facets_report_uri = facets_report_uri
+        self.stats_output_uri = stats_output_uri
+
+    def expand(
+        self, features: PCollection[pa.RecordBatch]
+    ) -> PCollection[statistics_pb2.DatasetFeatureStatisticsList]:
+        stats = features | "Generate TFDV statistics" >> tfdv.GenerateStatistics()
+
+        _ = (
+            stats
+            | "Generate stats visualization"
+            >> beam.Map(tfdv.utils.display_util.get_statistics_html)
+            | "Write stats Facets report HTML"
+            >> beam.io.WriteToText(
+                self.facets_report_uri.uri, num_shards=1, shard_name_template=""
+            )
+        )
+
+        _ = (
+            stats
+            | "Write TFDV stats output TFRecord"
+            >> tfdv.WriteStatisticsToTFRecord(self.stats_output_uri.uri)
+        )
+
+        return stats
+
+
+class _RowsToRecordBatchDoFn(beam.DoFn):
+    """Buffer incoming row dicts and emit ``pa.RecordBatch`` batches.
+
+    Each output column is encoded as an Arrow list-typed column
+    (``list<T>``) with NULLs mapped to Arrow nulls, matching TFDV's
+    expectation that each feature column be a ``(Large)List<primitive|struct>``
+    (or null). See ``tfdv.utils.stats_util.get_feature_type_from_arrow_type``.
+    """
+
+    def __init__(self, batch_size: int, feature_columns: list[str]):
+        self._batch_size = batch_size
+        self._feature_columns = feature_columns
+        self._buffer: list[dict] = []
+
+    def start_bundle(self) -> None:
+        self._buffer = []
+
+    def process(self, element: dict) -> Iterable[pa.RecordBatch]:
+        self._buffer.append(element)
+        if len(self._buffer) >= self._batch_size:
+            yield self._drain()
+
+    def finish_bundle(self) -> Iterable[WindowedValue]:
+        if self._buffer:
+            yield WindowedValue(
+                value=self._drain(),
+                timestamp=0,
+                windows=(GlobalWindow(),),
+            )
+
+    def _drain(self) -> pa.RecordBatch:
+        buffered = self._buffer
+        self._buffer = []
+        column_values: dict[str, list] = {col: [] for col in self._feature_columns}
+        for row in buffered:
+            for col in self._feature_columns:
+                value = row[col]
+                column_values[col].append(None if value is None else [value])
+        return pa.RecordBatch.from_pydict(
+            {col: pa.array(values) for col, values in column_values.items()}
+        )
+
+
+class BqTableToRecordBatch(beam.PTransform):
+    """Read selected columns from a BigQuery table and emit Arrow record batches.
+
+    The output is a ``PCollection[pa.RecordBatch]`` whose columns are Arrow
+    list-typed (``list<T>``), which is the shape TFDV expects. Schema is
+    inferred from row values; rows with NULL values are represented as Arrow
+    nulls (missing features).
+
+    Args:
+        bq_table: Fully qualified ``project.dataset.table`` reference.
+        feature_columns: Columns to select; also the columns exposed to TFDV.
+        batch_size: Rows per emitted ``RecordBatch``. Defaults to 1000.
+        bq_project: Optional GCP project to bill the read against. Defaults to
+            the project inferred by ``beam.io.ReadFromBigQuery``.
+    """
+
+    def __init__(
+        self,
+        bq_table: str,
+        feature_columns: list[str],
+        batch_size: int = _DEFAULT_BQ_READ_BATCH_SIZE,
+        bq_project: Optional[str] = None,
+    ):
+        if not feature_columns:
+            raise ValueError(
+                f"BqTableToRecordBatch requires at least one feature column "
+                f"for table {bq_table!r}"
+            )
+        self.bq_table = bq_table
+        self.feature_columns = feature_columns
+        self.batch_size = batch_size
+        self.bq_project = bq_project
+
+    def expand(self, pbegin: PBegin) -> PCollection[pa.RecordBatch]:
+        if not isinstance(pbegin, PBegin):
+            raise TypeError(
+                f"Input to {BqTableToRecordBatch.__name__} transform must be "
+                f"a PBegin but found {pbegin})"
+            )
+        column_list = ", ".join(f"`{c}`" for c in self.feature_columns)
+        query = f"SELECT {column_list} FROM `{self.bq_table}`"
+        read_kwargs: dict = {
+            "query": query,
+            "use_standard_sql": True,
+        }
+        if self.bq_project is not None:
+            read_kwargs["project"] = self.bq_project
+        return (
+            pbegin
+            | "Read feature rows from BQ" >> beam.io.ReadFromBigQuery(**read_kwargs)
+            | "Buffer rows and emit record batches"
+            >> beam.ParDo(
+                _RowsToRecordBatchDoFn(
+                    batch_size=self.batch_size,
+                    feature_columns=self.feature_columns,
+                )
+            )
+        )
diff --git a/gigl/src/common/constants/components.py b/gigl/src/common/constants/components.py
index 29e9e4091..ae52a5cbb 100644
--- a/gigl/src/common/constants/components.py
+++ b/gigl/src/common/constants/components.py
@@ -10,6 +10,7 @@ class GiGLComponents(Enum):
     Trainer = "trainer"
     Inferencer = "inferencer"
     PostProcessor = "post_processor"
+    DataAnalyzer = "data_analyzer"
 
     @property
     def kebab_case_value(self):
diff --git a/gigl/src/data_preprocessor/lib/transform/utils.py b/gigl/src/data_preprocessor/lib/transform/utils.py
index f2b990abf..9694005cc 100644
--- a/gigl/src/data_preprocessor/lib/transform/utils.py
+++ b/gigl/src/data_preprocessor/lib/transform/utils.py
@@ -2,11 +2,10 @@
 
 import apache_beam as beam
 import pyarrow as pa
-import tensorflow_data_validation as tfdv
 import tensorflow_transform
 import tfx_bsl
 from apache_beam.pvalue import PBegin, PCollection, PDone
-from tensorflow_metadata.proto.v0 import schema_pb2, statistics_pb2
+from tensorflow_metadata.proto.v0 import schema_pb2
 from tensorflow_transform import beam as tft_beam
 from tensorflow_transform.tf_metadata import schema_utils
 from tfx_bsl.tfxio.record_based_tfxio import RecordBasedTFXIO
@@ -117,35 +116,6 @@ def expand(self, pbegin: PBegin) -> PCollection[pa.RecordBatch]:
         )
 
 
-class GenerateAndVisualizeStats(beam.PTransform):
-    def __init__(self, facets_report_uri: GcsUri, stats_output_uri: GcsUri):
-        self.facets_report_uri = facets_report_uri
-        self.stats_output_uri = stats_output_uri
-
-    def expand(
-        self, features: PCollection[pa.RecordBatch]
-    ) -> PCollection[statistics_pb2.DatasetFeatureStatisticsList]:
-        stats = features | "Generate TFDV statistics" >> tfdv.GenerateStatistics()
-
-        _ = (
-            stats
-            | "Generate stats visualization"
-            >> beam.Map(tfdv.utils.display_util.get_statistics_html)
-            | "Write stats Facets report HTML"
-            >> beam.io.WriteToText(
-                self.facets_report_uri.uri, num_shards=1, shard_name_template=""
-            )
-        )
-
-        _ = (
-            stats
-            | "Write TFDV stats output TFRecord"
-            >> tfdv.WriteStatisticsToTFRecord(self.stats_output_uri.uri)
-        )
-
-        return stats
-
-
 class ReadExistingTFTransformFn(beam.PTransform):
     def __init__(self, tf_transform_directory: Uri):
         assert isinstance(tf_transform_directory, (GcsUri, LocalUri)), (
diff --git a/tests/unit/analytics/data_analyzer/config_test.py b/tests/unit/analytics/data_analyzer/config_test.py
index 5d71d5e55..57a6b643c 100644
--- a/tests/unit/analytics/data_analyzer/config_test.py
+++ b/tests/unit/analytics/data_analyzer/config_test.py
@@ -2,6 +2,7 @@
 from typing import cast
 
 from omegaconf import OmegaConf
+from omegaconf.errors import MissingMandatoryValue
 
 from gigl.analytics.data_analyzer.config import DataAnalyzerConfig, load_analyzer_config
 from tests.test_assets.test_case import TestCase
@@ -58,7 +59,7 @@ def test_missing_required_field_raises(self) -> None:
         output_gcs_path: "gs://bucket/out/"
         """
         raw = OmegaConf.create(yaml_str)
-        with self.assertRaises(Exception):
+        with self.assertRaises(MissingMandatoryValue):
             merged = OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), raw)
             OmegaConf.to_object(merged)
 
@@ -80,3 +81,96 @@ def test_node_table_without_feature_columns(self) -> None:
         merged = OmegaConf.merge(OmegaConf.structured(DataAnalyzerConfig), raw)
         config = cast(DataAnalyzerConfig, OmegaConf.to_object(merged))
         self.assertEqual(config.node_tables[0].feature_columns, [])
+
+    def test_homogeneous_edge_backfills_src_and_dst_node_type(self) -> None:
+        """Single-node-table configs auto-populate src/dst node types."""
+        config = load_analyzer_config(str(SAMPLE_CONFIG_PATH))
+        self.assertEqual(config.edge_tables[0].src_node_type, "user")
+        self.assertEqual(config.edge_tables[0].dst_node_type, "user")
+
+
+SAMPLE_HETERO_YAML = """
+node_tables:
+  - bq_table: "p.d.users"
+    node_type: "user"
+    id_column: "uid"
+  - bq_table: "p.d.content"
+    node_type: "content"
+    id_column: "cid"
+edge_tables:
+  - bq_table: "p.d.viewed"
+    edge_type: "viewed"
+    src_id_column: "user_id"
+    dst_id_column: "content_id"
+    src_node_type: "user"
+    dst_node_type: "content"
+output_gcs_path: "gs://bucket/out/"
+"""
+
+
+class DataAnalyzerConfigHeterogeneousTest(TestCase):
+    """Tests for heterogeneous graph support (I3) and identifier validation (I1)."""
+
+    def _write_yaml(self, yaml_str: str) -> str:
+        import tempfile
+
+        fd = tempfile.NamedTemporaryFile(
+            mode="w", suffix=".yaml", delete=False, encoding="utf-8"
+        )
+        fd.write(yaml_str)
+        fd.close()
+        return fd.name
+
+    def test_heterogeneous_config_with_node_types_loads(self) -> None:
+        path = self._write_yaml(SAMPLE_HETERO_YAML)
+        config = load_analyzer_config(path)
+        self.assertEqual(len(config.node_tables), 2)
+        self.assertEqual(config.edge_tables[0].src_node_type, "user")
+        self.assertEqual(config.edge_tables[0].dst_node_type, "content")
+
+    def test_heterogeneous_missing_src_node_type_raises(self) -> None:
+        """Regression test for I3: multi-node-table configs must declare both sides."""
+        yaml_str = SAMPLE_HETERO_YAML.replace('    src_node_type: "user"\n', "")
+        path = self._write_yaml(yaml_str)
+        with self.assertRaises(ValueError) as ctx:
+            load_analyzer_config(path)
+        self.assertIn("src_node_type is required", str(ctx.exception))
+
+    def test_heterogeneous_unknown_node_type_raises(self) -> None:
+        yaml_str = SAMPLE_HETERO_YAML.replace(
+            '    dst_node_type: "content"', '    dst_node_type: "movie"'
+        )
+        path = self._write_yaml(yaml_str)
+        with self.assertRaises(ValueError) as ctx:
+            load_analyzer_config(path)
+        self.assertIn("is not a declared node_type", str(ctx.exception))
+
+    def test_invalid_bq_table_reference_raises(self) -> None:
+        """Regression test for I1: reject malformed table identifiers."""
+        yaml_str = SAMPLE_HETERO_YAML.replace(
+            'bq_table: "p.d.users"', 'bq_table: "p.d.users; DROP TABLE x"'
+        )
+        path = self._write_yaml(yaml_str)
+        with self.assertRaises(ValueError) as ctx:
+            load_analyzer_config(path)
+        self.assertIn("not a valid BigQuery table reference", str(ctx.exception))
+
+    def test_invalid_column_identifier_raises(self) -> None:
+        """Regression test for I1: reject column names with backticks/quotes."""
+        yaml_str = SAMPLE_HETERO_YAML.replace(
+            'src_id_column: "user_id"', 'src_id_column: "user`id"'
+        )
+        path = self._write_yaml(yaml_str)
+        with self.assertRaises(ValueError) as ctx:
+            load_analyzer_config(path)
+        self.assertIn("not a valid BigQuery column identifier", str(ctx.exception))
+
+    def test_column_with_whitespace_raises(self) -> None:
+        """Regression test for I1: reject column names containing whitespace."""
+        yaml_str = SAMPLE_HETERO_YAML.replace(
+            'dst_id_column: "content_id"', 'dst_id_column: "content id"'
+        )
+        path = self._write_yaml(yaml_str)
+        with self.assertRaises(ValueError) as ctx:
+            load_analyzer_config(path)
+        self.assertIn("not a valid BigQuery column identifier", str(ctx.exception))
diff --git a/tests/unit/analytics/data_analyzer/data_analyzer_test.py b/tests/unit/analytics/data_analyzer/data_analyzer_test.py
index 5f1b9f046..15f6675de 100644
--- a/tests/unit/analytics/data_analyzer/data_analyzer_test.py
+++ b/tests/unit/analytics/data_analyzer/data_analyzer_test.py
@@ -2,12 +2,44 @@
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 
-from gigl.analytics.data_analyzer.data_analyzer import _write_report
+from gigl.analytics.data_analyzer.config import (
+    DataAnalyzerConfig,
+    EdgeTableSpec,
+    NodeTableSpec,
+)
+from gigl.analytics.data_analyzer.data_analyzer import DataAnalyzer, _write_report
+from gigl.analytics.data_analyzer.graph_structure_analyzer import DataQualityError
+from gigl.analytics.data_analyzer.types import FeatureProfileResult, GraphAnalysisResult
+from gigl.common import LocalUri
 from tests.test_assets.test_case import TestCase
 
 HTML = "<html><body>report</body></html>"
 
 
+def _make_config(output_gcs_path: str) -> DataAnalyzerConfig:
+    return DataAnalyzerConfig(
+        node_tables=[
+            NodeTableSpec(
+                bq_table="p.d.users",
+                node_type="user",
+                id_column="uid",
+                feature_columns=["age"],
+            )
+        ],
+        edge_tables=[
+            EdgeTableSpec(
+                bq_table="p.d.follows",
+                edge_type="follows",
+                src_id_column="src",
+                dst_id_column="dst",
+                src_node_type="user",
+                dst_node_type="user",
+            )
+        ],
+        output_gcs_path=output_gcs_path,
+    )
+
+
 class WriteReportLocalTest(TestCase):
     def test_writes_to_local_directory(self) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -35,3 +67,93 @@ def test_handles_trailing_slash(self, mock_gcs_cls: MagicMock) -> None:
         path_with = _write_report(HTML, "gs://my-bucket/output/")
         path_without = _write_report(HTML, "gs://my-bucket/output")
         self.assertEqual(path_with, path_without)
+
+
+class DataAnalyzerRunTest(TestCase):
+    """Orchestrator tests: structure analyzer and feature profiler run
+    concurrently, their results both reach ``generate_report``, and failures
+    in either are handled independently without blocking the other.
+    """
+
+    def setUp(self) -> None:
+        super().setUp()
+        self._generate_report = patch(
+            "gigl.analytics.data_analyzer.data_analyzer.generate_report",
+            return_value=HTML,
+        ).start()
+        self._analyze = patch(
+            "gigl.analytics.data_analyzer.data_analyzer.GraphStructureAnalyzer.analyze",
+        ).start()
+        self._profile = patch(
+            "gigl.analytics.data_analyzer.data_analyzer.FeatureProfiler.profile",
+        ).start()
+        self.addCleanup(patch.stopall)
+
+    def test_invokes_both_analyzer_and_profiler(self) -> None:
+        analysis = GraphAnalysisResult()
+        profile = FeatureProfileResult(
+            facets_html_paths={"node:user": "gs://b/facets.html"}
+        )
+        self._analyze.return_value = analysis
+        self._profile.return_value = profile
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            DataAnalyzer().run(
+                config=_make_config(tmpdir),
+                resource_config_uri=LocalUri("/tmp/fake.yaml"),
+            )
+
+        self.assertEqual(self._analyze.call_count, 1)
+        self.assertEqual(self._profile.call_count, 1)
+        _, call_kwargs = self._generate_report.call_args
+        self.assertIs(call_kwargs["analysis_result"], analysis)
+        self.assertIs(call_kwargs["profile_result"], profile)
+
+    def test_profiler_failure_does_not_block_report(self) -> None:
+        self._analyze.return_value = GraphAnalysisResult()
+        self._profile.side_effect = RuntimeError("Dataflow went boom")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = DataAnalyzer().run(
+                config=_make_config(tmpdir),
+                resource_config_uri=LocalUri("/tmp/fake.yaml"),
+            )
+            _, call_kwargs = self._generate_report.call_args
+            self.assertIsInstance(call_kwargs["profile_result"], FeatureProfileResult)
+            self.assertEqual(call_kwargs["profile_result"].facets_html_paths, {})
+            self.assertTrue(Path(path).exists())
+
+    def test_data_quality_error_uses_partial_result_and_still_runs_profiler(
+        self,
+    ) -> None:
+        partial = GraphAnalysisResult(dangling_edge_counts={"follows": 1})
+        self._analyze.side_effect = DataQualityError(
+            "Tier 1 failure", partial_result=partial
+        )
+        profile = FeatureProfileResult(
+            facets_html_paths={"node:user": "gs://b/facets.html"}
+        )
+        self._profile.return_value = profile
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            DataAnalyzer().run(
+                config=_make_config(tmpdir),
+                resource_config_uri=LocalUri("/tmp/fake.yaml"),
+            )
+
+        _, call_kwargs = self._generate_report.call_args
+        self.assertIs(call_kwargs["analysis_result"], partial)
+        self.assertIs(call_kwargs["profile_result"], profile)
+
+    def test_passes_resource_config_uri_to_profiler(self) -> None:
+        self._analyze.return_value = GraphAnalysisResult()
+        self._profile.return_value = FeatureProfileResult()
+        resource_config_uri = LocalUri("/tmp/fake.yaml")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            DataAnalyzer().run(
+                config=_make_config(tmpdir),
+                resource_config_uri=resource_config_uri,
+            )
+
+        self.assertEqual(self._profile.call_args.args[1], resource_config_uri)
diff --git a/tests/unit/analytics/data_analyzer/feature_profiler_test.py b/tests/unit/analytics/data_analyzer/feature_profiler_test.py
new file mode 100644
index 000000000..fa345bf16
--- /dev/null
+++ b/tests/unit/analytics/data_analyzer/feature_profiler_test.py
@@ -0,0 +1,256 @@
+"""Unit tests for the FeatureProfiler.
+
+Dataflow job execution is mocked: ``beam.Pipeline`` is replaced with a
+dummy that records construction, and ``init_beam_pipeline_options`` /
+``get_resource_config`` are patched so tests don't touch real GCP
+resources.
+"""
+import itertools
+from typing import Optional
+from unittest.mock import MagicMock, patch
+
+from gigl.analytics.data_analyzer.config import (
+    DataAnalyzerConfig,
+    EdgeTableSpec,
+    NodeTableSpec,
+)
+from gigl.analytics.data_analyzer.feature_profiler import (
+    FeatureProfiler,
+    _collect_profile_tasks,
+)
+from gigl.common import LocalUri
+from gigl.src.common.constants.components import GiGLComponents
+from tests.test_assets.test_case import TestCase
+
+
+def _make_config(
+    node_specs: Optional[list[NodeTableSpec]] = None,
+    edge_specs: Optional[list[EdgeTableSpec]] = None,
+    output_gcs_path: str = "gs://bucket/out",
+) -> DataAnalyzerConfig:
+    return DataAnalyzerConfig(
+        node_tables=node_specs
+        if node_specs is not None
+        else [
+            NodeTableSpec(
+                bq_table="p.d.users",
+                node_type="user",
+                id_column="uid",
+                feature_columns=["age", "country"],
+            )
+        ],
+        edge_tables=edge_specs
+        if edge_specs is not None
+        else [
+            EdgeTableSpec(
+                bq_table="p.d.follows",
+                edge_type="follows",
+                src_id_column="src",
+                dst_id_column="dst",
+                src_node_type="user",
+                dst_node_type="user",
+                feature_columns=["weight"],
+            )
+        ],
+        output_gcs_path=output_gcs_path,
+    )
+
+
+class CollectProfileTasksTest(TestCase):
+    def test_skips_tables_without_feature_columns(self) -> None:
+        config = _make_config(
+            node_specs=[
+                NodeTableSpec(
+                    bq_table="p.d.a",
+                    node_type="a",
+                    id_column="id",
+                    feature_columns=["f1"],
+                ),
+                NodeTableSpec(
+                    bq_table="p.d.b",
+                    node_type="b",
+                    id_column="id",
+                    feature_columns=[],
+                ),
+            ],
+            edge_specs=[
+                EdgeTableSpec(
+                    bq_table="p.d.e1",
+                    edge_type="e1",
+                    src_id_column="s",
+                    dst_id_column="d",
+                    src_node_type="a",
+                    dst_node_type="a",
+                    feature_columns=["w"],
+                ),
+                EdgeTableSpec(
+                    bq_table="p.d.e2",
+                    edge_type="e2",
+                    src_id_column="s",
+                    dst_id_column="d",
+                    src_node_type="a",
+                    dst_node_type="a",
+                    feature_columns=[],
+                ),
+            ],
+        )
+        tasks = _collect_profile_tasks(config)
+        keys = sorted(t.result_key for t in tasks)
+        self.assertEqual(keys, ["edge:e1", "node:a"])
+
+    def test_preserves_feature_columns(self) -> None:
+        config = _make_config()
+        tasks = _collect_profile_tasks(config)
+        by_key = {t.result_key: t for t in tasks}
+        self.assertEqual(by_key["node:user"].feature_columns, ["age", "country"])
+        self.assertEqual(by_key["edge:follows"].feature_columns, ["weight"])
+
+
+class FeatureProfilerRaisesTest(TestCase):
+    def test_raises_when_resource_config_uri_missing(self) -> None:
+        profiler = FeatureProfiler()
+        with self.assertRaises(ValueError):
+            profiler.profile(config=_make_config(), resource_config_uri=None)
+
+
+class FeatureProfilerRunTest(TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._resource_config_uri = LocalUri("/tmp/fake_resource_config.yaml")
+
+        self._get_resource_config = patch(
+            "gigl.analytics.data_analyzer.feature_profiler.get_resource_config"
+        ).start()
+        self._init_beam_pipeline_options = patch(
+            "gigl.analytics.data_analyzer.feature_profiler.init_beam_pipeline_options",
+            return_value=MagicMock(name="PipelineOptions"),
+        ).start()
+
+        self._pipelines: list[MagicMock] = []
+
+        def _make_pipeline(*args, **kwargs):
+            pipeline = MagicMock(name="Pipeline")
+            pipeline.__enter__ = MagicMock(return_value=pipeline)
+            pipeline.__exit__ = MagicMock(return_value=False)
+            self._pipelines.append(pipeline)
+            return pipeline
+
+        self._pipeline_ctor = patch(
+            "gigl.analytics.data_analyzer.feature_profiler.beam.Pipeline",
+            side_effect=_make_pipeline,
+        ).start()
+
+        self.addCleanup(patch.stopall)
+
+    def test_returns_empty_when_no_tables_have_features(self) -> None:
+        config = _make_config(
+            node_specs=[
+                NodeTableSpec(
+                    bq_table="p.d.users",
+                    node_type="user",
+                    id_column="uid",
+                    feature_columns=[],
+                )
+            ],
+            edge_specs=[
+                EdgeTableSpec(
+                    bq_table="p.d.follows",
+                    edge_type="follows",
+                    src_id_column="src",
+                    dst_id_column="dst",
+                    src_node_type="user",
+                    dst_node_type="user",
+                    feature_columns=[],
+                )
+            ],
+        )
+        profiler = FeatureProfiler()
+        result = profiler.profile(
+            config=config, resource_config_uri=self._resource_config_uri
+        )
+        self.assertEqual(result.facets_html_paths, {})
+        self.assertEqual(result.stats_paths, {})
+        self.assertEqual(len(self._pipelines), 0)
+
+    def test_launches_one_pipeline_per_feature_table(self) -> None:
+        config = _make_config()
+        profiler = FeatureProfiler()
+        result = profiler.profile(
+            config=config, resource_config_uri=self._resource_config_uri
+        )
+        self.assertEqual(len(self._pipelines), 2)
+        self.assertEqual(
+            sorted(result.facets_html_paths.keys()),
+            ["edge:follows", "node:user"],
+        )
+        self.assertEqual(
+            sorted(result.stats_paths.keys()),
+            ["edge:follows", "node:user"],
+        )
+        self._get_resource_config.assert_called_once_with(
+            resource_config_uri=self._resource_config_uri
+        )
+        component_kwargs = [
+            call.kwargs.get("component")
+            for call in self._init_beam_pipeline_options.call_args_list
+        ]
+        self.assertTrue(all(c == GiGLComponents.DataAnalyzer for c in component_kwargs))
+
+    def test_gcs_paths_use_expected_layout(self) -> None:
+        profiler = FeatureProfiler()
+        result = profiler.profile(
+            config=_make_config(output_gcs_path="gs://bucket/run1/"),
+            resource_config_uri=self._resource_config_uri,
+        )
+        self.assertEqual(
+            result.facets_html_paths["node:user"],
+            "gs://bucket/run1/feature_profiler/nodes/user/facets.html",
+        )
+        self.assertEqual(
+            result.stats_paths["node:user"],
+            "gs://bucket/run1/feature_profiler/nodes/user/stats.tfrecord",
+        )
+        self.assertEqual(
+            result.facets_html_paths["edge:follows"],
+            "gs://bucket/run1/feature_profiler/edges/follows/facets.html",
+        )
+
+    def test_individual_pipeline_failure_is_caught(self) -> None:
+        counter = itertools.count(1)
+
+        def _make_pipeline_fail_second(*args, **kwargs):
+            pipeline = MagicMock(name="Pipeline")
+            pipeline.__enter__ = MagicMock(return_value=pipeline)
+            if next(counter) == 2:
+                pipeline.__exit__ = MagicMock(side_effect=RuntimeError("Dataflow boom"))
+            else:
+                pipeline.__exit__ = MagicMock(return_value=False)
+            self._pipelines.append(pipeline)
+            return pipeline
+
+        self._pipeline_ctor.side_effect = _make_pipeline_fail_second
+
+        profiler = FeatureProfiler()
+        result = profiler.profile(
+            config=_make_config(),
+            resource_config_uri=self._resource_config_uri,
+        )
+        self.assertEqual(len(self._pipelines), 2)
+        total_keys = set(result.facets_html_paths.keys())
+        self.assertEqual(len(total_keys), 1)
+        self.assertLessEqual(total_keys, {"node:user", "edge:follows"})
+
+    def test_uses_data_analyzer_job_name_suffix(self) -> None:
+        profiler = FeatureProfiler()
+        profiler.profile(
+            config=_make_config(),
+            resource_config_uri=self._resource_config_uri,
+        )
+        suffixes = {
+            call.kwargs.get("job_name_suffix")
+            for call in self._init_beam_pipeline_options.call_args_list
+        }
+        self.assertEqual(
+            suffixes,
+            {"profile-node-user", "profile-edge-follows"},
+        )
diff --git a/tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py b/tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py
index cd185738f..b7d943251 100644
--- a/tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py
+++ b/tests/unit/analytics/data_analyzer/graph_structure_analyzer_test.py
@@ -31,6 +31,8 @@ def _make_config(
             edge_type="follows",
             src_id_column="src",
             dst_id_column="dst",
+            src_node_type="user",
+            dst_node_type="user",
         )
     ]
     if extra_edge:
@@ -40,6 +42,8 @@ def _make_config(
                 edge_type="likes",
                 src_id_column="src",
                 dst_id_column="dst",
+                src_node_type="user",
+                dst_node_type="user",
             )
         )
     return DataAnalyzerConfig(
@@ -59,6 +63,37 @@ def _make_config(
     )
 
 
+def _make_heterogeneous_config() -> DataAnalyzerConfig:
+    """User -[viewed]-> content bipartite graph."""
+    return DataAnalyzerConfig(
+        node_tables=[
+            NodeTableSpec(
+                bq_table="p.d.users",
+                node_type="user",
+                id_column="uid",
+                feature_columns=["age"],
+            ),
+            NodeTableSpec(
+                bq_table="p.d.content",
+                node_type="content",
+                id_column="cid",
+                feature_columns=["topic"],
+            ),
+        ],
+        edge_tables=[
+            EdgeTableSpec(
+                bq_table="p.d.viewed",
+                edge_type="viewed",
+                src_id_column="user_id",
+                dst_id_column="content_id",
+                src_node_type="user",
+                dst_node_type="content",
+            )
+        ],
+        output_gcs_path="gs://bucket/out/",
+    )
+
+
 def _mock_row(data: dict[str, Any]) -> MagicMock:
     """Mock a BigQuery Row supporting both key and attribute access."""
     row = MagicMock()
@@ -120,11 +155,15 @@ def _default_row_for_query(query: str) -> dict[str, Any]:
         return {"cold_start_count": 50}
     if "null_rate" in q:
         # Include any plausible column name ending in _null_rate with zero default.
+        # Extend this list when adding new feature columns to test configs.
         return {
             "total_rows": 1000,
             "f1_null_rate": 0.0,
             "f2_null_rate": 0.01,
             "uid_null_rate": 0.0,
+            "cid_null_rate": 0.0,
+            "age_null_rate": 0.0,
+            "topic_null_rate": 0.0,
             "is_active_null_rate": 0.0,
         }
     if "distinct_src_count" in q:
@@ -266,3 +305,92 @@ def test_edge_type_distribution_populated_for_multiple_edges(
         result = analyzer.analyze(_make_config(extra_edge=True))
         self.assertIn("follows", result.edge_type_distribution)
         self.assertIn("likes", result.edge_type_distribution)
+
+    def test_degree_stats_bucket_keys_match_report_bucket_order(
+        self, mock_bq_cls: MagicMock
+    ) -> None:
+        """Bucket keys must exactly match BUCKET_ORDER in report/charts.ai.js.
+
+        Regression test for C1: previously, Python emitted lowercase 'k' keys
+        (e.g., '101-1k') while the JS renderer expected uppercase 'K', causing
+        the three highest buckets to silently render as zero.
+        """
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        result = analyzer.analyze(_make_config())
+        self.assertIn("follows_out", result.degree_stats)
+        stats = result.degree_stats["follows_out"]
+        expected_bucket_keys = ["0-1", "2-10", "11-100", "101-1K", "1K-10K", "10K+"]
+        self.assertEqual(list(stats.buckets.keys()), expected_bucket_keys)
+
+    def test_cold_start_query_includes_both_src_and_dst_columns(
+        self, mock_bq_cls: MagicMock
+    ) -> None:
+        """Cold-start must be computed from total degree (src + dst).
+
+        Regression test for C2: previously only src-side edges were counted,
+        misclassifying pure-destination nodes as cold-start.
+        """
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        analyzer.analyze(_make_config())
+        cold_start_queries = [
+            call.kwargs["query"]
+            for call in mock_bq.run_query.call_args_list
+            if "cold_start_count" in call.kwargs.get("query", "")
+        ]
+        self.assertGreaterEqual(len(cold_start_queries), 1)
+        for sql in cold_start_queries:
+            self.assertIn("src", sql)
+            self.assertIn("dst", sql)
+            self.assertIn("UNION ALL", sql)
+
+    def test_query_scalar_raises_on_empty_rows(self, mock_bq_cls: MagicMock) -> None:
+        """Scalar queries must fail loudly on unexpected empty results.
+
+        Regression test for I2: previously _query_scalar silently returned 0
+        when BQ returned no rows, hiding driver/auth/schema issues.
+        """
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            []
+        )
+        analyzer = GraphStructureAnalyzer()
+        with self.assertRaises(RuntimeError) as ctx:
+            analyzer.analyze(_make_config())
+        self.assertIn("expected exactly 1 row", str(ctx.exception))
+
+    def test_heterogeneous_tier1_joins_correct_node_tables(
+        self, mock_bq_cls: MagicMock
+    ) -> None:
+        """For hetero edges, src and dst must join against their own node tables.
+
+        Regression test for I3: previously every edge table was joined against
+        node_tables[0] on both sides, producing false-positive missing_dst
+        violations for bipartite edges like user->content.
+        """
+        mock_bq = mock_bq_cls.return_value
+        mock_bq.run_query.side_effect = lambda query, labels=None: _mock_row_iterator(
+            _default_rows_for_query(query)
+        )
+        analyzer = GraphStructureAnalyzer()
+        result = analyzer.analyze(_make_heterogeneous_config())
+        self.assertEqual(result.referential_integrity_violations["viewed"], 0)
+        # Inspect the referential integrity query: src joins user_nodes, dst joins content_nodes.
+        ref_queries = [
+            call.kwargs["query"]
+            for call in mock_bq.run_query.call_args_list
+            if "missing_src_count" in call.kwargs.get("query", "")
+        ]
+        self.assertGreaterEqual(len(ref_queries), 1)
+        ref_sql = ref_queries[0]
+        self.assertIn("`p.d.users`", ref_sql)
+        self.assertIn("`p.d.content`", ref_sql)
+        self.assertIn("e.user_id = src_node.uid", ref_sql)
+        self.assertIn("e.content_id = dst_node.cid", ref_sql)
diff --git a/tests/unit/analytics/data_analyzer/queries_test.py b/tests/unit/analytics/data_analyzer/queries_test.py
index db6389329..e7de870a3 100644
--- a/tests/unit/analytics/data_analyzer/queries_test.py
+++ b/tests/unit/analytics/data_analyzer/queries_test.py
@@ -1,4 +1,5 @@
 from gigl.analytics.data_analyzer.queries import (
+    COLD_START_NODE_COUNT_QUERY,
     DANGLING_EDGES_QUERY,
     DEGREE_BUCKET_QUERY,
     DEGREE_DISTRIBUTION_QUERY,
@@ -33,19 +34,45 @@ def test_contains_null_checks(self) -> None:
 
 
 class EdgeReferentialIntegrityQueryTest(TestCase):
-    def test_contains_left_join(self) -> None:
+    def test_contains_left_join_homogeneous(self) -> None:
+        """Homogeneous case: src and dst resolve to the same node table."""
         sql = EDGE_REFERENTIAL_INTEGRITY_QUERY.format(
             edge_table=EDGE_TABLE,
-            node_table=NODE_TABLE,
+            src_node_table=NODE_TABLE,
+            dst_node_table=NODE_TABLE,
             src_id_column="src_uid",
             dst_id_column="dst_uid",
-            node_id_column="user_id",
+            src_node_id_column="user_id",
+            dst_node_id_column="user_id",
         )
         self.assertIn("LEFT JOIN", sql)
         self.assertIn(f"`{NODE_TABLE}`", sql)
         self.assertIn(f"`{EDGE_TABLE}`", sql)
         self.assertIn("IS NULL", sql)
 
+    def test_contains_left_join_heterogeneous(self) -> None:
+        """Heterogeneous case: src and dst resolve to different node tables.
+
+        Regression test for I3: previously the query took a single node_table
+        and always joined both sides against it, producing false-positive
+        missing_dst violations on bipartite graphs.
+        """
+        user_table = "project.dataset.user_nodes"
+        content_table = "project.dataset.content_nodes"
+        sql = EDGE_REFERENTIAL_INTEGRITY_QUERY.format(
+            edge_table=EDGE_TABLE,
+            src_node_table=user_table,
+            dst_node_table=content_table,
+            src_id_column="user_id",
+            dst_id_column="content_id",
+            src_node_id_column="uid",
+            dst_node_id_column="cid",
+        )
+        self.assertIn(f"`{user_table}`", sql)
+        self.assertIn(f"`{content_table}`", sql)
+        self.assertIn("e.user_id = src_node.uid", sql)
+        self.assertIn("e.content_id = dst_node.cid", sql)
+
 
 class DuplicateNodeCountQueryTest(TestCase):
     def test_contains_group_by_having(self) -> None:
@@ -93,3 +120,25 @@ def test_contains_limit(self) -> None:
         self.assertIn("LIMIT 20", sql)
         self.assertIn("ORDER BY", sql)
         self.assertIn("DESC", sql)
+
+
+class ColdStartNodeCountQueryTest(TestCase):
+    def test_unions_src_and_dst_columns(self) -> None:
+        """Cold-start is a property of total degree, not out-degree alone.
+
+        Regression test for C2: previously the query only counted src-side
+        edges, which misclassified pure-destination node types (e.g., content
+        receiving likes) as cold-start regardless of in-degree.
+        """
+        sql = COLD_START_NODE_COUNT_QUERY.format(
+            node_table=NODE_TABLE,
+            edge_table=EDGE_TABLE,
+            node_id_column="user_id",
+            src_id_column="src_uid",
+            dst_id_column="dst_uid",
+        )
+        self.assertIn("src_uid", sql)
+        self.assertIn("dst_uid", sql)
+        self.assertIn("UNION ALL", sql)
+        self.assertIn(f"`{NODE_TABLE}`", sql)
+        self.assertIn(f"`{EDGE_TABLE}`", sql)
diff --git a/tests/unit/common/beam/__init__.py b/tests/unit/common/beam/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/unit/common/beam/tfdv_transforms_test.py b/tests/unit/common/beam/tfdv_transforms_test.py
new file mode 100644
index 000000000..f52c09a6c
--- /dev/null
+++ b/tests/unit/common/beam/tfdv_transforms_test.py
@@ -0,0 +1,154 @@
+"""Unit tests for the shared TFDV/Beam PTransforms."""
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+import apache_beam as beam
+import pyarrow as pa
+from apache_beam.testing.util import assert_that, equal_to
+
+from gigl.common import LocalUri
+from gigl.common.beam.tfdv_transforms import (
+    BqTableToRecordBatch,
+    GenerateAndVisualizeStats,
+)
+from tests.test_assets.test_case import TestCase
+
+
+class BqTableToRecordBatchTest(TestCase):
+    def test_raises_on_empty_feature_columns(self) -> None:
+        with self.assertRaises(ValueError):
+            BqTableToRecordBatch(bq_table="p.d.t", feature_columns=[])
+
+    def test_query_uses_backtick_quoted_columns_and_table(self) -> None:
+        transform = BqTableToRecordBatch(
+            bq_table="proj.ds.users",
+            feature_columns=["age", "country"],
+        )
+        captured_kwargs: dict = {}
+
+        def _fake_read(**kwargs):
+            captured_kwargs.update(kwargs)
+            return beam.Create([{"age": 1, "country": "US"}])
+
+        with patch(
+            "gigl.common.beam.tfdv_transforms.beam.io.ReadFromBigQuery",
+            side_effect=_fake_read,
+        ):
+            with beam.Pipeline() as p:
+                _ = p | transform
+
+        self.assertEqual(
+            captured_kwargs["query"],
+            "SELECT `age`, `country` FROM `proj.ds.users`",
+        )
+        self.assertTrue(captured_kwargs["use_standard_sql"])
+        self.assertNotIn("project", captured_kwargs)
+
+    def test_passes_bq_project_when_given(self) -> None:
+        transform = BqTableToRecordBatch(
+            bq_table="proj.ds.users",
+            feature_columns=["age"],
+            bq_project="billing-project",
+        )
+        captured_kwargs: dict = {}
+
+        def _fake_read(**kwargs):
+            captured_kwargs.update(kwargs)
+            return beam.Create([{"age": 1}])
+
+        with patch(
+            "gigl.common.beam.tfdv_transforms.beam.io.ReadFromBigQuery",
+            side_effect=_fake_read,
+        ):
+            with beam.Pipeline() as p:
+                _ = p | transform
+
+        self.assertEqual(captured_kwargs["project"], "billing-project")
+
+    def test_emits_record_batches_with_list_typed_columns(self) -> None:
+        rows = [
+            {"age": 30, "country": "US"},
+            {"age": 25, "country": "CA"},
+            {"age": None, "country": "US"},
+        ]
+
+        def _fake_read(**kwargs):
+            return beam.Create(rows)
+
+        def _extract(batch: pa.RecordBatch) -> tuple:
+            age_type = batch.schema.field("age").type
+            country_type = batch.schema.field("country").type
+            return (
+                batch.num_rows,
+                tuple(sorted(batch.schema.names)),
+                pa.types.is_list(age_type),
+                pa.types.is_list(country_type),
+                tuple(batch.column("age").to_pylist()),
+                tuple(batch.column("country").to_pylist()),
+            )
+
+        with patch(
+            "gigl.common.beam.tfdv_transforms.beam.io.ReadFromBigQuery",
+            side_effect=_fake_read,
+        ):
+            with beam.Pipeline() as p:
+                batches = p | BqTableToRecordBatch(
+                    bq_table="p.d.t",
+                    feature_columns=["age", "country"],
+                    batch_size=10,
+                )
+                summaries = batches | "Summarize batch" >> beam.Map(_extract)
+                assert_that(
+                    summaries,
+                    equal_to(
+                        [
+                            (
+                                3,
+                                ("age", "country"),
+                                True,
+                                True,
+                                ([30], [25], None),
+                                (["US"], ["CA"], ["US"]),
+                            )
+                        ]
+                    ),
+                )
+
+
+class GenerateAndVisualizeStatsTest(TestCase):
+    def test_runs_and_writes_artifacts(self) -> None:
+        """Smoke test: runs the PTransform on a tiny in-memory RecordBatch and
+        verifies that both the Facets HTML and the stats TFRecord are written.
+        """
+        batch = pa.RecordBatch.from_pydict(
+            {
+                "age": pa.array([[30], [25], [40]], type=pa.list_(pa.int64())),
+                "country": pa.array(
+                    [["US"], ["CA"], ["US"]], type=pa.list_(pa.string())
+                ),
+            }
+        )
+        with tempfile.TemporaryDirectory() as tmpdir:
+            facets_path = os.path.join(tmpdir, "facets.html")
+            stats_path = os.path.join(tmpdir, "stats.tfrecord")
+            with beam.Pipeline() as p:
+                _ = (
+                    p
+                    | "Create a single record batch" >> beam.Create([batch])
+                    | GenerateAndVisualizeStats(
+                        facets_report_uri=LocalUri(facets_path),
+                        stats_output_uri=LocalUri(stats_path),
+                    )
+                )
+
+            self.assertTrue(
+                Path(facets_path).exists(),
+                f"Facets HTML not written at {facets_path}",
+            )
+            self.assertGreater(Path(facets_path).stat().st_size, 0)
+            written = list(Path(tmpdir).glob("stats.tfrecord*"))
+            self.assertTrue(
+                written, f"No stats TFRecord written under prefix {stats_path}"
+            )