diff --git a/Makefile b/Makefile
index 93ab75ffc..dab742500 100644
--- a/Makefile
+++ b/Makefile
@@ -270,6 +270,14 @@ run_hom_cora_sup_gs_e2e_test:
 		--test_spec_uri="tests/e2e_tests/e2e_tests.yaml" \
 		--test_names="hom_cora_sup_gs_test"
 
+run_hom_cora_sup_gs_ppr_e2e_test: compiled_pipeline_path:=${GIGL_E2E_TEST_COMPILED_PIPELINE_PATH}
+run_hom_cora_sup_gs_ppr_e2e_test: compile_gigl_kubeflow_pipeline
+run_hom_cora_sup_gs_ppr_e2e_test:
+	uv run python tests/e2e_tests/e2e_test.py \
+		--compiled_pipeline_path=$(compiled_pipeline_path) \
+		--test_spec_uri="tests/e2e_tests/e2e_tests.yaml" \
+		--test_names="hom_cora_sup_gs_ppr_test"
+
 run_het_dblp_sup_gs_e2e_test: compiled_pipeline_path:=${GIGL_E2E_TEST_COMPILED_PIPELINE_PATH}
 run_het_dblp_sup_gs_e2e_test: compile_gigl_kubeflow_pipeline
 run_het_dblp_sup_gs_e2e_test:
diff --git a/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_ppr_task_config.yaml b/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_ppr_task_config.yaml
new file mode 100644
index 000000000..46c508819
--- /dev/null
+++ b/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_ppr_task_config.yaml
@@ -0,0 +1,75 @@
+# This config runs homogeneous CORA supervised training and inference in Graph Store mode
+# with PPR sampling. It intentionally reuses the standard graph-store training/inference
+# entrypoints, changing only the sampler args and keeping the loop short for E2E coverage.
+graphMetadata:
+  edgeTypes:
+  - dstNodeType: paper
+    relation: cites
+    srcNodeType: paper
+  nodeTypes:
+  - paper
+datasetConfig:
+  dataPreprocessorConfig:
+    dataPreprocessorConfigClsPath: gigl.src.mocking.mocking_assets.passthrough_preprocessor_config_for_mocked_assets.PassthroughPreprocessorConfigForMockedAssets
+    dataPreprocessorArgs:
+      mocked_dataset_name: 'cora_homogeneous_node_anchor_edge_features_user_defined_labels'
+trainerConfig:
+  trainerArgs:
+    log_every_n_batch: "1"
+    num_neighbors: "[10, 10]"
+    sampler_type: "ppr"
+    ppr_alpha: "0.5"
+    ppr_eps: "0.0001"
+    ppr_max_nodes: "20"
+    ppr_neighbors_per_hop: "100"
+    ppr_max_fetch_iterations: "2"
+    sampling_workers_per_process: "2"
+    main_batch_size: "8"
+    random_batch_size: "8"
+    num_max_train_batches: "4"
+    num_val_batches: "4"
+    val_every_n_batch: "1"
+  command: python -m examples.link_prediction.graph_store.homogeneous_training
+  graphStoreStorageConfig:
+    command: python -m examples.link_prediction.graph_store.storage_main
+    storageArgs:
+      sample_edge_direction: "in"
+      splitter_cls_path: "gigl.utils.data_splitters.DistNodeAnchorLinkSplitter"
+      splitter_kwargs: >-
+        {
+          "sampling_direction": "in",
+          "should_convert_labels_to_edges": True,
+          "num_val": 0.25,
+          "num_test": 0.25
+        }
+      num_server_sessions: "1"
+inferencerConfig:
+  inferencerArgs:
+    log_every_n_batch: "1"
+    num_neighbors: "[10, 10]"
+    sampler_type: "ppr"
+    ppr_alpha: "0.5"
+    ppr_eps: "0.0001"
+    ppr_max_nodes: "20"
+    ppr_neighbors_per_hop: "100"
+    ppr_max_fetch_iterations: "2"
+    sampling_workers_per_inference_process: "2"
+  inferenceBatchSize: 256
+  command: python -m examples.link_prediction.graph_store.homogeneous_inference
+  graphStoreStorageConfig:
+    command: python -m examples.link_prediction.graph_store.storage_main
+    storageArgs:
+      sample_edge_direction: "in"
+      num_server_sessions: "1"
+sharedConfig:
+  shouldSkipInference: false
+  shouldSkipModelEvaluation: true
+taskMetadata:
+  nodeAnchorBasedLinkPredictionTaskMetadata:
+    supervisionEdgeTypes:
+    - dstNodeType: paper
+      relation: cites
+      srcNodeType: paper
+featureFlags:
+  should_run_glt_backend: 'True'
+  data_preprocessor_num_shards: '2'
diff --git a/examples/link_prediction/graph_store/homogeneous_inference.py b/examples/link_prediction/graph_store/homogeneous_inference.py
index 34bc2672e..5faa84b72 100644
--- a/examples/link_prediction/graph_store/homogeneous_inference.py
+++ b/examples/link_prediction/graph_store/homogeneous_inference.py
@@ -87,7 +87,7 @@
 import sys
 import time
 from dataclasses import dataclass
-from typing import Union
+from typing import Optional, Union
 
 import torch
 import torch.multiprocessing as mp
@@ -101,6 +101,7 @@
 from gigl.common.utils.gcs import GcsUtils
 from gigl.distributed.graph_store.compute import init_compute_process
 from gigl.distributed.graph_store.remote_dist_dataset import RemoteDistDataset
+from gigl.distributed.sampler_options import SamplerOptions
 from gigl.distributed.utils import get_graph_store_info
 from gigl.env.distributed import GraphStoreInfo
 from gigl.nn import LinkPredictionGNN
@@ -110,16 +111,10 @@
 from gigl.src.common.utils.bq import BqUtils
 from gigl.src.common.utils.model import load_state_dict_from_uri
 from gigl.src.inference.lib.assets import InferenceAssets
-from gigl.utils.sampling import parse_fanout
+from gigl.utils.sampling import parse_fanout, parse_sampler_options
 
 logger = Logger()
 
-# Default number of inference processes per machine incase one isnt provided in inference args
-# i.e. `local_world_size` is not provided, and we can't infer automatically.
-# If there are GPUs attached to the machine, we automatically infer to setting
-# LOCAL_WORLD_SIZE == # of gpus on the machine.
-DEFAULT_CPU_BASED_LOCAL_WORLD_SIZE = 4
-
 
 @dataclass(frozen=True)
 class InferenceProcessArgs:
@@ -143,6 +138,7 @@ class InferenceProcessArgs:
         inference_batch_size (int): Batch size to use for inference.
         num_neighbors (Union[list[int], dict[EdgeType, list[int]]]): Fanout for subgraph sampling,
             where the ith item corresponds to the number of items to sample for the ith hop.
+        sampler_options (Optional[SamplerOptions]): Sampler variant. None uses k-hop sampling.
         sampling_workers_per_inference_process (int): Number of sampling workers per inference
             process.
         sampling_worker_shared_channel_size (str): Shared-memory buffer size (bytes) allocated for
@@ -169,6 +165,7 @@ class InferenceProcessArgs:
     # Inference configuration
     inference_batch_size: int
     num_neighbors: Union[list[int], dict[EdgeType, list[int]]]
+    sampler_options: Optional[SamplerOptions]
     sampling_workers_per_inference_process: int
     sampling_worker_shared_channel_size: str
     log_every_n_batch: int
@@ -242,6 +239,7 @@ def _inference_process(
         # For large-scale settings, consider setting this field to 30-60 seconds to ensure dataloaders
         # don't compete for memory during initialization, causing OOM
         process_start_gap_seconds=0,
+        sampler_options=args.sampler_options,
     )
     # Initialize a LinkPredictionGNN model and load parameters from
     # the saved model.
@@ -455,25 +453,23 @@ def _run_example_inference(
     if arg_local_world_size is not None:
         local_world_size = int(arg_local_world_size)
         logger.info(f"Using local_world_size from inferencer_args: {local_world_size}")
-        if torch.cuda.is_available() and local_world_size != torch.cuda.device_count():
-            logger.warning(
-                f"local_world_size {local_world_size} does not match the number of GPUs {torch.cuda.device_count()}. "
-                "This may lead to unexpected failures with NCCL communication incase GPUs are being used for "
-                + "training/inference. Consider setting local_world_size to the number of GPUs."
-            )
     else:
-        if torch.cuda.is_available() and torch.cuda.device_count() > 0:
-            # If GPUs are available, we set the local_world_size to the number of GPUs
-            local_world_size = torch.cuda.device_count()
-            logger.info(
-                f"Detected {local_world_size} GPUs. Thus, setting local_world_size to {local_world_size}"
-            )
-        else:
-            # If no GPUs are available, we set the local_world_size to the number of inference processes per machine
-            logger.info(
-                f"No GPUs detected. Thus, setting local_world_size to `{DEFAULT_CPU_BASED_LOCAL_WORLD_SIZE}`"
-            )
-            local_world_size = DEFAULT_CPU_BASED_LOCAL_WORLD_SIZE
+        local_world_size = cluster_info.num_processes_per_compute
+        logger.info(
+            f"Using local_world_size from cluster_info.num_processes_per_compute: {local_world_size}"
+        )
+    if local_world_size != cluster_info.num_processes_per_compute:
+        raise ValueError(
+            f"Graph Store local_world_size={local_world_size} must match "
+            f"cluster_info.num_processes_per_compute="
+            f"{cluster_info.num_processes_per_compute}"
+        )
+    if torch.cuda.is_available() and local_world_size != torch.cuda.device_count():
+        logger.warning(
+            f"local_world_size {local_world_size} does not match the number of GPUs {torch.cuda.device_count()}. "
+            "This may lead to unexpected failures with NCCL communication incase GPUs are being used for "
+            + "training/inference. Consider setting local_world_size to the number of GPUs."
+        )
 
     if cluster_info.compute_node_rank == 0:
         gcs_utils = GcsUtils()
@@ -494,6 +490,7 @@ def _run_example_inference(
     # Parses the fanout as a string. For the homogeneous case, the fanouts should be specified
     # as a string of a list of integers, such as "[10, 10]".
     num_neighbors = parse_fanout(inferencer_args.get("num_neighbors", "[10, 10]"))
+    sampler_options = parse_sampler_options(inferencer_args)
 
     # While the ideal value for `sampling_workers_per_inference_process` has been identified to be
     # between `2` and `4`, this may need some tuning depending on the pipeline. We default this
@@ -516,6 +513,14 @@ def _run_example_inference(
 
     log_every_n_batch = int(inferencer_args.get("log_every_n_batch", "50"))
 
+    logger.info(
+        f"Got inference args local_world_size={local_world_size}, "
+        f"num_neighbors={num_neighbors}, sampler_options={sampler_options}, "
+        f"sampling_workers_per_inference_process={sampling_workers_per_inference_process}, "
+        f"sampling_worker_shared_channel_size={sampling_worker_shared_channel_size}, "
+        f"log_every_n_batch={log_every_n_batch}"
+    )
+
     # When using mp.spawn with `nprocs`, the first argument is implicitly set to be the process number on the current machine.
     inference_args = InferenceProcessArgs(
         local_world_size=local_world_size,
@@ -528,6 +533,7 @@ def _run_example_inference(
         edge_feature_dim=edge_feature_dim,
         inference_batch_size=inference_batch_size,
         num_neighbors=num_neighbors,
+        sampler_options=sampler_options,
         sampling_workers_per_inference_process=sampling_workers_per_inference_process,
         sampling_worker_shared_channel_size=sampling_worker_shared_channel_size,
         log_every_n_batch=log_every_n_batch,
diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py
index 04340f99a..c7ae356cc 100644
--- a/examples/link_prediction/graph_store/homogeneous_training.py
+++ b/examples/link_prediction/graph_store/homogeneous_training.py
@@ -143,6 +143,7 @@
     shutdown_compute_process,
 )
 from gigl.distributed.graph_store.remote_dist_dataset import RemoteDistDataset
+from gigl.distributed.sampler_options import SamplerOptions
 from gigl.distributed.utils import get_available_device, get_graph_store_info
 from gigl.env.distributed import GraphStoreInfo
 from gigl.nn import LinkPredictionGNN, RetrievalLoss
@@ -158,7 +159,7 @@
 from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper
 from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict
 from gigl.utils.iterator import InfiniteIterator
-from gigl.utils.sampling import parse_fanout
+from gigl.utils.sampling import parse_fanout, parse_sampler_options
 
 logger = Logger()
 
@@ -191,6 +192,7 @@ def _setup_dataloaders(
     split: Literal["train", "val", "test"],
     cluster_info: GraphStoreInfo,
     num_neighbors: list[int] | dict[EdgeType, list[int]],
+    sampler_options: Optional[SamplerOptions],
     sampling_workers_per_process: int,
     main_batch_size: int,
     random_batch_size: int,
@@ -205,6 +207,7 @@ def _setup_dataloaders(
         split (Literal["train", "val", "test"]): The current split which we are loading data for.
         cluster_info (GraphStoreInfo): Cluster topology info for graph store mode.
         num_neighbors: Fanout for subgraph sampling.
+        sampler_options (Optional[SamplerOptions]): Sampler variant. None uses k-hop sampling.
         sampling_workers_per_process (int): Number of sampling workers per training/testing process.
         main_batch_size (int): Batch size for main dataloader with query and labeled nodes.
         random_batch_size (int): Batch size for random negative dataloader.
@@ -240,6 +243,7 @@ def _setup_dataloaders(
         channel_size=sampling_worker_shared_channel_size,
         process_start_gap_seconds=process_start_gap_seconds,
         shuffle=shuffle,
+        sampler_options=sampler_options,
     )
 
     logger.info(f"---Rank {rank} finished setting up main loader for split={split}")
@@ -266,6 +270,7 @@ def _setup_dataloaders(
         channel_size=sampling_worker_shared_channel_size,
         process_start_gap_seconds=process_start_gap_seconds,
         shuffle=shuffle,
+        sampler_options=sampler_options,
     )
 
     logger.info(
@@ -375,6 +380,7 @@ class TrainingProcessArgs:
         sampling_workers_per_process (int): Number of sampling workers per training/testing process.
         sampling_worker_shared_channel_size (str): Shared-memory buffer size for the channel during sampling.
         process_start_gap_seconds (int): Time to sleep between dataloader initializations.
+        sampler_options (Optional[SamplerOptions]): Sampler variant. None uses k-hop sampling.
         main_batch_size (int): Batch size for main dataloader.
         random_batch_size (int): Batch size for random negative dataloader.
         learning_rate (float): Learning rate for the optimizer.
@@ -400,6 +406,7 @@ class TrainingProcessArgs:
 
     # Sampling config
     num_neighbors: list[int] | dict[EdgeType, list[int]]
+    sampler_options: Optional[SamplerOptions]
     sampling_workers_per_process: int
     sampling_worker_shared_channel_size: str
     process_start_gap_seconds: int
@@ -463,6 +470,7 @@ def _training_process(
             split="train",
             cluster_info=args.cluster_info,
             num_neighbors=args.num_neighbors,
+            sampler_options=args.sampler_options,
             sampling_workers_per_process=args.sampling_workers_per_process,
             main_batch_size=args.main_batch_size,
             random_batch_size=args.random_batch_size,
@@ -481,6 +489,7 @@ def _training_process(
             split="val",
             cluster_info=args.cluster_info,
             num_neighbors=args.num_neighbors,
+            sampler_options=args.sampler_options,
             sampling_workers_per_process=args.sampling_workers_per_process,
             main_batch_size=args.main_batch_size,
             random_batch_size=args.random_batch_size,
@@ -637,6 +646,7 @@ def _training_process(
         split="test",
         cluster_info=args.cluster_info,
         num_neighbors=args.num_neighbors,
+        sampler_options=args.sampler_options,
         sampling_workers_per_process=args.sampling_workers_per_process,
         main_batch_size=args.main_batch_size,
         random_batch_size=args.random_batch_size,
@@ -837,13 +847,17 @@ def _run_example_training(
     # Training Hyperparameters
     trainer_args = dict(gbml_config_pb_wrapper.trainer_config.trainer_args)
 
-    if torch.cuda.is_available():
-        default_local_world_size = torch.cuda.device_count()
-    else:
-        default_local_world_size = 2
     local_world_size = int(
-        trainer_args.get("local_world_size", str(default_local_world_size))
+        trainer_args.get(
+            "local_world_size", str(cluster_info.num_processes_per_compute)
+        )
     )
+    if local_world_size != cluster_info.num_processes_per_compute:
+        raise ValueError(
+            f"Graph Store local_world_size={local_world_size} must match "
+            f"cluster_info.num_processes_per_compute="
+            f"{cluster_info.num_processes_per_compute}"
+        )
 
     if torch.cuda.is_available():
         if local_world_size > torch.cuda.device_count():
@@ -853,6 +867,7 @@ def _run_example_training(
 
     fanout = trainer_args.get("num_neighbors", "[10, 10]")
     num_neighbors = parse_fanout(fanout)
+    sampler_options = parse_sampler_options(trainer_args)
 
     sampling_workers_per_process: int = int(
         trainer_args.get("sampling_workers_per_process", "4")
@@ -880,6 +895,7 @@ def _run_example_training(
     logger.info(
         f"Got training args local_world_size={local_world_size}, \
         num_neighbors={num_neighbors}, \
+        sampler_options={sampler_options}, \
         sampling_workers_per_process={sampling_workers_per_process}, \
         main_batch_size={main_batch_size}, \
         random_batch_size={random_batch_size}, \
@@ -931,6 +947,7 @@ def _run_example_training(
         node_feature_dim=node_feature_dim,
         edge_feature_dim=edge_feature_dim,
         num_neighbors=num_neighbors,
+        sampler_options=sampler_options,
         sampling_workers_per_process=sampling_workers_per_process,
         sampling_worker_shared_channel_size=sampling_worker_shared_channel_size,
         process_start_gap_seconds=process_start_gap_seconds,
diff --git a/gigl/distributed/base_dist_loader.py b/gigl/distributed/base_dist_loader.py
index d993d83ca..b8f6f2c87 100644
--- a/gigl/distributed/base_dist_loader.py
+++ b/gigl/distributed/base_dist_loader.py
@@ -466,16 +466,6 @@ def create_mp_producer(
         channel = BaseDistLoader.create_colocated_channel(worker_options)
         if isinstance(sampler_options, PPRSamplerOptions):
             degree_tensors = dataset.degree_tensor
-            if isinstance(degree_tensors, dict):
-                logger.info(
-                    f"Pre-computed degree tensors for PPR sampling across "
-                    f"{len(degree_tensors)} edge types."
-                )
-            else:
-                logger.info(
-                    f"Pre-computed degree tensor for PPR sampling with "
-                    f"{degree_tensors.size(0)} nodes."
-                )
         else:
             degree_tensors = None
         return DistSamplingProducer(
diff --git a/gigl/distributed/base_sampler.py b/gigl/distributed/base_sampler.py
index 986ba5d58..e8e6f9e77 100644
--- a/gigl/distributed/base_sampler.py
+++ b/gigl/distributed/base_sampler.py
@@ -1,3 +1,4 @@
+import logging
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Optional, Union
@@ -213,11 +214,15 @@ async def _send_adapter(
         Copied from ``graphlearn_torch.distributed.DistNeighborSampler._send_adapter``
         (GLT 0.2.4) with the single change of ``_colloate_fn`` → ``_collate_fn``.
         """
-        sampler_output = await async_func(*args, **kwargs)
-        res = await self._collate_fn(sampler_output)
-        if self.channel is None:
-            return res
-        self.channel.send(res)
+        try:
+            sampler_output = await async_func(*args, **kwargs)
+            res = await self._collate_fn(sampler_output)
+            if self.channel is None:
+                return res
+            self.channel.send(res)
+        except Exception:
+            logging.exception("sampler task failed")
+            raise
         return None
 
     async def _collate_fn(
diff --git a/gigl/distributed/dist_dataset.py b/gigl/distributed/dist_dataset.py
index 6e124b34f..31c97410a 100644
--- a/gigl/distributed/dist_dataset.py
+++ b/gigl/distributed/dist_dataset.py
@@ -80,9 +80,7 @@ def __init__(
         edge_feature_info: Optional[
             Union[FeatureInfo, dict[EdgeType, FeatureInfo]]
         ] = None,
-        degree_tensor: Optional[
-            Union[torch.Tensor, dict[EdgeType, torch.Tensor]]
-        ] = None,
+        degree_tensor: Optional[dict[NodeType, torch.Tensor]] = None,
         max_labels_per_anchor_node: Optional[int] = None,
         edge_weights: Optional[
             Union[torch.Tensor, dict[EdgeType, torch.Tensor]]
@@ -111,7 +109,7 @@ def __init__(
                 Note this will be None in the homogeneous case if the data has no node features, or will only contain node types with node features in the heterogeneous case.
             edge_feature_info: Optional[Union[FeatureInfo, dict[EdgeType, FeatureInfo]]]: Dimension of edge features and its data type, will be a dict if heterogeneous.
                 Note this will be None in the homogeneous case if the data has no edge features, or will only contain edge types with edge features in the heterogeneous case.
-            degree_tensor: Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]]: Pre-computed degree tensor. Lazily computed on first access via the degree_tensor property.
+            degree_tensor: Optional[dict[NodeType, torch.Tensor]]: Pre-computed degree tensor keyed by node type. Lazily computed on first access via the degree_tensor property.
             max_labels_per_anchor_node (Optional[int]): Optional cap for how many
                 labels to materialize per anchor node for ABLP label fetching.
             edge_weights: Per-edge sampling weights for this rank's partition.
@@ -151,9 +149,7 @@ def __init__(
         self._node_feature_info = node_feature_info
         self._edge_feature_info = edge_feature_info
 
-        self._degree_tensor: Optional[
-            Union[torch.Tensor, dict[EdgeType, torch.Tensor]]
-        ] = degree_tensor
+        self._degree_tensor: Optional[dict[NodeType, torch.Tensor]] = degree_tensor
         self._max_labels_per_anchor_node = max_labels_per_anchor_node
         self._edge_weights: Optional[
             Union[torch.Tensor, dict[EdgeType, torch.Tensor]]
@@ -315,13 +311,15 @@ def edge_feature_info(
     @property
     def degree_tensor(
         self,
-    ) -> Union[torch.Tensor, dict[EdgeType, torch.Tensor]]:
+    ) -> dict[NodeType, torch.Tensor]:
         """
-        Lazily compute and return the degree tensor for the graph.
+        Lazily compute and return the total degree tensor per node type.
 
         On first access, computes node degrees from the graph partition and uses
-        all-reduce to aggregate across all machines. Requires torch.distributed
-        to be initialized.
+        all-reduce to aggregate across all machines. Degrees are summed across
+        all incident edge types per anchor node type before the all-reduce, so
+        the per-edge-type tensor is never stored. Requires torch.distributed to
+        be initialized.
 
         Over-counting correction (for processes sharing the same data on the same
         machine) is handled automatically by detecting the distributed topology.
@@ -329,9 +327,9 @@ def degree_tensor(
         The result is cached for subsequent accesses.
 
         Returns:
-            Union[torch.Tensor, dict[EdgeType, torch.Tensor]]: The aggregated degree tensor.
-                - For homogeneous graphs: A tensor of shape [num_nodes].
-                - For heterogeneous graphs: A dict mapping EdgeType to degree tensors.
+            dict[NodeType, torch.Tensor]: Total degree tensors keyed by node type.
+                For homogeneous graphs the single entry uses
+                ``DEFAULT_HOMOGENEOUS_NODE_TYPE`` as its key.
 
         Raises:
             RuntimeError: If torch.distributed is not initialized.
@@ -341,7 +339,9 @@ def degree_tensor(
             if self.graph is None:
                 raise ValueError("Dataset graph is None. Cannot compute degrees.")
 
-            self._degree_tensor = compute_and_broadcast_degree_tensor(self.graph)
+            self._degree_tensor = compute_and_broadcast_degree_tensor(
+                self.graph, self._edge_dir
+            )
         return self._degree_tensor
 
     @property
@@ -943,7 +943,7 @@ def share_ipc(
         Optional[Union[int, dict[NodeType, int]]],
         Optional[Union[FeatureInfo, dict[NodeType, FeatureInfo]]],
         Optional[Union[FeatureInfo, dict[EdgeType, FeatureInfo]]],
-        Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]],
+        Optional[dict[NodeType, torch.Tensor]],
         Optional[int],
         Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]],
     ]:
@@ -967,7 +967,7 @@ def share_ipc(
             Optional[Union[int, dict[NodeType, int]]]: Number of test nodes on the current machine. Will be a dict if heterogeneous.
             Optional[Union[FeatureInfo, dict[NodeType, FeatureInfo]]]: Node feature dim and its data type, will be a dict if heterogeneous
             Optional[Union[FeatureInfo, dict[EdgeType, FeatureInfo]]]: Edge feature dim and its data type, will be a dict if heterogeneous
-            Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]]: Degree tensors, will be a dict if heterogeneous
+            Optional[dict[NodeType, torch.Tensor]]: Degree tensors keyed by node type
             Optional[int]: Optional per-anchor label cap for ABLP label fetching
             Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]]: Per-edge sampling weights for this rank's partition
         """
@@ -1256,7 +1256,7 @@ def _rebuild_distributed_dataset(
         Optional[
             Union[FeatureInfo, dict[EdgeType, FeatureInfo]]
         ],  # Edge feature dim and its data type
-        Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]],  # Degree tensors
+        Optional[dict[NodeType, torch.Tensor]],  # Degree tensors
         Optional[int],  # Optional per-anchor label cap for ABLP label fetching
         Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]],  # edge_weights
     ],
diff --git a/gigl/distributed/dist_ppr_sampler.py b/gigl/distributed/dist_ppr_sampler.py
index 402e381c1..80049f305 100644
--- a/gigl/distributed/dist_ppr_sampler.py
+++ b/gigl/distributed/dist_ppr_sampler.py
@@ -17,7 +17,7 @@
 from graphlearn_torch.utils import merge_dict
 
 from gigl.distributed.base_sampler import BaseDistNeighborSampler
-from gigl.types.graph import is_label_edge_type
+from gigl.types.graph import DEFAULT_HOMOGENEOUS_NODE_TYPE, is_label_edge_type
 
 # Trailing "." is an intentional separator.  These constants are used both to
 # write metadata keys (f"{KEY}{repr(edge_type)}" → e.g. "ppr_edge_index.('user', 'to', 'story')")
@@ -26,14 +26,14 @@
 PPR_EDGE_INDEX_METADATA_KEY = "ppr_edge_index."
 PPR_WEIGHT_METADATA_KEY = "ppr_weight."
 
-# Sentinel type names for homogeneous graphs.  The PPR algorithm uses
-# dict[NodeType, ...] internally for both homo and hetero graphs; these
-# sentinels let the homogeneous path reuse the same dict-based code.
-_PPR_HOMOGENEOUS_NODE_TYPE = "ppr_homogeneous_node_type"
+# Sentinel edge type for homogeneous graphs.  The PPR algorithm uses
+# dict[NodeType, ...] internally for both homo and hetero graphs; the
+# DEFAULT_HOMOGENEOUS_NODE_TYPE sentinel lets the homogeneous path reuse
+# the same dict-based code.
 _PPR_HOMOGENEOUS_EDGE_TYPE = (
-    _PPR_HOMOGENEOUS_NODE_TYPE,
+    DEFAULT_HOMOGENEOUS_NODE_TYPE,
     "to",
-    _PPR_HOMOGENEOUS_NODE_TYPE,
+    DEFAULT_HOMOGENEOUS_NODE_TYPE,
 )
 
 
@@ -74,10 +74,10 @@ class DistPPRNeighborSampler(BaseDistNeighborSampler):
              but require more computation. Typical values: 1e-4 to 1e-6.
         max_ppr_nodes: Maximum number of nodes to return per seed based on PPR scores.
         num_neighbors_per_hop: Maximum number of neighbors to fetch per hop.
-        total_degree_dtype: Dtype for precomputed total-degree tensors. Defaults
-            to ``torch.int32``. Use a larger dtype if nodes have exceptionally high
-            aggregate degrees.
-        degree_tensors: Pre-computed degree tensors from the dataset.
+        degree_tensors: Pre-computed total-degree tensors (int32), keyed by NodeType.
+            Must be pre-computed by the caller through
+            ``DistDataset.degree_tensor`` so that workers share a single
+            allocation rather than recomputing per-worker.
     """
 
     def __init__(
@@ -87,8 +87,7 @@ def __init__(
         eps: float = 1e-4,
         max_ppr_nodes: int = 50,
         num_neighbors_per_hop: int = 100_000,
-        total_degree_dtype: torch.dtype = torch.int32,
-        degree_tensors: Union[torch.Tensor, dict[EdgeType, torch.Tensor]],
+        degree_tensors: dict[NodeType, torch.Tensor],
         max_fetch_iterations: Optional[int] = None,
         **kwargs,
     ):
@@ -125,23 +124,16 @@ def __init__(
 
                 self._node_type_to_edge_types[anchor_type].append(etype)
         else:
-            self._node_type_to_edge_types[_PPR_HOMOGENEOUS_NODE_TYPE] = [
+            self._node_type_to_edge_types[DEFAULT_HOMOGENEOUS_NODE_TYPE] = [
                 _PPR_HOMOGENEOUS_EDGE_TYPE
             ]
             self._is_homogeneous = True
 
-        # Precompute total degree per node type: the sum of degrees across all
-        # edge types traversable from that node type.  This is a graph-level
-        # property used on every PPR iteration, so computing it once at init
-        # avoids per-node summation and cache lookups in the hot loop.
-        # TODO (mkolodner-sc): This trades memory for throughput — we
-        # materialize a tensor per node type to avoid recomputing total degree
-        # on every neighbor during sampling.  Computing it here (rather than in
-        # the dataset) also keeps the door open for edge-specific degree
-        # strategies.  If memory becomes a bottleneck, revisit this.
-        self._node_type_to_total_degree: dict[NodeType, torch.Tensor] = (
-            self._build_total_degree_tensors(degree_tensors, total_degree_dtype)
-        )
+        # Total-degree tensors keyed by NodeType, pre-computed by the caller.
+        # Callers compute DistDataset.degree_tensor once in the parent process
+        # and place the result in shared memory so all worker processes map the
+        # same allocation.
+        self._node_type_to_total_degree: dict[NodeType, torch.Tensor] = degree_tensors
 
         # Build integer ID mappings for the C++ forward-push kernel.  String
         # NodeType / EdgeType keys are only used at the Python boundary
@@ -191,58 +183,6 @@ def __init__(
             for nt in all_node_types
         ]
 
-    def _build_total_degree_tensors(
-        self,
-        degree_tensors: Union[torch.Tensor, dict[EdgeType, torch.Tensor]],
-        dtype: torch.dtype,
-    ) -> dict[NodeType, torch.Tensor]:
-        """Build total-degree tensors by summing per-edge-type degrees for each node type.
-
-        For homogeneous graphs, the total degree is just the single degree tensor.
-        For heterogeneous graphs, it sums degree tensors across all edge types
-        traversable from each node type, padding shorter tensors with zeros.
-
-        Args:
-            degree_tensors: Per-edge-type degree tensors from the dataset.
-            dtype: Dtype for the output tensors.
-
-        Returns:
-            Dict mapping node type to a 1-D tensor of total degrees.
-        """
-        result: dict[NodeType, torch.Tensor] = {}
-
-        if self._is_homogeneous:
-            assert isinstance(degree_tensors, torch.Tensor)
-            # Single edge type: degree values fit directly in the target dtype.
-            result[_PPR_HOMOGENEOUS_NODE_TYPE] = degree_tensors.to(dtype)
-        else:
-            assert isinstance(degree_tensors, dict)
-            dtype_max = torch.iinfo(dtype).max
-            for node_type, edge_types in self._node_type_to_edge_types.items():
-                max_len = 0
-                for et in edge_types:
-                    if et not in degree_tensors:
-                        raise ValueError(
-                            f"Edge type {et} not found in degree tensors. "
-                            f"Available: {list(degree_tensors.keys())}"
-                        )
-                    max_len = max(max_len, len(degree_tensors[et]))
-
-                # Each degree tensor is indexed by node ID (derived from CSR
-                # indptr), so index i in every edge type's tensor refers to
-                # the same node.  Element-wise summation gives the total degree
-                # per node across all edge types.  Shorter tensors are padded
-                # implicitly (only the first len(et_degrees) entries are added).
-                # Sum in int64: aggregate degrees are bounded by partition size
-                # and fit comfortably within int64 range in practice.
-                summed = torch.zeros(max_len, dtype=torch.int64)
-                for et in edge_types:
-                    et_degrees = degree_tensors[et]
-                    summed[: len(et_degrees)] += et_degrees.to(torch.int64)
-                result[node_type] = summed.clamp(max=dtype_max).to(dtype)
-
-        return result
-
     def _get_destination_type(self, edge_type: EdgeType) -> NodeType:
         """Get the node type at the destination end of an edge type."""
         return edge_type[0] if self.edge_dir == "in" else edge_type[-1]
@@ -294,8 +234,15 @@ async def _batch_fetch_neighbors(
                 self._sample_one_hop(
                     srcs=nodes_by_etype_id[eid].to(device),
                     num_nbr=self._num_neighbors_per_hop,
-                    # _sample_one_hop expects None for homogeneous graphs, not the PPR sentinel.
-                    etype=None if etype == _PPR_HOMOGENEOUS_EDGE_TYPE else etype,
+                    # _sample_one_hop expects None only for true homogeneous graphs.
+                    # Labeled homogeneous ABLP graphs are hetero-backed because label
+                    # edges are represented as separate edge types, so they still need
+                    # the explicit default edge type here.
+                    etype=(
+                        None
+                        if self._is_homogeneous and etype == _PPR_HOMOGENEOUS_EDGE_TYPE
+                        else etype
+                    ),
                 )
             )
         outputs: list[NeighborOutput] = await asyncio.gather(*sample_tasks)
@@ -362,7 +309,7 @@ async def _compute_ppr_scores(
             valid_counts      = tensor([1,  3,   2,   0])
         """
         if seed_node_type is None:
-            seed_node_type = _PPR_HOMOGENEOUS_NODE_TYPE
+            seed_node_type = DEFAULT_HOMOGENEOUS_NODE_TYPE
         device = seed_nodes.device
 
         ppr_state = PPRForwardPush(
@@ -422,12 +369,12 @@ async def _compute_ppr_scores(
         if self._is_homogeneous:
             assert (
                 len(ntype_to_flat_ids) == 1
-                and _PPR_HOMOGENEOUS_NODE_TYPE in ntype_to_flat_ids
+                and DEFAULT_HOMOGENEOUS_NODE_TYPE in ntype_to_flat_ids
             )
             return (
-                ntype_to_flat_ids[_PPR_HOMOGENEOUS_NODE_TYPE],
-                ntype_to_flat_weights[_PPR_HOMOGENEOUS_NODE_TYPE],
-                ntype_to_valid_counts[_PPR_HOMOGENEOUS_NODE_TYPE],
+                ntype_to_flat_ids[DEFAULT_HOMOGENEOUS_NODE_TYPE],
+                ntype_to_flat_weights[DEFAULT_HOMOGENEOUS_NODE_TYPE],
+                ntype_to_valid_counts[DEFAULT_HOMOGENEOUS_NODE_TYPE],
             )
         else:
             return (
@@ -636,17 +583,32 @@ async def _sample_from_nodes(
             )
 
         else:
-            assert isinstance(nodes_to_sample, torch.Tensor)
+            if isinstance(nodes_to_sample, torch.Tensor):
+                homogeneous_nodes_to_sample = nodes_to_sample
+            elif isinstance(nodes_to_sample, dict):
+                node_types = set(nodes_to_sample.keys())
+                if node_types != {DEFAULT_HOMOGENEOUS_NODE_TYPE}:
+                    raise ValueError(
+                        f"Expected only {DEFAULT_HOMOGENEOUS_NODE_TYPE} for homogeneous PPR sampling, "
+                        f"received node types: {node_types}"
+                    )
+                homogeneous_nodes_to_sample = nodes_to_sample[
+                    DEFAULT_HOMOGENEOUS_NODE_TYPE
+                ]
+            else:
+                raise TypeError(
+                    f"Expected Tensor or node-type mapping for homogeneous PPR sampling, got {type(nodes_to_sample)}"
+                )
 
             # Register seeds; local indices 0..N-1 are assigned internally.
             # srcs holds their global IDs (same values as nodes_to_sample).
-            srcs = inducer.init_node(nodes_to_sample)
+            srcs = inducer.init_node(homogeneous_nodes_to_sample)
 
             (
                 homo_flat_ids,
                 homo_flat_weights,
                 homo_valid_counts,
-            ) = await self._compute_ppr_scores(nodes_to_sample, None)
+            ) = await self._compute_ppr_scores(homogeneous_nodes_to_sample, None)
             assert isinstance(homo_flat_ids, torch.Tensor)
             assert isinstance(homo_flat_weights, torch.Tensor)
             assert isinstance(homo_valid_counts, torch.Tensor)
diff --git a/gigl/distributed/dist_sampling_producer.py b/gigl/distributed/dist_sampling_producer.py
index 3a51715e2..15d29a48c 100644
--- a/gigl/distributed/dist_sampling_producer.py
+++ b/gigl/distributed/dist_sampling_producer.py
@@ -30,7 +30,7 @@
     SamplingConfig,
     SamplingType,
 )
-from graphlearn_torch.typing import EdgeType
+from graphlearn_torch.typing import NodeType
 from graphlearn_torch.utils import seed_everything
 from torch._C import _set_worker_signal_handlers
 from torch.utils.data.dataloader import DataLoader
@@ -55,7 +55,7 @@ def _sampling_worker_loop(
     sampling_completed_worker_count,  # mp.Value
     mp_barrier: Barrier,
     sampler_options: SamplerOptions,
-    degree_tensors: Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]],
+    degree_tensors: Optional[dict[NodeType, torch.Tensor]],
 ):
     dist_sampler = None
     try:
@@ -180,9 +180,7 @@ def __init__(
         worker_options: MpDistSamplingWorkerOptions,
         channel: ChannelBase,
         sampler_options: SamplerOptions,
-        degree_tensors: Optional[
-            Union[torch.Tensor, dict[EdgeType, torch.Tensor]]
-        ] = None,
+        degree_tensors: Optional[dict[NodeType, torch.Tensor]] = None,
     ):
         super().__init__(data, sampler_input, sampling_config, worker_options, channel)
         self._sampler_options = sampler_options
diff --git a/gigl/distributed/graph_store/shared_dist_sampling_producer.py b/gigl/distributed/graph_store/shared_dist_sampling_producer.py
index 0f7461196..c6564a39d 100644
--- a/gigl/distributed/graph_store/shared_dist_sampling_producer.py
+++ b/gigl/distributed/graph_store/shared_dist_sampling_producer.py
@@ -93,7 +93,7 @@
     SamplingConfig,
     SamplingType,
 )
-from graphlearn_torch.typing import EdgeType
+from graphlearn_torch.typing import NodeType
 from torch._C import _set_worker_signal_handlers
 
 from gigl.common.logger import Logger
@@ -103,6 +103,7 @@
     SamplerRuntime,
     create_dist_sampler,
 )
+from gigl.utils.share_memory import share_memory
 
 logger = Logger()
 
@@ -338,7 +339,7 @@ def _shared_sampling_worker_loop(
     event_queue: mp.Queue,
     mp_barrier: Barrier,
     sampler_options: SamplerOptions,
-    degree_tensors: Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]],
+    degree_tensors: Optional[dict[NodeType, torch.Tensor]],
 ) -> None:
     """Run one shared graph-store worker that schedules many input channels.
 
@@ -363,8 +364,8 @@ def _shared_sampling_worker_loop(
         sampler_options: GiGL sampler configuration (e.g. ``PPRSamplerOptions``
             for PPR-based sampling).
         degree_tensors: Pre-computed degree tensors for PPR sampling, or
-            ``None`` for non-PPR samplers.  Materialized once in the parent
-            process by ``_prepare_degree_tensors`` and shared across workers.
+            ``None`` for non-PPR samplers.  Materialized once in the parent via
+            ``DistDataset.degree_tensor`` and shared across workers.
 
     Algorithm:
         1. Initialize RPC, sampler infrastructure, and signal the parent via barrier.
@@ -835,7 +836,7 @@ def __init__(
         worker_options: RemoteDistSamplingWorkerOptions,
         sampling_config: SamplingConfig,
         sampler_options: SamplerOptions,
-        degree_tensors: Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]],
+        degree_tensors: Optional[dict[NodeType, torch.Tensor]],
     ) -> None:
         """Initialize the shared sampling backend.
 
@@ -871,7 +872,10 @@ def __init__(
         self._completed_workers: defaultdict[tuple[int, int], set[int]] = defaultdict(
             set
         )
-        self._degree_tensors = degree_tensors
+        # Move degree tensors to shared memory so all spawned workers map the
+        # same allocation instead of each pickling a private copy.
+        self._degree_tensors: Optional[dict[NodeType, torch.Tensor]] = degree_tensors
+        share_memory(self._degree_tensors)
 
     def init_backend(self) -> None:
         """Initialize worker processes once for this backend.
diff --git a/gigl/distributed/sampler_options.py b/gigl/distributed/sampler_options.py
index fccd7a3ba..08cd27352 100644
--- a/gigl/distributed/sampler_options.py
+++ b/gigl/distributed/sampler_options.py
@@ -10,7 +10,6 @@
 from dataclasses import dataclass
 from typing import Optional, Union
 
-import torch
 from graphlearn_torch.typing import EdgeType
 
 from gigl.common.logger import Logger
@@ -58,9 +57,6 @@ class PPRSamplerOptions:
             hub nodes receive diminishing residual per neighbor, so capping the fetch
             has little effect on PPR accuracy while keeping per-hop RPC cost bounded.
             Set large to approximate fetching all neighbors.
-        total_degree_dtype: Dtype for precomputed total-degree tensors. Defaults
-            to ``torch.int32``, which supports total degrees up to ~2 billion.
-            Use a larger dtype if nodes have exceptionally high aggregate degrees.
         max_fetch_iterations: Maximum number of iterations that issue RPC neighbor
             fetches. After this many fetch iterations, subsequent iterations push
             residuals using only already-cached neighbor lists (no new RPCs).
@@ -73,7 +69,6 @@ class PPRSamplerOptions:
     eps: float = 1e-4
     max_ppr_nodes: int = 50
     num_neighbors_per_hop: int = 1_000
-    total_degree_dtype: torch.dtype = torch.int32
     max_fetch_iterations: Optional[int] = None
 
 
diff --git a/gigl/distributed/utils/degree.py b/gigl/distributed/utils/degree.py
index 7374f53ed..d33ec74f0 100644
--- a/gigl/distributed/utils/degree.py
+++ b/gigl/distributed/utils/degree.py
@@ -5,8 +5,9 @@
 and aggregate them across distributed machines. Degrees are computed from the
 CSR (Compressed Sparse Row) topology stored in GraphLearn-Torch Graph objects.
 
-Note: Degree tensors are not moved to shared memory and may be duplicated across
-processes on the same machine.
+Degrees are accumulated per anchor node type (summing across all edge types
+incident to that node type) before the distributed all-reduce, so callers
+receive ``dict[NodeType, torch.Tensor]`` directly with no further conversion.
 
 Requirements
 ============
@@ -27,24 +28,28 @@
 
 import torch
 from graphlearn_torch.data import Graph
+from graphlearn_torch.typing import NodeType
 from torch_geometric.typing import EdgeType
 
 from gigl.common.logger import Logger
 from gigl.distributed.utils.device import get_device_from_process_group
 from gigl.distributed.utils.networking import get_internal_ip_from_all_ranks
-from gigl.types.graph import is_label_edge_type
+from gigl.types.graph import DEFAULT_HOMOGENEOUS_NODE_TYPE, is_label_edge_type
 
 logger = Logger()
 
 
 def compute_and_broadcast_degree_tensor(
     graph: Union[Graph, dict[EdgeType, Graph]],
-) -> Union[torch.Tensor, dict[EdgeType, torch.Tensor]]:
-    """
-    Compute node degrees from a graph and aggregate across all machines.
+    edge_dir: str,
+) -> dict[NodeType, torch.Tensor]:
+    """Compute node degrees from a graph and aggregate across all machines.
 
-    Computes degrees from the CSR row pointers (indptr) and performs all-reduce
-    to aggregate across ranks.
+    For each non-label edge type, degrees are derived from the CSR row pointers
+    (indptr).  For heterogeneous graphs, degrees are summed across all edge types
+    incident to each anchor node type **locally** before the all-reduce, so the
+    per-edge-type tensor is only a transient intermediate and is never stored,
+    returned, or transmitted over RPC.
 
     Over-counting correction (for processes sharing the same data) is handled
     automatically by detecting the distributed topology.
@@ -52,13 +57,16 @@ def compute_and_broadcast_degree_tensor(
     Args:
         graph: A Graph (homogeneous) or dict[EdgeType, Graph] (heterogeneous).
             For heterogeneous graphs, label edge types are automatically excluded
-            from the computation — they are supervision edges and should not
-            contribute to node degree for graph traversal algorithms like PPR.
+            — they are supervision edges and should not contribute to node degree
+            for graph traversal algorithms like PPR.
+        edge_dir: Sampling direction — ``"in"`` or ``"out"``.  Determines which
+            end of each edge is the anchor node type for degree accumulation.
 
     Returns:
-        Union[torch.Tensor, dict[EdgeType, torch.Tensor]]: The aggregated degree tensors.
-            - For homogeneous graphs: A tensor of shape [num_nodes].
-            - For heterogeneous graphs: A dict mapping non-label EdgeType to degree tensors.
+        dict[NodeType, torch.Tensor]: Aggregated degree tensors keyed by node
+            type.  For homogeneous graphs the single entry uses
+            ``DEFAULT_HOMOGENEOUS_NODE_TYPE`` as its key.  Values are int32
+            tensors of shape ``[num_nodes_of_that_type]``.
 
     Raises:
         RuntimeError: If torch.distributed is not initialized.
@@ -69,52 +77,50 @@ def compute_and_broadcast_degree_tensor(
             "compute_and_broadcast_degree_tensor requires torch.distributed to be initialized."
         )
 
-    # Compute local degrees from graph topology
+    local_dict: dict[NodeType, torch.Tensor] = {}
+
     if isinstance(graph, Graph):
         topo = graph.topo
         if topo is None or topo.indptr is None:
             raise ValueError("Topology/indptr not available for graph.")
-        local_degrees: Union[torch.Tensor, dict[EdgeType, torch.Tensor]] = (
-            _compute_degrees_from_indptr(topo.indptr)
+        local_dict[DEFAULT_HOMOGENEOUS_NODE_TYPE] = _compute_degrees_from_indptr(
+            topo.indptr
         )
     else:
-        local_dict: dict[EdgeType, torch.Tensor] = {}
         for edge_type, edge_graph in graph.items():
-            # Label edge types are supervision edges and should not contribute
-            # to node degree for graph traversal algorithms like PPR.
             if is_label_edge_type(edge_type):
                 continue
+            anchor_type: NodeType = edge_type[-1] if edge_dir == "in" else edge_type[0]
             topo = edge_graph.topo
             if topo is None or topo.indptr is None:
                 logger.warning(
                     f"Topology/indptr not available for edge type {edge_type}, using empty tensor."
                 )
-                local_dict[edge_type] = torch.empty(0, dtype=torch.int16)
+                degrees = torch.empty(0, dtype=torch.int32)
             else:
-                local_dict[edge_type] = _compute_degrees_from_indptr(topo.indptr)
-        local_degrees = local_dict
+                degrees = _compute_degrees_from_indptr(topo.indptr)
+
+            if anchor_type in local_dict:
+                existing = local_dict[anchor_type]
+                max_len = max(len(existing), len(degrees))
+                summed = _pad_to_size(existing, max_len).to(torch.int64)
+                summed[: len(degrees)] += degrees.to(torch.int64)
+                local_dict[anchor_type] = summed.to(torch.int32)
+            else:
+                local_dict[anchor_type] = degrees
 
-    # All-reduce across ranks (over-counting correction handled internally)
-    result = _all_reduce_degrees(local_degrees)
+    result = _all_reduce_degrees(local_dict)
 
-    # Log results
-    if isinstance(result, torch.Tensor):
-        if result.numel() > 0:
+    for node_type, degrees in result.items():
+        if degrees.numel() > 0:
             logger.info(
-                f"{result.size(0)} nodes, max={result.max().item()}, min={result.min().item()}"
+                f"{node_type}: {degrees.size(0)} nodes, "
+                f"max={degrees.max().item()}, min={degrees.min().item()}"
             )
         else:
-            logger.info("Graph contained 0 nodes when computing degrees")
-    else:
-        for edge_type, degrees in result.items():
-            if degrees.numel() > 0:
-                logger.info(
-                    f"{edge_type}: {degrees.size(0)} nodes, max={degrees.max().item()}, min={degrees.min().item()}"
-                )
-            else:
-                logger.info(
-                    f"Graph contained 0 nodes for edge type {edge_type} when computing degrees"
-                )
+            logger.info(
+                f"Graph contained 0 nodes for node type {node_type} when computing degrees"
+            )
 
     return result
 
@@ -131,33 +137,25 @@ def _pad_to_size(tensor: torch.Tensor, target_size: int) -> torch.Tensor:
     return torch.cat([tensor, padding])
 
 
-def _clamp_to_int16(tensor: torch.Tensor) -> torch.Tensor:
-    """Clamp tensor values to int16 max and convert dtype."""
-    max_int16 = torch.iinfo(torch.int16).max
-    return tensor.clamp(max=max_int16).to(torch.int16)
-
-
 def _compute_degrees_from_indptr(indptr: torch.Tensor) -> torch.Tensor:
     """Compute degrees from CSR row pointers: degree[i] = indptr[i+1] - indptr[i]."""
-    return (indptr[1:] - indptr[:-1]).to(torch.int16)
+    return (indptr[1:] - indptr[:-1]).to(torch.int32)
 
 
 def _all_reduce_degrees(
-    local_degrees: Union[torch.Tensor, dict[EdgeType, torch.Tensor]],
-) -> Union[torch.Tensor, dict[EdgeType, torch.Tensor]]:
-    """All-reduce degree tensors across ranks, handling both homogeneous and heterogeneous cases.
+    local_degrees: dict[NodeType, torch.Tensor],
+) -> dict[NodeType, torch.Tensor]:
+    """All-reduce degree tensors across ranks.
 
-    For heterogeneous graphs, iterates over the edge types in local_degrees. All partitions
-    are expected to have entries for all edge types (even if some have empty tensors).
-
-    Moves tensors to GPU for the all-reduce if using NCCL backend (which requires CUDA),
-    otherwise keeps tensors on CPU (for Gloo backend).
+    Moves tensors to GPU for the all-reduce if using NCCL backend (which
+    requires CUDA), otherwise keeps tensors on CPU (for Gloo backend).
 
     Over-counting correction:
-        In distributed training, multiple processes on the same machine often share the
-        same graph partition data (via shared memory). When we all-reduce degrees, each
-        process contributes its "local" degrees - but if 4 processes on one machine all
-        read the same partition, that partition's degrees get summed 4 times instead of 1.
+        In distributed training, multiple processes on the same machine often
+        share the same graph partition data (via shared memory). When we
+        all-reduce degrees, each process contributes its "local" degrees — but
+        if 4 processes on one machine all read the same partition, that
+        partition's degrees get summed 4 times instead of 1.
 
         Example: Machine A has 2 processes sharing partition with degrees [3, 5, 2].
                  Machine B has 2 processes sharing partition with degrees [1, 4, 6].
@@ -168,16 +166,16 @@ def _all_reduce_degrees(
                  With correction: divide by local_world_size (2 per machine)
                                   = [4, 9, 8]  (correct: [3+1, 5+4, 2+6])
 
-        This function detects how many processes share the same machine by comparing
-        IP addresses, then divides by that count to correct the over-counting.
+        This function detects how many processes share the same machine by
+        comparing IP addresses, then divides by that count to correct the
+        over-counting.
 
     Args:
-        local_degrees: Either a single tensor (homogeneous) or dict mapping EdgeType
-            to tensors (heterogeneous). For heterogeneous graphs, all partitions must
-            have entries for all edge types.
+        local_degrees: Dict mapping NodeType to local degree tensors.
+            All partitions must have entries for all node types.
 
     Returns:
-        Aggregated degree tensors in the same format as input.
+        Aggregated degree tensors keyed by NodeType.
 
     Raises:
         RuntimeError: If torch.distributed is not initialized.
@@ -187,38 +185,25 @@ def _all_reduce_degrees(
             "_all_reduce_degrees requires torch.distributed to be initialized."
         )
 
-    # Compute local_world_size: number of processes on the same machine sharing data
     all_ips = get_internal_ip_from_all_ranks()
     my_rank = torch.distributed.get_rank()
     my_ip = all_ips[my_rank]
     local_world_size = Counter(all_ips)[my_ip]
 
-    # NCCL backend requires CUDA tensors; Gloo works with CPU
     device = get_device_from_process_group()
 
     def reduce_tensor(tensor: torch.Tensor) -> torch.Tensor:
         """All-reduce a single tensor with size sync and over-counting correction."""
-        # Synchronize max size across all ranks
         local_size = torch.tensor([tensor.size(0)], dtype=torch.long, device=device)
         torch.distributed.all_reduce(local_size, op=torch.distributed.ReduceOp.MAX)
         max_size = int(local_size.item())
 
-        # Pad, convert to int64 (all_reduce doesn't support int16), move to device
         padded = _pad_to_size(tensor, max_size).to(torch.int64).to(device)
         torch.distributed.all_reduce(padded, op=torch.distributed.ReduceOp.SUM)
 
-        # Correct for over-counting, move back to CPU, and clamp to int16
-        # TODO (mkolodner-sc): Potentially want to paramaterize this in the future if we want degrees higher than the int16 max.
-        return _clamp_to_int16((padded // local_world_size).cpu())
-
-    # Homogeneous case
-    if isinstance(local_degrees, torch.Tensor):
-        return reduce_tensor(local_degrees)
-
-    # Heterogeneous case: all-reduce each edge type
-    # Sort edge types for deterministic ordering across ranks
-    result: dict[EdgeType, torch.Tensor] = {}
-    for edge_type in sorted(local_degrees.keys()):
-        result[edge_type] = reduce_tensor(local_degrees[edge_type])
+        return (padded // local_world_size).to(torch.int32).cpu()
 
+    result: dict[NodeType, torch.Tensor] = {}
+    for node_type in sorted(local_degrees.keys()):
+        result[node_type] = reduce_tensor(local_degrees[node_type])
     return result
diff --git a/gigl/distributed/utils/dist_sampler.py b/gigl/distributed/utils/dist_sampler.py
index 0333f4138..db5dba1af 100644
--- a/gigl/distributed/utils/dist_sampler.py
+++ b/gigl/distributed/utils/dist_sampler.py
@@ -10,7 +10,7 @@
     RemoteDistSamplingWorkerOptions,
 )
 from graphlearn_torch.sampler import EdgeSamplerInput, NodeSamplerInput, SamplingConfig
-from graphlearn_torch.typing import EdgeType
+from graphlearn_torch.typing import NodeType
 
 from gigl.distributed.dist_neighbor_sampler import DistNeighborSampler
 from gigl.distributed.dist_ppr_sampler import DistPPRNeighborSampler
@@ -35,7 +35,7 @@ def create_dist_sampler(
     worker_options: Union[MpDistSamplingWorkerOptions, RemoteDistSamplingWorkerOptions],
     channel: ChannelBase,
     sampler_options: SamplerOptions,
-    degree_tensors: Optional[Union[torch.Tensor, dict[EdgeType, torch.Tensor]]],
+    degree_tensors: Optional[dict[NodeType, torch.Tensor]],
     current_device: torch.device,
 ) -> SamplerRuntime:
     """Create a GiGL sampler runtime for one channel on one worker.
@@ -84,7 +84,6 @@ def create_dist_sampler(
             max_ppr_nodes=sampler_options.max_ppr_nodes,
             max_fetch_iterations=sampler_options.max_fetch_iterations,
             num_neighbors_per_hop=sampler_options.num_neighbors_per_hop,
-            total_degree_dtype=sampler_options.total_degree_dtype,
             degree_tensors=degree_tensors,
         )
     else:
diff --git a/gigl/distributed/utils/neighborloader.py b/gigl/distributed/utils/neighborloader.py
index b91b411e3..570fca93b 100644
--- a/gigl/distributed/utils/neighborloader.py
+++ b/gigl/distributed/utils/neighborloader.py
@@ -357,6 +357,18 @@ def attach_ppr_outputs(
         f"PPR edge index and weight edge types must match, "
         f"got {set(ppr_edge_indices.keys())} vs {set(ppr_weights.keys())}"
     )
+    if isinstance(data, Data):
+        if len(ppr_edge_indices) > 1:
+            raise ValueError(
+                "Expected at most one PPR edge type for homogeneous Data output, "
+                f"got {set(ppr_edge_indices.keys())}"
+            )
+        if ppr_edge_indices:
+            edge_type = next(iter(ppr_edge_indices))
+            data.edge_index = ppr_edge_indices[edge_type]
+            data.edge_attr = ppr_weights[edge_type]
+        return
+
     for edge_type, edge_index in ppr_edge_indices.items():
         data[edge_type].edge_index = edge_index
         data[edge_type].edge_attr = ppr_weights[edge_type]
diff --git a/gigl/utils/sampling.py b/gigl/utils/sampling.py
index 5d0ed6a44..e2c6996e5 100644
--- a/gigl/utils/sampling.py
+++ b/gigl/utils/sampling.py
@@ -1,10 +1,12 @@
 import ast
+from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import Any, Optional, Union
 
 import torch
 
 from gigl.common.logger import Logger
+from gigl.distributed.sampler_options import PPRSamplerOptions, SamplerOptions
 from gigl.src.common.types.graph_data import EdgeType, NodeType
 
 logger = Logger()
@@ -88,6 +90,45 @@ def parse_fanout(fanout_str: str) -> Union[list[int], dict[EdgeType, list[int]]]
         )
 
 
+def _parse_optional_int(value: Optional[str]) -> Optional[int]:
+    if value is None:
+        return None
+    normalized = value.strip().lower()
+    if normalized in {"", "none", "null"}:
+        return None
+    return int(value)
+
+
+def parse_sampler_options(args: Mapping[str, str]) -> Optional[SamplerOptions]:
+    sampler_type = args.get("sampler_type", "khop").strip().lower().replace("-", "_")
+    if sampler_type == "":
+        sampler_type = "khop"
+
+    if sampler_type in {"khop", "k_hop", "neighbor", "neighbor_sampler"}:
+        return None
+
+    if sampler_type != "ppr":
+        raise ValueError(
+            f"Unsupported sampler_type={sampler_type}. Expected one of: khop, ppr."
+        )
+
+    max_ppr_nodes = args.get("ppr_max_nodes")
+    if max_ppr_nodes is None:
+        max_ppr_nodes = args.get("ppr_max_ppr_nodes", "50")
+
+    num_neighbors_per_hop = args.get("ppr_neighbors_per_hop")
+    if num_neighbors_per_hop is None:
+        num_neighbors_per_hop = args.get("ppr_num_neighbors_per_hop", "1000")
+
+    return PPRSamplerOptions(
+        alpha=float(args.get("ppr_alpha", "0.5")),
+        eps=float(args.get("ppr_eps", "0.0001")),
+        max_ppr_nodes=int(max_ppr_nodes),
+        num_neighbors_per_hop=int(num_neighbors_per_hop),
+        max_fetch_iterations=_parse_optional_int(args.get("ppr_max_fetch_iterations")),
+    )
+
+
 @dataclass(frozen=True)
 class ABLPInputNodes:
     """Represents ABLP (Anchor Based Link Prediction) input for a single storage server.
diff --git a/tests/e2e_tests/e2e_tests.yaml b/tests/e2e_tests/e2e_tests.yaml
index 61fc4f311..6d09d8213 100644
--- a/tests/e2e_tests/e2e_tests.yaml
+++ b/tests/e2e_tests/e2e_tests.yaml
@@ -22,6 +22,9 @@ tests:
   hom_cora_sup_gs_test:
     task_config_uri: "examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml"
     resource_config_uri: "${oc.env:GIGL_TEST_IN_MEMORY_DEFAULT_GRAPH_STORE_RESOURCE_CONFIG,deployment/configs/e2e_glt_gs_resource_config.yaml}"
+  hom_cora_sup_gs_ppr_test:
+    task_config_uri: "examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_ppr_task_config.yaml"
+    resource_config_uri: "${oc.env:GIGL_TEST_IN_MEMORY_DEFAULT_GRAPH_STORE_RESOURCE_CONFIG,deployment/configs/e2e_glt_gs_resource_config.yaml}"
   het_dblp_sup_gs_test:
     task_config_uri: "examples/link_prediction/graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml"
     resource_config_uri: "${oc.env:GIGL_TEST_IN_MEMORY_DEFAULT_GRAPH_STORE_RESOURCE_CONFIG,deployment/configs/e2e_glt_gs_resource_config.yaml}"
diff --git a/tests/unit/distributed/dist_ppr_sampler_test.py b/tests/unit/distributed/dist_ppr_sampler_test.py
index 400ce1107..4837f5ef4 100644
--- a/tests/unit/distributed/dist_ppr_sampler_test.py
+++ b/tests/unit/distributed/dist_ppr_sampler_test.py
@@ -28,7 +28,7 @@
 
 import heapq
 from collections import defaultdict
-from typing import Literal
+from typing import Literal, TypeGuard
 
 import networkx as nx
 import torch
@@ -41,6 +41,10 @@
 from gigl.distributed.dist_ablp_neighborloader import DistABLPLoader
 from gigl.distributed.distributed_neighborloader import DistNeighborLoader
 from gigl.distributed.sampler_options import PPRSamplerOptions
+from gigl.types.graph import (
+    DEFAULT_HOMOGENEOUS_EDGE_TYPE,
+    DEFAULT_HOMOGENEOUS_NODE_TYPE,
+)
 from tests.test_assets.distributed.test_dataset import (
     STORY,
     STORY_TO_USER,
@@ -91,6 +95,14 @@
 _TEST_MAX_PPR_NODES = 5
 
 
+def _is_node_type_to_tensor_map(
+    value: object,
+) -> TypeGuard[dict[str, torch.Tensor]]:
+    return isinstance(value, dict) and all(
+        isinstance(node_ids, torch.Tensor) for node_ids in value.values()
+    )
+
+
 # ---------------------------------------------------------------------------
 # Reference PPR implementations (NetworkX-based)
 # ---------------------------------------------------------------------------
@@ -504,12 +516,15 @@ def _run_ppr_ablp_loader_correctness_check(
     )
 
     train_node_ids = dataset.train_node_ids
-    assert isinstance(train_node_ids, dict)
+    if not _is_node_type_to_tensor_map(train_node_ids):
+        raise TypeError(
+            f"Expected train_node_ids to be a dictionary, got {type(train_node_ids)}"
+        )
 
     loader = DistABLPLoader(
         dataset=dataset,
         num_neighbors=[],  # Unused by PPR sampler; required by interface
-        input_nodes=(USER, train_node_ids[USER]),  # ty: ignore[invalid-argument-type] TODO(ty-torch-keyed-access): fix ty false positives for torch-backed keyed container access.
+        input_nodes=(USER, train_node_ids[USER]),
         supervision_edge_type=USER_TO_STORY,
         sampler_options=PPRSamplerOptions(
             alpha=alpha,
@@ -589,6 +604,55 @@ def _run_ppr_ablp_loader_correctness_check(
     shutdown_rpc()
 
 
+def _run_ppr_labeled_homogeneous_ablp_loader_check(_: int) -> None:
+    """Verify PPR works for labeled homogeneous DistABLPLoader inputs."""
+    create_test_process_group()
+
+    dataset = create_heterogeneous_dataset_for_ablp(
+        positive_labels={0: [1], 1: [2], 2: [0]},
+        negative_labels={0: [2], 1: [0], 2: [1]},
+        train_node_ids=[0, 1],
+        val_node_ids=[2],
+        test_node_ids=[],
+        edge_indices={DEFAULT_HOMOGENEOUS_EDGE_TYPE: _TEST_EDGE_INDEX},
+        src_node_type=DEFAULT_HOMOGENEOUS_NODE_TYPE,
+        dst_node_type=DEFAULT_HOMOGENEOUS_NODE_TYPE,
+        supervision_edge_type=DEFAULT_HOMOGENEOUS_EDGE_TYPE,
+        edge_dir="out",
+    )
+
+    train_node_ids = dataset.train_node_ids
+    if not _is_node_type_to_tensor_map(train_node_ids):
+        raise TypeError(
+            f"Expected train_node_ids to be a dictionary, got {type(train_node_ids)}"
+        )
+
+    loader = DistABLPLoader(
+        dataset=dataset,
+        num_neighbors=[],
+        input_nodes=train_node_ids[DEFAULT_HOMOGENEOUS_NODE_TYPE],
+        sampler_options=PPRSamplerOptions(
+            alpha=_TEST_ALPHA,
+            eps=_TEST_EPS,
+            max_ppr_nodes=_TEST_MAX_PPR_NODES,
+        ),
+        pin_memory_device=torch.device("cpu"),
+        batch_size=1,
+    )
+
+    datum = next(iter(loader))
+    assert isinstance(datum, Data)
+    assert hasattr(datum, "edge_index"), "Missing PPR edge_index on Data"
+    assert hasattr(datum, "edge_attr"), "Missing PPR edge_attr on Data"
+    assert hasattr(datum, "y_positive"), "Missing y_positive on Data"
+    assert hasattr(datum, "y_negative"), "Missing y_negative on Data"
+    assert datum.edge_index.dim() == 2
+    assert datum.edge_index.size(0) == 2
+    assert datum.edge_index.size(1) == datum.edge_attr.numel()
+
+    shutdown_rpc()
+
+
 # ---------------------------------------------------------------------------
 # Bug regression runners
 # ---------------------------------------------------------------------------
@@ -758,6 +822,10 @@ def test_ppr_sampler_ablp_ignores_label_edges_for_anchor_ppr(self) -> None:
         """Verify ABLP label edges are excluded from anchor-seed PPR walks."""
         mp.spawn(fn=_run_ppr_ablp_label_edges_do_not_affect_anchor_ppr, args=())
 
+    def test_ppr_sampler_homogeneous_ablp(self) -> None:
+        """Verify PPR handles homogeneous ABLP seed dictionaries."""
+        mp.spawn(fn=_run_ppr_labeled_homogeneous_ablp_loader_check, args=())
+
 
 if __name__ == "__main__":
     absltest.main()
diff --git a/tests/unit/distributed/utils/degree_test.py b/tests/unit/distributed/utils/degree_test.py
index f61210f5e..5bd84651e 100644
--- a/tests/unit/distributed/utils/degree_test.py
+++ b/tests/unit/distributed/utils/degree_test.py
@@ -1,3 +1,5 @@
+from typing import Literal
+
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -5,11 +7,12 @@
 from parameterized import param, parameterized
 
 from gigl.distributed.utils.degree import (
-    _clamp_to_int16,
     _compute_degrees_from_indptr,
     _pad_to_size,
     compute_and_broadcast_degree_tensor,
 )
+from gigl.src.common.types.graph_data import EdgeType, NodeType
+from gigl.types.graph import DEFAULT_HOMOGENEOUS_NODE_TYPE
 from tests.test_assets.distributed.test_dataset import (
     DEFAULT_HETEROGENEOUS_EDGE_INDICES,
     DEFAULT_HOMOGENEOUS_EDGE_INDEX,
@@ -25,16 +28,58 @@
 
 
 def _compute_expected_degrees_from_edge_index(
-    edge_index: torch.Tensor, num_nodes: int
+    edge_index: torch.Tensor, num_nodes: int, node_axis: int = 0
 ) -> torch.Tensor:
-    """Compute expected out-degrees from COO edge index."""
-    src_nodes = edge_index[0]
-    degrees = torch.zeros(num_nodes, dtype=torch.int16)
-    for src in src_nodes:
-        degrees[src] += 1
+    """Compute expected degrees from a COO edge index along one endpoint axis."""
+    nodes = edge_index[node_axis]
+    degrees = torch.zeros(num_nodes, dtype=torch.int32)
+    for node in nodes:
+        degrees[node] += 1
     return degrees
 
 
+def _get_anchor_node_type(
+    edge_type: EdgeType, edge_dir: Literal["in", "out"]
+) -> NodeType:
+    """Return the node type whose CSR rows define traversable degrees."""
+    return edge_type.dst_node_type if edge_dir == "in" else edge_type.src_node_type
+
+
+def _compute_expected_total_degrees_by_node_type(
+    edge_indices: dict[EdgeType, torch.Tensor],
+    edge_dir: Literal["in", "out"],
+) -> dict[NodeType, torch.Tensor]:
+    """Compute total degrees keyed by anchor node type."""
+    node_axis = 1 if edge_dir == "in" else 0
+    expected: dict[NodeType, torch.Tensor] = {}
+    for edge_type, edge_index in edge_indices.items():
+        anchor_node_type = _get_anchor_node_type(edge_type, edge_dir)
+        num_nodes = (
+            int(edge_index[node_axis].max().item() + 1)
+            if edge_index.shape[1] > 0
+            else 0
+        )
+        degrees = _compute_expected_degrees_from_edge_index(
+            edge_index=edge_index,
+            num_nodes=num_nodes,
+            node_axis=node_axis,
+        )
+
+        if anchor_node_type not in expected:
+            expected[anchor_node_type] = degrees
+            continue
+
+        max_len = max(expected[anchor_node_type].numel(), degrees.numel())
+        summed_degrees = torch.zeros(max_len, dtype=torch.int64)
+        summed_degrees[: expected[anchor_node_type].numel()] += expected[
+            anchor_node_type
+        ].to(torch.int64)
+        summed_degrees[: degrees.numel()] += degrees.to(torch.int64)
+        expected[anchor_node_type] = summed_degrees.to(torch.int32)
+
+    return expected
+
+
 class TestDegreeComputation(TestCase):
     """Tests for degree computation with torch.distributed initialized.
 
@@ -60,12 +105,12 @@ def test_homogeneous_graph(self):
 
         dataset = create_homogeneous_dataset(edge_index=edge_index)
         assert dataset.graph is not None
-        result = compute_and_broadcast_degree_tensor(dataset.graph)
+        result = compute_and_broadcast_degree_tensor(dataset.graph, dataset.edge_dir)
 
-        assert isinstance(result, torch.Tensor)
+        self.assertEqual(set(result.keys()), {DEFAULT_HOMOGENEOUS_NODE_TYPE})
         expected = _compute_expected_degrees_from_edge_index(edge_index, num_nodes)
-        self.assertEqual(result.shape[0], num_nodes)
-        self.assert_tensor_equality(result, expected)
+        self.assertEqual(result[DEFAULT_HOMOGENEOUS_NODE_TYPE].shape[0], num_nodes)
+        self.assert_tensor_equality(result[DEFAULT_HOMOGENEOUS_NODE_TYPE], expected)
 
     def test_heterogeneous_graph(self):
         """Test degree computation for a heterogeneous graph."""
@@ -73,15 +118,16 @@ def test_heterogeneous_graph(self):
         dataset = create_heterogeneous_dataset(edge_indices=edge_indices)
 
         assert dataset.graph is not None
-        result = compute_and_broadcast_degree_tensor(dataset.graph)
+        result = compute_and_broadcast_degree_tensor(dataset.graph, dataset.edge_dir)
 
-        assert isinstance(result, dict)
-        self.assertEqual(set(result.keys()), set(edge_indices.keys()))
+        expected = _compute_expected_total_degrees_by_node_type(
+            edge_indices=edge_indices,
+            edge_dir=dataset.edge_dir,
+        )
+        self.assertEqual(set(result.keys()), set(expected.keys()))
 
-        for edge_type, edge_index in edge_indices.items():
-            num_nodes = int(edge_index[0].max().item() + 1)
-            expected = _compute_expected_degrees_from_edge_index(edge_index, num_nodes)
-            self.assert_tensor_equality(result[edge_type], expected)  # ty: ignore[invalid-argument-type] TODO(ty-torch-keyed-access): fix ty false positives for torch-backed keyed container access.
+        for node_type, expected_degrees in expected.items():
+            self.assert_tensor_equality(result[node_type], expected_degrees)
 
     def test_heterogeneous_graph_with_missing_topology(self):
         """Test that edge types with missing topology get empty tensors.
@@ -105,24 +151,37 @@ def test_heterogeneous_graph_with_missing_topology(self):
         # Save the original topology for computing expected degrees
         original_graph = dataset.graph[edge_type_with_topo]
         assert original_graph.topo is not None
-        expected_degrees = _compute_expected_degrees_from_edge_index(
-            edge_indices[edge_type_with_topo],
-            int(edge_indices[edge_type_with_topo][0].max().item() + 1),
+        expected_degrees = _compute_expected_total_degrees_by_node_type(
+            edge_indices={edge_type_with_topo: edge_indices[edge_type_with_topo]},
+            edge_dir=dataset.edge_dir,
         )
 
         # Manually set one graph's topology to None to test the edge case
         dataset.graph[edge_type_without_topo].topo = None
 
-        result = compute_and_broadcast_degree_tensor(dataset.graph)
+        result = compute_and_broadcast_degree_tensor(dataset.graph, dataset.edge_dir)
 
-        assert isinstance(result, dict)
-        self.assertEqual(set(result.keys()), set(edge_types))
+        expected_node_types = {
+            _get_anchor_node_type(edge_type, dataset.edge_dir)
+            for edge_type in edge_types
+        }
+        self.assertEqual(set(result.keys()), expected_node_types)
 
         # Edge type with topology should have computed degrees
-        self.assert_tensor_equality(result[edge_type_with_topo], expected_degrees)  # ty: ignore[invalid-argument-type] TODO(ty-torch-keyed-access): fix ty false positives for torch-backed keyed container access.
+        node_type_with_topo = _get_anchor_node_type(
+            edge_type=edge_type_with_topo,
+            edge_dir=dataset.edge_dir,
+        )
+        self.assert_tensor_equality(
+            result[node_type_with_topo], expected_degrees[node_type_with_topo]
+        )
 
         # Edge type without topology should have empty tensor
-        self.assertEqual(result[edge_type_without_topo].numel(), 0)  # ty: ignore[invalid-argument-type] TODO(ty-torch-keyed-access): fix ty false positives for torch-backed keyed container access.
+        node_type_without_topo = _get_anchor_node_type(
+            edge_type=edge_type_without_topo,
+            edge_dir=dataset.edge_dir,
+        )
+        self.assertEqual(result[node_type_without_topo].numel(), 0)
 
 
 def _run_local_world_size_correction_homogeneous(
@@ -130,7 +189,7 @@ def _run_local_world_size_correction_homogeneous(
     world_size: int,
     init_method: str,
     edge_index: torch.Tensor,
-    expected_degrees: torch.Tensor,
+    expected_degrees: dict[NodeType, torch.Tensor],
 ) -> None:
     """Worker function for multi-process local_world_size correction test (homogeneous)."""
     dist.init_process_group(
@@ -142,10 +201,11 @@ def _run_local_world_size_correction_homogeneous(
     try:
         dataset = create_homogeneous_dataset(edge_index=edge_index)
         assert dataset.graph is not None
-        result = compute_and_broadcast_degree_tensor(dataset.graph)
+        result = compute_and_broadcast_degree_tensor(dataset.graph, dataset.edge_dir)
 
-        assert isinstance(result, torch.Tensor)
-        assert_tensor_equality(result, expected_degrees)
+        assert set(result.keys()) == set(expected_degrees.keys())
+        for node_type, expected in expected_degrees.items():
+            assert_tensor_equality(result[node_type], expected)
     finally:
         dist.destroy_process_group()
 
@@ -154,8 +214,8 @@ def _run_local_world_size_correction_heterogeneous(
     rank: int,
     world_size: int,
     init_method: str,
-    edge_indices: dict,
-    expected_degrees: dict,
+    edge_indices: dict[EdgeType, torch.Tensor],
+    expected_degrees: dict[NodeType, torch.Tensor],
 ) -> None:
     """Worker function for multi-process local_world_size correction test (heterogeneous)."""
     dist.init_process_group(
@@ -167,12 +227,11 @@ def _run_local_world_size_correction_heterogeneous(
     try:
         dataset = create_heterogeneous_dataset(edge_indices=edge_indices)
         assert dataset.graph is not None
-        result = compute_and_broadcast_degree_tensor(dataset.graph)
+        result = compute_and_broadcast_degree_tensor(dataset.graph, dataset.edge_dir)
 
-        assert isinstance(result, dict)
         assert set(result.keys()) == set(expected_degrees.keys())
-        for edge_type, expected in expected_degrees.items():
-            assert_tensor_equality(result[edge_type], expected)
+        for node_type, expected in expected_degrees.items():
+            assert_tensor_equality(result[node_type], expected)
     finally:
         dist.destroy_process_group()
 
@@ -191,7 +250,9 @@ def test_local_world_size_correction_homogeneous(self):
         num_nodes = int(edge_index.max().item() + 1)
 
         raw_degrees = _compute_expected_degrees_from_edge_index(edge_index, num_nodes)
-        expected_degrees = raw_degrees  # After correction: (2*raw) / 2 = raw
+        expected_degrees = {
+            DEFAULT_HOMOGENEOUS_NODE_TYPE: raw_degrees
+        }  # After correction: (2*raw) / 2 = raw
 
         init_method = get_process_group_init_method()
         mp.spawn(
@@ -204,13 +265,10 @@ def test_local_world_size_correction_heterogeneous(self):
         """Test over-counting correction for heterogeneous graphs with 2 processes."""
         edge_indices = DEFAULT_HETEROGENEOUS_EDGE_INDICES
 
-        expected_degrees = {}
-        for edge_type, edge_index in edge_indices.items():
-            num_nodes = int(edge_index[0].max().item() + 1)
-            raw_degrees = _compute_expected_degrees_from_edge_index(
-                edge_index, num_nodes
-            )
-            expected_degrees[edge_type] = raw_degrees
+        expected_degrees = _compute_expected_total_degrees_by_node_type(
+            edge_indices=edge_indices,
+            edge_dir="out",
+        )
 
         init_method = get_process_group_init_method()
         mp.spawn(
@@ -242,9 +300,9 @@ def test_degree_tensor_homogeneous(self):
         dataset = create_homogeneous_dataset(edge_index=edge_index)
         result = dataset.degree_tensor
 
-        assert isinstance(result, torch.Tensor)
+        self.assertEqual(set(result.keys()), {DEFAULT_HOMOGENEOUS_NODE_TYPE})
         expected = _compute_expected_degrees_from_edge_index(edge_index, num_nodes)
-        self.assert_tensor_equality(result, expected)
+        self.assert_tensor_equality(result[DEFAULT_HOMOGENEOUS_NODE_TYPE], expected)
 
     def test_degree_tensor_caches_result(self):
         """Test that degree_tensor property caches the result."""
@@ -262,13 +320,14 @@ def test_degree_tensor_heterogeneous(self):
 
         result = dataset.degree_tensor
 
-        assert isinstance(result, dict)
-        self.assertEqual(set(result.keys()), set(edge_indices.keys()))
+        expected = _compute_expected_total_degrees_by_node_type(
+            edge_indices=edge_indices,
+            edge_dir=dataset.edge_dir,
+        )
+        self.assertEqual(set(result.keys()), set(expected.keys()))
 
-        for edge_type, edge_index in edge_indices.items():
-            num_nodes = int(edge_index[0].max().item() + 1)
-            expected = _compute_expected_degrees_from_edge_index(edge_index, num_nodes)
-            self.assert_tensor_equality(result[edge_type], expected)  # ty: ignore[invalid-argument-type] TODO(ty-torch-keyed-access): fix ty false positives for torch-backed keyed container access.
+        for node_type, expected_degrees in expected.items():
+            self.assert_tensor_equality(result[node_type], expected_degrees)
 
 
 class TestHelperFunctions(TestCase):
@@ -304,7 +363,7 @@ def test_pad_to_size(self, _, tensor, target_size, expected):
     def test_compute_degrees_from_indptr(self):
         """Test _compute_degrees_from_indptr helper function."""
         indptr = torch.tensor([0, 3, 5, 10, 12], dtype=torch.int64)
-        expected = torch.tensor([3, 2, 5, 2], dtype=torch.int16)
+        expected = torch.tensor([3, 2, 5, 2], dtype=torch.int32)
         result = _compute_degrees_from_indptr(indptr)
         self.assert_tensor_equality(result, expected)
 
@@ -312,7 +371,7 @@ def test_compute_degrees_from_indptr_all_zeros(self):
         """Test _compute_degrees_from_indptr with all-zero indptr (no edges)."""
         # All-zero indptr means no outgoing edges for any node
         indptr = torch.tensor([0, 0, 0, 0, 0], dtype=torch.int64)
-        expected = torch.tensor([0, 0, 0, 0], dtype=torch.int16)
+        expected = torch.tensor([0, 0, 0, 0], dtype=torch.int32)
         result = _compute_degrees_from_indptr(indptr)
         self.assert_tensor_equality(result, expected)
 
@@ -320,19 +379,11 @@ def test_compute_degrees_from_indptr_empty(self):
         """Test _compute_degrees_from_indptr with empty indptr (no nodes)."""
         # indptr of [0] means 0 nodes
         indptr = torch.empty(0, dtype=torch.int64)
-        expected = torch.empty(0, dtype=torch.int16)
+        expected = torch.empty(0, dtype=torch.int32)
         result = _compute_degrees_from_indptr(indptr)
         self.assert_tensor_equality(result, expected)
         self.assertEqual(result.numel(), 0)
 
-    def test_clamp_to_int16(self):
-        """Test _clamp_to_int16 helper function."""
-        max_int16 = torch.iinfo(torch.int16).max
-        tensor = torch.tensor([1, max_int16 + 100, 5], dtype=torch.int64)
-        expected = torch.tensor([1, max_int16, 5], dtype=torch.int16)
-        result = _clamp_to_int16(tensor)
-        self.assert_tensor_equality(result, expected)
-
 
 if __name__ == "__main__":
     absltest.main()