Skip to content

API Reference

Complete API documentation for all Hyperbench modules.

Core Modules

Data Module

hyperbench.data.dataset

DatasetNames

Bases: Enum

Enumeration of available datasets.

Source code in hyperbench/data/dataset.py
class DatasetNames(Enum):
    """
    Enumeration of available datasets.
    """

    ALGEBRA = "1"
    EMAIL_ENRON = "2"
    ARXIV = "3"
    DBLP = "4"
    THREADSMATHSX = "5"

HIFConverter

Docstring for HIFConverter A utility class to load hypergraphs from HIF format.

Source code in hyperbench/data/dataset.py
class HIFConverter:
    """
    Docstring for HIFConverter
    A utility class to load hypergraphs from HIF format.
    """

    @staticmethod
    def load_from_hif(dataset_name: str | None, file_id: str | None) -> HIFHypergraph:
        if dataset_name is None or file_id is None:
            raise ValueError(
                f"Dataset name (provided: {dataset_name}) and file ID (provided: {file_id}) must be provided."
            )
        if dataset_name not in DatasetNames.__members__:
            raise ValueError(f"Dataset '{dataset_name}' not found.")

        dataset_name_lower = dataset_name.lower()
        current_dir = os.path.dirname(os.path.abspath(__file__))
        zst_filename = os.path.join(
            current_dir, "datasets", f"{dataset_name_lower}.json.zst"
        )

        if os.path.exists(zst_filename):
            dctx = zstd.ZstdDecompressor()
            with (
                open(zst_filename, "rb") as input_f,
                tempfile.NamedTemporaryFile(
                    mode="wb", suffix=".json", delete=False
                ) as tmp_file,
            ):
                dctx.copy_stream(input_f, tmp_file)
                output = tmp_file.name
        else:
            url = f"https://drive.google.com/uc?id={file_id}"

            with tempfile.NamedTemporaryFile(
                mode="w+", suffix=".json", delete=False
            ) as tmp_file:
                output = tmp_file.name
                gdown.download(url=url, output=output, quiet=False, fuzzy=True)

        with open(output, "r") as f:
            hiftext = json.load(f)
        if not validate_hif_json(output):
            raise ValueError(f"Dataset '{dataset_name}' is not HIF-compliant.")

        hypergraph = HIFHypergraph.from_hif(hiftext)
        return hypergraph

Dataset

Bases: Dataset

Base Dataset class for hypergraph datasets, extending PyTorch's Dataset. Attributes: GDRIVE_FILE_ID (str): Google Drive file ID for the dataset. DATASET_NAME (str): Name of the dataset. hypergraph (HIFHypergraph): Loaded hypergraph instance. Methods: download(): Downloads and loads the hypergraph from HIF. process(): Processes the hypergraph into HData format.

Source code in hyperbench/data/dataset.py
class Dataset(TorchDataset):
    """
    Base Dataset class for hypergraph datasets, extending PyTorch's Dataset.
    Attributes:
        GDRIVE_FILE_ID (str): Google Drive file ID for the dataset.
        DATASET_NAME (str): Name of the dataset.
        hypergraph (HIFHypergraph): Loaded hypergraph instance.
    Methods:
        download(): Downloads and loads the hypergraph from HIF.
        process(): Processes the hypergraph into HData format.
    """

    GDRIVE_FILE_ID = None
    DATASET_NAME = None

    def __init__(self) -> None:
        self.hypergraph: HIFHypergraph = self.download()
        self.hdata: HData = self.process()

    def __len__(self) -> int:
        return len(self.hypergraph.nodes)

    def __getitem__(self, index: int | List[int]) -> HData:
        sampled_node_ids_list = self.__get_node_ids_to_sample(index)
        self.__validate_node_ids(sampled_node_ids_list)

        sampled_edge_index, sampled_node_ids, sampled_edge_ids = (
            self.__sample_edge_index(sampled_node_ids_list)
        )

        new_edge_index = self.__new_edge_index(
            sampled_edge_index, sampled_node_ids, sampled_edge_ids
        )

        new_node_features = self.hdata.x[sampled_node_ids]

        new_edge_attr = None
        if self.hdata.edge_attr is not None and len(sampled_edge_ids) > 0:
            new_edge_attr = self.hdata.edge_attr[sampled_edge_ids]

        return HData(
            x=new_node_features,
            edge_index=new_edge_index,
            edge_attr=new_edge_attr,
            num_nodes=len(sampled_node_ids),
            num_edges=len(sampled_edge_ids),
        )

    def download(self) -> HIFHypergraph:
        """
        Load the hypergraph from HIF format using HIFConverter class.
        """
        if hasattr(self, "hypergraph") and self.hypergraph is not None:
            return self.hypergraph
        hypergraph = HIFConverter.load_from_hif(self.DATASET_NAME, self.GDRIVE_FILE_ID)
        return hypergraph

    def process(self) -> HData:
        """
        Process the loaded hypergraph into HData format, mapping HIF structure to tensors.
        Returns:
            HData: Processed hypergraph data.
        """
        num_nodes = len(self.hypergraph.nodes)
        num_edges = len(self.hypergraph.edges)

        # x: shape [num_nodes, num_node_features]
        # collect all attribute keys to have tensors of same size
        node_attr_keys = self.__collect_attr_keys(
            [node.get("attrs", {}) for node in self.hypergraph.nodes]
        )

        if node_attr_keys:
            x = torch.stack(
                [
                    self.transform_node_attrs(
                        node.get("attrs", {}), attr_keys=node_attr_keys
                    )
                    for node in self.hypergraph.nodes
                ]
            )
        else:
            # Fallback to zeros if no numeric attributes
            x = torch.zeros((num_nodes, 1), dtype=torch.float)

        # remap node and edge IDs to 0-based contiguous IDs
        # Use dict comprehension for faster lookups
        node_set = {}
        edge_set = {}
        node_ids = []
        edge_ids = []

        for inc in self.hypergraph.incidences:
            node = inc.get("node", 0)
            edge = inc.get("edge", 0)

            if node not in node_set:
                node_set[node] = len(node_set)
            if edge not in edge_set:
                edge_set[edge] = len(edge_set)

            node_ids.append(node_set[node])
            edge_ids.append(edge_set[edge])

        if len(node_ids) < 1:
            raise ValueError("Hypergraph has no incidences.")

        # edge_index: shape [2, E] where E is number of incidences
        edge_index = torch.tensor([node_ids, edge_ids], dtype=torch.long)

        # edge-attr: shape [num_edges, num_edge_attributes]
        edge_attr = None
        if self.hypergraph.edges and any(
            "attrs" in edge for edge in self.hypergraph.edges
        ):
            # collect all attribute keys to have tensors of same size
            edge_attr_keys = self.__collect_attr_keys(
                [edge.get("attrs", {}) for edge in self.hypergraph.edges]
            )

            edge_attr = torch.stack(
                [
                    self.transform_edge_attrs(
                        edge.get("attrs", {}), attr_keys=edge_attr_keys
                    )
                    for edge in self.hypergraph.edges
                ]
            )

            # Flatten to 1D if only one attribute (PyTorch Geometric standard)
            # if edge_attr.shape[1] == 1:
            #     edge_attr = edge_attr.squeeze(1)

        return HData(x, edge_index, edge_attr, num_nodes, num_edges)

    def transform_node_attrs(
        self, attrs: Dict[str, Any], attr_keys: List[str] | None = None
    ) -> Tensor:
        return self.transform_attrs(attrs, attr_keys)

    def transform_edge_attrs(
        self, attrs: Dict[str, Any], attr_keys: List[str] | None = None
    ) -> Tensor:
        return self.transform_attrs(attrs, attr_keys)

    def transform_attrs(
        self, attrs: Dict[str, Any], attr_keys: List[str] | None = None
    ) -> Tensor:
        """
        Extract and encode numeric node attributes to tensor.
        Non-numeric attributes are discarded. Missing attributes are filled with 0.0.

        Args:
            attrs: Dictionary of node attributes
            attr_keys: Optional list of attribute keys to encode. If provided, ensures
                      consistent ordering and fill missing with 0.0.

        Returns:
            Tensor of numeric attribute values
        """
        numeric_attrs = {
            key: value
            for key, value in attrs.items()
            if isinstance(value, (int, float)) and not isinstance(value, bool)
        }

        if attr_keys is not None:
            values = [float(numeric_attrs.get(key, 0.0)) for key in attr_keys]
            return torch.tensor(values, dtype=torch.float)

        if not numeric_attrs:
            return torch.tensor([], dtype=torch.float)

        values = [float(value) for value in numeric_attrs.values()]
        return torch.tensor(values, dtype=torch.float)

    def __collect_attr_keys(self, attr_keys: List[Dict[str, Any]]) -> List[str]:
        """
        Collect unique numeric attribute keys from a list of attribute dictionaries.
        Args:
            attrs_list: List of attribute dictionaries.
        Returns:
            List of unique numeric attribute keys.
        """
        unique_keys = []
        for attrs in attr_keys:
            for key, value in attrs.items():
                if key not in unique_keys and isinstance(value, (int, float)):
                    unique_keys.append(key)

        return unique_keys

    def __get_node_ids_to_sample(self, id: int | List[int]) -> List[int]:
        if isinstance(id, int):
            return [id]

        if isinstance(id, list):
            if len(id) < 1:
                raise ValueError("Index list cannot be empty.")
            elif len(id) > self.__len__():
                raise ValueError(
                    "Index list length cannot exceed number of nodes in the hypergraph."
                )
            return list(set(id))

    def __validate_node_ids(self, node_ids: List[int]) -> None:
        for id in node_ids:
            if id < 0 or id >= self.__len__():
                raise IndexError(
                    f"Node ID {id} is out of bounds (0, {self.__len__() - 1})."
                )

    def __sample_edge_index(
        self,
        sampled_node_ids_list: List[int],
    ) -> Tuple[Tensor, Tensor, Tensor]:
        edge_index = self.hdata.edge_index
        node_ids = edge_index[0]
        edge_ids = edge_index[1]

        sampled_node_ids = torch.tensor(sampled_node_ids_list)

        # Find incidences where the node is in our sampled node set
        # Example: edge_index[0] = [0, 0, 1, 2, 3, 4], sampled_node_ids = [0, 3]
        #          -> node_incidence_mask = [True, True, False, False, True, False]
        node_incidence_mask = torch.isin(node_ids, sampled_node_ids)

        # Get unique hyperedges that have at least one sampled node
        # Example: edge_index[1] = [0, 0, 0, 1, 2, 2], node_incidence_mask = [True, True, False, False, True, False]
        #          -> sampled_edge_ids = [0, 2] as they connect to sampled nodes
        sampled_edge_ids = edge_ids[node_incidence_mask].unique()

        # Find all incidences for sampled nodes belonging to relevant hyperedges
        # Example: edge_index[1] = [0, 0, 0, 1, 2, 2], sampled_edge_ids = [0, 2]
        #          -> edge_incidence_mask = [True, True, True, False, True, True]
        edge_incidence_mask = torch.isin(edge_ids, sampled_edge_ids)

        # Incidence is kept if node is sampled AND hyperedge is relevant
        incidence_mask = node_incidence_mask & edge_incidence_mask

        # Keep only the incidences that match our mask
        # Example: edge_index = [[0, 0, 1, 2, 3, 4],
        #                        [0, 0, 0, 1, 2, 2]],
        #          incidence_mask = [True, True, False, False, True, False]
        #          -> sampled_edge_index = [[0, 0, 3],
        #                                   [0, 0, 2]]
        sampled_edge_index = edge_index[:, incidence_mask]

        return sampled_edge_index, sampled_node_ids, sampled_edge_ids

    def __new_edge_index(
        self,
        sampled_edge_index: Tensor,
        sampled_node_ids: Tensor,
        sampled_edge_ids: Tensor,
    ) -> Tensor:
        """
        Create new edge_index with 0-based node and edge IDs.
        Args:
            sampled_edge_index: Original edge_index tensor with sampled incidences.
            sampled_node_ids: List of sampled original node IDs.
            sampled_edge_ids: List of sampled original edge IDs.
        Returns:
            New edge_index tensor with 0-based node and edge IDs.
        """
        # Example: sampled_edge_index = [[1, 1, 3],
        #                                [0, 2, 2]]
        #          sampled_node_ids = [1, 3],
        #          sampled_edge_ids = [0, 2]
        #          -> new_node_ids = [0, 0, 1], new_edge_ids = [0, 1, 1]
        new_node_ids = self.__to_0based_ids(
            sampled_edge_index[0], sampled_node_ids, self.hdata.num_nodes
        )
        new_edge_ids = self.__to_0based_ids(
            sampled_edge_index[1], sampled_edge_ids, self.hdata.num_edges
        )

        # Example: new_node_ids = [0, 1], new_edge_ids = [0, 1]
        #          -> new_edge_index = [[0, 1],
        #                               [0, 1]]
        new_edge_index = torch.stack([new_node_ids, new_edge_ids], dim=0)
        return new_edge_index

    def __to_0based_ids(
        self,
        original_ids: Tensor,
        ids_to_keep: Tensor,
        n: int,
    ) -> Tensor:
        """
        Map original IDs to 0-based ids.
        Example:
            original_ids: [1, 3, 3, 7]
            ids_to_keep: [3, 7]
            n = 8                            # total number of elements (nodes or edges) in the original hypergraph
            Returned 0-based IDs: [0, 0, 1]  # the size is sum of occurrences of ids_to_keep in original_ids
        Args:
            original_ids: Tensor of original IDs.
            n: Total number of original IDs.
            ids_to_keep: List of selected original IDs to be mapped to 0-based.
        Returns:
            Tensor of 0-based ids.
        """
        id_to_0based_id = torch.zeros(n, dtype=torch.long)
        n_ids_to_keep = len(ids_to_keep)
        id_to_0based_id[ids_to_keep] = torch.arange(n_ids_to_keep)
        return id_to_0based_id[original_ids]

download()

Load the hypergraph from HIF format using HIFConverter class.

Source code in hyperbench/data/dataset.py
def download(self) -> HIFHypergraph:
    """
    Load the hypergraph from HIF format using HIFConverter class.
    """
    if hasattr(self, "hypergraph") and self.hypergraph is not None:
        return self.hypergraph
    hypergraph = HIFConverter.load_from_hif(self.DATASET_NAME, self.GDRIVE_FILE_ID)
    return hypergraph

process()

Process the loaded hypergraph into HData format, mapping HIF structure to tensors. Returns: HData: Processed hypergraph data.

Source code in hyperbench/data/dataset.py
def process(self) -> HData:
    """
    Process the loaded hypergraph into HData format, mapping HIF structure to tensors.
    Returns:
        HData: Processed hypergraph data.
    """
    num_nodes = len(self.hypergraph.nodes)
    num_edges = len(self.hypergraph.edges)

    # x: shape [num_nodes, num_node_features]
    # collect all attribute keys to have tensors of same size
    node_attr_keys = self.__collect_attr_keys(
        [node.get("attrs", {}) for node in self.hypergraph.nodes]
    )

    if node_attr_keys:
        x = torch.stack(
            [
                self.transform_node_attrs(
                    node.get("attrs", {}), attr_keys=node_attr_keys
                )
                for node in self.hypergraph.nodes
            ]
        )
    else:
        # Fallback to zeros if no numeric attributes
        x = torch.zeros((num_nodes, 1), dtype=torch.float)

    # remap node and edge IDs to 0-based contiguous IDs
    # Use dict comprehension for faster lookups
    node_set = {}
    edge_set = {}
    node_ids = []
    edge_ids = []

    for inc in self.hypergraph.incidences:
        node = inc.get("node", 0)
        edge = inc.get("edge", 0)

        if node not in node_set:
            node_set[node] = len(node_set)
        if edge not in edge_set:
            edge_set[edge] = len(edge_set)

        node_ids.append(node_set[node])
        edge_ids.append(edge_set[edge])

    if len(node_ids) < 1:
        raise ValueError("Hypergraph has no incidences.")

    # edge_index: shape [2, E] where E is number of incidences
    edge_index = torch.tensor([node_ids, edge_ids], dtype=torch.long)

    # edge-attr: shape [num_edges, num_edge_attributes]
    edge_attr = None
    if self.hypergraph.edges and any(
        "attrs" in edge for edge in self.hypergraph.edges
    ):
        # collect all attribute keys to have tensors of same size
        edge_attr_keys = self.__collect_attr_keys(
            [edge.get("attrs", {}) for edge in self.hypergraph.edges]
        )

        edge_attr = torch.stack(
            [
                self.transform_edge_attrs(
                    edge.get("attrs", {}), attr_keys=edge_attr_keys
                )
                for edge in self.hypergraph.edges
            ]
        )

        # Flatten to 1D if only one attribute (PyTorch Geometric standard)
        # if edge_attr.shape[1] == 1:
        #     edge_attr = edge_attr.squeeze(1)

    return HData(x, edge_index, edge_attr, num_nodes, num_edges)

transform_attrs(attrs, attr_keys=None)

Extract and encode numeric node attributes to tensor. Non-numeric attributes are discarded. Missing attributes are filled with 0.0.

Args: attrs: Dictionary of node attributes attr_keys: Optional list of attribute keys to encode. If provided, ensures consistent ordering and fill missing with 0.0.

Returns: Tensor of numeric attribute values

Source code in hyperbench/data/dataset.py
def transform_attrs(
    self, attrs: Dict[str, Any], attr_keys: List[str] | None = None
) -> Tensor:
    """
    Extract and encode numeric node attributes to tensor.
    Non-numeric attributes are discarded. Missing attributes are filled with 0.0.

    Args:
        attrs: Dictionary of node attributes
        attr_keys: Optional list of attribute keys to encode. If provided, ensures
                  consistent ordering and fill missing with 0.0.

    Returns:
        Tensor of numeric attribute values
    """
    numeric_attrs = {
        key: value
        for key, value in attrs.items()
        if isinstance(value, (int, float)) and not isinstance(value, bool)
    }

    if attr_keys is not None:
        values = [float(numeric_attrs.get(key, 0.0)) for key in attr_keys]
        return torch.tensor(values, dtype=torch.float)

    if not numeric_attrs:
        return torch.tensor([], dtype=torch.float)

    values = [float(value) for value in numeric_attrs.values()]
    return torch.tensor(values, dtype=torch.float)

__collect_attr_keys(attr_keys)

Collect unique numeric attribute keys from a list of attribute dictionaries. Args: attrs_list: List of attribute dictionaries. Returns: List of unique numeric attribute keys.

Source code in hyperbench/data/dataset.py
def __collect_attr_keys(self, attr_keys: List[Dict[str, Any]]) -> List[str]:
    """
    Collect unique numeric attribute keys from a list of attribute dictionaries.
    Args:
        attrs_list: List of attribute dictionaries.
    Returns:
        List of unique numeric attribute keys.
    """
    unique_keys = []
    for attrs in attr_keys:
        for key, value in attrs.items():
            if key not in unique_keys and isinstance(value, (int, float)):
                unique_keys.append(key)

    return unique_keys

__new_edge_index(sampled_edge_index, sampled_node_ids, sampled_edge_ids)

Create new edge_index with 0-based node and edge IDs. Args: sampled_edge_index: Original edge_index tensor with sampled incidences. sampled_node_ids: List of sampled original node IDs. sampled_edge_ids: List of sampled original edge IDs. Returns: New edge_index tensor with 0-based node and edge IDs.

Source code in hyperbench/data/dataset.py
def __new_edge_index(
    self,
    sampled_edge_index: Tensor,
    sampled_node_ids: Tensor,
    sampled_edge_ids: Tensor,
) -> Tensor:
    """
    Create new edge_index with 0-based node and edge IDs.
    Args:
        sampled_edge_index: Original edge_index tensor with sampled incidences.
        sampled_node_ids: List of sampled original node IDs.
        sampled_edge_ids: List of sampled original edge IDs.
    Returns:
        New edge_index tensor with 0-based node and edge IDs.
    """
    # Example: sampled_edge_index = [[1, 1, 3],
    #                                [0, 2, 2]]
    #          sampled_node_ids = [1, 3],
    #          sampled_edge_ids = [0, 2]
    #          -> new_node_ids = [0, 0, 1], new_edge_ids = [0, 1, 1]
    new_node_ids = self.__to_0based_ids(
        sampled_edge_index[0], sampled_node_ids, self.hdata.num_nodes
    )
    new_edge_ids = self.__to_0based_ids(
        sampled_edge_index[1], sampled_edge_ids, self.hdata.num_edges
    )

    # Example: new_node_ids = [0, 1], new_edge_ids = [0, 1]
    #          -> new_edge_index = [[0, 1],
    #                               [0, 1]]
    new_edge_index = torch.stack([new_node_ids, new_edge_ids], dim=0)
    return new_edge_index

__to_0based_ids(original_ids, ids_to_keep, n)

Map original IDs to 0-based ids. Example: original_ids: [1, 3, 3, 7] ids_to_keep: [3, 7] n = 8 # total number of elements (nodes or edges) in the original hypergraph Returned 0-based IDs: [0, 0, 1] # the size is sum of occurrences of ids_to_keep in original_ids Args: original_ids: Tensor of original IDs. n: Total number of original IDs. ids_to_keep: List of selected original IDs to be mapped to 0-based. Returns: Tensor of 0-based ids.

Source code in hyperbench/data/dataset.py
def __to_0based_ids(
    self,
    original_ids: Tensor,
    ids_to_keep: Tensor,
    n: int,
) -> Tensor:
    """
    Map original IDs to 0-based ids.
    Example:
        original_ids: [1, 3, 3, 7]
        ids_to_keep: [3, 7]
        n = 8                            # total number of elements (nodes or edges) in the original hypergraph
        Returned 0-based IDs: [0, 0, 1]  # the size is sum of occurrences of ids_to_keep in original_ids
    Args:
        original_ids: Tensor of original IDs.
        n: Total number of original IDs.
        ids_to_keep: List of selected original IDs to be mapped to 0-based.
    Returns:
        Tensor of 0-based ids.
    """
    id_to_0based_id = torch.zeros(n, dtype=torch.long)
    n_ids_to_keep = len(ids_to_keep)
    id_to_0based_id[ids_to_keep] = torch.arange(n_ids_to_keep)
    return id_to_0based_id[original_ids]

Types Module

HData

hyperbench.types.hdata

HData

Container for hypergraph data.

Attributes: x (Tensor): Node feature matrix of shape [num_nodes, num_features]. edge_index (Tensor): Hyperedge connectivity in COO format of shape [2, num_incidences], where edge_index[0] contains node IDs and edge_index[1] contains hyperedge IDs. edge_attr (Tensor, optional): Hyperedge feature matrix of shape [num_edges, num_edge_features]. Features associated with each hyperedge (e.g., weights, timestamps, types). num_nodes (int, optional): Number of nodes in the hypergraph. If None, inferred as x.size(0). num_edges (int, optional): Number of hyperedges in the hypergraph. If None, inferred as edge_index[1].max().item() + 1.

Example: >>> x = torch.randn(10, 16) # 10 nodes with 16 features each >>> edge_index = torch.tensor([[0, 0, 1, 1, 1], # node IDs ... [0, 1, 2, 3, 4]]) # hyperedge IDs >>> data = HData(x, edge_index=edge_index)

Source code in hyperbench/types/hdata.py
class HData:
    """Container for hypergraph data.

    Attributes:
        x (Tensor): Node feature matrix of shape [num_nodes, num_features].
        edge_index (Tensor): Hyperedge connectivity in COO format
            of shape [2, num_incidences], where edge_index[0] contains
            node IDs and edge_index[1] contains hyperedge IDs.
        edge_attr (Tensor, optional): Hyperedge feature matrix of shape
            [num_edges, num_edge_features]. Features associated with each
            hyperedge (e.g., weights, timestamps, types).
        num_nodes (int, optional): Number of nodes in the hypergraph.
            If None, inferred as x.size(0).
        num_edges (int, optional): Number of hyperedges in the hypergraph.
            If None, inferred as edge_index[1].max().item() + 1.

    Example:
        >>> x = torch.randn(10, 16)  # 10 nodes with 16 features each
        >>> edge_index = torch.tensor([[0, 0, 1, 1, 1],  # node IDs
        ...                            [0, 1, 2, 3, 4]]) # hyperedge IDs
        >>> data = HData(x, edge_index=edge_index)
    """

    def __init__(
        self,
        x: Tensor,
        edge_index: Tensor,
        edge_attr: Tensor | None = None,
        num_nodes: int | None = None,
        num_edges: int | None = None,
    ):
        self.x: Tensor = x

        self.edge_index: Tensor = edge_index

        self.edge_attr: Tensor | None = edge_attr

        self.num_nodes: int = num_nodes if num_nodes is not None else x.size(0)

        max_edge_id = edge_index[1].max().item() if edge_index.size(1) > 0 else -1
        self.num_edges: int = num_edges if num_edges is not None else max_edge_id + 1

Hypergraph

hyperbench.types.hypergraph

HIFHypergraph

A hypergraph data structure that supports directed/undirected hyperedges with incidence-based representation.

Source code in hyperbench/types/hypergraph.py
class HIFHypergraph:
    """
    A hypergraph data structure that supports directed/undirected hyperedges
    with incidence-based representation.
    """

    def __init__(
        self,
        network_type: Optional[Literal["asc", "directed", "undirected"]] = None,
        metadata: Optional[Dict[str, Any]] = None,
        incidences: Optional[List[Dict[str, Any]]] = None,
        nodes: Optional[List[Dict[str, Any]]] = None,
        edges: Optional[List[Dict[str, Any]]] = None,
    ):
        self.network_type = network_type
        self.metadata = metadata if metadata is not None else {}
        self.incidences = incidences if incidences is not None else []
        self.nodes = nodes if nodes is not None else []
        self.edges = edges if edges is not None else []

    @classmethod
    def from_hif(cls, data: Dict[str, Any]) -> "HIFHypergraph":
        """
        Create a Hypergraph from a HIF (Hypergraph Interchange Format).

        Args:
            data: Dictionary with keys: network-type, metadata, incidences, nodes, edges

        Returns:
            Hypergraph instance
        """
        network_type = data.get("network-type") or data.get("network_type")
        metadata = data.get("metadata", {})
        incidences = data.get("incidences", [])
        nodes = data.get("nodes", [])
        edges = data.get("edges", [])

        return cls(
            network_type=network_type,
            metadata=metadata,
            incidences=incidences,
            nodes=nodes,
            edges=edges,
        )

    @property
    def num_nodes(self) -> int:
        """Return the number of nodes in the hypergraph."""
        return len(self.nodes)

    @property
    def num_edges(self) -> int:
        """Return the number of edges in the hypergraph."""
        return len(self.edges)

num_nodes property

Return the number of nodes in the hypergraph.

num_edges property

Return the number of edges in the hypergraph.

from_hif(data) classmethod

Create a Hypergraph from a HIF (Hypergraph Interchange Format).

Args: data: Dictionary with keys: network-type, metadata, incidences, nodes, edges

Returns: Hypergraph instance

Source code in hyperbench/types/hypergraph.py
@classmethod
def from_hif(cls, data: Dict[str, Any]) -> "HIFHypergraph":
    """
    Create a Hypergraph from a HIF (Hypergraph Interchange Format).

    Args:
        data: Dictionary with keys: network-type, metadata, incidences, nodes, edges

    Returns:
        Hypergraph instance
    """
    network_type = data.get("network-type") or data.get("network_type")
    metadata = data.get("metadata", {})
    incidences = data.get("incidences", [])
    nodes = data.get("nodes", [])
    edges = data.get("edges", [])

    return cls(
        network_type=network_type,
        metadata=metadata,
        incidences=incidences,
        nodes=nodes,
        edges=edges,
    )

Utils Module

Data Utils

hyperbench.utils.data_utils

HIF Utils

hyperbench.utils.hif_utils

Data Loader

hyperbench.data.loader

DataLoader

Bases: DataLoader

Source code in hyperbench/data/loader.py
class DataLoader(TorchDataLoader):
    def __init__(
        self, dataset: Dataset, batch_size: int = 1, shuffle: bool = False, **kwargs
    ) -> None:
        super().__init__(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            collate_fn=self.collate,
            **kwargs,
        )

    def collate(self, batch: List[HData]) -> HData:
        """Collates a list of HData objects into a single batched HData object.

        This function combines multiple separate hypergraph samples into a single
        batched representation suitable for mini-batch training. It handles:
        - Concatenating node features from all samples
        - Concatenating and offsetting hyperedge from all samples
        - Concatenating edge attributes from all samples, if present

        Example:
            Given batch = [HData_0, HData_1].

            For node features:
                HData_0.x.shape = (3, 64) # 3 nodes with 64 features
                HData_1.x.shape = (2, 64) # 2 nodes with 64 features

                The output will be HData with:
                    x.shape = (5, 64) # All 5 nodes concatenated

            For edge index:
                HData_0 (3 nodes, 2 hyperedges):
                    edge_index = [[0, 1, 1, 2], # Nodes 0, 1, 1, 2
                                  [0, 0, 1, 1]] # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2}

                HData_1 (2 nodes, 1 hyperedge):
                    edge_index = [[0, 1], # Nodes 0, 1
                                  [0, 0]] # Hyperedge 0 contains {0,1}

                Batched result:
                    edge_index = [[0, 1, 1, 2, 3, 4], # Node indices: original then offset by 3, so 0->3, 1->4
                                  [0, 0, 1, 1, 2, 2]] # Hyperedge IDs: original then offset by 2, so 0->2, 0->2
                                   ^^^^^^^^^^  ^^^^
                                   Sample 0    Sample 1 (nodes +3, edges +2)

        Args:
            batch: List of HData objects to collate.

        Returns:
            HData: A single HData object containing the batched data.
        """
        node_features, total_nodes = self.__batch_node_features(batch)
        edge_index, edge_attr, total_edges = self.__batch_edges(batch)

        batched_data = HData(
            x=node_features,
            edge_index=edge_index,
            edge_attr=edge_attr,
            num_nodes=total_nodes,
            num_edges=total_edges,
        )

        return batched_data

    def __batch_node_features(self, batch: List[HData]) -> Tuple[Tensor, int]:
        """Concatenates node features from all samples in the batch.

        Example:
            With shape being (num_nodes_in_sample, num_features).

            If batch contains 3 sample with node features:
                Sample 0: x = [[1, 2], [3, 4]]           , shape: (2, 2)
                Sample 1: x = [[5, 6]]                   , shape: (1, 2)
                Sample 2: x = [[7, 8], [9, 10], [11, 12]], shape: (3, 2)

            Result:
                x: [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]
                shape: (6, 2), where 6 = 2 + 1 + 3 total nodes.

        Args:
            batch: List of HData objects.

        Returns:
            Tensor: Concatenated node features with shape (total_nodes, num_features).
        """
        per_sample_node_features = [data.x for data in batch]

        # Stack all nodes along the node dimension from all samples into a single tensor
        batched_node_features = torch.cat(per_sample_node_features, dim=0)
        total_nodes = batched_node_features.size(0)

        return batched_node_features, total_nodes

    def __batch_edges(self, batch: List[HData]) -> Tuple[Tensor, Tensor | None, int]:
        """Batches hyperedge indices and attributes, adjusting indices for concatenated nodes.
        Hyperedge indices must be offset so they point to the correct nodes in the batched node tensor.

        Example:
            Sample 0 (3 nodes, 2 hyperedges):
                edge_index = [[0, 1, 1, 2], # Nodes 0, 1, 1, 2
                              [0, 0, 1, 1]] # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2}
                node_offset = 0
                edge_offset = 0

            Sample 1 (2 nodes, 1 hyperedge):
                edge_index = [[0, 1], # Nodes 0, 1
                              [0, 0]] # Hyperedge 0 contains {0,1}
                node_offset = 3 # Previous samples have 3 nodes total
                edge_offset = 2 # Previous samples have 2 hyperedges total
            Result:
                edge_index = [[0, 1, 1, 2, 3, 4], # Node indices: original then offset by 3, so 0->3, 1->4
                              [0, 0, 1, 1, 2, 2]] # Hyperedge IDs: original then offset by 2, so 0->2, 0->2
                               ^^^^^^^^^^  ^^^^
                               Sample 0    Sample 1 (nodes +3, edges +2)

        Args:
            batch: List of HData objects.

        Returns:
            Tuple containing:
                - batched_edge_index: Concatenated and offset hyperedge indices, or None
                - batched_edge_attr: Concatenated hyperedge attributes, or None
                - total_edges: Total number of hyperedges across all batched samples
        """
        edge_indexes = []
        edge_attrs = []
        node_offset = 0
        edge_offset = 0

        for data in batch:
            # Offset nodes and hyperedge IDs (indices) in edge_index
            offset_edge_index = data.edge_index.clone()
            offset_edge_index[0] += node_offset
            offset_edge_index[1] += edge_offset
            edge_indexes.append(offset_edge_index)

            if data.edge_attr is not None:
                edge_attrs.append(data.edge_attr)

            # Offset calculations for next sample based on the max hyperedge ID as it indicates the number of hyperedges
            max_edge_id = (
                data.edge_index[1].max().item() if data.edge_index.size(1) > 0 else -1
            )
            edge_offset += (
                data.num_edges if data.num_edges is not None else max_edge_id + 1
            )

            # Offset calculations for next sample based on x[0] as x has shape (num_nodes, num_features), so 0 provides the number of nodes
            node_offset += (
                data.num_nodes if data.num_nodes is not None else data.x.size(0)
            )

        # Concatenate all edge_index tensors along the incidence dimension, so that we get a shape of (2, total_edges)
        batched_edge_index = torch.cat(edge_indexes, dim=1)
        max_edge_id = int(
            (
                batched_edge_index[1].max().item()
                if batched_edge_index.size(1) > 0
                else -1
            )
        )
        total_edges = max_edge_id + 1

        batched_edge_attr = None
        if len(edge_attrs) > 0:
            # Concatenate hyperedge attributes along dimension 0 (the hyperedge dimension)
            # edge_attr typically has shape (num_edges, num_edge_features)
            # Result shape: (total_edges, num_edge_features)
            batched_edge_attr = torch.cat(edge_attrs, dim=0)

        return batched_edge_index, batched_edge_attr, total_edges

collate(batch)

Collates a list of HData objects into a single batched HData object.

This function combines multiple separate hypergraph samples into a single batched representation suitable for mini-batch training. It handles: - Concatenating node features from all samples - Concatenating and offsetting hyperedge from all samples - Concatenating edge attributes from all samples, if present

Example: Given batch = [HData_0, HData_1].

For node features:
    HData_0.x.shape = (3, 64) # 3 nodes with 64 features
    HData_1.x.shape = (2, 64) # 2 nodes with 64 features

    The output will be HData with:
        x.shape = (5, 64) # All 5 nodes concatenated

For edge index:
    HData_0 (3 nodes, 2 hyperedges):
        edge_index = [[0, 1, 1, 2], # Nodes 0, 1, 1, 2
                      [0, 0, 1, 1]] # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2}

    HData_1 (2 nodes, 1 hyperedge):
        edge_index = [[0, 1], # Nodes 0, 1
                      [0, 0]] # Hyperedge 0 contains {0,1}

    Batched result:
        edge_index = [[0, 1, 1, 2, 3, 4], # Node indices: original then offset by 3, so 0->3, 1->4
                      [0, 0, 1, 1, 2, 2]] # Hyperedge IDs: original then offset by 2, so 0->2, 0->2
                       ^^^^^^^^^^  ^^^^
                       Sample 0    Sample 1 (nodes +3, edges +2)

Args: batch: List of HData objects to collate.

Returns: HData: A single HData object containing the batched data.

Source code in hyperbench/data/loader.py
def collate(self, batch: List[HData]) -> HData:
    """Collates a list of HData objects into a single batched HData object.

    This function combines multiple separate hypergraph samples into a single
    batched representation suitable for mini-batch training. It handles:
    - Concatenating node features from all samples
    - Concatenating and offsetting hyperedge from all samples
    - Concatenating edge attributes from all samples, if present

    Example:
        Given batch = [HData_0, HData_1].

        For node features:
            HData_0.x.shape = (3, 64) # 3 nodes with 64 features
            HData_1.x.shape = (2, 64) # 2 nodes with 64 features

            The output will be HData with:
                x.shape = (5, 64) # All 5 nodes concatenated

        For edge index:
            HData_0 (3 nodes, 2 hyperedges):
                edge_index = [[0, 1, 1, 2], # Nodes 0, 1, 1, 2
                              [0, 0, 1, 1]] # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2}

            HData_1 (2 nodes, 1 hyperedge):
                edge_index = [[0, 1], # Nodes 0, 1
                              [0, 0]] # Hyperedge 0 contains {0,1}

            Batched result:
                edge_index = [[0, 1, 1, 2, 3, 4], # Node indices: original then offset by 3, so 0->3, 1->4
                              [0, 0, 1, 1, 2, 2]] # Hyperedge IDs: original then offset by 2, so 0->2, 0->2
                               ^^^^^^^^^^  ^^^^
                               Sample 0    Sample 1 (nodes +3, edges +2)

    Args:
        batch: List of HData objects to collate.

    Returns:
        HData: A single HData object containing the batched data.
    """
    node_features, total_nodes = self.__batch_node_features(batch)
    edge_index, edge_attr, total_edges = self.__batch_edges(batch)

    batched_data = HData(
        x=node_features,
        edge_index=edge_index,
        edge_attr=edge_attr,
        num_nodes=total_nodes,
        num_edges=total_edges,
    )

    return batched_data

__batch_node_features(batch)

Concatenates node features from all samples in the batch.

Example: With shape being (num_nodes_in_sample, num_features).

If batch contains 3 sample with node features:
    Sample 0: x = [[1, 2], [3, 4]]           , shape: (2, 2)
    Sample 1: x = [[5, 6]]                   , shape: (1, 2)
    Sample 2: x = [[7, 8], [9, 10], [11, 12]], shape: (3, 2)

Result:
    x: [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]
    shape: (6, 2), where 6 = 2 + 1 + 3 total nodes.

Args: batch: List of HData objects.

Returns: Tensor: Concatenated node features with shape (total_nodes, num_features).

Source code in hyperbench/data/loader.py
def __batch_node_features(self, batch: List[HData]) -> Tuple[Tensor, int]:
    """Concatenates node features from all samples in the batch.

    Example:
        With shape being (num_nodes_in_sample, num_features).

        If batch contains 3 sample with node features:
            Sample 0: x = [[1, 2], [3, 4]]           , shape: (2, 2)
            Sample 1: x = [[5, 6]]                   , shape: (1, 2)
            Sample 2: x = [[7, 8], [9, 10], [11, 12]], shape: (3, 2)

        Result:
            x: [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]
            shape: (6, 2), where 6 = 2 + 1 + 3 total nodes.

    Args:
        batch: List of HData objects.

    Returns:
        Tensor: Concatenated node features with shape (total_nodes, num_features).
    """
    per_sample_node_features = [data.x for data in batch]

    # Stack all nodes along the node dimension from all samples into a single tensor
    batched_node_features = torch.cat(per_sample_node_features, dim=0)
    total_nodes = batched_node_features.size(0)

    return batched_node_features, total_nodes

__batch_edges(batch)

Batches hyperedge indices and attributes, adjusting indices for concatenated nodes. Hyperedge indices must be offset so they point to the correct nodes in the batched node tensor.

Example: Sample 0 (3 nodes, 2 hyperedges): edge_index = [[0, 1, 1, 2], # Nodes 0, 1, 1, 2 [0, 0, 1, 1]] # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2} node_offset = 0 edge_offset = 0

Sample 1 (2 nodes, 1 hyperedge):
    edge_index = [[0, 1], # Nodes 0, 1
                  [0, 0]] # Hyperedge 0 contains {0,1}
    node_offset = 3 # Previous samples have 3 nodes total
    edge_offset = 2 # Previous samples have 2 hyperedges total
Result:
    edge_index = [[0, 1, 1, 2, 3, 4], # Node indices: original then offset by 3, so 0->3, 1->4
                  [0, 0, 1, 1, 2, 2]] # Hyperedge IDs: original then offset by 2, so 0->2, 0->2
                   ^^^^^^^^^^  ^^^^
                   Sample 0    Sample 1 (nodes +3, edges +2)

Args: batch: List of HData objects.

Returns: Tuple containing: - batched_edge_index: Concatenated and offset hyperedge indices, or None - batched_edge_attr: Concatenated hyperedge attributes, or None - total_edges: Total number of hyperedges across all batched samples

Source code in hyperbench/data/loader.py
def __batch_edges(self, batch: List[HData]) -> Tuple[Tensor, Tensor | None, int]:
    """Batches hyperedge indices and attributes, adjusting indices for concatenated nodes.
    Hyperedge indices must be offset so they point to the correct nodes in the batched node tensor.

    Example:
        Sample 0 (3 nodes, 2 hyperedges):
            edge_index = [[0, 1, 1, 2], # Nodes 0, 1, 1, 2
                          [0, 0, 1, 1]] # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2}
            node_offset = 0
            edge_offset = 0

        Sample 1 (2 nodes, 1 hyperedge):
            edge_index = [[0, 1], # Nodes 0, 1
                          [0, 0]] # Hyperedge 0 contains {0,1}
            node_offset = 3 # Previous samples have 3 nodes total
            edge_offset = 2 # Previous samples have 2 hyperedges total
        Result:
            edge_index = [[0, 1, 1, 2, 3, 4], # Node indices: original then offset by 3, so 0->3, 1->4
                          [0, 0, 1, 1, 2, 2]] # Hyperedge IDs: original then offset by 2, so 0->2, 0->2
                           ^^^^^^^^^^  ^^^^
                           Sample 0    Sample 1 (nodes +3, edges +2)

    Args:
        batch: List of HData objects.

    Returns:
        Tuple containing:
            - batched_edge_index: Concatenated and offset hyperedge indices, or None
            - batched_edge_attr: Concatenated hyperedge attributes, or None
            - total_edges: Total number of hyperedges across all batched samples
    """
    edge_indexes = []
    edge_attrs = []
    node_offset = 0
    edge_offset = 0

    for data in batch:
        # Offset nodes and hyperedge IDs (indices) in edge_index
        offset_edge_index = data.edge_index.clone()
        offset_edge_index[0] += node_offset
        offset_edge_index[1] += edge_offset
        edge_indexes.append(offset_edge_index)

        if data.edge_attr is not None:
            edge_attrs.append(data.edge_attr)

        # Offset calculations for next sample based on the max hyperedge ID as it indicates the number of hyperedges
        max_edge_id = (
            data.edge_index[1].max().item() if data.edge_index.size(1) > 0 else -1
        )
        edge_offset += (
            data.num_edges if data.num_edges is not None else max_edge_id + 1
        )

        # Offset calculations for next sample based on x[0] as x has shape (num_nodes, num_features), so 0 provides the number of nodes
        node_offset += (
            data.num_nodes if data.num_nodes is not None else data.x.size(0)
        )

    # Concatenate all edge_index tensors along the incidence dimension, so that we get a shape of (2, total_edges)
    batched_edge_index = torch.cat(edge_indexes, dim=1)
    max_edge_id = int(
        (
            batched_edge_index[1].max().item()
            if batched_edge_index.size(1) > 0
            else -1
        )
    )
    total_edges = max_edge_id + 1

    batched_edge_attr = None
    if len(edge_attrs) > 0:
        # Concatenate hyperedge attributes along dimension 0 (the hyperedge dimension)
        # edge_attr typically has shape (num_edges, num_edge_features)
        # Result shape: (total_edges, num_edge_features)
        batched_edge_attr = torch.cat(edge_attrs, dim=0)

    return batched_edge_index, batched_edge_attr, total_edges