Skip to content

API Reference

Complete API documentation for all Hyperbench modules.

Data Module

hyperbench.data.dataset

DatasetNames

Bases: Enum

Enumeration of available datasets.

Source code in hyperbench/data/dataset.py
class DatasetNames(Enum):
    """
    Enumeration of available datasets.
    """

    ALGEBRA = "algebra"
    AMAZON = "amazon"
    CONTACT_HIGH_SCHOOL = "contact-high-school"
    CONTACT_PRIMARY_SCHOOL = "contact-primary-school"
    CORA = "cora"
    COURSERA = "coursera"
    DBLP = "dblp"
    EMAIL_ENRON = "email-Enron"
    EMAIL_W3C = "email-W3C"
    GEOMETRY = "geometry"
    GOT = "got"
    IMDB = "imdb"
    MUSIC_BLUES_REVIEWS = "music-blues-reviews"
    NBA = "nba"
    NDC_CLASSES = "NDC-classes"
    NDC_SUBSTANCES = "NDC-substances"
    PATENT = "patent"
    PUBMED = "pubmed"
    RESTAURANT_REVIEWS = "restaurant-reviews"
    THREADS_ASK_UBUNTU = "threads-ask-ubuntu"
    THREADS_MATH_SX = "threads-math-sx"
    TWITTER = "twitter"
    VEGAS_BARS_REVIEWS = "vegas-bars-reviews"

HIFConverter

A utility class to load hypergraphs from HIF format.

Source code in hyperbench/data/dataset.py
class HIFConverter:
    """A utility class to load hypergraphs from HIF format."""

    @staticmethod
    def load_from_hif(dataset_name: Optional[str], save_on_disk: bool = False) -> HIFHypergraph:
        if dataset_name is None:
            raise ValueError(f"Dataset name (provided: {dataset_name}) must be provided.")
        if dataset_name not in DatasetNames.__members__:
            raise ValueError(f"Dataset '{dataset_name}' not found.")

        dataset_name = DatasetNames[dataset_name].value
        current_dir = os.path.dirname(os.path.abspath(__file__))
        zst_filename = os.path.join(current_dir, "datasets", f"{dataset_name}.json.zst")

        if not os.path.exists(zst_filename):
            github_dataset_repo = f"https://github.com/hypernetwork-research-group/datasets/blob/main/{dataset_name}.json.zst?raw=true"

            response = requests.get(github_dataset_repo)
            if response.status_code != 200:
                raise ValueError(
                    f"Failed to download dataset '{dataset_name}' from GitHub. Status code: {response.status_code}"
                )

            if save_on_disk:
                os.makedirs(os.path.join(current_dir, "datasets"), exist_ok=True)
                with open(zst_filename, "wb") as f:
                    f.write(response.content)
            else:
                # Create temporary file for downloaded zst content
                with tempfile.NamedTemporaryFile(
                    mode="wb", suffix=".json.zst", delete=False
                ) as tmp_zst_file:
                    tmp_zst_file.write(response.content)
                    zst_filename = tmp_zst_file.name

        # Decompress the downloaded zst file
        dctx = zstd.ZstdDecompressor()
        with (
            open(zst_filename, "rb") as input_f,
            tempfile.NamedTemporaryFile(mode="wb", suffix=".json", delete=False) as tmp_file,
        ):
            dctx.copy_stream(input_f, tmp_file)
            output = tmp_file.name

        with open(output, "r") as f:
            hiftext = json.load(f)
        if not validate_hif_json(output):
            raise ValueError(f"Dataset '{dataset_name}' is not HIF-compliant.")

        hypergraph = HIFHypergraph.from_hif(hiftext)
        return hypergraph

Dataset

Bases: Dataset

A dataset class for loading and processing hypergraph data.

Attributes:

Name Type Description
DATASET_NAME

Class variable indicating the name of the dataset to load.

hypergraph

The loaded hypergraph in HIF format. Can be None if initialized from an HData object.

hdata

The processed hypergraph data in HData format.

sampling_strategy

The strategy used for sampling sub-hypergraphs (e.g., by node IDs or hyperedge IDs). If not provided, defaults to SamplingStrategy.HYPEREDGE.

Source code in hyperbench/data/dataset.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
class Dataset(TorchDataset):
    """
    A dataset class for loading and processing hypergraph data.

    Attributes:
        DATASET_NAME: Class variable indicating the name of the dataset to load.
        hypergraph: The loaded hypergraph in HIF format. Can be ``None`` if initialized from an HData object.
        hdata: The processed hypergraph data in HData format.
        sampling_strategy: The strategy used for sampling sub-hypergraphs (e.g., by node IDs or hyperedge IDs).
            If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.
    """

    DATASET_NAME = None

    def __init__(
        self,
        hdata: Optional[HData] = None,
        sampling_strategy: SamplingStrategy = SamplingStrategy.HYPEREDGE,
        prepare: bool = True,
    ) -> None:
        """
        Initialize the Dataset.

        Args:
            hdata: Optional HData object to initialize the dataset with.
                If provided, the dataset will be initialized with this data instead of loading and processing from HIF. Must be provided if prepare is set to ``False``.
            sampling_strategy: The sampling strategy to use for the dataset. If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.
            prepare: Whether to load and process the original dataset from HIF format.
                If set to ``False``, the dataset will be initialized with the provided hdata instead. Defaults to ``True``.
        """
        self.__is_prepared = prepare
        self.__sampler = create_sampler_from_strategy(sampling_strategy)
        self.sampling_strategy = sampling_strategy

        if self.__is_prepared:
            self.hypergraph = self.download()
            self.hdata = self.process()
        else:
            if hdata is None:
                raise ValueError("hdata must be provided when prepare is set to False.")

            self.hypergraph = HIFHypergraph.empty()
            self.hdata = hdata

    def __len__(self) -> int:
        return self.__sampler.len(self.hdata)

    def __getitem__(self, index: int | List[int]) -> HData:
        """
        Sample a sub-hypergraph based on the sampling strategy and return it as HData.
        If:
        - Sampling by node IDs, the sub-hypergraph will contain all hyperedges incident to the sampled nodes and all nodes incident to those hyperedges.
        - Sampling by hyperedge IDs, the sub-hypergraph will contain all nodes incident to the sampled hyperedges.

        Args:
            index: An integer or a list of integers representing node or hyperedge IDs to sample, depending on the sampling strategy.

        Returns:
            An HData instance containing the sampled sub-hypergraph.

        Raises:
            ValueError: If the provided index is invalid (e.g., empty list or list length exceeds number of nodes/hyperedges).
            IndexError: If any node/hyperedge ID is out of bounds.
        """
        return self.__sampler.sample(index, self.hdata)

    @classmethod
    def from_hdata(
        cls,
        hdata: HData,
        sampling_strategy: SamplingStrategy = SamplingStrategy.HYPEREDGE,
    ) -> "Dataset":
        """
        Create a :class:`Dataset` instance from an :class:`HData` object.

        Args:
            hdata: :class:`HData` object containing the hypergraph data.
            sampling_strategy: The sampling strategy to use for the dataset. If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.

        Returns:
            The :class:`Dataset` instance with the provided :class:`HData`.
        """
        return cls(hdata=hdata, sampling_strategy=sampling_strategy, prepare=False)

    def download(self) -> HIFHypergraph:
        """
        Load the hypergraph from HIF format using HIFConverter class.
        """
        if not self.__is_prepared:
            raise ValueError("download can only be called for the original dataset (prepare=True).")

        if hasattr(self, "hypergraph") and self.hypergraph is not None:
            return self.hypergraph

        return HIFConverter.load_from_hif(self.DATASET_NAME, save_on_disk=True)

    def process(self) -> HData:
        """
        Process the loaded hypergraph into :class:`HData` format, mapping HIF structure to tensors.

        Returns:
            The processed hypergraph data.
        """
        if not self.__is_prepared:
            raise ValueError("process can only be called for the original dataset.")

        num_nodes = len(self.hypergraph.nodes)
        x = self.__process_x(num_nodes)

        # Remap node IDs to 0-based contiguous IDs (using indices) matching the x tensor order
        node_id_to_idx = {node.get("node"): idx for idx, node in enumerate(self.hypergraph.nodes)}
        # Initialize edge_set only with edges that have incidences, so that
        # we avoid inflating edge count due to isolated nodes/missing incidences
        hyperedge_id_to_idx: Dict[Any, int] = {}

        node_ids = []
        hyperedge_ids = []
        nodes_with_incidences = set()
        for incidence in self.hypergraph.incidences:
            node_id = incidence.get("node", 0)
            hyperedge_id = incidence.get("edge", 0)

            if hyperedge_id not in hyperedge_id_to_idx:
                # Hyperedges start from 0 and are assigned IDs in the order they are first encountered in incidences
                hyperedge_id_to_idx[hyperedge_id] = len(hyperedge_id_to_idx)

            node_ids.append(node_id_to_idx[node_id])
            hyperedge_ids.append(hyperedge_id_to_idx[hyperedge_id])
            nodes_with_incidences.add(node_id_to_idx[node_id])

        # Handle isolated nodes by assigning them to a new unique hyperedge (self-loop)
        for node_idx in range(num_nodes):
            if node_idx not in nodes_with_incidences:
                new_hyperedge_id = len(hyperedge_id_to_idx)
                # Unique dummy key to reserve the index in hyperedge_set
                hyperedge_id_to_idx[f"__self_loop_{node_idx}__"] = new_hyperedge_id
                node_ids.append(node_idx)
                hyperedge_ids.append(new_hyperedge_id)

        num_hyperedges = len(hyperedge_id_to_idx)
        hyperedge_attr = self.__process_hyperedge_attr(hyperedge_id_to_idx, num_hyperedges)

        hyperedge_index = torch.tensor([node_ids, hyperedge_ids], dtype=torch.long)

        return HData(x, hyperedge_index, hyperedge_attr, num_nodes, num_hyperedges)

    def enrich_node_features(
        self,
        enricher: NodeFeatureEnricher,
        enrichment_mode: Optional[EnrichmentMode] = None,
    ) -> None:
        """
        Enrich node features using the provided node feature enricher.

        Args:
            enricher: An instance of NodeFeatureEnricher to generate structural node features from hypergraph topology.
            enrichment_mode: How to combine generated features with existing ``hdata.x``.
                ``concatenate`` appends new features as additional columns.
                ``replace`` substitutes ``hdata.x`` entirely.
        """
        self.hdata.enrich_node_features(enricher, enrichment_mode)

    def split(
        self,
        ratios: List[float],
        shuffle: Optional[bool] = False,
        seed: Optional[int] = None,
    ) -> List["Dataset"]:
        """
        Split the dataset by hyperedges into partitions with contiguous 0-based IDs.

        Boundaries are computed using cumulative floor to prevent early splits from
        over-consuming edges. The last split absorbs any rounding remainder.

        Examples:
            With ``num_hyperedges = 3`` and ``ratios = [0.5, 0.25, 0.25]``:

            >>> cumulative_ratios = [0.5, 0.75, 1.0]

            Boundaries:

            - ``i=0`` -> ``end = int(0.5 * 3) = 1`` -> slice ``[0:1]`` -> 1 edge
            - ``i=1`` -> ``end = int(0.75 * 3) = 2`` -> slice ``[1:2]`` -> 1 edge
            - ``i=2`` -> ``end = 3`` (clamped) -> slice ``[2:3]`` -> 1 edge

        Args:
            ratios: List of floats summing to ``1.0``, e.g., ``[0.8, 0.1, 0.1]``.
            shuffle: Whether to shuffle hyperedges before splitting. Defaults to ``False`` for deterministic splits.
            seed: Optional random seed for reproducibility. Ignored if shuffle is set to ``False``.

        Returns:
            List of Dataset objects, one per split, each with contiguous IDs.
        """
        # Allow small imprecision in sum of ratios, but raise error if it's significant
        # Example: ratios = [0.8, 0.1, 0.1] -> sum = 1.0 (valid)
        #          ratios = [0.8, 0.1, 0.05] -> sum = 0.95 (invalid, raises ValueError)
        #          ratios = [0.8, 0.1, 0.1, 0.0000001] -> sum = 1.0000001 (valid, allows small imprecision)
        if abs(sum(ratios) - 1.0) > 1e-6:
            raise ValueError(f"Split ratios must sum to 1.0, got {sum(ratios)}.")

        device = self.hdata.device
        num_hyperedges = self.hdata.num_hyperedges
        hyperedge_ids_permutation = self.__get_hyperedge_ids_permutation(
            num_hyperedges, shuffle, seed
        )

        # Compute cumulative ratio boundaries to avoid independent rounding errors.
        # Independent rounding (e.g., round(0.5*3)=2, round(0.25*3)=1, round(0.25*3)=1 -> total=4)
        # can over-allocate edges to early splits and starve later ones.
        # Cumulative floor boundaries guarantee monotonically increasing cut points.
        # Example: ratios = [0.5, 0.25, 0.25], num_hyperedges = 3
        #          cumulative_ratios = [0.5, 0.75, 1.0]
        cumulative_ratios = []
        cumsum = 0.0
        for ratio in ratios:
            cumsum += ratio
            cumulative_ratios.append(cumsum)

        split_datasets = []
        start = 0
        for i in range(len(ratios)):
            if i == len(ratios) - 1:
                # Last split gets everything remaining, absorbing any rounding remainder
                # Example: start = 2, end = 3 -> permutation[2:3] = [2] (1 edge)
                end = num_hyperedges
            else:
                # Floor of cumulative boundary ensures early splits don't over-consume
                # Example: i=0 -> int(0.5 * 3) = int(1.5) = 1, end = 1
                #          i=1 -> int(0.75 * 3) = int(2.25) = 2, end = 2
                end = int(cumulative_ratios[i] * num_hyperedges)

            # Example: i=0 -> permutation[0:1] = [0] (1 edge)
            #          i=1 -> permutation[1:2] = [1] (1 edge)
            #          i=2 -> permutation[2:3] = [2] (1 edge)
            split_hyperedge_ids = hyperedge_ids_permutation[start:end]
            split_hdata = HData.split(self.hdata, split_hyperedge_ids).to(device=device)
            split_dataset = self.__class__(
                hdata=split_hdata,
                sampling_strategy=self.sampling_strategy,
                prepare=False,
            )
            split_datasets.append(split_dataset)

            start = end

        return split_datasets

    def to(self, device: torch.device) -> "Dataset":
        """
        Move the dataset's HData to the specified device.

        Args:
            device: The target device (e.g., ``torch.device('cuda')`` or ``torch.device('cpu')``).

        Returns:
            The Dataset instance moved to the specified device.
        """
        self.hdata = self.hdata.to(device)
        return self

    def transform_node_attrs(
        self,
        attrs: Dict[str, Any],
        attr_keys: Optional[List[str]] = None,
    ) -> Tensor:
        return self.transform_attrs(attrs, attr_keys)

    def transform_hyperedge_attrs(
        self,
        attrs: Dict[str, Any],
        attr_keys: Optional[List[str]] = None,
    ) -> Tensor:
        return self.transform_attrs(attrs, attr_keys)

    def transform_attrs(
        self,
        attrs: Dict[str, Any],
        attr_keys: Optional[List[str]] = None,
    ) -> Tensor:
        """
        Extract and encode numeric attributes to tensor.
        Non-numeric attributes are discarded. Missing attributes are filled with ``0.0``.

        Args:
            attrs: Dictionary of attributes
            attr_keys: Optional list of attribute keys to encode. If provided, ensures consistent ordering and fill missing with ``0.0``.

        Returns:
            Tensor of numeric attribute values
        """
        numeric_attrs = {
            key: value
            for key, value in attrs.items()
            if isinstance(value, (int, float)) and not isinstance(value, bool)
        }

        if attr_keys is not None:
            values = [float(numeric_attrs.get(key, 0.0)) for key in attr_keys]
            return torch.tensor(values, dtype=torch.float)

        if not numeric_attrs:
            return torch.tensor([], dtype=torch.float)

        values = [float(value) for value in numeric_attrs.values()]
        return torch.tensor(values, dtype=torch.float)

    def __collect_attr_keys(self, attr_keys: List[Dict[str, Any]]) -> List[str]:
        """
        Collect unique numeric attribute keys from a list of attribute dictionaries.

        Args:
            attr_keys: List of attribute dictionaries.

        Returns:
            List of unique numeric attribute keys.
        """
        unique_keys = []
        for attrs in attr_keys:
            for key, value in attrs.items():
                if key not in unique_keys and isinstance(value, (int, float)):
                    unique_keys.append(key)

        return unique_keys

    def __get_hyperedge_ids_permutation(
        self,
        num_hyperedges: int,
        shuffle: Optional[bool],
        seed: Optional[int],
    ) -> Tensor:
        device = self.hdata.device

        # Shuffle hyperedge IDs if shuffle is requested, otherwise keep original order for deterministic splits
        if shuffle:
            generator = torch.Generator(device=device)
            if seed is not None:
                generator.manual_seed(seed)

            random_hyperedge_ids_permutation = torch.randperm(
                n=num_hyperedges,
                generator=generator,
                device=device,
            )
            return random_hyperedge_ids_permutation

        ranged_hyperedge_ids_permutation = torch.arange(num_hyperedges, device=device)
        return ranged_hyperedge_ids_permutation

    def __process_hyperedge_attr(
        self,
        hyperedge_id_to_idx: Dict[Any, int],
        num_hyperedges: int,
    ) -> Optional[Tensor]:
        # hyperedge-attr: shape [num_hyperedges, num_hyperedge_attributes]
        hyperedge_attr = None
        has_hyperedges = self.hypergraph.edges is not None and len(self.hypergraph.edges) > 0
        has_any_hyperedge_attrs = has_hyperedges and any(
            "attrs" in edge for edge in self.hypergraph.edges
        )

        if has_any_hyperedge_attrs:
            hyperedge_id_to_attrs: Dict[Any, Dict[str, Any]] = {
                e.get("edge"): e.get("attrs", {}) for e in self.hypergraph.edges
            }

            hyperedge_attr_keys = self.__collect_attr_keys(list(hyperedge_id_to_attrs.values()))

            # Build attributes in exact order of hyperedge_set indices (0 to num_hyperedges - 1)
            hyperedge_idx_to_id = {idx: id for id, idx in hyperedge_id_to_idx.items()}

            attrs = []
            for hyperedge_idx in range(num_hyperedges):
                hyperedge_id = hyperedge_idx_to_id[hyperedge_idx]

                transformed_attrs = self.transform_hyperedge_attrs(
                    # If it's a real hyperedge, get its attrs; if self-loop, get empty dict
                    attrs=hyperedge_id_to_attrs.get(hyperedge_id, {}),
                    attr_keys=hyperedge_attr_keys,
                )
                attrs.append(transformed_attrs)

            hyperedge_attr = torch.stack(attrs)

        return hyperedge_attr

    def __process_x(self, num_nodes: int) -> Tensor:
        # Collect all attribute keys to have tensors of same size
        node_attr_keys = self.__collect_attr_keys(
            [node.get("attrs", {}) for node in self.hypergraph.nodes]
        )

        if node_attr_keys:
            x = torch.stack(
                [
                    self.transform_node_attrs(node.get("attrs", {}), attr_keys=node_attr_keys)
                    for node in self.hypergraph.nodes
                ]
            )
        else:
            # Fallback to ones if no node features, 1 is better as it can help during
            # training (e.g., avoid zero multiplication), especially in first epochs
            x = torch.ones((num_nodes, 1), dtype=torch.float)

        return x  # shape [num_nodes, num_node_features]

    def stats(self) -> Dict[str, Any]:
        """
        Compute statistics for the dataset.
        This method currently delegates to the underlying HData's stats method.
        The fields returned in the dictionary include:
        - ``shape_x``: The shape of the node feature matrix ``x``.
        - ``shape_hyperedge_attr``: The shape of the hyperedge attribute matrix, or ``None`` if hyperedge attributes are not present.
        - ``num_nodes``: The number of nodes in the hypergraph.
        - ``num_hyperedges``: The number of hyperedges in the hypergraph.
        - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to.
        - ``avg_degree_node``: The floored node average degree.
        - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains.
        - ``avg_degree_hyperedge``: The floored hyperedge average size.
        - ``node_degree_max``: The maximum degree of any node in the hypergraph.
        - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
        - ``node_degree_median``: The median degree of nodes in the hypergraph.
        - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
        - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``.
        - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``.
        - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree.
        - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.

        Returns:
            A dictionary containing various statistics about the hypergraph.
        """

        return self.hdata.stats()

__init__(hdata=None, sampling_strategy=SamplingStrategy.HYPEREDGE, prepare=True)

Initialize the Dataset.

Parameters:

Name Type Description Default
hdata Optional[HData]

Optional HData object to initialize the dataset with. If provided, the dataset will be initialized with this data instead of loading and processing from HIF. Must be provided if prepare is set to False.

None
sampling_strategy SamplingStrategy

The sampling strategy to use for the dataset. If not provided, defaults to SamplingStrategy.HYPEREDGE.

HYPEREDGE
prepare bool

Whether to load and process the original dataset from HIF format. If set to False, the dataset will be initialized with the provided hdata instead. Defaults to True.

True
Source code in hyperbench/data/dataset.py
def __init__(
    self,
    hdata: Optional[HData] = None,
    sampling_strategy: SamplingStrategy = SamplingStrategy.HYPEREDGE,
    prepare: bool = True,
) -> None:
    """
    Initialize the Dataset.

    Args:
        hdata: Optional HData object to initialize the dataset with.
            If provided, the dataset will be initialized with this data instead of loading and processing from HIF. Must be provided if prepare is set to ``False``.
        sampling_strategy: The sampling strategy to use for the dataset. If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.
        prepare: Whether to load and process the original dataset from HIF format.
            If set to ``False``, the dataset will be initialized with the provided hdata instead. Defaults to ``True``.
    """
    self.__is_prepared = prepare
    self.__sampler = create_sampler_from_strategy(sampling_strategy)
    self.sampling_strategy = sampling_strategy

    if self.__is_prepared:
        self.hypergraph = self.download()
        self.hdata = self.process()
    else:
        if hdata is None:
            raise ValueError("hdata must be provided when prepare is set to False.")

        self.hypergraph = HIFHypergraph.empty()
        self.hdata = hdata

__getitem__(index)

Sample a sub-hypergraph based on the sampling strategy and return it as HData. If: - Sampling by node IDs, the sub-hypergraph will contain all hyperedges incident to the sampled nodes and all nodes incident to those hyperedges. - Sampling by hyperedge IDs, the sub-hypergraph will contain all nodes incident to the sampled hyperedges.

Parameters:

Name Type Description Default
index int | List[int]

An integer or a list of integers representing node or hyperedge IDs to sample, depending on the sampling strategy.

required

Returns:

Type Description
HData

An HData instance containing the sampled sub-hypergraph.

Raises:

Type Description
ValueError

If the provided index is invalid (e.g., empty list or list length exceeds number of nodes/hyperedges).

IndexError

If any node/hyperedge ID is out of bounds.

Source code in hyperbench/data/dataset.py
def __getitem__(self, index: int | List[int]) -> HData:
    """
    Sample a sub-hypergraph based on the sampling strategy and return it as HData.
    If:
    - Sampling by node IDs, the sub-hypergraph will contain all hyperedges incident to the sampled nodes and all nodes incident to those hyperedges.
    - Sampling by hyperedge IDs, the sub-hypergraph will contain all nodes incident to the sampled hyperedges.

    Args:
        index: An integer or a list of integers representing node or hyperedge IDs to sample, depending on the sampling strategy.

    Returns:
        An HData instance containing the sampled sub-hypergraph.

    Raises:
        ValueError: If the provided index is invalid (e.g., empty list or list length exceeds number of nodes/hyperedges).
        IndexError: If any node/hyperedge ID is out of bounds.
    """
    return self.__sampler.sample(index, self.hdata)

from_hdata(hdata, sampling_strategy=SamplingStrategy.HYPEREDGE) classmethod

Create a :class:Dataset instance from an :class:HData object.

Parameters:

Name Type Description Default
hdata HData

:class:HData object containing the hypergraph data.

required
sampling_strategy SamplingStrategy

The sampling strategy to use for the dataset. If not provided, defaults to SamplingStrategy.HYPEREDGE.

HYPEREDGE

Returns:

Name Type Description
The Dataset

class:Dataset instance with the provided :class:HData.

Source code in hyperbench/data/dataset.py
@classmethod
def from_hdata(
    cls,
    hdata: HData,
    sampling_strategy: SamplingStrategy = SamplingStrategy.HYPEREDGE,
) -> "Dataset":
    """
    Create a :class:`Dataset` instance from an :class:`HData` object.

    Args:
        hdata: :class:`HData` object containing the hypergraph data.
        sampling_strategy: The sampling strategy to use for the dataset. If not provided, defaults to ``SamplingStrategy.HYPEREDGE``.

    Returns:
        The :class:`Dataset` instance with the provided :class:`HData`.
    """
    return cls(hdata=hdata, sampling_strategy=sampling_strategy, prepare=False)

download()

Load the hypergraph from HIF format using HIFConverter class.

Source code in hyperbench/data/dataset.py
def download(self) -> HIFHypergraph:
    """
    Load the hypergraph from HIF format using HIFConverter class.
    """
    if not self.__is_prepared:
        raise ValueError("download can only be called for the original dataset (prepare=True).")

    if hasattr(self, "hypergraph") and self.hypergraph is not None:
        return self.hypergraph

    return HIFConverter.load_from_hif(self.DATASET_NAME, save_on_disk=True)

process()

Process the loaded hypergraph into :class:HData format, mapping HIF structure to tensors.

Returns:

Type Description
HData

The processed hypergraph data.

Source code in hyperbench/data/dataset.py
def process(self) -> HData:
    """
    Process the loaded hypergraph into :class:`HData` format, mapping HIF structure to tensors.

    Returns:
        The processed hypergraph data.
    """
    if not self.__is_prepared:
        raise ValueError("process can only be called for the original dataset.")

    num_nodes = len(self.hypergraph.nodes)
    x = self.__process_x(num_nodes)

    # Remap node IDs to 0-based contiguous IDs (using indices) matching the x tensor order
    node_id_to_idx = {node.get("node"): idx for idx, node in enumerate(self.hypergraph.nodes)}
    # Initialize edge_set only with edges that have incidences, so that
    # we avoid inflating edge count due to isolated nodes/missing incidences
    hyperedge_id_to_idx: Dict[Any, int] = {}

    node_ids = []
    hyperedge_ids = []
    nodes_with_incidences = set()
    for incidence in self.hypergraph.incidences:
        node_id = incidence.get("node", 0)
        hyperedge_id = incidence.get("edge", 0)

        if hyperedge_id not in hyperedge_id_to_idx:
            # Hyperedges start from 0 and are assigned IDs in the order they are first encountered in incidences
            hyperedge_id_to_idx[hyperedge_id] = len(hyperedge_id_to_idx)

        node_ids.append(node_id_to_idx[node_id])
        hyperedge_ids.append(hyperedge_id_to_idx[hyperedge_id])
        nodes_with_incidences.add(node_id_to_idx[node_id])

    # Handle isolated nodes by assigning them to a new unique hyperedge (self-loop)
    for node_idx in range(num_nodes):
        if node_idx not in nodes_with_incidences:
            new_hyperedge_id = len(hyperedge_id_to_idx)
            # Unique dummy key to reserve the index in hyperedge_set
            hyperedge_id_to_idx[f"__self_loop_{node_idx}__"] = new_hyperedge_id
            node_ids.append(node_idx)
            hyperedge_ids.append(new_hyperedge_id)

    num_hyperedges = len(hyperedge_id_to_idx)
    hyperedge_attr = self.__process_hyperedge_attr(hyperedge_id_to_idx, num_hyperedges)

    hyperedge_index = torch.tensor([node_ids, hyperedge_ids], dtype=torch.long)

    return HData(x, hyperedge_index, hyperedge_attr, num_nodes, num_hyperedges)

enrich_node_features(enricher, enrichment_mode=None)

Enrich node features using the provided node feature enricher.

Parameters:

Name Type Description Default
enricher NodeFeatureEnricher

An instance of NodeFeatureEnricher to generate structural node features from hypergraph topology.

required
enrichment_mode Optional[EnrichmentMode]

How to combine generated features with existing hdata.x. concatenate appends new features as additional columns. replace substitutes hdata.x entirely.

None
Source code in hyperbench/data/dataset.py
def enrich_node_features(
    self,
    enricher: NodeFeatureEnricher,
    enrichment_mode: Optional[EnrichmentMode] = None,
) -> None:
    """
    Enrich node features using the provided node feature enricher.

    Args:
        enricher: An instance of NodeFeatureEnricher to generate structural node features from hypergraph topology.
        enrichment_mode: How to combine generated features with existing ``hdata.x``.
            ``concatenate`` appends new features as additional columns.
            ``replace`` substitutes ``hdata.x`` entirely.
    """
    self.hdata.enrich_node_features(enricher, enrichment_mode)

split(ratios, shuffle=False, seed=None)

Split the dataset by hyperedges into partitions with contiguous 0-based IDs.

Boundaries are computed using cumulative floor to prevent early splits from over-consuming edges. The last split absorbs any rounding remainder.

Examples:

With num_hyperedges = 3 and ratios = [0.5, 0.25, 0.25]:

>>> cumulative_ratios = [0.5, 0.75, 1.0]

Boundaries:

  • i=0 -> end = int(0.5 * 3) = 1 -> slice [0:1] -> 1 edge
  • i=1 -> end = int(0.75 * 3) = 2 -> slice [1:2] -> 1 edge
  • i=2 -> end = 3 (clamped) -> slice [2:3] -> 1 edge

Parameters:

Name Type Description Default
ratios List[float]

List of floats summing to 1.0, e.g., [0.8, 0.1, 0.1].

required
shuffle Optional[bool]

Whether to shuffle hyperedges before splitting. Defaults to False for deterministic splits.

False
seed Optional[int]

Optional random seed for reproducibility. Ignored if shuffle is set to False.

None

Returns:

Type Description
List[Dataset]

List of Dataset objects, one per split, each with contiguous IDs.

Source code in hyperbench/data/dataset.py
def split(
    self,
    ratios: List[float],
    shuffle: Optional[bool] = False,
    seed: Optional[int] = None,
) -> List["Dataset"]:
    """
    Split the dataset by hyperedges into partitions with contiguous 0-based IDs.

    Boundaries are computed using cumulative floor to prevent early splits from
    over-consuming edges. The last split absorbs any rounding remainder.

    Examples:
        With ``num_hyperedges = 3`` and ``ratios = [0.5, 0.25, 0.25]``:

        >>> cumulative_ratios = [0.5, 0.75, 1.0]

        Boundaries:

        - ``i=0`` -> ``end = int(0.5 * 3) = 1`` -> slice ``[0:1]`` -> 1 edge
        - ``i=1`` -> ``end = int(0.75 * 3) = 2`` -> slice ``[1:2]`` -> 1 edge
        - ``i=2`` -> ``end = 3`` (clamped) -> slice ``[2:3]`` -> 1 edge

    Args:
        ratios: List of floats summing to ``1.0``, e.g., ``[0.8, 0.1, 0.1]``.
        shuffle: Whether to shuffle hyperedges before splitting. Defaults to ``False`` for deterministic splits.
        seed: Optional random seed for reproducibility. Ignored if shuffle is set to ``False``.

    Returns:
        List of Dataset objects, one per split, each with contiguous IDs.
    """
    # Allow small imprecision in sum of ratios, but raise error if it's significant
    # Example: ratios = [0.8, 0.1, 0.1] -> sum = 1.0 (valid)
    #          ratios = [0.8, 0.1, 0.05] -> sum = 0.95 (invalid, raises ValueError)
    #          ratios = [0.8, 0.1, 0.1, 0.0000001] -> sum = 1.0000001 (valid, allows small imprecision)
    if abs(sum(ratios) - 1.0) > 1e-6:
        raise ValueError(f"Split ratios must sum to 1.0, got {sum(ratios)}.")

    device = self.hdata.device
    num_hyperedges = self.hdata.num_hyperedges
    hyperedge_ids_permutation = self.__get_hyperedge_ids_permutation(
        num_hyperedges, shuffle, seed
    )

    # Compute cumulative ratio boundaries to avoid independent rounding errors.
    # Independent rounding (e.g., round(0.5*3)=2, round(0.25*3)=1, round(0.25*3)=1 -> total=4)
    # can over-allocate edges to early splits and starve later ones.
    # Cumulative floor boundaries guarantee monotonically increasing cut points.
    # Example: ratios = [0.5, 0.25, 0.25], num_hyperedges = 3
    #          cumulative_ratios = [0.5, 0.75, 1.0]
    cumulative_ratios = []
    cumsum = 0.0
    for ratio in ratios:
        cumsum += ratio
        cumulative_ratios.append(cumsum)

    split_datasets = []
    start = 0
    for i in range(len(ratios)):
        if i == len(ratios) - 1:
            # Last split gets everything remaining, absorbing any rounding remainder
            # Example: start = 2, end = 3 -> permutation[2:3] = [2] (1 edge)
            end = num_hyperedges
        else:
            # Floor of cumulative boundary ensures early splits don't over-consume
            # Example: i=0 -> int(0.5 * 3) = int(1.5) = 1, end = 1
            #          i=1 -> int(0.75 * 3) = int(2.25) = 2, end = 2
            end = int(cumulative_ratios[i] * num_hyperedges)

        # Example: i=0 -> permutation[0:1] = [0] (1 edge)
        #          i=1 -> permutation[1:2] = [1] (1 edge)
        #          i=2 -> permutation[2:3] = [2] (1 edge)
        split_hyperedge_ids = hyperedge_ids_permutation[start:end]
        split_hdata = HData.split(self.hdata, split_hyperedge_ids).to(device=device)
        split_dataset = self.__class__(
            hdata=split_hdata,
            sampling_strategy=self.sampling_strategy,
            prepare=False,
        )
        split_datasets.append(split_dataset)

        start = end

    return split_datasets

to(device)

Move the dataset's HData to the specified device.

Parameters:

Name Type Description Default
device device

The target device (e.g., torch.device('cuda') or torch.device('cpu')).

required

Returns:

Type Description
Dataset

The Dataset instance moved to the specified device.

Source code in hyperbench/data/dataset.py
def to(self, device: torch.device) -> "Dataset":
    """
    Move the dataset's HData to the specified device.

    Args:
        device: The target device (e.g., ``torch.device('cuda')`` or ``torch.device('cpu')``).

    Returns:
        The Dataset instance moved to the specified device.
    """
    self.hdata = self.hdata.to(device)
    return self

transform_attrs(attrs, attr_keys=None)

Extract and encode numeric attributes to tensor. Non-numeric attributes are discarded. Missing attributes are filled with 0.0.

Parameters:

Name Type Description Default
attrs Dict[str, Any]

Dictionary of attributes

required
attr_keys Optional[List[str]]

Optional list of attribute keys to encode. If provided, ensures consistent ordering and fill missing with 0.0.

None

Returns:

Type Description
Tensor

Tensor of numeric attribute values

Source code in hyperbench/data/dataset.py
def transform_attrs(
    self,
    attrs: Dict[str, Any],
    attr_keys: Optional[List[str]] = None,
) -> Tensor:
    """
    Extract and encode numeric attributes to tensor.
    Non-numeric attributes are discarded. Missing attributes are filled with ``0.0``.

    Args:
        attrs: Dictionary of attributes
        attr_keys: Optional list of attribute keys to encode. If provided, ensures consistent ordering and fill missing with ``0.0``.

    Returns:
        Tensor of numeric attribute values
    """
    numeric_attrs = {
        key: value
        for key, value in attrs.items()
        if isinstance(value, (int, float)) and not isinstance(value, bool)
    }

    if attr_keys is not None:
        values = [float(numeric_attrs.get(key, 0.0)) for key in attr_keys]
        return torch.tensor(values, dtype=torch.float)

    if not numeric_attrs:
        return torch.tensor([], dtype=torch.float)

    values = [float(value) for value in numeric_attrs.values()]
    return torch.tensor(values, dtype=torch.float)

__collect_attr_keys(attr_keys)

Collect unique numeric attribute keys from a list of attribute dictionaries.

Parameters:

Name Type Description Default
attr_keys List[Dict[str, Any]]

List of attribute dictionaries.

required

Returns:

Type Description
List[str]

List of unique numeric attribute keys.

Source code in hyperbench/data/dataset.py
def __collect_attr_keys(self, attr_keys: List[Dict[str, Any]]) -> List[str]:
    """
    Collect unique numeric attribute keys from a list of attribute dictionaries.

    Args:
        attr_keys: List of attribute dictionaries.

    Returns:
        List of unique numeric attribute keys.
    """
    unique_keys = []
    for attrs in attr_keys:
        for key, value in attrs.items():
            if key not in unique_keys and isinstance(value, (int, float)):
                unique_keys.append(key)

    return unique_keys

stats()

Compute statistics for the dataset. This method currently delegates to the underlying HData's stats method. The fields returned in the dictionary include: - shape_x: The shape of the node feature matrix x. - shape_hyperedge_attr: The shape of the hyperedge attribute matrix, or None if hyperedge attributes are not present. - num_nodes: The number of nodes in the hypergraph. - num_hyperedges: The number of hyperedges in the hypergraph. - avg_degree_node_raw: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to. - avg_degree_node: The floored node average degree. - avg_degree_hyperedge_raw: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains. - avg_degree_hyperedge: The floored hyperedge average size. - node_degree_max: The maximum degree of any node in the hypergraph. - hyperedge_degree_max: The maximum size of any hyperedge in the hypergraph. - node_degree_median: The median degree of nodes in the hypergraph. - hyperedge_degree_median: The median size of hyperedges in the hypergraph. - distribution_node_degree: A list where the value at index i represents the count of nodes with degree i. - distribution_hyperedge_size: A list where the value at index i represents the count of hyperedges with size i. - distribution_node_degree_hist: A dictionary where the keys are node degrees and the values are the count of nodes with that degree. - distribution_hyperedge_size_hist: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.

Returns:

Type Description
Dict[str, Any]

A dictionary containing various statistics about the hypergraph.

Source code in hyperbench/data/dataset.py
def stats(self) -> Dict[str, Any]:
    """
    Compute statistics for the dataset.
    This method currently delegates to the underlying HData's stats method.
    The fields returned in the dictionary include:
    - ``shape_x``: The shape of the node feature matrix ``x``.
    - ``shape_hyperedge_attr``: The shape of the hyperedge attribute matrix, or ``None`` if hyperedge attributes are not present.
    - ``num_nodes``: The number of nodes in the hypergraph.
    - ``num_hyperedges``: The number of hyperedges in the hypergraph.
    - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to.
    - ``avg_degree_node``: The floored node average degree.
    - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains.
    - ``avg_degree_hyperedge``: The floored hyperedge average size.
    - ``node_degree_max``: The maximum degree of any node in the hypergraph.
    - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
    - ``node_degree_median``: The median degree of nodes in the hypergraph.
    - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
    - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``.
    - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``.
    - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree.
    - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.

    Returns:
        A dictionary containing various statistics about the hypergraph.
    """

    return self.hdata.stats()

hyperbench.data.loader

DataLoader

Bases: DataLoader

Source code in hyperbench/data/loader.py
class DataLoader(TorchDataLoader):
    def __init__(
        self,
        dataset: Dataset,
        batch_size: int = 1,
        shuffle: Optional[bool] = False,
        **kwargs,
    ) -> None:
        super().__init__(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            collate_fn=self.collate,
            **kwargs,
        )

        self.__cached_dataset_hdata = dataset.hdata

    def collate(self, batch: List[HData]) -> HData:
        """
        Collates a list of :class:`HData objects into a single batched :class:`HData object.

        This function combines multiple separate samples into a single batched representation suitable for mini-batch training.
        It handles:
        - Concatenating node features from all samples.
        - Concatenating and offsetting hyperedges from all samples.
        - Concatenating hyperedge attributes from all samples, if present.

        Examples:
            Given ``batch = [HData_0, HData_1]``:

            For node features:

            >>> HData_0.x.shape  # (3, 64) — 3 nodes with 64 features
            >>> HData_1.x.shape  # (2, 64) — 2 nodes with 64 features
            >>> x.shape  # (5, 64) — all 5 nodes concatenated

            For hyperedge index:

            - ``HData_0`` (3 nodes, 2 hyperedges):

            >>> hyperedge_index = [[0, 1, 1, 2],  # Nodes 0, 1, 1, 2
            ...                    [0, 0, 1, 1]]  # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2}

            - ``HData_1`` (2 nodes, 1 hyperedge):

            >>> hyperedge_index = [[0, 1],  # Nodes 0, 1
            ...                    [0, 0]]  # Hyperedge 0 contains {0,1}

            Batched result:

            >>> hyperedge_index = [[0, 1, 1, 2, 3, 4],  # Node indices: original then offset by 3
            ...                    [0, 0, 1, 1, 2, 2]]  # Hyperedge IDs: original then offset by 2

        Args:
            batch: List of :class:`HData objects to collate.

        Returns:
            A single :class:`HData` object containing the collated data.
        """
        collated_hyperedge_index = torch.cat([data.hyperedge_index for data in batch], dim=1)
        hyperedge_index_wrapper = HyperedgeIndex(collated_hyperedge_index).remove_duplicate_edges()

        hyperedge_ids = hyperedge_index_wrapper.hyperedge_ids
        collated_x = self.__cached_dataset_hdata.x[hyperedge_index_wrapper.node_ids]
        collated_y = self.__cached_dataset_hdata.y[hyperedge_ids]

        collated_hyeredge_attr = None
        if self.__cached_dataset_hdata.hyperedge_attr is not None:
            collated_hyeredge_attr = self.__cached_dataset_hdata.hyperedge_attr[hyperedge_ids]

        collated_hyperedge_index = hyperedge_index_wrapper.to_0based().item

        collated_hdata = HData(
            x=collated_x,
            hyperedge_index=collated_hyperedge_index,
            hyperedge_attr=collated_hyeredge_attr,
            num_nodes=hyperedge_index_wrapper.num_nodes,
            num_hyperedges=hyperedge_index_wrapper.num_hyperedges,
            y=collated_y,
        )

        return collated_hdata.to(batch[0].device)

collate(batch)

Collates a list of :class:HData objects into a single batched :class:HData object.

This function combines multiple separate samples into a single batched representation suitable for mini-batch training. It handles: - Concatenating node features from all samples. - Concatenating and offsetting hyperedges from all samples. - Concatenating hyperedge attributes from all samples, if present.

Examples:

Given batch = [HData_0, HData_1]:

For node features:

>>> HData_0.x.shape  # (3, 64) — 3 nodes with 64 features
>>> HData_1.x.shape  # (2, 64) — 2 nodes with 64 features
>>> x.shape  # (5, 64) — all 5 nodes concatenated

For hyperedge index:

  • HData_0 (3 nodes, 2 hyperedges):
>>> hyperedge_index = [[0, 1, 1, 2],  # Nodes 0, 1, 1, 2
...                    [0, 0, 1, 1]]  # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2}
  • HData_1 (2 nodes, 1 hyperedge):
>>> hyperedge_index = [[0, 1],  # Nodes 0, 1
...                    [0, 0]]  # Hyperedge 0 contains {0,1}

Batched result:

>>> hyperedge_index = [[0, 1, 1, 2, 3, 4],  # Node indices: original then offset by 3
...                    [0, 0, 1, 1, 2, 2]]  # Hyperedge IDs: original then offset by 2

Parameters:

Name Type Description Default
batch List[HData]

List of :class:`HData objects to collate.

required

Returns:

Type Description
HData

A single :class:HData object containing the collated data.

Source code in hyperbench/data/loader.py
def collate(self, batch: List[HData]) -> HData:
    """
    Collates a list of :class:`HData objects into a single batched :class:`HData object.

    This function combines multiple separate samples into a single batched representation suitable for mini-batch training.
    It handles:
    - Concatenating node features from all samples.
    - Concatenating and offsetting hyperedges from all samples.
    - Concatenating hyperedge attributes from all samples, if present.

    Examples:
        Given ``batch = [HData_0, HData_1]``:

        For node features:

        >>> HData_0.x.shape  # (3, 64) — 3 nodes with 64 features
        >>> HData_1.x.shape  # (2, 64) — 2 nodes with 64 features
        >>> x.shape  # (5, 64) — all 5 nodes concatenated

        For hyperedge index:

        - ``HData_0`` (3 nodes, 2 hyperedges):

        >>> hyperedge_index = [[0, 1, 1, 2],  # Nodes 0, 1, 1, 2
        ...                    [0, 0, 1, 1]]  # Hyperedge 0 contains {0,1}, Hyperedge 1 contains {1,2}

        - ``HData_1`` (2 nodes, 1 hyperedge):

        >>> hyperedge_index = [[0, 1],  # Nodes 0, 1
        ...                    [0, 0]]  # Hyperedge 0 contains {0,1}

        Batched result:

        >>> hyperedge_index = [[0, 1, 1, 2, 3, 4],  # Node indices: original then offset by 3
        ...                    [0, 0, 1, 1, 2, 2]]  # Hyperedge IDs: original then offset by 2

    Args:
        batch: List of :class:`HData objects to collate.

    Returns:
        A single :class:`HData` object containing the collated data.
    """
    collated_hyperedge_index = torch.cat([data.hyperedge_index for data in batch], dim=1)
    hyperedge_index_wrapper = HyperedgeIndex(collated_hyperedge_index).remove_duplicate_edges()

    hyperedge_ids = hyperedge_index_wrapper.hyperedge_ids
    collated_x = self.__cached_dataset_hdata.x[hyperedge_index_wrapper.node_ids]
    collated_y = self.__cached_dataset_hdata.y[hyperedge_ids]

    collated_hyeredge_attr = None
    if self.__cached_dataset_hdata.hyperedge_attr is not None:
        collated_hyeredge_attr = self.__cached_dataset_hdata.hyperedge_attr[hyperedge_ids]

    collated_hyperedge_index = hyperedge_index_wrapper.to_0based().item

    collated_hdata = HData(
        x=collated_x,
        hyperedge_index=collated_hyperedge_index,
        hyperedge_attr=collated_hyeredge_attr,
        num_nodes=hyperedge_index_wrapper.num_nodes,
        num_hyperedges=hyperedge_index_wrapper.num_hyperedges,
        y=collated_y,
    )

    return collated_hdata.to(batch[0].device)

Train Module

Trainer

hyperbench.train.trainer

MultiModelTrainer

A trainer class to handle training multiple models with individual trainers.

Parameters:

Name Type Description Default
model_configs List[ModelConfig]

A list of ModelConfig objects, each containing a model and its associated trainer (if any).

required
experiment_name Optional[str]

Name for this experiment run's log directory. When None (default), auto-increments as experiment_0, experiment_1, etc. under the log root directory. Only used when logger is not provided.

None
accelerator str | Accelerator

Supports passing different accelerator types ("cpu", "gpu", "tpu", "hpu", "mps", "auto") as well as custom accelerator instances.

'auto'
devices list[int] | str | int

The devices to use. Can be set to a positive number (int or str), a sequence of device indices (list or str), the value -1 to indicate all available devices should be used, or "auto" for automatic selection based on the chosen accelerator. Defaults to "auto".

'auto'
strategy str | Strategy

Supports different training strategies with aliases as well custom strategies. Defaults to "auto".

'auto'
num_nodes int

Number of GPU nodes for distributed training. Defaults to 1.

1
precision Optional[Any]

Double precision (64, '64' or '64-true'), full precision (32, '32' or '32-true'), 16bit mixed precision (16, '16', '16-mixed') or bfloat16 mixed precision ('bf16', 'bf16-mixed'). Can be used on CPU, GPU, TPUs, or HPUs. Defaults to '32-true'.

None
max_epochs Optional[int]

Stop training once this number of epochs is reached. Disabled by default (None). If both max_epochs and max_steps are not specified, defaults to max_epochs = 1000. To enable infinite training, set max_epochs = -1.

None
min_epochs Optional[int]

Force training for at least these many epochs. Disabled by default (None).

None
max_steps int

Stop training after this number of steps. Disabled by default (-1). If max_steps = -1 and max_epochs = None, will default to max_epochs = 1000. To enable infinite training, set max_epochs to -1.

-1
min_steps Optional[int]

Force training for at least these number of steps. Disabled by default (None).

None
check_val_every_n_epoch Optional[int]

Perform a validation loop after every N training epochs. If None, validation will be done solely based on the number of training batches, requiring val_check_interval to be an integer value. When used together with a time-based val_check_interval and check_val_every_n_epoch > 1, validation is aligned to epoch multiples: if the interval elapses before the next multiple-N epoch, validation runs at the start of that epoch (after the first batch) and the timer resets; if it elapses during a multiple-N epoch, validation runs after the current batch. For None or 1 cases, the time-based behavior of val_check_interval applies without additional alignment. Defaults to 1.

1
logger Optional[Logger | Iterable[Logger] | bool]

Logger (or iterable collection of loggers) for experiment tracking. A True value uses the default TensorBoardLogger if it is installed, otherwise CSVLogger. False will disable logging. If multiple loggers are provided, local files (checkpoints, profiler traces, etc.) are saved in the log_dir of the first logger. Defaults to True.

None
default_root_dir Optional[str | Path]

Default path for logs and weights when no logger/ckpt_callback passed. Defaults to os.getcwd(). Can be remote file paths such as s3://mybucket/path or 'hdfs://path/'

None
enable_autolog_hparams bool

Whether to log hyperparameters at the start of a run. Defaults to True.

True
log_every_n_steps Optional[int]

How often to log within steps. Defaults to 50.

None
profiler Optional[Profiler | str]

To profile individual steps during training and assist in identifying bottlenecks. Defaults to None.

None
fast_dev_run int | bool

Runs n if set to n (int) else 1 if set to True batch(es) of train, val and test to find any bugs (ie: a sort of unit test). Defaults to False.

False
enable_checkpointing bool

If True, enable checkpointing. It will configure a default ModelCheckpoint callback if there is no user-defined ModelCheckpoint in :paramref:~hyperbench.train.MultiModelTrainer.callbacks. Defaults to True.

True
enable_progress_bar bool

Whether to enable the progress bar by default. Defaults to True.

True
enable_model_summary Optional[bool]

Whether to enable model summarization by default. Defaults to True.

None
callbacks Optional[List[Callback] | Callback]

Add a callback or list of callbacks. Defaults to None.

None
auto_start_tensorboard bool

When True and tensorboard is installed, automatically starts a TensorBoard server pointing at the experiment log directory. Using this option requires that TensorBoard is installed in the environment and moves control of the TensorBoard server lifecycle to the trainer, which will automatically terminate the server when the trainer is finalized (e.g., at the end of a with block or when the object is garbage collected). Enable auto_wait to keep the server alive after training completes so you can inspect results before the trainer is finalized. Defaults to False.

False
tensorboard_port int

Port for the auto-launched TensorBoard server. Defaults to 6006.

6006
auto_wait bool

When True and a TensorBoard server is running, automatically calls :meth:wait inside finalize before terminating the server, so the user can inspect results before the process is stopped. Defaults to False.

False
Source code in hyperbench/train/trainer.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
class MultiModelTrainer:
    """
    A trainer class to handle training multiple models with individual trainers.

    Args:
        model_configs: A list of ModelConfig objects, each containing a model and its associated trainer (if any).

        experiment_name: Name for this experiment run's log directory. When ``None`` (default),
            auto-increments as ``experiment_0``, ``experiment_1``, etc. under the log root directory.
            Only used when ``logger`` is not provided.

        accelerator: Supports passing different accelerator types ("cpu", "gpu", "tpu", "hpu", "mps", "auto")
            as well as custom accelerator instances.

        devices: The devices to use. Can be set to a positive number (int or str), a sequence of device indices
            (list or str), the value ``-1`` to indicate all available devices should be used, or ``"auto"`` for
            automatic selection based on the chosen accelerator. Defaults to ``"auto"``.

        strategy: Supports different training strategies with aliases as well custom strategies.
            Defaults to ``"auto"``.

        num_nodes: Number of GPU nodes for distributed training.
            Defaults to ``1``.

        precision: Double precision (64, '64' or '64-true'), full precision (32, '32' or '32-true'),
            16bit mixed precision (16, '16', '16-mixed') or bfloat16 mixed precision ('bf16', 'bf16-mixed').
            Can be used on CPU, GPU, TPUs, or HPUs.
            Defaults to ``'32-true'``.

        max_epochs: Stop training once this number of epochs is reached. Disabled by default (None).
            If both max_epochs and max_steps are not specified, defaults to ``max_epochs = 1000``.
            To enable infinite training, set ``max_epochs = -1``.

        min_epochs: Force training for at least these many epochs. Disabled by default (None).

        max_steps: Stop training after this number of steps. Disabled by default (-1). If ``max_steps = -1``
            and ``max_epochs = None``, will default to ``max_epochs = 1000``. To enable infinite training, set
            ``max_epochs`` to ``-1``.

        min_steps: Force training for at least these number of steps. Disabled by default (``None``).

        check_val_every_n_epoch: Perform a validation loop after every `N` training epochs. If ``None``,
            validation will be done solely based on the number of training batches, requiring ``val_check_interval``
            to be an integer value. When used together with a time-based ``val_check_interval`` and
            ``check_val_every_n_epoch`` > 1, validation is aligned to epoch multiples: if the interval elapses
            before the next multiple-N epoch, validation runs at the start of that epoch (after the first batch)
            and the timer resets; if it elapses during a multiple-N epoch, validation runs after the current batch.
            For ``None`` or ``1`` cases, the time-based behavior of ``val_check_interval`` applies without
            additional alignment.
            Defaults to ``1``.

        logger: Logger (or iterable collection of loggers) for experiment tracking. A ``True`` value uses
            the default ``TensorBoardLogger`` if it is installed, otherwise ``CSVLogger``.
            ``False`` will disable logging. If multiple loggers are provided, local files
            (checkpoints, profiler traces, etc.) are saved in the ``log_dir`` of the first logger.
            Defaults to ``True``.

        default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed.
            Defaults to ``os.getcwd()``.
            Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/'

        enable_autolog_hparams: Whether to log hyperparameters at the start of a run.
            Defaults to ``True``.

        log_every_n_steps: How often to log within steps.
            Defaults to ``50``.

        profiler: To profile individual steps during training and assist in identifying bottlenecks.
            Defaults to ``None``.

        fast_dev_run: Runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es)
            of train, val and test to find any bugs (ie: a sort of unit test).
            Defaults to ``False``.

        enable_checkpointing: If ``True``, enable checkpointing.
            It will configure a default ModelCheckpoint callback if there is no user-defined ModelCheckpoint in
            :paramref:`~hyperbench.train.MultiModelTrainer.callbacks`.
            Defaults to ``True``.

        enable_progress_bar: Whether to enable the progress bar by default.
            Defaults to ``True``.

        enable_model_summary: Whether to enable model summarization by default.
            Defaults to ``True``.

        callbacks: Add a callback or list of callbacks.
            Defaults to ``None``.

        auto_start_tensorboard: When ``True`` and tensorboard is installed, automatically starts
            a TensorBoard server pointing at the experiment log directory.
            Using this option requires that TensorBoard is installed in the environment and moves control
            of the TensorBoard server lifecycle to the trainer, which will automatically terminate the server
            when the trainer is finalized (e.g., at the end of a `with` block or when the object is garbage collected).
            Enable `auto_wait` to keep the server alive after training completes so you can inspect results before the trainer is finalized.
            Defaults to ``False``.

        tensorboard_port: Port for the auto-launched TensorBoard server.
            Defaults to ``6006``.

        auto_wait: When ``True`` and a TensorBoard server is running, automatically calls
            :meth:`wait` inside `finalize` before terminating the server, so the user
            can inspect results before the process is stopped.
            Defaults to ``False``.
    """

    DEFAULT_BASE_LOG_DIR = "hyperbench_logs"
    EXPERIMENT_NAME_PREFIX = "experiment"
    VERSION_NAME_PREFIX = "version"

    def __init__(
        self,
        model_configs: List[ModelConfig],
        experiment_name: Optional[str] = None,
        # args to pass to each Trainer
        accelerator: str | Accelerator = "auto",
        devices: list[int] | str | int = "auto",
        strategy: str | Strategy = "auto",
        num_nodes: int = 1,
        precision: Optional[
            Any  # Any as Lightning accepts multiple types (int, str, Literal, etc.)
        ] = None,
        max_epochs: Optional[int] = None,
        min_epochs: Optional[int] = None,
        max_steps: int = -1,
        min_steps: Optional[int] = None,
        check_val_every_n_epoch: Optional[int] = 1,
        logger: Optional[Logger | Iterable[Logger] | bool] = None,
        default_root_dir: Optional[str | Path] = None,
        enable_autolog_hparams: bool = True,
        log_every_n_steps: Optional[int] = None,
        profiler: Optional[Profiler | str] = None,
        fast_dev_run: int | bool = False,
        enable_checkpointing: bool = True,
        enable_progress_bar: bool = True,
        enable_model_summary: Optional[bool] = None,
        callbacks: Optional[List[Callback] | Callback] = None,
        auto_start_tensorboard: bool = False,
        tensorboard_port: int = 6006,
        auto_wait: bool = False,
        **kwargs,
    ) -> None:
        self.model_configs = model_configs
        self.log_dir = self.__setup_logdir(default_root_dir, experiment_name)

        self.auto_start_tensorboard = auto_start_tensorboard
        self.auto_wait = auto_wait
        self.tensorboard_port = tensorboard_port
        self.__tensorboard_process: Optional[subprocess.Popen] = None

        for model_config in model_configs:
            if model_config.trainer is None:
                model_logger = self.__setup_logger(model_config, logger)

                model_config.trainer = L.Trainer(
                    accelerator=accelerator,
                    devices=devices,
                    strategy=strategy,
                    num_nodes=num_nodes,
                    precision=precision,
                    max_epochs=max_epochs,
                    min_epochs=min_epochs,
                    max_steps=max_steps,
                    min_steps=min_steps,
                    check_val_every_n_epoch=check_val_every_n_epoch,
                    logger=model_logger,
                    default_root_dir=default_root_dir,
                    enable_autolog_hparams=enable_autolog_hparams,
                    log_every_n_steps=log_every_n_steps,
                    profiler=profiler,
                    fast_dev_run=fast_dev_run,
                    enable_checkpointing=enable_checkpointing,
                    enable_progress_bar=enable_progress_bar,
                    enable_model_summary=enable_model_summary,
                    callbacks=copy.deepcopy(callbacks),
                    **kwargs,
                )

        print(f"Initialized trainer(models: {len(model_configs)}, log_dir: {self.log_dir})")
        self.__auto_start_tensorboard_if_enabled()

    def __enter__(self) -> "MultiModelTrainer":
        return self

    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        self.finalize()

    def __del__(self) -> None:
        try:
            self.finalize()
        except Exception:
            pass

    @property
    def models(self) -> List[L.LightningModule]:
        return [config.model for config in self.model_configs]

    def model(self, name: str, version: str = "default") -> Optional[L.LightningModule]:
        for config in self.model_configs:
            if config.name == name and config.version == version:
                return config.model
        return None

    def fit_all(
        self,
        train_dataloader: Optional[DataLoader] = None,
        val_dataloader: Optional[DataLoader] = None,
        datamodule: Optional[L.LightningDataModule] = None,
        ckpt_path: Optional[CkptStrategy] = None,
        verbose: bool = True,
    ) -> None:
        if len(self.model_configs) < 1:
            raise ValueError("No models to fit.")

        for i, config in enumerate(self.model_configs):
            if not config.is_trainable:
                if verbose:
                    print(
                        f"Skipping training for model {config.full_model_name()} [{i + 1}/{len(self.model_configs)} models] (is_trainable=False)"
                    )
                continue

            if config.trainer is None:
                raise ValueError(f"Trainer not defined for model {config.full_model_name()}.")

            if verbose:
                print(
                    f"Fit model {config.full_model_name()} [{i + 1}/{len(self.model_configs)} models]"
                )

            config.trainer.fit(
                model=config.model,
                train_dataloaders=train_dataloader,
                val_dataloaders=val_dataloader,
                datamodule=datamodule,
                ckpt_path=ckpt_path,
            )

    def test_all(
        self,
        dataloader: Optional[DataLoader] = None,
        datamodule: Optional[L.LightningDataModule] = None,
        ckpt_path: Optional[CkptStrategy] = None,
        verbose: bool = True,
        verbose_loop: bool = True,
    ) -> Mapping[str, TestResult]:
        if len(self.model_configs) < 1:
            raise ValueError("No models to test.")

        test_results: Dict[str, TestResult] = {}

        for i, config in enumerate(self.model_configs):
            if config.trainer is None:
                raise ValueError(f"Trainer not defined for model {config.full_model_name()}.")

            if verbose:
                print(
                    f"Test model {config.full_model_name()} [{i + 1}/{len(self.model_configs)} models]"
                )

            trainer_test_results: List[TestResult] = config.trainer.test(
                model=config.model,
                dataloaders=dataloader,
                datamodule=datamodule,
                ckpt_path=ckpt_path,
                verbose=verbose_loop,
            )

            # In Lightning, test() returns a list of dicts, one per dataloader, but we use a single dataloader
            test_results[config.full_model_name()] = (
                trainer_test_results[0] if len(trainer_test_results) > 0 else {}
            )

        return test_results

    def __auto_start_tensorboard_if_enabled(self) -> None:
        if self.auto_start_tensorboard:
            if self.__is_tensorboard_available():
                self.__tensorboard_process = self.__start_tensorboard_process()
            else:
                warnings.warn(
                    "TensorBoard is not available."
                    "Install it with `pip install hyperbench[tensorboard]` or `pip install tensorboard`"
                    "to enable auto-start.",
                    category=UserWarning,
                    stacklevel=2,
                )

    def finalize(self) -> None:
        if self.auto_wait:
            self.wait()
        if self.__tensorboard_process is not None:
            self.__tensorboard_process.terminate()
            self.__tensorboard_process = None

    def wait(self) -> None:
        """
        Wait until the user presses Enter, keeping process alive.
        If no process is running, this method does nothing.
        """
        # For now, we only use this for waiting on TensorBoard, but this can be extended
        # to support waiting for other processes or conditions as needed
        if self.__tensorboard_process is None:
            return

        print(f"TensorBoard is running at http://localhost:{self.tensorboard_port}")

        try:
            input("Press Enter to stop...")
        except (KeyboardInterrupt, EOFError):
            pass

    def __is_tensorboard_available(self) -> bool:
        return importlib.util.find_spec("tensorboard") is not None

    def __start_tensorboard_process(self) -> Optional[subprocess.Popen]:
        try:
            process = subprocess.Popen(
                ["tensorboard", "--logdir", self.log_dir, "--port", str(self.tensorboard_port)],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
            )
            print(
                f"TensorBoard started at http://localhost:{self.tensorboard_port} (logdir={self.log_dir})"
            )
            return process
        except Exception as e:
            warnings.warn(
                f"Proceeding without starting TensorBoard as it failed: {e}",
                category=UserWarning,
                stacklevel=2,
            )
            return None

    def __next_experiment_name(self, save_dir: Path) -> Path:
        if not save_dir.exists():
            return Path(f"{MultiModelTrainer.EXPERIMENT_NAME_PREFIX}_0")

        existing_experiment_names: List[str] = [
            dir.name
            for dir in save_dir.iterdir()
            if dir.is_dir() and dir.name.startswith(MultiModelTrainer.EXPERIMENT_NAME_PREFIX)
        ]
        if len(existing_experiment_names) < 1:
            return Path(f"{MultiModelTrainer.EXPERIMENT_NAME_PREFIX}_0")

        last_experiment_number = max(
            int(experiment_name.split("_")[1])
            for experiment_name in existing_experiment_names
            if experiment_name.split("_")[1].isdigit()
        )
        return Path(f"{MultiModelTrainer.EXPERIMENT_NAME_PREFIX}_{last_experiment_number + 1}")

    def __setup_logdir(
        self,
        default_root_dir: Optional[str | Path],
        experiment_name: Optional[str],
    ) -> Path:
        base_dir = (
            Path(MultiModelTrainer.DEFAULT_BASE_LOG_DIR)
            if default_root_dir is None
            else Path(default_root_dir)
        )
        next_experiment_name = (
            self.__next_experiment_name(base_dir)
            if experiment_name is None
            else Path(experiment_name)
        )
        return base_dir / next_experiment_name

    def __setup_logger(
        self,
        model_config: ModelConfig,
        logger: Optional[Logger | Iterable[Logger] | bool],
    ) -> Optional[Logger | Iterable[Logger] | bool]:
        if logger is not None:
            return logger

        loggers: List[Logger] = [
            CSVLogger(
                save_dir=self.log_dir,
                name=model_config.name,
                version=f"{MultiModelTrainer.VERSION_NAME_PREFIX}_{model_config.version}",
            ),
        ]

        if self.__is_tensorboard_available():
            from lightning.pytorch.loggers import TensorBoardLogger

            loggers.append(
                TensorBoardLogger(
                    save_dir=self.log_dir,
                    name=model_config.name,
                    version=f"{MultiModelTrainer.VERSION_NAME_PREFIX}_{model_config.version}",
                ),
            )

        return loggers

wait()

Wait until the user presses Enter, keeping process alive. If no process is running, this method does nothing.

Source code in hyperbench/train/trainer.py
def wait(self) -> None:
    """
    Wait until the user presses Enter, keeping process alive.
    If no process is running, this method does nothing.
    """
    # For now, we only use this for waiting on TensorBoard, but this can be extended
    # to support waiting for other processes or conditions as needed
    if self.__tensorboard_process is None:
        return

    print(f"TensorBoard is running at http://localhost:{self.tensorboard_port}")

    try:
        input("Press Enter to stop...")
    except (KeyboardInterrupt, EOFError):
        pass

Negative Sampler

hyperbench.train.negative_sampler

NegativeSampler

Bases: ABC

Abstract base class for negative samplers.

Parameters:

Name Type Description Default
return_0based_negatives bool
  • If True, the negative samples returned by the sample method will have 0-based node and hyperedge IDs.
  • If False, the negative samples will retain the original global node and hyperedge IDs from the input data.
False
Source code in hyperbench/train/negative_sampler.py
class NegativeSampler(ABC):
    """
    Abstract base class for negative samplers.

    Args:
        return_0based_negatives:
            - If ``True``, the negative samples returned by the ``sample`` method will have 0-based node and hyperedge IDs.
            - If ``False``, the negative samples will retain the original global node and hyperedge IDs from the input data.
    """

    def __init__(self, return_0based_negatives: bool = False):
        super().__init__()
        self.return_0based_negatives: bool = return_0based_negatives

    @abstractmethod
    def sample(self, data: HData) -> HData:
        """
        Abstract method for negative sampling.

        Args:
            data: The input data object containing graph or hypergraph information.

        Returns:
            The negative samples as a new :class:`HData` object.

        Raises:
            NotImplementedError: If the method is not implemented in a subclass.
        """
        raise NotImplementedError("Subclasses must implement this method.")

    def _new_negative_hyperedge_index(
        self,
        sampled_hyperedge_indexes: List[Tensor],
        negative_node_ids: Tensor,
        negative_hyperedge_ids: Tensor,
    ) -> Tensor:
        """
        Concatenate, sort, and remap the sampled hyperedge indexes for negative samples.

        Args:
            sampled_hyperedge_indexes: List of hyperedge index tensors for each negative sample.
            negative_node_ids: Tensor of negative node IDs.
            negative_hyperedge_ids: Tensor of negative hyperedge IDs.

        Returns:
            The concatenated, sorted, and remapped hyperedge index tensor.
            If ``self.return_0based_negatives`` is ``True``, the returned tensor will have 0-based node and hyperedge IDs.
            Otherwise, it will retain the original global node and hyperedge IDs from the input data.
        """
        negative_hyperedge_index = torch.cat(sampled_hyperedge_indexes, dim=1)
        if not self.return_0based_negatives:
            return negative_hyperedge_index

        negative_hyperedge_index_wrapper = HyperedgeIndex(negative_hyperedge_index).to_0based(
            node_ids_to_rebase=negative_node_ids,
            hyperedge_ids_to_rebase=negative_hyperedge_ids,
        )

        return negative_hyperedge_index_wrapper.item

    def _new_hyperedge_attr(
        self,
        sampled_hyperedge_attrs: List[Tensor],
        hyperedge_attr: Optional[Tensor] = None,
    ) -> Optional[Tensor]:
        """
        Concatenate the hyperedge attributes for the negative samples.

        Args:
            sampled_hyperedge_attrs: List of hyperedge attribute tensors for each negative sample.
            hyperedge_attr: The original hyperedge attributes from the input data.

        Returns:
            The concatenated hyperedge attribute tensor for the negative samples.
        """
        if hyperedge_attr is None or len(sampled_hyperedge_attrs) < 1:
            return None

        negative_hyperedge_attr = torch.stack(sampled_hyperedge_attrs, dim=0)
        return negative_hyperedge_attr

    def _new_x(self, x: Tensor, negative_node_ids: Tensor) -> Tuple[Tensor, int]:
        """
        Get the node feature matrix for the negative samples.

        Args:
            x: The original node feature matrix from the input data.
            negative_node_ids: Tensor of negative node IDs.

        Returns:
            The node feature matrix for the negative samples and the number of negative nodes.
        """
        return x[negative_node_ids], len(negative_node_ids)

sample(data) abstractmethod

Abstract method for negative sampling.

Parameters:

Name Type Description Default
data HData

The input data object containing graph or hypergraph information.

required

Returns:

Type Description
HData

The negative samples as a new :class:HData object.

Raises:

Type Description
NotImplementedError

If the method is not implemented in a subclass.

Source code in hyperbench/train/negative_sampler.py
@abstractmethod
def sample(self, data: HData) -> HData:
    """
    Abstract method for negative sampling.

    Args:
        data: The input data object containing graph or hypergraph information.

    Returns:
        The negative samples as a new :class:`HData` object.

    Raises:
        NotImplementedError: If the method is not implemented in a subclass.
    """
    raise NotImplementedError("Subclasses must implement this method.")

RandomNegativeSampler

Bases: NegativeSampler

A random negative sampler. Negatives generated with return_0based_negatives = False aren't usable standalone as they have global node and hyperedge IDs. They must be concatenated with the original :class:HData object that is provided as input to the sample method, as it contains the global node and hyperedge IDs and features that can be indexed with the negative samples' IDs.

Parameters:

Name Type Description Default
num_negative_samples int

Number of negative hyperedges to generate.

required
num_nodes_per_sample int

Number of nodes per negative hyperedge.

required
return_0based_negatives bool
  • If True, the negative samples returned by the sample method will have 0-based node and hyperedge IDs.
  • If False, the negative samples will retain the original global node and hyperedge IDs from the input data.
False

Raises:

Type Description
ValueError

If either argument is not positive.

Source code in hyperbench/train/negative_sampler.py
class RandomNegativeSampler(NegativeSampler):
    """
    A random negative sampler. Negatives generated with ``return_0based_negatives = False`` aren't usable standalone
    as they have global node and hyperedge IDs. They must be concatenated with the original :class:`HData` object
    that is provided as input to the ``sample`` method, as it contains the global node and hyperedge IDs and features
    that can be indexed with the negative samples' IDs.

    Args:
        num_negative_samples: Number of negative hyperedges to generate.
        num_nodes_per_sample: Number of nodes per negative hyperedge.
        return_0based_negatives:
            - If ``True``, the negative samples returned by the ``sample`` method will have 0-based node and hyperedge IDs.
            - If ``False``, the negative samples will retain the original global node and hyperedge IDs from the input data.

    Raises:
        ValueError: If either argument is not positive.
    """

    def __init__(
        self,
        num_negative_samples: int,
        num_nodes_per_sample: int,
        return_0based_negatives: bool = False,
    ):
        if num_negative_samples <= 0:
            raise ValueError(f"num_negative_samples must be positive, got {num_negative_samples}.")
        if num_nodes_per_sample <= 0:
            raise ValueError(f"num_nodes_per_sample must be positive, got {num_nodes_per_sample}.")

        super().__init__(return_0based_negatives=return_0based_negatives)
        self.num_negative_samples = num_negative_samples
        self.num_nodes_per_sample = num_nodes_per_sample

    def sample(self, data: HData) -> HData:
        """
        Generate negative hyperedges by randomly sampling unique node IDs.
        Node IDs are sampled from the same node space as the input data, and the new negative hyperedge IDs
        start from the original number of hyperedges in the input data to avoid ID conflicts.
        The resulting negative samples are returned as a new :class:`HData` object with remapped 0-based node and hyperedge IDs, if ``self.return_0based_negatives == True``.
        Otherwise, the negative samples retain their original global node and hyperedge IDs from the input data.

        Examples:
            With ``self.return_0based_negatives = True``:

            >>> num_negative_samples = 2
            >>> num_nodes_per_sample = 3
            >>> negative_hyperedge_index = [[0, 0, 1, 2, 3, 4],
            ...                             [0, 1, 1, 0, 1, 0]]

            The negative hyperedge 0 connects nodes 0, 2, 3.
            The second negative hyperedge 1 connects nodes 0, 1, 4.

            >>> negative_x = data.x[[0, 1, 2, 3, 4]]
            >>> negative_hyperedge_attr = random_attributes_for_2_negative_hyperedges

            With ``self.return_0based_negatives = False``:

            >>> num_negative_samples = 2
            >>> num_nodes_per_sample = 3
            >>> negative_hyperedge_index = [[100, 120, 300, 450, 500, 501],
            ...                             [3, 3, 3, 4, 4, 4]]

            Since node IDs are not remapped, the original feature matrix can be used directly.

            >>> negative_x = data.x

        Args:
            data: The input data object containing node and hyperedge information.

        Returns:
            A new :class:`HData` instance containing the negative samples.

        Raises:
            ValueError: If ``num_nodes_per_sample`` is greater than the number of available nodes.
        """
        if self.num_nodes_per_sample > data.num_nodes:
            raise ValueError(
                f"Asked to create samples with {self.num_nodes_per_sample} nodes, but only {data.num_nodes} nodes are available."
            )

        device = data.device

        negative_node_ids: Set[int] = set()
        sampled_hyperedge_indexes: List[Tensor] = []
        sampled_hyperedge_attrs: List[Tensor] = []

        new_hyperedge_id_offset = data.num_hyperedges
        for new_hyperedge_id in range(self.num_negative_samples):
            # Sample with multinomial without replacement to ensure unique node ids
            # and assign each node id equal probability of being selected by setting all of them to 1
            # Example: num_nodes_per_sample=3, max_node_id=5
            #          -> possible output: [2, 0, 4]
            equal_probabilities = torch.ones(data.num_nodes, device=device)
            sampled_node_ids = torch.multinomial(
                input=equal_probabilities,
                num_samples=self.num_nodes_per_sample,
                replacement=False,
            )

            # Example: sampled_node_ids = [2, 0, 4], new_hyperedge_id=0, new_hyperedge_id_offset=3
            #          -> hyperedge_index = [[2, 0, 4],
            #                                [3, 3, 3]]  # this is sampled_hyperedge_id_tensor
            sampled_hyperedge_id_tensor = torch.full(
                (self.num_nodes_per_sample,),
                new_hyperedge_id + new_hyperedge_id_offset,
                device=device,
            )
            sampled_hyperedge_index = torch.stack(
                [sampled_node_ids, sampled_hyperedge_id_tensor], dim=0
            )
            sampled_hyperedge_indexes.append(sampled_hyperedge_index)

            # Example: nodes = [0, 1, 2],
            #          sampled_node_ids_0 = [0, 1], sampled_node_ids_1 = [1, 2],
            #          -> negative_node_ids = {0, 1, 2}
            negative_node_ids.update(sampled_node_ids.tolist())

            if data.hyperedge_attr is not None:
                random_hyperedge_attr = torch.randn_like(data.hyperedge_attr[0], device=device)
                sampled_hyperedge_attrs.append(random_hyperedge_attr)

        negative_node_ids_tensor = torch.tensor(list(negative_node_ids), device=device)
        new_x, num_negative_nodes = self._new_x(data.x, negative_node_ids_tensor)

        # Example: new_hyperedge_id_offset = 3 (if data.num_edges was 3)
        #          num_negative_samples = 2
        #          -> num_hyperedges_including_negatives = 5
        num_hyperedges_including_negatives = new_hyperedge_id_offset + self.num_negative_samples
        negative_hyperedge_ids = torch.arange(
            new_hyperedge_id_offset,
            num_hyperedges_including_negatives,
            device=device,
        )

        negative_hyperedge_index = self._new_negative_hyperedge_index(
            sampled_hyperedge_indexes,
            negative_node_ids_tensor,
            negative_hyperedge_ids,
        )

        return HData(
            x=new_x,
            hyperedge_index=negative_hyperedge_index,
            hyperedge_attr=self._new_hyperedge_attr(sampled_hyperedge_attrs, data.hyperedge_attr),
            num_nodes=num_negative_nodes,
            num_hyperedges=self.num_negative_samples,
        ).with_y_zeros()

sample(data)

Generate negative hyperedges by randomly sampling unique node IDs. Node IDs are sampled from the same node space as the input data, and the new negative hyperedge IDs start from the original number of hyperedges in the input data to avoid ID conflicts. The resulting negative samples are returned as a new :class:HData object with remapped 0-based node and hyperedge IDs, if self.return_0based_negatives == True. Otherwise, the negative samples retain their original global node and hyperedge IDs from the input data.

Examples:

With self.return_0based_negatives = True:

>>> num_negative_samples = 2
>>> num_nodes_per_sample = 3
>>> negative_hyperedge_index = [[0, 0, 1, 2, 3, 4],
...                             [0, 1, 1, 0, 1, 0]]

The negative hyperedge 0 connects nodes 0, 2, 3. The second negative hyperedge 1 connects nodes 0, 1, 4.

>>> negative_x = data.x[[0, 1, 2, 3, 4]]
>>> negative_hyperedge_attr = random_attributes_for_2_negative_hyperedges

With self.return_0based_negatives = False:

>>> num_negative_samples = 2
>>> num_nodes_per_sample = 3
>>> negative_hyperedge_index = [[100, 120, 300, 450, 500, 501],
...                             [3, 3, 3, 4, 4, 4]]

Since node IDs are not remapped, the original feature matrix can be used directly.

>>> negative_x = data.x

Parameters:

Name Type Description Default
data HData

The input data object containing node and hyperedge information.

required

Returns:

Type Description
HData

A new :class:HData instance containing the negative samples.

Raises:

Type Description
ValueError

If num_nodes_per_sample is greater than the number of available nodes.

Source code in hyperbench/train/negative_sampler.py
def sample(self, data: HData) -> HData:
    """
    Generate negative hyperedges by randomly sampling unique node IDs.
    Node IDs are sampled from the same node space as the input data, and the new negative hyperedge IDs
    start from the original number of hyperedges in the input data to avoid ID conflicts.
    The resulting negative samples are returned as a new :class:`HData` object with remapped 0-based node and hyperedge IDs, if ``self.return_0based_negatives == True``.
    Otherwise, the negative samples retain their original global node and hyperedge IDs from the input data.

    Examples:
        With ``self.return_0based_negatives = True``:

        >>> num_negative_samples = 2
        >>> num_nodes_per_sample = 3
        >>> negative_hyperedge_index = [[0, 0, 1, 2, 3, 4],
        ...                             [0, 1, 1, 0, 1, 0]]

        The negative hyperedge 0 connects nodes 0, 2, 3.
        The second negative hyperedge 1 connects nodes 0, 1, 4.

        >>> negative_x = data.x[[0, 1, 2, 3, 4]]
        >>> negative_hyperedge_attr = random_attributes_for_2_negative_hyperedges

        With ``self.return_0based_negatives = False``:

        >>> num_negative_samples = 2
        >>> num_nodes_per_sample = 3
        >>> negative_hyperedge_index = [[100, 120, 300, 450, 500, 501],
        ...                             [3, 3, 3, 4, 4, 4]]

        Since node IDs are not remapped, the original feature matrix can be used directly.

        >>> negative_x = data.x

    Args:
        data: The input data object containing node and hyperedge information.

    Returns:
        A new :class:`HData` instance containing the negative samples.

    Raises:
        ValueError: If ``num_nodes_per_sample`` is greater than the number of available nodes.
    """
    if self.num_nodes_per_sample > data.num_nodes:
        raise ValueError(
            f"Asked to create samples with {self.num_nodes_per_sample} nodes, but only {data.num_nodes} nodes are available."
        )

    device = data.device

    negative_node_ids: Set[int] = set()
    sampled_hyperedge_indexes: List[Tensor] = []
    sampled_hyperedge_attrs: List[Tensor] = []

    new_hyperedge_id_offset = data.num_hyperedges
    for new_hyperedge_id in range(self.num_negative_samples):
        # Sample with multinomial without replacement to ensure unique node ids
        # and assign each node id equal probability of being selected by setting all of them to 1
        # Example: num_nodes_per_sample=3, max_node_id=5
        #          -> possible output: [2, 0, 4]
        equal_probabilities = torch.ones(data.num_nodes, device=device)
        sampled_node_ids = torch.multinomial(
            input=equal_probabilities,
            num_samples=self.num_nodes_per_sample,
            replacement=False,
        )

        # Example: sampled_node_ids = [2, 0, 4], new_hyperedge_id=0, new_hyperedge_id_offset=3
        #          -> hyperedge_index = [[2, 0, 4],
        #                                [3, 3, 3]]  # this is sampled_hyperedge_id_tensor
        sampled_hyperedge_id_tensor = torch.full(
            (self.num_nodes_per_sample,),
            new_hyperedge_id + new_hyperedge_id_offset,
            device=device,
        )
        sampled_hyperedge_index = torch.stack(
            [sampled_node_ids, sampled_hyperedge_id_tensor], dim=0
        )
        sampled_hyperedge_indexes.append(sampled_hyperedge_index)

        # Example: nodes = [0, 1, 2],
        #          sampled_node_ids_0 = [0, 1], sampled_node_ids_1 = [1, 2],
        #          -> negative_node_ids = {0, 1, 2}
        negative_node_ids.update(sampled_node_ids.tolist())

        if data.hyperedge_attr is not None:
            random_hyperedge_attr = torch.randn_like(data.hyperedge_attr[0], device=device)
            sampled_hyperedge_attrs.append(random_hyperedge_attr)

    negative_node_ids_tensor = torch.tensor(list(negative_node_ids), device=device)
    new_x, num_negative_nodes = self._new_x(data.x, negative_node_ids_tensor)

    # Example: new_hyperedge_id_offset = 3 (if data.num_edges was 3)
    #          num_negative_samples = 2
    #          -> num_hyperedges_including_negatives = 5
    num_hyperedges_including_negatives = new_hyperedge_id_offset + self.num_negative_samples
    negative_hyperedge_ids = torch.arange(
        new_hyperedge_id_offset,
        num_hyperedges_including_negatives,
        device=device,
    )

    negative_hyperedge_index = self._new_negative_hyperedge_index(
        sampled_hyperedge_indexes,
        negative_node_ids_tensor,
        negative_hyperedge_ids,
    )

    return HData(
        x=new_x,
        hyperedge_index=negative_hyperedge_index,
        hyperedge_attr=self._new_hyperedge_attr(sampled_hyperedge_attrs, data.hyperedge_attr),
        num_nodes=num_negative_nodes,
        num_hyperedges=self.num_negative_samples,
    ).with_y_zeros()

Types Module

HData

hyperbench.types.hdata

HData

Container for hypergraph data.

Examples:

>>> x = torch.randn(10, 16)  # 10 nodes with 16 features each
>>> hyperedge_index = torch.tensor([[0, 0, 1, 1, 1],  # node IDs
...                                 [0, 1, 2, 3, 4]]) # hyperedge IDs
>>> data = HData(x, hyperedge_index=hyperedge_index)

Parameters:

Name Type Description Default
x Tensor

Node feature matrix of shape [num_nodes, num_features].

required
hyperedge_index Tensor

Hyperedge connectivity in COO format of shape [2, num_incidences], where hyperedge_index[0] contains node IDs and hyperedge_index[1] contains hyperedge IDs.

required
hyperedge_attr Optional[Tensor]

Hyperedge feature matrix of shape [num_hyperedges, num_hyperedge_features]. Features associated with each hyperedge (e.g., weights, timestamps, types).

None
num_nodes Optional[int]

Number of nodes in the hypergraph. If None, inferred as x.size(0).

None
num_hyperedges Optional[int]

Number of hyperedges in the hypergraph. If None, inferred as the number of unique hyperedge IDs in hyperedge_index[1].

None
y Optional[Tensor]

Labels for hyperedges, of shape [num_hyperedges]. Used for supervised learning tasks. For unsupervised tasks, this can be ignored. Default is a tensor of ones, indicating all hyperedges are positive examples.

None
Source code in hyperbench/types/hdata.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
class HData:
    """
    Container for hypergraph data.

    Examples:
        >>> x = torch.randn(10, 16)  # 10 nodes with 16 features each
        >>> hyperedge_index = torch.tensor([[0, 0, 1, 1, 1],  # node IDs
        ...                                 [0, 1, 2, 3, 4]]) # hyperedge IDs
        >>> data = HData(x, hyperedge_index=hyperedge_index)

    Args:
        x: Node feature matrix of shape ``[num_nodes, num_features]``.
        hyperedge_index: Hyperedge connectivity in COO format of shape ``[2, num_incidences]``,
            where ``hyperedge_index[0]`` contains node IDs and ``hyperedge_index[1]`` contains hyperedge IDs.
        hyperedge_attr: Hyperedge feature matrix of shape ``[num_hyperedges, num_hyperedge_features]``.
            Features associated with each hyperedge (e.g., weights, timestamps, types).
        num_nodes: Number of nodes in the hypergraph.
            If ``None``, inferred as ``x.size(0)``.
        num_hyperedges: Number of hyperedges in the hypergraph.
            If ``None``, inferred as the number of unique hyperedge IDs in ``hyperedge_index[1]``.
        y: Labels for hyperedges, of shape ``[num_hyperedges]``.
            Used for supervised learning tasks. For unsupervised tasks, this can be ignored.
            Default is a tensor of ones, indicating all hyperedges are positive examples.
    """

    def __init__(
        self,
        x: Tensor,
        hyperedge_index: Tensor,
        hyperedge_attr: Optional[Tensor] = None,
        num_nodes: Optional[int] = None,
        num_hyperedges: Optional[int] = None,
        y: Optional[Tensor] = None,
    ):
        self.x: Tensor = x

        self.hyperedge_index: Tensor = hyperedge_index

        self.hyperedge_attr: Optional[Tensor] = hyperedge_attr

        hyperedge_index_wrapper = HyperedgeIndex(hyperedge_index)

        self.num_nodes: int = (
            num_nodes
            if num_nodes is not None
            # There should never be isolated nodes when HData is created by Dataset
            # as each isolted node gets its own self-loop hyperedge
            else hyperedge_index_wrapper.num_nodes_if_isolated_exist(num_nodes=x.size(0))
        )

        self.num_hyperedges: int = (
            num_hyperedges if num_hyperedges is not None else hyperedge_index_wrapper.num_hyperedges
        )

        self.y = (
            y
            if y is not None
            else torch.ones((self.num_hyperedges,), dtype=torch.float, device=self.x.device)
        )

        self.device = self.get_device_if_all_consistent()

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}(\n"
            f"    num_nodes={self.num_nodes},\n"
            f"    num_hyperedges={self.num_hyperedges},\n"
            f"    x_shape={self.x.shape},\n"
            f"    hyperedge_index_shape={self.hyperedge_index.shape},\n"
            f"    hyperedge_attr_shape={self.hyperedge_attr.shape if self.hyperedge_attr is not None else None},\n"
            f"    y_shape={self.y.shape if self.y is not None else None}\n"
            f"    device={self.device}\n"
            f")"
        )

    @classmethod
    def cat_same_node_space(cls, hdatas: Sequence["HData"], x: Optional[Tensor] = None) -> "HData":
        """
        Concatenate :class:`HData` instances that share the same node space, meaning nodes with the same ID in different instances are the same node.
        This is useful when combining positive and negative hyperedges that reference the same set of nodes.

        Notes:
            - ``x`` is derived from the instance with the largest number of nodes, if not provided explicitly. If there are conflicting features for the same node ID across instances, the features from the instance with the largest number of nodes will be used.
            - ``hyperedge_index`` is the concatenation of all input hyperedge indices.
            - ``hyperedge_attr`` is the concatenation of all input hyperedge attributes, if present. If some instances have hyperedge attributes and others do not, the resulting ``hyperedge_attr`` will be set to ``None``.
            - ``y`` is the concatenation of all input labels.

        Examples:
            >>> x = torch.randn(5, 8)
            >>> pos = HData(x, torch.tensor([[0, 1, 2, 3, 4], [0, 0, 1, 2, 2]]))
            >>> neg = HData(x, torch.tensor([[0, 2], [3, 3]]))
            >>> new = HData.cat_same_node_space([pos, neg])
            >>> new.num_nodes  # 5 — nodes [0, 1, 2, 3, 4]
            >>> new.num_hyperedges  # 4 — hyperedges [0, 1, 2, 3]

        Args:
            hdatas: One or more :class:`HData` instances sharing the same node space.
            x: Optional node feature matrix to use for the resulting :class:`HData`.
                If ``None``, the node features from the instance with the largest number of nodes will be used.

        Returns:
            A new :class:`HData` with shared nodes and concatenated hyperedges.

        Raises:
            ValueError: If the node counts do not match across inputs.
        """
        if len(hdatas) < 1:
            raise ValueError("At least one instance is required.")

        joint_hyperedge_ids = torch.cat([hdata.hyperedge_index[1].unique() for hdata in hdatas])
        unique_joint_hyperedge_ids = joint_hyperedge_ids.unique()
        if unique_joint_hyperedge_ids.size(0) != joint_hyperedge_ids.size(0):
            raise ValueError(
                "Overlapping hyperedge IDs found across instances. Ensure each instance uses distinct hyperedge IDs."
            )

        new_x = x if x is not None else max(hdatas, key=lambda hdata: hdata.num_nodes).x
        new_y = torch.cat([hdata.y for hdata in hdatas], dim=0)
        new_hyperedge_index = torch.cat([hdata.hyperedge_index for hdata in hdatas], dim=1)

        hyperedge_attrs = []
        have_all_hyperedge_attr = all(hdata.hyperedge_attr is not None for hdata in hdatas)
        for hdata in hdatas:
            if have_all_hyperedge_attr and hdata.hyperedge_attr is not None:
                hyperedge_attrs.append(hdata.hyperedge_attr)
        new_hyperedge_attr = torch.cat(hyperedge_attrs, dim=0) if len(hyperedge_attrs) > 0 else None

        return cls(
            x=new_x,
            hyperedge_index=new_hyperedge_index,
            hyperedge_attr=new_hyperedge_attr,
            num_nodes=new_x.size(0),
            num_hyperedges=new_y.size(0),
            y=new_y,
        )

    @classmethod
    def empty(cls) -> "HData":
        return cls(
            x=empty_nodefeatures(),
            hyperedge_index=empty_hyperedgeindex(),
            hyperedge_attr=None,
            num_nodes=0,
            num_hyperedges=0,
            y=None,
        )

    @classmethod
    def from_hyperedge_index(cls, hyperedge_index: Tensor) -> "HData":
        """
        Build an :class:`HData` from a given hyperedge index, with empty node features and hyperedge attributes.

        - Node features are initialized as an empty tensor of shape ``[0, 0]``.
        - Hyperedge attributes are set to ``None``.
        - The number of nodes and hyperedges are inferred from the hyperedge index.

        Examples:
            >>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
            ...                    [0, 0, 0, 1, 2, 2]]
            >>> num_nodes = 5
            >>> num_hyperedges = 3
            >>> x = []  # Empty node features with shape [0, 0]
            >>> hyperedge_attr = None

        Args:
            hyperedge_index: Tensor of shape ``[2, num_incidences]`` representing the hypergraph connectivity.

        Returns:
            An :class:`HData` instance with the given hyperedge index and default values for other attributes.
        """
        return cls(
            x=empty_nodefeatures(),
            hyperedge_index=hyperedge_index,
            hyperedge_attr=None,
            y=None,
        )

    @classmethod
    def split(cls, hdata: "HData", split_hyperedge_ids: Tensor) -> "HData":
        """
        Build an :class:`HData` for a single split from the given hyperedge IDs.

        Examples:
            >>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
            ...                    [0, 0, 0, 1, 2, 2]]
            >>> split_hyperedge_ids = [0, 2]
            >>> new_hyperedge_index = [[0, 0, 1, 2, 3],  # nodes 0 -> 0, 1 -> 1, 3 -> 2, 4 -> 3 (remapped to 0-based)
            ...                        [0, 0, 0, 1, 1]]  # hyperedges 0 -> 0, 2 -> 1 (remapped to 0-based)
            >>> new_x = [x[0], x[1], x[3], x[4]]
            >>> new_hyperedge_attr = [hyperedge_attr[0], hyperedge_attr[2]]

        Args:
            hdata: The original :class:`HData` containing the full hypergraph.
            split_hyperedge_ids: Tensor of hyperedge IDs to include in this split.

        Returns:
            The splitted instance with remapped node and hyperedge IDs.
        """
        # Mask to keep only incidences belonging to selected hyperedges
        # Example: hyperedge_index = [[0, 0, 1, 2, 3, 4],
        #                             [0, 0, 0, 1, 2, 2]]
        #          split_hyperedge_ids = [0, 2]
        #          -> mask = [True, True, True, False, True, True]
        keep_mask = torch.isin(hdata.hyperedge_index[1], split_hyperedge_ids)

        # Example: hyperedge_index = [[0, 0, 1, 3, 4],
        #                             [0, 0, 0, 2, 2]]
        #          incidence [2, 1] is missing as 1 is not in split_hyperedge_ids = [0, 2]
        split_hyperedge_index = hdata.hyperedge_index[:, keep_mask]

        # Example: split_hyperedge_index = [[0, 0, 1, 3, 4],
        #                                   [0, 0, 0, 2, 2]]
        #          -> split_unique_node_ids = [0, 1, 3, 4]
        #          -> split_unique_hyperedge_ids = [0, 2]
        split_unique_node_ids = split_hyperedge_index[0].unique()
        split_unique_hyperedge_ids = split_hyperedge_index[1].unique()

        split_hyperedge_index_wrapper = HyperedgeIndex(split_hyperedge_index).to_0based(
            node_ids_to_rebase=split_unique_node_ids,
            hyperedge_ids_to_rebase=split_unique_hyperedge_ids,
        )

        new_x = hdata.x[split_unique_node_ids]
        new_y = hdata.y[split_unique_hyperedge_ids]

        # Subset hyperedge_attr if present
        new_hyperedge_attr = None
        if hdata.hyperedge_attr is not None:
            new_hyperedge_attr = hdata.hyperedge_attr[split_unique_hyperedge_ids]

        return cls(
            x=new_x,
            hyperedge_index=split_hyperedge_index_wrapper.item,
            hyperedge_attr=new_hyperedge_attr,
            num_nodes=len(split_unique_node_ids),
            num_hyperedges=len(split_unique_hyperedge_ids),
            y=new_y,
        )

    def enrich_node_features(
        self,
        enricher: NodeFeatureEnricher,
        enrichment_mode: Optional[EnrichmentMode] = None,
    ) -> None:
        """
        Enrich node features using the provided node feature enricher.

        Args:
            enricher: An instance of NodeFeatureEnricher to generate structural node features from hypergraph topology.
            enrichment_mode: How to combine generated features with existing ``hdata.x``.
                ``concatenate`` appends new features as additional columns.
                ``replace`` substitutes ``hdata.x`` entirely.
        """
        enriched_features = enricher.enrich(self.hyperedge_index)

        match enrichment_mode:
            case "concatenate":
                self.x = torch.cat([self.x, enriched_features], dim=1)
            case _:
                self.x = enriched_features

    def get_device_if_all_consistent(self) -> torch.device:
        """
        Check that all tensors are on the same device and return that device.
        If there are no tensors or if they are on different devices, return CPU.

        Returns:
            The common device if all tensors are on the same device, otherwise CPU.

        Raises:
            ValueError: If tensors are on different devices.
        """
        devices = {self.x.device, self.hyperedge_index.device, self.y.device}
        if self.hyperedge_attr is not None:
            devices.add(self.hyperedge_attr.device)
        if len(devices) > 1:
            raise ValueError(f"Inconsistent device placement: {devices}")

        return devices.pop() if len(devices) == 1 else torch.device("cpu")

    def shuffle(self, seed: Optional[int] = None) -> "HData":
        """
        Return a new :class:`HData` instance with hyperedge IDs randomly reassigned.

        Each hyperedge keeps its original set of nodes, but is assigned a new ID via a random permutation.
        ``y`` and ``hyperedge_attr`` are reordered to match, so that ``y[new_id]`` still corresponds to the correct hyperedge.
        Same for ``hyperedge_attr[new_id]`` if hyperedge attributes are present.

        Examples:
            >>> hyperedge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]])
            >>> y  = torch.tensor([1, 0])
            >>> hdata = HData(x, hyperedge_index=hyperedge_index, y=y)
            >>> shuffled_hdata = hdata.shuffle(seed=42)
            >>> shuffled_hdata.hyperedge_index  # hyperedges may be reassigned
            ... # e.g.,
            ...     [[0, 1, 2, 3],
            ...      [1, 1, 0, 0]]
            >>> shuffled_hdata.y  # labels are permuted to match new hyperedge IDs, e.g., [0, 1]

        Args:
            seed: Optional random seed for reproducibility. If ``None``, the shuffle will be non-deterministic.

        Returns:
            A new :class:`HData` instance with hyperedge IDs, ``y``, and ``hyperedge_attr`` permuted.
        """
        generator = torch.Generator(device=self.device)
        if seed is not None:
            generator.manual_seed(seed)

        permutation = torch.randperm(self.num_hyperedges, generator=generator, device=self.device)

        # permutation[new_id] = old_id, so y[permutation] puts old labels into new slots
        # inverse_permutation[old_id] = new_id, used to remap hyperedge IDs in incidences
        # Example: permutation = [1, 2, 0] means new_id 0 gets old_id 1, new_id 1 gets old_id 2, new_id 2 gets old_id 0
        #          -> inverse_permutation = [2, 0, 1] means old_id 0 gets new_id 2, old_id 1 gets new_id 0, old_id 2 gets new_id 1
        inverse_permutation = torch.empty_like(permutation)
        inverse_permutation[permutation] = torch.arange(self.num_hyperedges, device=self.device)

        new_hyperedge_index = self.hyperedge_index.clone()

        # Example: hyperedge_index = [[0, 1, 2, 3, 4],
        #                             [0, 0, 1, 1, 2]],
        #          inverse_permutation = [2, 0, 1] (new_id 0 -> old_id 2, new_id 1 -> old_id 0, new_id 2 -> old_id 1)
        #          -> new_hyperedge_index = [[0, 1, 2, 3, 4],
        #                                    [2, 2, 0, 0, 1]]
        old_hyperedge_ids = self.hyperedge_index[1]
        new_hyperedge_index[1] = inverse_permutation[old_hyperedge_ids]

        # Example: hyperedge_attr = [attr_0, attr_1, attr_2], permutation = [1, 2, 0]
        #          -> new_hyperedge_attr = [attr_1  (attr of old_id 1), attr_2 (attr of old_id 2), attr_0 (attr of old_id 0)]
        new_hyperedge_attr = (
            self.hyperedge_attr[permutation] if self.hyperedge_attr is not None else None
        )

        # Example: y = [1, 1, 0], permutation = [1, 2, 0]
        #          -> new_y = [y[1], y[2], y[0]] = [1, 0, 1]
        new_y = self.y[permutation]

        return HData(
            x=self.x,
            hyperedge_index=new_hyperedge_index,
            hyperedge_attr=new_hyperedge_attr,
            num_nodes=self.num_nodes,
            num_hyperedges=self.num_hyperedges,
            y=new_y,
        )

    def to(self, device: torch.device | str, non_blocking: bool = False) -> "HData":
        """
        Move all tensors to the specified device.

        Args:
            device: The target device (e.g., 'cpu', 'cuda:0').
            non_blocking: If ``True`` and the source and destination devices are both CUDA, the copy will be non-blocking.

        Returns:
            The :class:`HData` instance with all tensors moved to the specified device.
        """
        self.x = self.x.to(device=device, non_blocking=non_blocking)
        self.hyperedge_index = self.hyperedge_index.to(device=device, non_blocking=non_blocking)
        self.y = self.y.to(device=device, non_blocking=non_blocking)

        if self.hyperedge_attr is not None:
            self.hyperedge_attr = self.hyperedge_attr.to(device=device, non_blocking=non_blocking)

        self.device = device if isinstance(device, torch.device) else torch.device(device)
        return self

    def with_y_to(self, value: float) -> "HData":
        """
        Return a copy of this instance with a y attribute set to the given value.

        Args:
            value: The value to set for all entries in the y attribute.

        Returns:
            A new :class:`HData` instance with the same attributes except for y, which is set to a tensor of the given value.
        """
        return HData(
            x=self.x,
            hyperedge_index=self.hyperedge_index,
            hyperedge_attr=self.hyperedge_attr,
            num_nodes=self.num_nodes,
            num_hyperedges=self.num_hyperedges,
            y=torch.full((self.num_hyperedges,), value, dtype=torch.float, device=self.device),
        )

    def with_y_ones(self) -> "HData":
        """Return a copy of this instance with a y attribute of all ones."""
        return self.with_y_to(1.0)

    def with_y_zeros(self) -> "HData":
        """Return a copy of this instance with a y attribute of all zeros."""
        return self.with_y_to(0.0)

    def stats(self) -> Dict[str, Any]:
        """
        Compute statistics for the hypergraph data.
        The fields returned in the dictionary include:
        - ``shape_x``: The shape of the node feature matrix ``x``.
        - ``shape_hyperedge_attr``: The shape of the hyperedge attribute matrix, or ``None`` if hyperedge attributes are not present.
        - ``num_nodes``: The number of nodes in the hypergraph.
        - ``num_hyperedges``: The number of hyperedges in the hypergraph.
        - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to.
        - ``avg_degree_node``: The floored node average degree.
        - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains.
        - ``avg_degree_hyperedge``: The floored hyperedge average size.
        - ``node_degree_max``: The maximum degree of any node in the hypergraph.
        - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
        - ``node_degree_median``: The median degree of nodes in the hypergraph.
        - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
        - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``.
        - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``.
        - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree.
        - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.

        Returns:
            A dictionary containing various statistics about the hypergraph.
        """

        node_ids = self.hyperedge_index[0]
        hyperedge_ids = self.hyperedge_index[1]

        # Degree of each node = number of hyperedges it belongs to
        # Size of each hyperedge = number of nodes it contains
        if node_ids.numel() > 0:
            distribution_node_degree = torch.bincount(node_ids, minlength=self.num_nodes).float()
            distribution_hyperedge_size = torch.bincount(
                hyperedge_ids, minlength=self.num_hyperedges
            ).float()
        else:
            distribution_node_degree = torch.zeros(self.num_nodes, dtype=torch.float)
            distribution_hyperedge_size = torch.zeros(self.num_hyperedges, dtype=torch.float)

        num_nodes = self.num_nodes
        num_hyperedges = self.num_hyperedges

        if distribution_node_degree.numel() > 0:
            avg_degree_node_raw = distribution_node_degree.mean().item()
            avg_degree_node = int(avg_degree_node_raw)
            avg_degree_hyperedge_raw = distribution_hyperedge_size.mean().item()
            avg_degree_hyperedge = int(avg_degree_hyperedge_raw)
            node_degree_max = int(distribution_node_degree.max().item())
            hyperedge_degree_max = int(distribution_hyperedge_size.max().item())
            node_degree_median = int(distribution_node_degree.median().item())
            hyperedge_degree_median = int(distribution_hyperedge_size.median().item())
        else:
            avg_degree_node_raw = 0
            avg_degree_node = 0
            avg_degree_hyperedge_raw = 0
            avg_degree_hyperedge = 0
            node_degree_max = 0
            hyperedge_degree_max = 0
            node_degree_median = 0
            hyperedge_degree_median = 0

        # Histograms: index i holds count of nodes/hyperedges with degree/size i
        distribution_node_degree_hist = torch.bincount(distribution_node_degree.long())
        distribution_hyperedge_size_hist = torch.bincount(distribution_hyperedge_size.long())

        distribution_node_degree_hist = {
            i: int(count.item())
            for i, count in enumerate(distribution_node_degree_hist)
            if count.item() > 0
        }
        distribution_hyperedge_size_hist = {
            i: int(count.item())
            for i, count in enumerate(distribution_hyperedge_size_hist)
            if count.item() > 0
        }

        return {
            "shape_x": self.x.shape,
            "shape_hyperedge_attr": self.hyperedge_attr.shape
            if self.hyperedge_attr is not None
            else None,
            "num_nodes": num_nodes,
            "num_hyperedges": num_hyperedges,
            "avg_degree_node_raw": avg_degree_node_raw,
            "avg_degree_node": avg_degree_node,
            "avg_degree_hyperedge_raw": avg_degree_hyperedge_raw,
            "avg_degree_hyperedge": avg_degree_hyperedge,
            "node_degree_max": node_degree_max,
            "hyperedge_degree_max": hyperedge_degree_max,
            "node_degree_median": node_degree_median,
            "hyperedge_degree_median": hyperedge_degree_median,
            "distribution_node_degree": distribution_node_degree.int().tolist(),
            "distribution_hyperedge_size": distribution_hyperedge_size.int().tolist(),
            "distribution_node_degree_hist": distribution_node_degree_hist,
            "distribution_hyperedge_size_hist": distribution_hyperedge_size_hist,
        }

cat_same_node_space(hdatas, x=None) classmethod

Concatenate :class:HData instances that share the same node space, meaning nodes with the same ID in different instances are the same node. This is useful when combining positive and negative hyperedges that reference the same set of nodes.

Notes
  • x is derived from the instance with the largest number of nodes, if not provided explicitly. If there are conflicting features for the same node ID across instances, the features from the instance with the largest number of nodes will be used.
  • hyperedge_index is the concatenation of all input hyperedge indices.
  • hyperedge_attr is the concatenation of all input hyperedge attributes, if present. If some instances have hyperedge attributes and others do not, the resulting hyperedge_attr will be set to None.
  • y is the concatenation of all input labels.

Examples:

>>> x = torch.randn(5, 8)
>>> pos = HData(x, torch.tensor([[0, 1, 2, 3, 4], [0, 0, 1, 2, 2]]))
>>> neg = HData(x, torch.tensor([[0, 2], [3, 3]]))
>>> new = HData.cat_same_node_space([pos, neg])
>>> new.num_nodes  # 5 — nodes [0, 1, 2, 3, 4]
>>> new.num_hyperedges  # 4 — hyperedges [0, 1, 2, 3]

Parameters:

Name Type Description Default
hdatas Sequence[HData]

One or more :class:HData instances sharing the same node space.

required
x Optional[Tensor]

Optional node feature matrix to use for the resulting :class:HData. If None, the node features from the instance with the largest number of nodes will be used.

None

Returns:

Type Description
HData

A new :class:HData with shared nodes and concatenated hyperedges.

Raises:

Type Description
ValueError

If the node counts do not match across inputs.

Source code in hyperbench/types/hdata.py
@classmethod
def cat_same_node_space(cls, hdatas: Sequence["HData"], x: Optional[Tensor] = None) -> "HData":
    """
    Concatenate :class:`HData` instances that share the same node space, meaning nodes with the same ID in different instances are the same node.
    This is useful when combining positive and negative hyperedges that reference the same set of nodes.

    Notes:
        - ``x`` is derived from the instance with the largest number of nodes, if not provided explicitly. If there are conflicting features for the same node ID across instances, the features from the instance with the largest number of nodes will be used.
        - ``hyperedge_index`` is the concatenation of all input hyperedge indices.
        - ``hyperedge_attr`` is the concatenation of all input hyperedge attributes, if present. If some instances have hyperedge attributes and others do not, the resulting ``hyperedge_attr`` will be set to ``None``.
        - ``y`` is the concatenation of all input labels.

    Examples:
        >>> x = torch.randn(5, 8)
        >>> pos = HData(x, torch.tensor([[0, 1, 2, 3, 4], [0, 0, 1, 2, 2]]))
        >>> neg = HData(x, torch.tensor([[0, 2], [3, 3]]))
        >>> new = HData.cat_same_node_space([pos, neg])
        >>> new.num_nodes  # 5 — nodes [0, 1, 2, 3, 4]
        >>> new.num_hyperedges  # 4 — hyperedges [0, 1, 2, 3]

    Args:
        hdatas: One or more :class:`HData` instances sharing the same node space.
        x: Optional node feature matrix to use for the resulting :class:`HData`.
            If ``None``, the node features from the instance with the largest number of nodes will be used.

    Returns:
        A new :class:`HData` with shared nodes and concatenated hyperedges.

    Raises:
        ValueError: If the node counts do not match across inputs.
    """
    if len(hdatas) < 1:
        raise ValueError("At least one instance is required.")

    joint_hyperedge_ids = torch.cat([hdata.hyperedge_index[1].unique() for hdata in hdatas])
    unique_joint_hyperedge_ids = joint_hyperedge_ids.unique()
    if unique_joint_hyperedge_ids.size(0) != joint_hyperedge_ids.size(0):
        raise ValueError(
            "Overlapping hyperedge IDs found across instances. Ensure each instance uses distinct hyperedge IDs."
        )

    new_x = x if x is not None else max(hdatas, key=lambda hdata: hdata.num_nodes).x
    new_y = torch.cat([hdata.y for hdata in hdatas], dim=0)
    new_hyperedge_index = torch.cat([hdata.hyperedge_index for hdata in hdatas], dim=1)

    hyperedge_attrs = []
    have_all_hyperedge_attr = all(hdata.hyperedge_attr is not None for hdata in hdatas)
    for hdata in hdatas:
        if have_all_hyperedge_attr and hdata.hyperedge_attr is not None:
            hyperedge_attrs.append(hdata.hyperedge_attr)
    new_hyperedge_attr = torch.cat(hyperedge_attrs, dim=0) if len(hyperedge_attrs) > 0 else None

    return cls(
        x=new_x,
        hyperedge_index=new_hyperedge_index,
        hyperedge_attr=new_hyperedge_attr,
        num_nodes=new_x.size(0),
        num_hyperedges=new_y.size(0),
        y=new_y,
    )

from_hyperedge_index(hyperedge_index) classmethod

Build an :class:HData from a given hyperedge index, with empty node features and hyperedge attributes.

  • Node features are initialized as an empty tensor of shape [0, 0].
  • Hyperedge attributes are set to None.
  • The number of nodes and hyperedges are inferred from the hyperedge index.

Examples:

>>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
...                    [0, 0, 0, 1, 2, 2]]
>>> num_nodes = 5
>>> num_hyperedges = 3
>>> x = []  # Empty node features with shape [0, 0]
>>> hyperedge_attr = None

Parameters:

Name Type Description Default
hyperedge_index Tensor

Tensor of shape [2, num_incidences] representing the hypergraph connectivity.

required

Returns:

Name Type Description
An HData

class:HData instance with the given hyperedge index and default values for other attributes.

Source code in hyperbench/types/hdata.py
@classmethod
def from_hyperedge_index(cls, hyperedge_index: Tensor) -> "HData":
    """
    Build an :class:`HData` from a given hyperedge index, with empty node features and hyperedge attributes.

    - Node features are initialized as an empty tensor of shape ``[0, 0]``.
    - Hyperedge attributes are set to ``None``.
    - The number of nodes and hyperedges are inferred from the hyperedge index.

    Examples:
        >>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
        ...                    [0, 0, 0, 1, 2, 2]]
        >>> num_nodes = 5
        >>> num_hyperedges = 3
        >>> x = []  # Empty node features with shape [0, 0]
        >>> hyperedge_attr = None

    Args:
        hyperedge_index: Tensor of shape ``[2, num_incidences]`` representing the hypergraph connectivity.

    Returns:
        An :class:`HData` instance with the given hyperedge index and default values for other attributes.
    """
    return cls(
        x=empty_nodefeatures(),
        hyperedge_index=hyperedge_index,
        hyperedge_attr=None,
        y=None,
    )

split(hdata, split_hyperedge_ids) classmethod

Build an :class:HData for a single split from the given hyperedge IDs.

Examples:

>>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
...                    [0, 0, 0, 1, 2, 2]]
>>> split_hyperedge_ids = [0, 2]
>>> new_hyperedge_index = [[0, 0, 1, 2, 3],  # nodes 0 -> 0, 1 -> 1, 3 -> 2, 4 -> 3 (remapped to 0-based)
...                        [0, 0, 0, 1, 1]]  # hyperedges 0 -> 0, 2 -> 1 (remapped to 0-based)
>>> new_x = [x[0], x[1], x[3], x[4]]
>>> new_hyperedge_attr = [hyperedge_attr[0], hyperedge_attr[2]]

Parameters:

Name Type Description Default
hdata HData

The original :class:HData containing the full hypergraph.

required
split_hyperedge_ids Tensor

Tensor of hyperedge IDs to include in this split.

required

Returns:

Type Description
HData

The splitted instance with remapped node and hyperedge IDs.

Source code in hyperbench/types/hdata.py
@classmethod
def split(cls, hdata: "HData", split_hyperedge_ids: Tensor) -> "HData":
    """
    Build an :class:`HData` for a single split from the given hyperedge IDs.

    Examples:
        >>> hyperedge_index = [[0, 0, 1, 2, 3, 4],
        ...                    [0, 0, 0, 1, 2, 2]]
        >>> split_hyperedge_ids = [0, 2]
        >>> new_hyperedge_index = [[0, 0, 1, 2, 3],  # nodes 0 -> 0, 1 -> 1, 3 -> 2, 4 -> 3 (remapped to 0-based)
        ...                        [0, 0, 0, 1, 1]]  # hyperedges 0 -> 0, 2 -> 1 (remapped to 0-based)
        >>> new_x = [x[0], x[1], x[3], x[4]]
        >>> new_hyperedge_attr = [hyperedge_attr[0], hyperedge_attr[2]]

    Args:
        hdata: The original :class:`HData` containing the full hypergraph.
        split_hyperedge_ids: Tensor of hyperedge IDs to include in this split.

    Returns:
        The splitted instance with remapped node and hyperedge IDs.
    """
    # Mask to keep only incidences belonging to selected hyperedges
    # Example: hyperedge_index = [[0, 0, 1, 2, 3, 4],
    #                             [0, 0, 0, 1, 2, 2]]
    #          split_hyperedge_ids = [0, 2]
    #          -> mask = [True, True, True, False, True, True]
    keep_mask = torch.isin(hdata.hyperedge_index[1], split_hyperedge_ids)

    # Example: hyperedge_index = [[0, 0, 1, 3, 4],
    #                             [0, 0, 0, 2, 2]]
    #          incidence [2, 1] is missing as 1 is not in split_hyperedge_ids = [0, 2]
    split_hyperedge_index = hdata.hyperedge_index[:, keep_mask]

    # Example: split_hyperedge_index = [[0, 0, 1, 3, 4],
    #                                   [0, 0, 0, 2, 2]]
    #          -> split_unique_node_ids = [0, 1, 3, 4]
    #          -> split_unique_hyperedge_ids = [0, 2]
    split_unique_node_ids = split_hyperedge_index[0].unique()
    split_unique_hyperedge_ids = split_hyperedge_index[1].unique()

    split_hyperedge_index_wrapper = HyperedgeIndex(split_hyperedge_index).to_0based(
        node_ids_to_rebase=split_unique_node_ids,
        hyperedge_ids_to_rebase=split_unique_hyperedge_ids,
    )

    new_x = hdata.x[split_unique_node_ids]
    new_y = hdata.y[split_unique_hyperedge_ids]

    # Subset hyperedge_attr if present
    new_hyperedge_attr = None
    if hdata.hyperedge_attr is not None:
        new_hyperedge_attr = hdata.hyperedge_attr[split_unique_hyperedge_ids]

    return cls(
        x=new_x,
        hyperedge_index=split_hyperedge_index_wrapper.item,
        hyperedge_attr=new_hyperedge_attr,
        num_nodes=len(split_unique_node_ids),
        num_hyperedges=len(split_unique_hyperedge_ids),
        y=new_y,
    )

enrich_node_features(enricher, enrichment_mode=None)

Enrich node features using the provided node feature enricher.

Parameters:

Name Type Description Default
enricher NodeFeatureEnricher

An instance of NodeFeatureEnricher to generate structural node features from hypergraph topology.

required
enrichment_mode Optional[EnrichmentMode]

How to combine generated features with existing hdata.x. concatenate appends new features as additional columns. replace substitutes hdata.x entirely.

None
Source code in hyperbench/types/hdata.py
def enrich_node_features(
    self,
    enricher: NodeFeatureEnricher,
    enrichment_mode: Optional[EnrichmentMode] = None,
) -> None:
    """
    Enrich node features using the provided node feature enricher.

    Args:
        enricher: An instance of NodeFeatureEnricher to generate structural node features from hypergraph topology.
        enrichment_mode: How to combine generated features with existing ``hdata.x``.
            ``concatenate`` appends new features as additional columns.
            ``replace`` substitutes ``hdata.x`` entirely.
    """
    enriched_features = enricher.enrich(self.hyperedge_index)

    match enrichment_mode:
        case "concatenate":
            self.x = torch.cat([self.x, enriched_features], dim=1)
        case _:
            self.x = enriched_features

get_device_if_all_consistent()

Check that all tensors are on the same device and return that device. If there are no tensors or if they are on different devices, return CPU.

Returns:

Type Description
device

The common device if all tensors are on the same device, otherwise CPU.

Raises:

Type Description
ValueError

If tensors are on different devices.

Source code in hyperbench/types/hdata.py
def get_device_if_all_consistent(self) -> torch.device:
    """
    Check that all tensors are on the same device and return that device.
    If there are no tensors or if they are on different devices, return CPU.

    Returns:
        The common device if all tensors are on the same device, otherwise CPU.

    Raises:
        ValueError: If tensors are on different devices.
    """
    devices = {self.x.device, self.hyperedge_index.device, self.y.device}
    if self.hyperedge_attr is not None:
        devices.add(self.hyperedge_attr.device)
    if len(devices) > 1:
        raise ValueError(f"Inconsistent device placement: {devices}")

    return devices.pop() if len(devices) == 1 else torch.device("cpu")

shuffle(seed=None)

Return a new :class:HData instance with hyperedge IDs randomly reassigned.

Each hyperedge keeps its original set of nodes, but is assigned a new ID via a random permutation. y and hyperedge_attr are reordered to match, so that y[new_id] still corresponds to the correct hyperedge. Same for hyperedge_attr[new_id] if hyperedge attributes are present.

Examples:

>>> hyperedge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]])
>>> y  = torch.tensor([1, 0])
>>> hdata = HData(x, hyperedge_index=hyperedge_index, y=y)
>>> shuffled_hdata = hdata.shuffle(seed=42)
>>> shuffled_hdata.hyperedge_index  # hyperedges may be reassigned
... # e.g.,
...     [[0, 1, 2, 3],
...      [1, 1, 0, 0]]
>>> shuffled_hdata.y  # labels are permuted to match new hyperedge IDs, e.g., [0, 1]

Parameters:

Name Type Description Default
seed Optional[int]

Optional random seed for reproducibility. If None, the shuffle will be non-deterministic.

None

Returns:

Type Description
HData

A new :class:HData instance with hyperedge IDs, y, and hyperedge_attr permuted.

Source code in hyperbench/types/hdata.py
def shuffle(self, seed: Optional[int] = None) -> "HData":
    """
    Return a new :class:`HData` instance with hyperedge IDs randomly reassigned.

    Each hyperedge keeps its original set of nodes, but is assigned a new ID via a random permutation.
    ``y`` and ``hyperedge_attr`` are reordered to match, so that ``y[new_id]`` still corresponds to the correct hyperedge.
    Same for ``hyperedge_attr[new_id]`` if hyperedge attributes are present.

    Examples:
        >>> hyperedge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]])
        >>> y  = torch.tensor([1, 0])
        >>> hdata = HData(x, hyperedge_index=hyperedge_index, y=y)
        >>> shuffled_hdata = hdata.shuffle(seed=42)
        >>> shuffled_hdata.hyperedge_index  # hyperedges may be reassigned
        ... # e.g.,
        ...     [[0, 1, 2, 3],
        ...      [1, 1, 0, 0]]
        >>> shuffled_hdata.y  # labels are permuted to match new hyperedge IDs, e.g., [0, 1]

    Args:
        seed: Optional random seed for reproducibility. If ``None``, the shuffle will be non-deterministic.

    Returns:
        A new :class:`HData` instance with hyperedge IDs, ``y``, and ``hyperedge_attr`` permuted.
    """
    generator = torch.Generator(device=self.device)
    if seed is not None:
        generator.manual_seed(seed)

    permutation = torch.randperm(self.num_hyperedges, generator=generator, device=self.device)

    # permutation[new_id] = old_id, so y[permutation] puts old labels into new slots
    # inverse_permutation[old_id] = new_id, used to remap hyperedge IDs in incidences
    # Example: permutation = [1, 2, 0] means new_id 0 gets old_id 1, new_id 1 gets old_id 2, new_id 2 gets old_id 0
    #          -> inverse_permutation = [2, 0, 1] means old_id 0 gets new_id 2, old_id 1 gets new_id 0, old_id 2 gets new_id 1
    inverse_permutation = torch.empty_like(permutation)
    inverse_permutation[permutation] = torch.arange(self.num_hyperedges, device=self.device)

    new_hyperedge_index = self.hyperedge_index.clone()

    # Example: hyperedge_index = [[0, 1, 2, 3, 4],
    #                             [0, 0, 1, 1, 2]],
    #          inverse_permutation = [2, 0, 1] (new_id 0 -> old_id 2, new_id 1 -> old_id 0, new_id 2 -> old_id 1)
    #          -> new_hyperedge_index = [[0, 1, 2, 3, 4],
    #                                    [2, 2, 0, 0, 1]]
    old_hyperedge_ids = self.hyperedge_index[1]
    new_hyperedge_index[1] = inverse_permutation[old_hyperedge_ids]

    # Example: hyperedge_attr = [attr_0, attr_1, attr_2], permutation = [1, 2, 0]
    #          -> new_hyperedge_attr = [attr_1  (attr of old_id 1), attr_2 (attr of old_id 2), attr_0 (attr of old_id 0)]
    new_hyperedge_attr = (
        self.hyperedge_attr[permutation] if self.hyperedge_attr is not None else None
    )

    # Example: y = [1, 1, 0], permutation = [1, 2, 0]
    #          -> new_y = [y[1], y[2], y[0]] = [1, 0, 1]
    new_y = self.y[permutation]

    return HData(
        x=self.x,
        hyperedge_index=new_hyperedge_index,
        hyperedge_attr=new_hyperedge_attr,
        num_nodes=self.num_nodes,
        num_hyperedges=self.num_hyperedges,
        y=new_y,
    )

to(device, non_blocking=False)

Move all tensors to the specified device.

Parameters:

Name Type Description Default
device device | str

The target device (e.g., 'cpu', 'cuda:0').

required
non_blocking bool

If True and the source and destination devices are both CUDA, the copy will be non-blocking.

False

Returns:

Name Type Description
The HData

class:HData instance with all tensors moved to the specified device.

Source code in hyperbench/types/hdata.py
def to(self, device: torch.device | str, non_blocking: bool = False) -> "HData":
    """
    Move all tensors to the specified device.

    Args:
        device: The target device (e.g., 'cpu', 'cuda:0').
        non_blocking: If ``True`` and the source and destination devices are both CUDA, the copy will be non-blocking.

    Returns:
        The :class:`HData` instance with all tensors moved to the specified device.
    """
    self.x = self.x.to(device=device, non_blocking=non_blocking)
    self.hyperedge_index = self.hyperedge_index.to(device=device, non_blocking=non_blocking)
    self.y = self.y.to(device=device, non_blocking=non_blocking)

    if self.hyperedge_attr is not None:
        self.hyperedge_attr = self.hyperedge_attr.to(device=device, non_blocking=non_blocking)

    self.device = device if isinstance(device, torch.device) else torch.device(device)
    return self

with_y_to(value)

Return a copy of this instance with a y attribute set to the given value.

Parameters:

Name Type Description Default
value float

The value to set for all entries in the y attribute.

required

Returns:

Type Description
HData

A new :class:HData instance with the same attributes except for y, which is set to a tensor of the given value.

Source code in hyperbench/types/hdata.py
def with_y_to(self, value: float) -> "HData":
    """
    Return a copy of this instance with a y attribute set to the given value.

    Args:
        value: The value to set for all entries in the y attribute.

    Returns:
        A new :class:`HData` instance with the same attributes except for y, which is set to a tensor of the given value.
    """
    return HData(
        x=self.x,
        hyperedge_index=self.hyperedge_index,
        hyperedge_attr=self.hyperedge_attr,
        num_nodes=self.num_nodes,
        num_hyperedges=self.num_hyperedges,
        y=torch.full((self.num_hyperedges,), value, dtype=torch.float, device=self.device),
    )

with_y_ones()

Return a copy of this instance with a y attribute of all ones.

Source code in hyperbench/types/hdata.py
def with_y_ones(self) -> "HData":
    """Return a copy of this instance with a y attribute of all ones."""
    return self.with_y_to(1.0)

with_y_zeros()

Return a copy of this instance with a y attribute of all zeros.

Source code in hyperbench/types/hdata.py
def with_y_zeros(self) -> "HData":
    """Return a copy of this instance with a y attribute of all zeros."""
    return self.with_y_to(0.0)

stats()

Compute statistics for the hypergraph data. The fields returned in the dictionary include: - shape_x: The shape of the node feature matrix x. - shape_hyperedge_attr: The shape of the hyperedge attribute matrix, or None if hyperedge attributes are not present. - num_nodes: The number of nodes in the hypergraph. - num_hyperedges: The number of hyperedges in the hypergraph. - avg_degree_node_raw: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to. - avg_degree_node: The floored node average degree. - avg_degree_hyperedge_raw: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains. - avg_degree_hyperedge: The floored hyperedge average size. - node_degree_max: The maximum degree of any node in the hypergraph. - hyperedge_degree_max: The maximum size of any hyperedge in the hypergraph. - node_degree_median: The median degree of nodes in the hypergraph. - hyperedge_degree_median: The median size of hyperedges in the hypergraph. - distribution_node_degree: A list where the value at index i represents the count of nodes with degree i. - distribution_hyperedge_size: A list where the value at index i represents the count of hyperedges with size i. - distribution_node_degree_hist: A dictionary where the keys are node degrees and the values are the count of nodes with that degree. - distribution_hyperedge_size_hist: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.

Returns:

Type Description
Dict[str, Any]

A dictionary containing various statistics about the hypergraph.

Source code in hyperbench/types/hdata.py
def stats(self) -> Dict[str, Any]:
    """
    Compute statistics for the hypergraph data.
    The fields returned in the dictionary include:
    - ``shape_x``: The shape of the node feature matrix ``x``.
    - ``shape_hyperedge_attr``: The shape of the hyperedge attribute matrix, or ``None`` if hyperedge attributes are not present.
    - ``num_nodes``: The number of nodes in the hypergraph.
    - ``num_hyperedges``: The number of hyperedges in the hypergraph.
    - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to.
    - ``avg_degree_node``: The floored node average degree.
    - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains.
    - ``avg_degree_hyperedge``: The floored hyperedge average size.
    - ``node_degree_max``: The maximum degree of any node in the hypergraph.
    - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
    - ``node_degree_median``: The median degree of nodes in the hypergraph.
    - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
    - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``.
    - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``.
    - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree.
    - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.

    Returns:
        A dictionary containing various statistics about the hypergraph.
    """

    node_ids = self.hyperedge_index[0]
    hyperedge_ids = self.hyperedge_index[1]

    # Degree of each node = number of hyperedges it belongs to
    # Size of each hyperedge = number of nodes it contains
    if node_ids.numel() > 0:
        distribution_node_degree = torch.bincount(node_ids, minlength=self.num_nodes).float()
        distribution_hyperedge_size = torch.bincount(
            hyperedge_ids, minlength=self.num_hyperedges
        ).float()
    else:
        distribution_node_degree = torch.zeros(self.num_nodes, dtype=torch.float)
        distribution_hyperedge_size = torch.zeros(self.num_hyperedges, dtype=torch.float)

    num_nodes = self.num_nodes
    num_hyperedges = self.num_hyperedges

    if distribution_node_degree.numel() > 0:
        avg_degree_node_raw = distribution_node_degree.mean().item()
        avg_degree_node = int(avg_degree_node_raw)
        avg_degree_hyperedge_raw = distribution_hyperedge_size.mean().item()
        avg_degree_hyperedge = int(avg_degree_hyperedge_raw)
        node_degree_max = int(distribution_node_degree.max().item())
        hyperedge_degree_max = int(distribution_hyperedge_size.max().item())
        node_degree_median = int(distribution_node_degree.median().item())
        hyperedge_degree_median = int(distribution_hyperedge_size.median().item())
    else:
        avg_degree_node_raw = 0
        avg_degree_node = 0
        avg_degree_hyperedge_raw = 0
        avg_degree_hyperedge = 0
        node_degree_max = 0
        hyperedge_degree_max = 0
        node_degree_median = 0
        hyperedge_degree_median = 0

    # Histograms: index i holds count of nodes/hyperedges with degree/size i
    distribution_node_degree_hist = torch.bincount(distribution_node_degree.long())
    distribution_hyperedge_size_hist = torch.bincount(distribution_hyperedge_size.long())

    distribution_node_degree_hist = {
        i: int(count.item())
        for i, count in enumerate(distribution_node_degree_hist)
        if count.item() > 0
    }
    distribution_hyperedge_size_hist = {
        i: int(count.item())
        for i, count in enumerate(distribution_hyperedge_size_hist)
        if count.item() > 0
    }

    return {
        "shape_x": self.x.shape,
        "shape_hyperedge_attr": self.hyperedge_attr.shape
        if self.hyperedge_attr is not None
        else None,
        "num_nodes": num_nodes,
        "num_hyperedges": num_hyperedges,
        "avg_degree_node_raw": avg_degree_node_raw,
        "avg_degree_node": avg_degree_node,
        "avg_degree_hyperedge_raw": avg_degree_hyperedge_raw,
        "avg_degree_hyperedge": avg_degree_hyperedge,
        "node_degree_max": node_degree_max,
        "hyperedge_degree_max": hyperedge_degree_max,
        "node_degree_median": node_degree_median,
        "hyperedge_degree_median": hyperedge_degree_median,
        "distribution_node_degree": distribution_node_degree.int().tolist(),
        "distribution_hyperedge_size": distribution_hyperedge_size.int().tolist(),
        "distribution_node_degree_hist": distribution_node_degree_hist,
        "distribution_hyperedge_size_hist": distribution_hyperedge_size_hist,
    }

Hypergraph

hyperbench.types.hypergraph

HIFHypergraph

A hypergraph data structure that supports directed/undirected hyperedges with incidence-based representation.

Parameters:

Name Type Description Default
network_type Optional[Literal['asc', 'directed', 'undirected']]

The type of hypergraph, which can be "asc" (or "directed") for directed hyperedges, or "undirected" for undirected hyperedges.

None
metadata Optional[Dict[str, Any]]

Optional dictionary of metadata about the hypergraph.

None
incidences Optional[List[Dict[str, Any]]]

A list of incidences, where each incidence is a dictionary with keys "node" and "edge" representing the relationship between a node and a hyperedge.

None
nodes Optional[List[Dict[str, Any]]]

A list of node dictionaries, where each dictionary contains information about a node (e.g., id, features).

None
edges Optional[List[Dict[str, Any]]]

A list of edge dictionaries, where each dictionary contains information about a hyperedge (e.g., id, features).

None
Source code in hyperbench/types/hypergraph.py
class HIFHypergraph:
    """
    A hypergraph data structure that supports directed/undirected hyperedges
    with incidence-based representation.

    Args:
        network_type: The type of hypergraph, which can be "asc" (or "directed") for directed hyperedges, or "undirected" for undirected hyperedges.
        metadata: Optional dictionary of metadata about the hypergraph.
        incidences: A list of incidences, where each incidence is a dictionary with keys "node" and "edge" representing the relationship between a node and a hyperedge.
        nodes: A list of node dictionaries, where each dictionary contains information about a node (e.g., id, features).
        edges: A list of edge dictionaries, where each dictionary contains information about a hyperedge (e.g., id, features).
    """

    def __init__(
        self,
        network_type: Optional[Literal["asc", "directed", "undirected"]] = None,
        metadata: Optional[Dict[str, Any]] = None,
        incidences: Optional[List[Dict[str, Any]]] = None,
        nodes: Optional[List[Dict[str, Any]]] = None,
        edges: Optional[List[Dict[str, Any]]] = None,
    ):
        self.network_type = network_type
        self.metadata = metadata if metadata is not None else {}
        self.incidences = incidences if incidences is not None else []
        self.nodes = nodes if nodes is not None else []
        self.edges = edges if edges is not None else []

    @classmethod
    def empty(cls) -> "HIFHypergraph":
        return cls(
            network_type="undirected",
            nodes=[],
            edges=[],
            incidences=[],
            metadata=None,
        )

    @classmethod
    def from_hif(cls, data: Dict[str, Any]) -> "HIFHypergraph":
        """
        Create a Hypergraph from a HIF (Hypergraph Interchange Format).

        Args:
            data: Dictionary with keys: network-type, metadata, incidences, nodes, edges

        Returns:
            Hypergraph instance
        """
        network_type = data.get("network-type") or data.get("network_type")
        metadata = data.get("metadata", {})
        incidences = data.get("incidences", [])
        nodes = data.get("nodes", [])
        edges = data.get("edges", [])

        return cls(
            network_type=network_type,
            metadata=metadata,
            incidences=incidences,
            nodes=nodes,
            edges=edges,
        )

    @property
    def num_nodes(self) -> int:
        """Return the number of nodes in the hypergraph."""
        return len(self.nodes)

    @property
    def num_edges(self) -> int:
        """Return the number of edges in the hypergraph."""
        return len(self.edges)

    def stats(self) -> Dict[str, Any]:
        """
        Compute statistics for the HIFhypergraph.
        The fields returned in the dictionary include:
        - ``num_nodes``: The number of nodes in the hypergraph.
        - ``num_hyperedges``: The number of hyperedges in the hypergraph.
        - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to.
        - ``avg_degree_node``: The floored node average degree.
        - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains.
        - ``avg_degree_hyperedge``: The floored hyperedge average size.
        - ``node_degree_max``: The maximum degree of any node in the hypergraph.
        - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
        - ``node_degree_median``: The median degree of nodes in the hypergraph.
        - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
        - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``.
        - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``.
        - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree.
        - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.

        Returns:
            A dictionary containing various statistics about the hypergraph.
        """

        node_degree: Dict[Any, int] = {}
        hyperedge_size: Dict[Any, int] = {}

        for incidence in self.incidences:
            node_id = incidence.get("node")
            edge_id = incidence.get("edge")
            node_degree[node_id] = node_degree.get(node_id, 0) + 1
            hyperedge_size[edge_id] = hyperedge_size.get(edge_id, 0) + 1

        num_nodes = len(self.nodes)
        num_hyperedges = len(self.edges)
        total_incidences = len(self.incidences)

        distribution_node_degree: List[int] = sorted(node_degree.values())
        distribution_hyperedge_size: List[int] = sorted(hyperedge_size.values())

        avg_degree_node_raw = total_incidences / num_nodes if num_nodes else 0
        avg_degree_node = int(avg_degree_node_raw)
        avg_degree_hyperedge_raw = total_incidences / num_hyperedges if num_hyperedges else 0
        avg_degree_hyperedge = int(avg_degree_hyperedge_raw)

        node_degree_max = max(distribution_node_degree) if distribution_node_degree else 0
        hyperedge_degree_max = (
            max(distribution_hyperedge_size) if distribution_hyperedge_size else 0
        )

        n_n = len(distribution_node_degree)
        node_degree_median = (
            (
                distribution_node_degree[n_n // 2]
                if n_n % 2
                else (distribution_node_degree[n_n // 2 - 1] + distribution_node_degree[n_n // 2])
                / 2
            )
            if n_n
            else 0
        )

        n_e = len(distribution_hyperedge_size)
        hyperedge_degree_median = (
            (
                distribution_hyperedge_size[n_e // 2]
                if n_e % 2
                else (
                    distribution_hyperedge_size[n_e // 2 - 1]
                    + distribution_hyperedge_size[n_e // 2]
                )
                / 2
            )
            if n_e
            else 0
        )

        distribution_node_degree_hist: Dict[int, int] = {}
        for d in distribution_node_degree:
            distribution_node_degree_hist[d] = distribution_node_degree_hist.get(d, 0) + 1

        distribution_hyperedge_size_hist: Dict[int, int] = {}
        for s in distribution_hyperedge_size:
            distribution_hyperedge_size_hist[s] = distribution_hyperedge_size_hist.get(s, 0) + 1

        return {
            "num_nodes": num_nodes,
            "num_hyperedges": num_hyperedges,
            "avg_degree_node_raw": avg_degree_node_raw,
            "avg_degree_node": avg_degree_node,
            "avg_degree_hyperedge_raw": avg_degree_hyperedge_raw,
            "avg_degree_hyperedge": avg_degree_hyperedge,
            "node_degree_max": node_degree_max,
            "hyperedge_degree_max": hyperedge_degree_max,
            "node_degree_median": node_degree_median,
            "hyperedge_degree_median": hyperedge_degree_median,
            "distribution_node_degree": distribution_node_degree,
            "distribution_hyperedge_size": distribution_hyperedge_size,
            "distribution_node_degree_hist": distribution_node_degree_hist,
            "distribution_hyperedge_size_hist": distribution_hyperedge_size_hist,
        }

num_nodes property

Return the number of nodes in the hypergraph.

num_edges property

Return the number of edges in the hypergraph.

from_hif(data) classmethod

Create a Hypergraph from a HIF (Hypergraph Interchange Format).

Parameters:

Name Type Description Default
data Dict[str, Any]

Dictionary with keys: network-type, metadata, incidences, nodes, edges

required

Returns:

Type Description
HIFHypergraph

Hypergraph instance

Source code in hyperbench/types/hypergraph.py
@classmethod
def from_hif(cls, data: Dict[str, Any]) -> "HIFHypergraph":
    """
    Create a Hypergraph from a HIF (Hypergraph Interchange Format).

    Args:
        data: Dictionary with keys: network-type, metadata, incidences, nodes, edges

    Returns:
        Hypergraph instance
    """
    network_type = data.get("network-type") or data.get("network_type")
    metadata = data.get("metadata", {})
    incidences = data.get("incidences", [])
    nodes = data.get("nodes", [])
    edges = data.get("edges", [])

    return cls(
        network_type=network_type,
        metadata=metadata,
        incidences=incidences,
        nodes=nodes,
        edges=edges,
    )

stats()

Compute statistics for the HIFhypergraph. The fields returned in the dictionary include: - num_nodes: The number of nodes in the hypergraph. - num_hyperedges: The number of hyperedges in the hypergraph. - avg_degree_node_raw: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to. - avg_degree_node: The floored node average degree. - avg_degree_hyperedge_raw: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains. - avg_degree_hyperedge: The floored hyperedge average size. - node_degree_max: The maximum degree of any node in the hypergraph. - hyperedge_degree_max: The maximum size of any hyperedge in the hypergraph. - node_degree_median: The median degree of nodes in the hypergraph. - hyperedge_degree_median: The median size of hyperedges in the hypergraph. - distribution_node_degree: A list where the value at index i represents the count of nodes with degree i. - distribution_hyperedge_size: A list where the value at index i represents the count of hyperedges with size i. - distribution_node_degree_hist: A dictionary where the keys are node degrees and the values are the count of nodes with that degree. - distribution_hyperedge_size_hist: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.

Returns:

Type Description
Dict[str, Any]

A dictionary containing various statistics about the hypergraph.

Source code in hyperbench/types/hypergraph.py
def stats(self) -> Dict[str, Any]:
    """
    Compute statistics for the HIFhypergraph.
    The fields returned in the dictionary include:
    - ``num_nodes``: The number of nodes in the hypergraph.
    - ``num_hyperedges``: The number of hyperedges in the hypergraph.
    - ``avg_degree_node_raw``: The average degree of nodes, calculated as the mean number of hyperedges each node belongs to.
    - ``avg_degree_node``: The floored node average degree.
    - ``avg_degree_hyperedge_raw``: The average size of hyperedges, calculated as the mean number of nodes each hyperedge contains.
    - ``avg_degree_hyperedge``: The floored hyperedge average size.
    - ``node_degree_max``: The maximum degree of any node in the hypergraph.
    - ``hyperedge_degree_max``: The maximum size of any hyperedge in the hypergraph.
    - ``node_degree_median``: The median degree of nodes in the hypergraph.
    - ``hyperedge_degree_median``: The median size of hyperedges in the hypergraph.
    - ``distribution_node_degree``: A list where the value at index ``i`` represents the count of nodes with degree ``i``.
    - ``distribution_hyperedge_size``: A list where the value at index ``i`` represents the count of hyperedges with size ``i``.
    - ``distribution_node_degree_hist``: A dictionary where the keys are node degrees and the values are the count of nodes with that degree.
    - ``distribution_hyperedge_size_hist``: A dictionary where the keys are hyperedge sizes and the values are the count of hyperedges with that size.

    Returns:
        A dictionary containing various statistics about the hypergraph.
    """

    node_degree: Dict[Any, int] = {}
    hyperedge_size: Dict[Any, int] = {}

    for incidence in self.incidences:
        node_id = incidence.get("node")
        edge_id = incidence.get("edge")
        node_degree[node_id] = node_degree.get(node_id, 0) + 1
        hyperedge_size[edge_id] = hyperedge_size.get(edge_id, 0) + 1

    num_nodes = len(self.nodes)
    num_hyperedges = len(self.edges)
    total_incidences = len(self.incidences)

    distribution_node_degree: List[int] = sorted(node_degree.values())
    distribution_hyperedge_size: List[int] = sorted(hyperedge_size.values())

    avg_degree_node_raw = total_incidences / num_nodes if num_nodes else 0
    avg_degree_node = int(avg_degree_node_raw)
    avg_degree_hyperedge_raw = total_incidences / num_hyperedges if num_hyperedges else 0
    avg_degree_hyperedge = int(avg_degree_hyperedge_raw)

    node_degree_max = max(distribution_node_degree) if distribution_node_degree else 0
    hyperedge_degree_max = (
        max(distribution_hyperedge_size) if distribution_hyperedge_size else 0
    )

    n_n = len(distribution_node_degree)
    node_degree_median = (
        (
            distribution_node_degree[n_n // 2]
            if n_n % 2
            else (distribution_node_degree[n_n // 2 - 1] + distribution_node_degree[n_n // 2])
            / 2
        )
        if n_n
        else 0
    )

    n_e = len(distribution_hyperedge_size)
    hyperedge_degree_median = (
        (
            distribution_hyperedge_size[n_e // 2]
            if n_e % 2
            else (
                distribution_hyperedge_size[n_e // 2 - 1]
                + distribution_hyperedge_size[n_e // 2]
            )
            / 2
        )
        if n_e
        else 0
    )

    distribution_node_degree_hist: Dict[int, int] = {}
    for d in distribution_node_degree:
        distribution_node_degree_hist[d] = distribution_node_degree_hist.get(d, 0) + 1

    distribution_hyperedge_size_hist: Dict[int, int] = {}
    for s in distribution_hyperedge_size:
        distribution_hyperedge_size_hist[s] = distribution_hyperedge_size_hist.get(s, 0) + 1

    return {
        "num_nodes": num_nodes,
        "num_hyperedges": num_hyperedges,
        "avg_degree_node_raw": avg_degree_node_raw,
        "avg_degree_node": avg_degree_node,
        "avg_degree_hyperedge_raw": avg_degree_hyperedge_raw,
        "avg_degree_hyperedge": avg_degree_hyperedge,
        "node_degree_max": node_degree_max,
        "hyperedge_degree_max": hyperedge_degree_max,
        "node_degree_median": node_degree_median,
        "hyperedge_degree_median": hyperedge_degree_median,
        "distribution_node_degree": distribution_node_degree,
        "distribution_hyperedge_size": distribution_hyperedge_size,
        "distribution_node_degree_hist": distribution_node_degree_hist,
        "distribution_hyperedge_size_hist": distribution_hyperedge_size_hist,
    }

Hypergraph

A simple hypergraph data structure using edge list representation.

Parameters:

Name Type Description Default
hyperedges List[List[int]]

A list of hyperedges, where each hyperedge is represented as a list of node IDs.

required
Source code in hyperbench/types/hypergraph.py
class Hypergraph:
    """
    A simple hypergraph data structure using edge list representation.

    Args:
        hyperedges: A list of hyperedges, where each hyperedge is represented as a list of node IDs.
    """

    def __init__(self, hyperedges: List[List[int]]):
        self.hyperedges = hyperedges

    @property
    def num_nodes(self) -> int:
        """Return the number of nodes in the hypergraph."""
        nodes = set()
        for edge in self.hyperedges:
            nodes.update(edge)
        return len(nodes)

    @property
    def num_hyperedges(self) -> int:
        """Return the number of hyperedges in the hypergraph."""
        return len(self.hyperedges)

    def neighbors_of(self, node: int) -> Neighborhood:
        """
        Return the set of nodes that share at least one hyperedge with node.

        A node u is a neighbor of v if there exists a hyperedge e such that
        both u and v are in e. The node itself is excluded from the result.

        Args:
            node: The node ID to find neighbors for.

        Returns:
            A set of neighbor node IDs (excluding the node itself).
        """
        neighbors: Neighborhood = set()
        for hyperedge in self.hyperedges:
            if node in hyperedge:
                neighbors.update(hyperedge)

        neighbors.discard(node)
        return neighbors

    def neighbors_of_all(self) -> Dict[int, Neighborhood]:
        """
        Build a mapping from every node to its neighbors.

        This precomputes ``neighbors_of`` for all nodes at once, which is
        more efficient when scoring many candidate hyperedges.

        Returns:
            A dictionary mapping each node ID to its set of neighbors.
        """
        nodes: Set[int] = set()
        for hyperedge in self.hyperedges:
            nodes.update(hyperedge)

        node_to_neighbors: Dict[int, Neighborhood] = {}
        for node in nodes:
            node_to_neighbors[node] = self.neighbors_of(node)

        return node_to_neighbors

    def stats(self) -> Dict[str, Any]:
        """Return basic statistics about the hypergraph."""
        node_degree: Dict[int, int] = {}
        distribution_hyperedge_size: List[int] = []
        total_incidences = 0

        for hyperedge in self.hyperedges:
            size = len(hyperedge)
            distribution_hyperedge_size.append(size)
            total_incidences += size
            for node in hyperedge:
                node_degree[node] = node_degree.get(node, 0) + 1

        num_nodes = len(node_degree)
        num_hyperedges = len(self.hyperedges)
        distribution_node_degree: List[int] = sorted(node_degree.values())

        avg_degree_hyperedge = total_incidences / num_hyperedges if num_hyperedges else 0
        total_incidences_nodes = sum(distribution_node_degree)
        avg_degree_node = total_incidences_nodes / num_nodes if num_nodes else 0

        hyperedge_degree_max = (
            max(distribution_hyperedge_size) if distribution_hyperedge_size else 0
        )
        node_degree_max = max(distribution_node_degree) if distribution_node_degree else 0

        sorted_hyperedge_sizes = sorted(distribution_hyperedge_size)
        n_e = len(sorted_hyperedge_sizes)
        hyperedge_degree_median = (
            (
                sorted_hyperedge_sizes[n_e // 2]
                if n_e % 2
                else (sorted_hyperedge_sizes[n_e // 2 - 1] + sorted_hyperedge_sizes[n_e // 2]) / 2
            )
            if n_e
            else 0
        )

        n_n = len(distribution_node_degree)
        node_degree_median = (
            (
                distribution_node_degree[n_n // 2]
                if n_n % 2
                else (distribution_node_degree[n_n // 2 - 1] + distribution_node_degree[n_n // 2])
                / 2
            )
            if n_n
            else 0
        )

        distribution_hyperedge_size_hist: Dict[int, int] = {}
        for s in distribution_hyperedge_size:
            distribution_hyperedge_size_hist[s] = distribution_hyperedge_size_hist.get(s, 0) + 1

        distribution_node_degree_hist: Dict[int, int] = {}
        for d in distribution_node_degree:
            distribution_node_degree_hist[d] = distribution_node_degree_hist.get(d, 0) + 1

        return {
            "num_nodes": num_nodes,
            "num_hyperedges": num_hyperedges,
            "avg_degree_node": avg_degree_node,
            "avg_degree_hyperedge": avg_degree_hyperedge,
            "node_degree_max": node_degree_max,
            "hyperedge_degree_max": hyperedge_degree_max,
            "node_degree_median": node_degree_median,
            "hyperedge_degree_median": hyperedge_degree_median,
            "distribution_node_degree": distribution_node_degree,
            "distribution_hyperedge_size": distribution_hyperedge_size,
            "distribution_node_degree_hist": distribution_node_degree_hist,
            "distribution_hyperedge_size_hist": distribution_hyperedge_size_hist,
        }

    @classmethod
    def from_hyperedge_index(cls, hyperedge_index: Tensor) -> "Hypergraph":
        """
        Create a Hypergraph from a hyperedge index representation.

        Args:
            hyperedge_index: Tensor of shape (2, |E|) representing hyperedges, where each column is (node, hyperedge).

        Returns:
            Hypergraph instance
        """
        if hyperedge_index.size(1) < 1:
            return cls(hyperedges=[])

        unique_hyperedge_ids = hyperedge_index[1].unique()
        hyperedges = [
            hyperedge_index[0, hyperedge_index[1] == hyperedge_id].tolist()
            for hyperedge_id in unique_hyperedge_ids
        ]

        return cls(hyperedges=hyperedges)

num_nodes property

Return the number of nodes in the hypergraph.

num_hyperedges property

Return the number of hyperedges in the hypergraph.

neighbors_of(node)

Return the set of nodes that share at least one hyperedge with node.

A node u is a neighbor of v if there exists a hyperedge e such that both u and v are in e. The node itself is excluded from the result.

Parameters:

Name Type Description Default
node int

The node ID to find neighbors for.

required

Returns:

Type Description
Neighborhood

A set of neighbor node IDs (excluding the node itself).

Source code in hyperbench/types/hypergraph.py
def neighbors_of(self, node: int) -> Neighborhood:
    """
    Return the set of nodes that share at least one hyperedge with node.

    A node u is a neighbor of v if there exists a hyperedge e such that
    both u and v are in e. The node itself is excluded from the result.

    Args:
        node: The node ID to find neighbors for.

    Returns:
        A set of neighbor node IDs (excluding the node itself).
    """
    neighbors: Neighborhood = set()
    for hyperedge in self.hyperedges:
        if node in hyperedge:
            neighbors.update(hyperedge)

    neighbors.discard(node)
    return neighbors

neighbors_of_all()

Build a mapping from every node to its neighbors.

This precomputes neighbors_of for all nodes at once, which is more efficient when scoring many candidate hyperedges.

Returns:

Type Description
Dict[int, Neighborhood]

A dictionary mapping each node ID to its set of neighbors.

Source code in hyperbench/types/hypergraph.py
def neighbors_of_all(self) -> Dict[int, Neighborhood]:
    """
    Build a mapping from every node to its neighbors.

    This precomputes ``neighbors_of`` for all nodes at once, which is
    more efficient when scoring many candidate hyperedges.

    Returns:
        A dictionary mapping each node ID to its set of neighbors.
    """
    nodes: Set[int] = set()
    for hyperedge in self.hyperedges:
        nodes.update(hyperedge)

    node_to_neighbors: Dict[int, Neighborhood] = {}
    for node in nodes:
        node_to_neighbors[node] = self.neighbors_of(node)

    return node_to_neighbors

stats()

Return basic statistics about the hypergraph.

Source code in hyperbench/types/hypergraph.py
def stats(self) -> Dict[str, Any]:
    """Return basic statistics about the hypergraph."""
    node_degree: Dict[int, int] = {}
    distribution_hyperedge_size: List[int] = []
    total_incidences = 0

    for hyperedge in self.hyperedges:
        size = len(hyperedge)
        distribution_hyperedge_size.append(size)
        total_incidences += size
        for node in hyperedge:
            node_degree[node] = node_degree.get(node, 0) + 1

    num_nodes = len(node_degree)
    num_hyperedges = len(self.hyperedges)
    distribution_node_degree: List[int] = sorted(node_degree.values())

    avg_degree_hyperedge = total_incidences / num_hyperedges if num_hyperedges else 0
    total_incidences_nodes = sum(distribution_node_degree)
    avg_degree_node = total_incidences_nodes / num_nodes if num_nodes else 0

    hyperedge_degree_max = (
        max(distribution_hyperedge_size) if distribution_hyperedge_size else 0
    )
    node_degree_max = max(distribution_node_degree) if distribution_node_degree else 0

    sorted_hyperedge_sizes = sorted(distribution_hyperedge_size)
    n_e = len(sorted_hyperedge_sizes)
    hyperedge_degree_median = (
        (
            sorted_hyperedge_sizes[n_e // 2]
            if n_e % 2
            else (sorted_hyperedge_sizes[n_e // 2 - 1] + sorted_hyperedge_sizes[n_e // 2]) / 2
        )
        if n_e
        else 0
    )

    n_n = len(distribution_node_degree)
    node_degree_median = (
        (
            distribution_node_degree[n_n // 2]
            if n_n % 2
            else (distribution_node_degree[n_n // 2 - 1] + distribution_node_degree[n_n // 2])
            / 2
        )
        if n_n
        else 0
    )

    distribution_hyperedge_size_hist: Dict[int, int] = {}
    for s in distribution_hyperedge_size:
        distribution_hyperedge_size_hist[s] = distribution_hyperedge_size_hist.get(s, 0) + 1

    distribution_node_degree_hist: Dict[int, int] = {}
    for d in distribution_node_degree:
        distribution_node_degree_hist[d] = distribution_node_degree_hist.get(d, 0) + 1

    return {
        "num_nodes": num_nodes,
        "num_hyperedges": num_hyperedges,
        "avg_degree_node": avg_degree_node,
        "avg_degree_hyperedge": avg_degree_hyperedge,
        "node_degree_max": node_degree_max,
        "hyperedge_degree_max": hyperedge_degree_max,
        "node_degree_median": node_degree_median,
        "hyperedge_degree_median": hyperedge_degree_median,
        "distribution_node_degree": distribution_node_degree,
        "distribution_hyperedge_size": distribution_hyperedge_size,
        "distribution_node_degree_hist": distribution_node_degree_hist,
        "distribution_hyperedge_size_hist": distribution_hyperedge_size_hist,
    }

from_hyperedge_index(hyperedge_index) classmethod

Create a Hypergraph from a hyperedge index representation.

Parameters:

Name Type Description Default
hyperedge_index Tensor

Tensor of shape (2, |E|) representing hyperedges, where each column is (node, hyperedge).

required

Returns:

Type Description
Hypergraph

Hypergraph instance

Source code in hyperbench/types/hypergraph.py
@classmethod
def from_hyperedge_index(cls, hyperedge_index: Tensor) -> "Hypergraph":
    """
    Create a Hypergraph from a hyperedge index representation.

    Args:
        hyperedge_index: Tensor of shape (2, |E|) representing hyperedges, where each column is (node, hyperedge).

    Returns:
        Hypergraph instance
    """
    if hyperedge_index.size(1) < 1:
        return cls(hyperedges=[])

    unique_hyperedge_ids = hyperedge_index[1].unique()
    hyperedges = [
        hyperedge_index[0, hyperedge_index[1] == hyperedge_id].tolist()
        for hyperedge_id in unique_hyperedge_ids
    ]

    return cls(hyperedges=hyperedges)

HyperedgeIndex

A wrapper for hyperedge index representation. Hyperedge index is a tensor of shape (2, |E|) that encodes the relationships between nodes and hyperedges. Each column in the tensor represents an incidence between a node and a hyperedge, with the first row containing node indices and the second row containing corresponding hyperedge indices.

Examples:

>>> hyperedge_index = [[0, 1, 2, 0],
...                    [0, 0, 0, 1]]

This represents two hyperedges: - Hyperedge 0 connects nodes 0, 1, and 2. - Hyperedge 1 connects node 0.

The number of nodes in this hypergraph is 3 (nodes 0, 1, and 2). The number of hyperedges is 2 (hyperedges 0 and 1).

Parameters:

Name Type Description Default
hyperedge_index Tensor

A tensor of shape (2, |E|) representing hyperedges, where each column is (node, hyperedge).

required
Source code in hyperbench/types/hypergraph.py
class HyperedgeIndex:
    """
    A wrapper for hyperedge index representation.
    Hyperedge index is a tensor of shape (2, |E|) that encodes the relationships between nodes and hyperedges.
    Each column in the tensor represents an incidence between a node and a hyperedge, with the first row containing node indices
    and the second row containing corresponding hyperedge indices.

    Examples:
        >>> hyperedge_index = [[0, 1, 2, 0],
        ...                    [0, 0, 0, 1]]

        This represents two hyperedges:
            - Hyperedge 0 connects nodes 0, 1, and 2.
            - Hyperedge 1 connects node 0.

        The number of nodes in this hypergraph is 3 (nodes 0, 1, and 2).
        The number of hyperedges is 2 (hyperedges 0 and 1).

    Args:
        hyperedge_index: A tensor of shape ``(2, |E|)`` representing hyperedges, where each column is (node, hyperedge).
    """

    def __init__(self, hyperedge_index: Tensor):
        self.__hyperedge_index = hyperedge_index

    @property
    def all_node_ids(self) -> Tensor:
        """Return the tensor of all node IDs in the hyperedge index."""
        return self.__hyperedge_index[0]

    @property
    def all_hyperedge_ids(self) -> Tensor:
        """Return the tensor of all hyperedge IDs in the hyperedge index."""
        return self.__hyperedge_index[1]

    @property
    def item(self) -> Tensor:
        """Return the hyperedge index tensor."""
        return self.__hyperedge_index

    @property
    def node_ids(self) -> Tensor:
        """Return the sorted unique node IDs from the hyperedge index."""
        return self.__hyperedge_index[0].unique(sorted=True)

    @property
    def hyperedge_ids(self) -> Tensor:
        """Return the sorted unique hyperedge IDs from the hyperedge index."""
        return self.__hyperedge_index[1].unique(sorted=True)

    @property
    def num_hyperedges(self) -> int:
        """Return the number of hyperedges in the hypergraph."""
        if self.num_incidences < 1:
            return 0

        hyperedges = self.__hyperedge_index[1]
        return len(hyperedges.unique())

    @property
    def num_nodes(self) -> int:
        """Return the number of nodes in the hypergraph."""
        if self.num_incidences < 1:
            return 0

        nodes = self.__hyperedge_index[0]
        return len(nodes.unique())

    @property
    def num_incidences(self) -> int:
        """Return the number of incidences in the hypergraph, which is the number of columns in the hyperedge index."""
        return self.__hyperedge_index.size(1)

    def nodes_in(self, hyperedge_id: int) -> List[int]:
        """Return the list of node IDs that belong to the given hyperedge."""
        return self.__hyperedge_index[0, self.__hyperedge_index[1] == hyperedge_id].tolist()

    def num_nodes_if_isolated_exist(self, num_nodes: int) -> int:
        """
        Return the number of nodes in the hypergraph, accounting for isolated nodes that may not appear in the hyperedge index.

        Args:
            num_nodes: The total number of nodes in the hypergraph, including isolated nodes.

        Returns:
            The number of nodes in the hypergraph, which is the maximum of the number of unique nodes in the hyperedge index and the provided ``num_nodes``.
        """
        return max(self.num_nodes, num_nodes)

    def reduce_to_edge_index_on_clique_expansion(
        self,
        remove_selfloops: bool = True,
    ) -> Tensor:
        """
        Construct a graph from a hypergraph via clique expansion using ``H @ H^T``, where ``H`` is the incidence matrix of the hypergraph.
        In clique expansion, each hyperedge is replaced by a clique connecting all its member nodes.

        For each hyperedge, all pairs of member nodes become edges in the resulting graph.
        This is computed efficiently using the incidence matrix: ``A = H @ H^T``, where ``H`` is
        the sparse incidence matrix of shape ``[num_nodes, num_hyperedges]`` and ``A`` is the adjacency matrix of the clique-expanded graph.

        Args:
            remove_selfloops: Whether to remove self-loops from the diagonal of ``H @ H^T``. Defaults to ``True``.

        Returns:
            The edge index of the clique-expanded graph. Size ``(2, |E'|)``.
        """
        # Build sparse incidence matrix of shape [num_nodes, num_hyperedges]
        values = torch.ones(
            size=(self.num_incidences,),
            dtype=torch.float,
            device=self.__hyperedge_index.device,
        )
        incidence_matrix = torch.sparse_coo_tensor(
            indices=torch.stack([self.all_node_ids, self.all_hyperedge_ids], dim=0),
            values=values,
            size=(self.num_nodes, self.num_hyperedges),
        ).coalesce()

        # A = H @ H^T gives adjacency with self-loops on diagonal
        # Example: For hyperedge_index = [[0, 1, 2, 0],
        #                                 [0, 0, 0, 1]]
        #                         hyperedges 0  1
        #          -> incidence_matrix H = [[1, 1], node 0
        #                                   [1, 0], node 1
        #                                   [1, 0]] node 2
        #               nodes 0  1  2
        #          -> H^T = [[1, 1, 1], hyperedge 0
        #                    [1, 0, 0]] hyperedge 1
        #                       nodes 0  1  2
        #          -> A = H @ H^T = [[2, 1, 1], node 0
        #                            [1, 1, 1], node 1
        #                            [1, 1, 1]] node 2
        #                                         nodes 0  1  2
        #          -> A (after removing self-loops) = [[0, 1, 1], node 0
        #                                              [1, 0, 1], node 1
        #                                              [1, 1, 0]] node 2
        adj_matrix = torch.sparse.mm(incidence_matrix, incidence_matrix.t()).coalesce()

        # Extract edge_index, make undirected, and deduplicate
        edge_index = EdgeIndex(adj_matrix.indices())
        if remove_selfloops:
            edge_index.remove_selfloops()
        return edge_index.to_undirected().item

    def reduce_to_edge_index_on_random_direction(
        self,
        x: Tensor,
        with_mediators: bool = False,
        remove_selfloops: bool = True,
    ) -> Tensor:
        """
        Construct a graph from a hypergraph with methods proposed in `HyperGCN: A New Method of Training Graph Convolutional Networks on Hypergraphs <https://arxiv.org/pdf/1809.02589.pdf>`_ paper.
        Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/structure/graphs/graph.html#Graph.from_hypergraph_hypergcn>`_.

        Args:
            x: Node feature matrix. Size ``(|V|, C)``.
            with_mediators: Whether to use mediator to transform the hyperedges to edges in the graph. Defaults to ``False``.
            remove_selfloops: Whether to remove self-loops. Defaults to ``True``.

        Returns:
            The edge index. Size ``(2, |E'|)``.

        Raises:
            ValueError: If any hyperedge contains fewer than 2 nodes.
        """
        device = x.device

        hypergraph = Hypergraph.from_hyperedge_index(self.__hyperedge_index)
        hypergraph_edges: List[List[int]] = hypergraph.hyperedges
        graph_edges: List[List[int]] = []

        # Random direction (feature_dim, 1) for projecting nodes in each hyperedge
        # Geometrically, we are choosing a random line through the origin in ℝᵈ, where ᵈ = feature_dim
        random_direction = torch.rand((x.shape[1], 1), device=device)

        for edge in hypergraph_edges:
            num_nodes_in_edge = len(edge)
            if num_nodes_in_edge < 2:
                raise ValueError("The number of vertices in an hyperedge must be >= 2.")

            # projections (num_nodes_in_edge,) contains a scalar value for each node in the hyperedge,
            # indicating its projection on the random vector 'random_direction'.
            # Key idea: If two points are very far apart in ℝᵈ, there is a high probability
            # that a random projection will still separate them
            projections = torch.matmul(x[edge], random_direction).squeeze()

            # The indices of the nodes that the farthest apart in the direction of 'random_direction'
            node_max_proj_idx = torch.argmax(projections)
            node_min_proj_idx = torch.argmin(projections)

            if not with_mediators:  # Just connect the two farthest nodes
                graph_edges.append([edge[node_min_proj_idx], edge[node_max_proj_idx]])
                continue

            for node_idx in range(num_nodes_in_edge):
                if node_idx != node_max_proj_idx and node_idx != node_min_proj_idx:
                    graph_edges.append([edge[node_min_proj_idx], edge[node_idx]])
                    graph_edges.append([edge[node_max_proj_idx], edge[node_idx]])

        graph = Graph(edges=graph_edges)
        if remove_selfloops:
            graph.remove_selfloops()

        return graph.to_edge_index().to(device)

    def remove_duplicate_edges(self) -> "HyperedgeIndex":
        """Remove duplicate edges from the hyperedge index. Keeps the tensor contiguous in memory."""
        # Example: hyperedge_index = [[0, 1, 2, 2, 0, 3, 2],
        #                             [3, 4, 4, 3, 4, 3, 3]], shape (2, 7)
        #          -> after torch.unique(..., dim=1):
        #             hyperedge_index = [[0, 1, 2, 2, 0, 3],
        #                                [3, 4, 4, 3, 4, 3]], shape (2, |E'| = 6)
        # Note: we need to call contiguous() after torch.unique() to ensure
        # the resulting tensor is contiguous in memory, which is important for efficient indexing
        # and further operations (e.g., searchsorted)
        self.__hyperedge_index = torch.unique(self.__hyperedge_index, dim=1).contiguous()
        return self

    def to_0based(
        self,
        node_ids_to_rebase: Optional[Tensor] = None,
        hyperedge_ids_to_rebase: Optional[Tensor] = None,
    ) -> "HyperedgeIndex":
        """
        Convert hyperedge index to the 0-based format by rebasing node IDs to the range ``[0, num_nodes-1]`` and hyperedge IDs ``[0, num_hyperedges-1]``.

        Args:
            node_ids_to_rebase: Tensor of shape ``(num_nodes,)`` containing the original node IDs that need to be rebased to 0-based format.
                If ``None``, all node IDs in the hyperedge index will be rebased to 0-based format based on their unique sorted order.
            hyperedge_ids_to_rebase: Tensor of shape ``(num_hyperedges,)`` containing the original hyperedge IDs that need to be rebased to 0-based format.
                If ``None``, all hyperedge IDs in the hyperedge index will be rebased to 0-based format based on their unique sorted order.

        Returns:
            A new :class:`HyperedgeIndex` instance with the hyperedge index converted to 0-based format.
        """
        # Example: hyperedge_index after sorting: [[0, 0, 1, 2, 3, 4],
        #                                          [3, 4, 4, 3, 4, 3]]
        #          node_ids_to_rebase = [0, 1, 2, 3, 4]
        #          -> hyperedge_index after remapping: [[0, 0, 1, 2, 3, 4],
        #                                               [3, 4, 4, 3, 4, 3]]
        self.__hyperedge_index[0] = to_0based_ids(self.all_node_ids, node_ids_to_rebase)

        # Example: hyperedge_index after remapping nodes: [[0, 0, 1, 2, 3, 4],
        #                                                  [3, 4, 4, 3, 4, 3]]
        #          hyperedge_ids_to_rebase = [3, 4]
        #          -> hyperedge_index after remapping hyperedges: [[0, 0, 1, 2, 3, 4],
        #                                                          [0, 0, 1, 0, 1, 0]]
        self.__hyperedge_index[1] = to_0based_ids(self.all_hyperedge_ids, hyperedge_ids_to_rebase)

        return self

all_node_ids property

Return the tensor of all node IDs in the hyperedge index.

all_hyperedge_ids property

Return the tensor of all hyperedge IDs in the hyperedge index.

item property

Return the hyperedge index tensor.

node_ids property

Return the sorted unique node IDs from the hyperedge index.

hyperedge_ids property

Return the sorted unique hyperedge IDs from the hyperedge index.

num_hyperedges property

Return the number of hyperedges in the hypergraph.

num_nodes property

Return the number of nodes in the hypergraph.

num_incidences property

Return the number of incidences in the hypergraph, which is the number of columns in the hyperedge index.

nodes_in(hyperedge_id)

Return the list of node IDs that belong to the given hyperedge.

Source code in hyperbench/types/hypergraph.py
def nodes_in(self, hyperedge_id: int) -> List[int]:
    """Return the list of node IDs that belong to the given hyperedge."""
    return self.__hyperedge_index[0, self.__hyperedge_index[1] == hyperedge_id].tolist()

num_nodes_if_isolated_exist(num_nodes)

Return the number of nodes in the hypergraph, accounting for isolated nodes that may not appear in the hyperedge index.

Parameters:

Name Type Description Default
num_nodes int

The total number of nodes in the hypergraph, including isolated nodes.

required

Returns:

Type Description
int

The number of nodes in the hypergraph, which is the maximum of the number of unique nodes in the hyperedge index and the provided num_nodes.

Source code in hyperbench/types/hypergraph.py
def num_nodes_if_isolated_exist(self, num_nodes: int) -> int:
    """
    Return the number of nodes in the hypergraph, accounting for isolated nodes that may not appear in the hyperedge index.

    Args:
        num_nodes: The total number of nodes in the hypergraph, including isolated nodes.

    Returns:
        The number of nodes in the hypergraph, which is the maximum of the number of unique nodes in the hyperedge index and the provided ``num_nodes``.
    """
    return max(self.num_nodes, num_nodes)

reduce_to_edge_index_on_clique_expansion(remove_selfloops=True)

Construct a graph from a hypergraph via clique expansion using H @ H^T, where H is the incidence matrix of the hypergraph. In clique expansion, each hyperedge is replaced by a clique connecting all its member nodes.

For each hyperedge, all pairs of member nodes become edges in the resulting graph. This is computed efficiently using the incidence matrix: A = H @ H^T, where H is the sparse incidence matrix of shape [num_nodes, num_hyperedges] and A is the adjacency matrix of the clique-expanded graph.

Parameters:

Name Type Description Default
remove_selfloops bool

Whether to remove self-loops from the diagonal of H @ H^T. Defaults to True.

True

Returns:

Type Description
Tensor

The edge index of the clique-expanded graph. Size (2, |E'|).

Source code in hyperbench/types/hypergraph.py
def reduce_to_edge_index_on_clique_expansion(
    self,
    remove_selfloops: bool = True,
) -> Tensor:
    """
    Construct a graph from a hypergraph via clique expansion using ``H @ H^T``, where ``H`` is the incidence matrix of the hypergraph.
    In clique expansion, each hyperedge is replaced by a clique connecting all its member nodes.

    For each hyperedge, all pairs of member nodes become edges in the resulting graph.
    This is computed efficiently using the incidence matrix: ``A = H @ H^T``, where ``H`` is
    the sparse incidence matrix of shape ``[num_nodes, num_hyperedges]`` and ``A`` is the adjacency matrix of the clique-expanded graph.

    Args:
        remove_selfloops: Whether to remove self-loops from the diagonal of ``H @ H^T``. Defaults to ``True``.

    Returns:
        The edge index of the clique-expanded graph. Size ``(2, |E'|)``.
    """
    # Build sparse incidence matrix of shape [num_nodes, num_hyperedges]
    values = torch.ones(
        size=(self.num_incidences,),
        dtype=torch.float,
        device=self.__hyperedge_index.device,
    )
    incidence_matrix = torch.sparse_coo_tensor(
        indices=torch.stack([self.all_node_ids, self.all_hyperedge_ids], dim=0),
        values=values,
        size=(self.num_nodes, self.num_hyperedges),
    ).coalesce()

    # A = H @ H^T gives adjacency with self-loops on diagonal
    # Example: For hyperedge_index = [[0, 1, 2, 0],
    #                                 [0, 0, 0, 1]]
    #                         hyperedges 0  1
    #          -> incidence_matrix H = [[1, 1], node 0
    #                                   [1, 0], node 1
    #                                   [1, 0]] node 2
    #               nodes 0  1  2
    #          -> H^T = [[1, 1, 1], hyperedge 0
    #                    [1, 0, 0]] hyperedge 1
    #                       nodes 0  1  2
    #          -> A = H @ H^T = [[2, 1, 1], node 0
    #                            [1, 1, 1], node 1
    #                            [1, 1, 1]] node 2
    #                                         nodes 0  1  2
    #          -> A (after removing self-loops) = [[0, 1, 1], node 0
    #                                              [1, 0, 1], node 1
    #                                              [1, 1, 0]] node 2
    adj_matrix = torch.sparse.mm(incidence_matrix, incidence_matrix.t()).coalesce()

    # Extract edge_index, make undirected, and deduplicate
    edge_index = EdgeIndex(adj_matrix.indices())
    if remove_selfloops:
        edge_index.remove_selfloops()
    return edge_index.to_undirected().item

reduce_to_edge_index_on_random_direction(x, with_mediators=False, remove_selfloops=True)

Construct a graph from a hypergraph with methods proposed in HyperGCN: A New Method of Training Graph Convolutional Networks on Hypergraphs <https://arxiv.org/pdf/1809.02589.pdf> paper. Reference implementation: source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/structure/graphs/graph.html#Graph.from_hypergraph_hypergcn>.

Parameters:

Name Type Description Default
x Tensor

Node feature matrix. Size (|V|, C).

required
with_mediators bool

Whether to use mediator to transform the hyperedges to edges in the graph. Defaults to False.

False
remove_selfloops bool

Whether to remove self-loops. Defaults to True.

True

Returns:

Type Description
Tensor

The edge index. Size (2, |E'|).

Raises:

Type Description
ValueError

If any hyperedge contains fewer than 2 nodes.

Source code in hyperbench/types/hypergraph.py
def reduce_to_edge_index_on_random_direction(
    self,
    x: Tensor,
    with_mediators: bool = False,
    remove_selfloops: bool = True,
) -> Tensor:
    """
    Construct a graph from a hypergraph with methods proposed in `HyperGCN: A New Method of Training Graph Convolutional Networks on Hypergraphs <https://arxiv.org/pdf/1809.02589.pdf>`_ paper.
    Reference implementation: `source <https://deephypergraph.readthedocs.io/en/latest/_modules/dhg/structure/graphs/graph.html#Graph.from_hypergraph_hypergcn>`_.

    Args:
        x: Node feature matrix. Size ``(|V|, C)``.
        with_mediators: Whether to use mediator to transform the hyperedges to edges in the graph. Defaults to ``False``.
        remove_selfloops: Whether to remove self-loops. Defaults to ``True``.

    Returns:
        The edge index. Size ``(2, |E'|)``.

    Raises:
        ValueError: If any hyperedge contains fewer than 2 nodes.
    """
    device = x.device

    hypergraph = Hypergraph.from_hyperedge_index(self.__hyperedge_index)
    hypergraph_edges: List[List[int]] = hypergraph.hyperedges
    graph_edges: List[List[int]] = []

    # Random direction (feature_dim, 1) for projecting nodes in each hyperedge
    # Geometrically, we are choosing a random line through the origin in ℝᵈ, where ᵈ = feature_dim
    random_direction = torch.rand((x.shape[1], 1), device=device)

    for edge in hypergraph_edges:
        num_nodes_in_edge = len(edge)
        if num_nodes_in_edge < 2:
            raise ValueError("The number of vertices in an hyperedge must be >= 2.")

        # projections (num_nodes_in_edge,) contains a scalar value for each node in the hyperedge,
        # indicating its projection on the random vector 'random_direction'.
        # Key idea: If two points are very far apart in ℝᵈ, there is a high probability
        # that a random projection will still separate them
        projections = torch.matmul(x[edge], random_direction).squeeze()

        # The indices of the nodes that the farthest apart in the direction of 'random_direction'
        node_max_proj_idx = torch.argmax(projections)
        node_min_proj_idx = torch.argmin(projections)

        if not with_mediators:  # Just connect the two farthest nodes
            graph_edges.append([edge[node_min_proj_idx], edge[node_max_proj_idx]])
            continue

        for node_idx in range(num_nodes_in_edge):
            if node_idx != node_max_proj_idx and node_idx != node_min_proj_idx:
                graph_edges.append([edge[node_min_proj_idx], edge[node_idx]])
                graph_edges.append([edge[node_max_proj_idx], edge[node_idx]])

    graph = Graph(edges=graph_edges)
    if remove_selfloops:
        graph.remove_selfloops()

    return graph.to_edge_index().to(device)

remove_duplicate_edges()

Remove duplicate edges from the hyperedge index. Keeps the tensor contiguous in memory.

Source code in hyperbench/types/hypergraph.py
def remove_duplicate_edges(self) -> "HyperedgeIndex":
    """Remove duplicate edges from the hyperedge index. Keeps the tensor contiguous in memory."""
    # Example: hyperedge_index = [[0, 1, 2, 2, 0, 3, 2],
    #                             [3, 4, 4, 3, 4, 3, 3]], shape (2, 7)
    #          -> after torch.unique(..., dim=1):
    #             hyperedge_index = [[0, 1, 2, 2, 0, 3],
    #                                [3, 4, 4, 3, 4, 3]], shape (2, |E'| = 6)
    # Note: we need to call contiguous() after torch.unique() to ensure
    # the resulting tensor is contiguous in memory, which is important for efficient indexing
    # and further operations (e.g., searchsorted)
    self.__hyperedge_index = torch.unique(self.__hyperedge_index, dim=1).contiguous()
    return self

to_0based(node_ids_to_rebase=None, hyperedge_ids_to_rebase=None)

Convert hyperedge index to the 0-based format by rebasing node IDs to the range [0, num_nodes-1] and hyperedge IDs [0, num_hyperedges-1].

Parameters:

Name Type Description Default
node_ids_to_rebase Optional[Tensor]

Tensor of shape (num_nodes,) containing the original node IDs that need to be rebased to 0-based format. If None, all node IDs in the hyperedge index will be rebased to 0-based format based on their unique sorted order.

None
hyperedge_ids_to_rebase Optional[Tensor]

Tensor of shape (num_hyperedges,) containing the original hyperedge IDs that need to be rebased to 0-based format. If None, all hyperedge IDs in the hyperedge index will be rebased to 0-based format based on their unique sorted order.

None

Returns:

Type Description
HyperedgeIndex

A new :class:HyperedgeIndex instance with the hyperedge index converted to 0-based format.

Source code in hyperbench/types/hypergraph.py
def to_0based(
    self,
    node_ids_to_rebase: Optional[Tensor] = None,
    hyperedge_ids_to_rebase: Optional[Tensor] = None,
) -> "HyperedgeIndex":
    """
    Convert hyperedge index to the 0-based format by rebasing node IDs to the range ``[0, num_nodes-1]`` and hyperedge IDs ``[0, num_hyperedges-1]``.

    Args:
        node_ids_to_rebase: Tensor of shape ``(num_nodes,)`` containing the original node IDs that need to be rebased to 0-based format.
            If ``None``, all node IDs in the hyperedge index will be rebased to 0-based format based on their unique sorted order.
        hyperedge_ids_to_rebase: Tensor of shape ``(num_hyperedges,)`` containing the original hyperedge IDs that need to be rebased to 0-based format.
            If ``None``, all hyperedge IDs in the hyperedge index will be rebased to 0-based format based on their unique sorted order.

    Returns:
        A new :class:`HyperedgeIndex` instance with the hyperedge index converted to 0-based format.
    """
    # Example: hyperedge_index after sorting: [[0, 0, 1, 2, 3, 4],
    #                                          [3, 4, 4, 3, 4, 3]]
    #          node_ids_to_rebase = [0, 1, 2, 3, 4]
    #          -> hyperedge_index after remapping: [[0, 0, 1, 2, 3, 4],
    #                                               [3, 4, 4, 3, 4, 3]]
    self.__hyperedge_index[0] = to_0based_ids(self.all_node_ids, node_ids_to_rebase)

    # Example: hyperedge_index after remapping nodes: [[0, 0, 1, 2, 3, 4],
    #                                                  [3, 4, 4, 3, 4, 3]]
    #          hyperedge_ids_to_rebase = [3, 4]
    #          -> hyperedge_index after remapping hyperedges: [[0, 0, 1, 2, 3, 4],
    #                                                          [0, 0, 1, 0, 1, 0]]
    self.__hyperedge_index[1] = to_0based_ids(self.all_hyperedge_ids, hyperedge_ids_to_rebase)

    return self

Model

hyperbench.types.model

ModelConfig

A class representing the configuration of a model for training.

Parameters:

Name Type Description Default
name str

The name of the model.

required
version str

The version of the model.

'default'
model LightningModule

a LightningModule instance.

required
is_trainable bool

Whether the model is trainable.

True
trainer Optional[Trainer]

a Trainer instance.

None
Source code in hyperbench/types/model.py
class ModelConfig:
    """
    A class representing the configuration of a model for training.

    Args:
        name: The name of the model.
        version: The version of the model.
        model: a LightningModule instance.
        is_trainable: Whether the model is trainable.
        trainer: a Trainer instance.
    """

    def __init__(
        self,
        name: str,
        model: L.LightningModule,
        version: str = "default",
        is_trainable: bool = True,
        trainer: Optional[L.Trainer] = None,
    ) -> None:
        self.name = name
        self.version = version
        self.model = model
        self.is_trainable = is_trainable
        self.trainer = trainer

    def full_model_name(self) -> str:
        return f"{self.name}:{self.version}"

Utils Module

Data Utils

hyperbench.utils.data_utils

to_0based_ids(original_ids, ids_to_rebase=None)

Remap IDs to contiguous 0-based indices.

If ids_to_rebase is provided, only IDs present in it are kept and remapped. If ids_to_rebase is not provided, all unique IDs in original_ids are remapped.

Examples:

>>> to_0based_ids(torch.tensor([1, 3, 3, 7]), torch.tensor([3, 7]))
... -> tensor([0, 0, 1])  # 1 is excluded, 3 -> 0, 7 -> 1
>>> to_0based_ids(torch.tensor([5, 3, 5, 8]))
... -> tensor([1, 0, 1, 2])  # 3 -> 0, 5 -> 1, 8 -> 2

Parameters:

Name Type Description Default
original_ids Tensor

Tensor of original IDs.

required
ids_to_rebase Optional[Tensor]

Optional tensor of IDs to keep and remap. If None, all unique IDs are used.

None

Returns:

Type Description
Tensor

Tensor of 0-based IDs.

Source code in hyperbench/utils/data_utils.py
def to_0based_ids(original_ids: Tensor, ids_to_rebase: Optional[Tensor] = None) -> Tensor:
    """
    Remap IDs to contiguous 0-based indices.

    If ``ids_to_rebase`` is provided, only IDs present in it are kept and remapped.
    If ``ids_to_rebase`` is not provided, all unique IDs in ``original_ids`` are remapped.

    Examples:
        >>> to_0based_ids(torch.tensor([1, 3, 3, 7]), torch.tensor([3, 7]))
        ... -> tensor([0, 0, 1])  # 1 is excluded, 3 -> 0, 7 -> 1

        >>> to_0based_ids(torch.tensor([5, 3, 5, 8]))
        ... -> tensor([1, 0, 1, 2])  # 3 -> 0, 5 -> 1, 8 -> 2

    Args:
        original_ids: Tensor of original IDs.
        ids_to_rebase: Optional tensor of IDs to keep and remap. If None, all unique IDs are used.

    Returns:
        Tensor of 0-based IDs.
    """
    if ids_to_rebase is None:
        sorted_unique_original_ids = original_ids.unique(sorted=True)
        return torch.searchsorted(sorted_unique_original_ids, original_ids)

    keep_mask = torch.isin(original_ids, ids_to_rebase)
    ids_to_keep = original_ids[keep_mask]
    sorted_unique_ids_to_rebase = ids_to_rebase.unique(sorted=True)
    return torch.searchsorted(sorted_unique_ids_to_rebase, ids_to_keep)

HIF Utils

hyperbench.utils.hif_utils

validate_hif_json(filename)

Validate a JSON file against the HIF (Hypergraph Interchange Format) schema.

Parameters:

Name Type Description Default
filename str

Path to the JSON file to validate.

required

Returns:

Type Description
bool

True if the file is valid HIF, False otherwise.

Source code in hyperbench/utils/hif_utils.py
def validate_hif_json(filename: str) -> bool:
    """
    Validate a JSON file against the HIF (Hypergraph Interchange Format) schema.

    Args:
        filename: Path to the JSON file to validate.

    Returns:
        ``True`` if the file is valid HIF, ``False`` otherwise.
    """
    url = "https://raw.githubusercontent.com/HIF-org/HIF-standard/main/schemas/hif_schema.json"
    try:
        schema = requests.get(url, timeout=10).json()
    except (requests.RequestException, requests.Timeout):
        with open("../schema/hif_schema.json", "r") as f:
            schema = json.load(f)
    validator = fastjsonschema.compile(schema)
    hiftext = json.load(open(filename, "r"))
    try:
        validator(hiftext)
        return True
    except Exception:
        return False