Utils¶

`hyperbench.utils` ¶

`to_0based_ids(original_ids, ids_to_rebase=None)` ¶

Remap IDs to contiguous 0-based indices.

If ids_to_rebase is provided, only IDs present in it are kept and remapped. If ids_to_rebase is not provided, all unique IDs in original_ids are remapped.

Examples:

>>> to_0based_ids(torch.tensor([1, 3, 3, 7]), torch.tensor([3, 7]))
... -> tensor([0, 0, 1])  # 1 is excluded, 3 -> 0, 7 -> 1

>>> to_0based_ids(torch.tensor([5, 3, 5, 8]))
... -> tensor([1, 0, 1, 2])  # 3 -> 0, 5 -> 1, 8 -> 2

Parameters:

Name	Type	Description	Default
`original_ids`	`Tensor`	Tensor of original IDs.	required
`ids_to_rebase`	`Tensor \| None`	Optional tensor of IDs to keep and remap. If None, all unique IDs are used.	`None`

Returns:

Type	Description
`Tensor`	Tensor of 0-based IDs.

Source code in hyperbench/utils/data_utils.py

def to_0based_ids(original_ids: Tensor, ids_to_rebase: Tensor | None = None) -> Tensor:
    """
    Remap IDs to contiguous 0-based indices.

    If ``ids_to_rebase`` is provided, only IDs present in it are kept and remapped.
    If ``ids_to_rebase`` is not provided, all unique IDs in ``original_ids`` are remapped.

    Examples:
        >>> to_0based_ids(torch.tensor([1, 3, 3, 7]), torch.tensor([3, 7]))
        ... -> tensor([0, 0, 1])  # 1 is excluded, 3 -> 0, 7 -> 1

        >>> to_0based_ids(torch.tensor([5, 3, 5, 8]))
        ... -> tensor([1, 0, 1, 2])  # 3 -> 0, 5 -> 1, 8 -> 2

    Args:
        original_ids: Tensor of original IDs.
        ids_to_rebase: Optional tensor of IDs to keep and remap. If None, all unique IDs are used.

    Returns:
        Tensor of 0-based IDs.
    """
    if ids_to_rebase is None:
        sorted_unique_original_ids = original_ids.unique(sorted=True)
        return torch.searchsorted(sorted_unique_original_ids, original_ids)

    keep_mask = torch.isin(original_ids, ids_to_rebase)
    ids_to_keep = original_ids[keep_mask]
    sorted_unique_ids_to_rebase = ids_to_rebase.unique(sorted=True)
    return torch.searchsorted(sorted_unique_ids_to_rebase, ids_to_keep)

`validate_hif_json(filename)` ¶

Validate a JSON file against the HIF (Hypergraph Interchange Format) schema.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Path to the JSON file to validate.	required

Returns:

Type	Description
`bool`	`True` if the file is valid HIF, `False` otherwise.

Source code in hyperbench/utils/hif_utils.py

def validate_hif_json(filename: str) -> bool:
    """
    Validate a JSON file against the HIF (Hypergraph Interchange Format) schema.

    Args:
        filename: Path to the JSON file to validate.

    Returns:
        ``True`` if the file is valid HIF, ``False`` otherwise.
    """
    url = f"https://raw.githubusercontent.com/HIF-org/HIF-standard/{HIF_SCHEMA_COMMIT_SHA}/schemas/hif_schema.json"
    try:
        schema = requests.get(url, timeout=10).json()
    except (requests.RequestException, requests.Timeout):
        with resources.files("hyperbench.utils.schema").joinpath("hif_schema.json").open("r") as f:
            schema = json.load(f)
    validator = fastjsonschema.compile(schema)

    with open(filename) as f:
        hiftext = json.load(f)
        try:
            validator(hiftext)
            return True
        except Exception:
            return False

`maxmin_scatter(src, index, dim, dim_size=None)` ¶

Performs a scatter reduction that computes the channel-wise range (max - min) for each index group.

Parameters:

Name	Type	Description	Default
`src`	`Tensor`	The source tensor containing the values to scatter.	required
`index`	`Tensor`	The indices of elements to scatter.	required
`dim`	`int`	The axis along which to index.	required
`dim_size`	`int \| None`	The size of the output tensor along the scatter dimension. If not provided, it will be inferred from the maximum index value.	`None`

Returns:

Type	Description
`Tensor`	A tensor containing the max-min values for each index group.

Source code in hyperbench/utils/nn_utils.py

def maxmin_scatter(
    src: Tensor,
    index: Tensor,
    dim: int,
    dim_size: int | None = None,
) -> Tensor:
    """
    Performs a scatter reduction that computes the channel-wise range (max - min) for each index group.

    Args:
        src: The source tensor containing the values to scatter.
        index: The indices of elements to scatter.
        dim: The axis along which to index.
        dim_size: The size of the output tensor along the scatter dimension.
            If not provided, it will be inferred from the maximum index value.

    Returns:
        A tensor containing the max-min values for each index group.
    """
    max_embeddings = scatter(src=src, index=index, dim=dim, dim_size=dim_size, reduce="max")
    min_embeddings = scatter(src=src, index=index, dim=dim, dim_size=dim_size, reduce="min")
    return max_embeddings - min_embeddings

`sparse_dropout(sparse_tensor, dropout_prob, fill_value=0.0)` ¶

Dropout function for sparse matrix.

Returns a new sparse matrix with the same shape as the input sparse matrix, but with some elements dropped out.

Parameters:

Name	Type	Description	Default
`sparse_tensor`	`Tensor`	The sparse matrix with format `torch.sparse_coo_tensor`.	required
`dropout_prob`	`float`	Probability of an element to be dropped.	required
`fill_value`	`float`	The fill value for dropped elements. Defaults to `0.0`.	`0.0`

Returns:

Type	Description
`Tensor`	A new sparse matrix with the same shape as the input sparse matrix, but with some elements dropped out.

Source code in hyperbench/utils/sparse_utils.py

def sparse_dropout(
    sparse_tensor: Tensor,
    dropout_prob: float,
    fill_value: float = 0.0,
) -> Tensor:
    """Dropout function for sparse matrix.

    Returns a new sparse matrix with the same shape as the input sparse matrix,
    but with some elements dropped out.

    Args:
        sparse_tensor: The sparse matrix with format ``torch.sparse_coo_tensor``.
        dropout_prob: Probability of an element to be dropped.
        fill_value: The fill value for dropped elements. Defaults to ``0.0``.

    Returns:
        A new sparse matrix with the same shape as the input sparse matrix, but with some elements dropped out.
    """
    device = sparse_tensor.device

    # Sparse tensors may be unsorted indices or have duplicate entries
    # 'coalesce()' will sum duplicates and sort indices to have a consistent format for dropout
    sparse_tensor = sparse_tensor.coalesce()

    if dropout_prob > 1 or dropout_prob < 0:
        raise ValueError("Dropout probability must be in the range [0, 1]")

    # Nothing to drop, return the original sparse tensor
    if dropout_prob == 0:
        return sparse_tensor

    values = sparse_tensor.values()
    indices = sparse_tensor.indices()

    keep_prob = 1 - dropout_prob

    # Generate a binary mask matching the shape of values for elements to keep
    # 'torch.bernoulli()' samples 1 with probability keep_prob and 0 with probability dropout_prob
    # Example: values = [0.5, 1.2, 3.4], keep_prob = 0.8
    #          -> keep_mask might be [1, 0, 1], meaning we keep the 1st and 3rd elements, drop the 2nd
    keep_mask = torch.bernoulli(torch.full_like(values, keep_prob)).to(device)

    if fill_value == 0.0:
        # If fill_value is 0, just zero out the dropped elements,
        # as keep_mask will be 0 for dropped elements and 1 for kept elements
        # Example: values = [0.5, 1.2, 3.4], keep_mask = [1, 0, 1], fill_value = 0.0
        #          -> new_values = [0.5*1, 1.2*0, 3.4*1] = [0.5, 0.0, 3.4]
        new_values = values * keep_mask
    else:
        # If fill_value is non-zero, we must fill the dropped elements with the specified fill_value instead of zero
        # 'torch.logical_not(keep_mask)' identifies dropped elements where mask is 0 and
        # Example: values = [0.5, 1.2, 3.4], keep_mask = [1, 0, 1], fill_value = 9.9
        #          -> values_to_fill_mask = [0, 1, 0]
        #          -> fill_values = [0*9.9, 1*9.9, 0*9.9] = [0.0, 9.9, 0.0]
        #          -> new_values = [0.5*1 + 0.0, 1.2*0 + 9.9, 3.4*1 + 0.0] = [0.5, 9.9, 3.4]
        values_to_fill_mask = torch.logical_not(keep_mask)
        fill_values = values_to_fill_mask * fill_value
        new_values = values * keep_mask + fill_values

    # Reuse the original indices and shape to preserve spasity but change values
    dropout_sparse_tensor = torch.sparse_coo_tensor(
        indices=indices,
        values=new_values,
        size=sparse_tensor.size(),
        dtype=sparse_tensor.dtype,
        device=device,
    )

    return dropout_sparse_tensor

`decompress_zst(zst_path)` ¶

Decompresses a .zst file and returns the path to the decompressed JSON file. Args: zst_path: The path to the .zst file to decompress. Returns: The path to the decompressed JSON file.

Source code in hyperbench/utils/file_utils.py

def decompress_zst(zst_path: str) -> str:
    """
    Decompresses a .zst file and returns the path to the decompressed JSON file.
    Args:
        zst_path: The path to the .zst file to decompress.
    Returns:
        The path to the decompressed JSON file.
    """
    dctx = zstd.ZstdDecompressor()
    with (
        open(zst_path, "rb") as input_f,
        tempfile.NamedTemporaryFile(mode="wb", suffix=".json", delete=False) as tmp_file,
    ):
        dctx.copy_stream(input_f, tmp_file)
        output = tmp_file.name
    return output

`compress_to_zst(json_path)` ¶

Compresses a JSON file to .zst format and returns the compressed bytes.

Parameters:

Name	Type	Description	Default
`json_path`	`str`	The path to the JSON file to compress.	required

Returns: The compressed content as bytes.

Source code in hyperbench/utils/file_utils.py

def compress_to_zst(json_path: str) -> bytes:
    """
    Compresses a JSON file to .zst format and returns the compressed bytes.

    Args:
        json_path: The path to the JSON file to compress.
    Returns:
        The compressed content as bytes.
    """
    cctx = zstd.ZstdCompressor()
    with open(json_path, "rb") as input_f:
        compressed_content = cctx.compress(input_f.read())
    return compressed_content

`write_to_disk(dataset_name, content, output_dir=None)` ¶

Writes the compressed content to disk in the specified output directory or a default location. Args: dataset_name: The name of the dataset. content: The compressed content as bytes. output_dir: The directory to write the file to. If None, a default location is used.

Source code in hyperbench/utils/file_utils.py

def write_to_disk(dataset_name: str, content: bytes, output_dir: str | None = None) -> None:
    """
    Writes the compressed content to disk in the specified output directory or a default location.
    Args:
        dataset_name: The name of the dataset.
        content: The compressed content as bytes.
        output_dir: The directory to write the file to. If None, a default location is used.
    """
    if output_dir is not None:
        zst_filename = os.path.join(output_dir, f"{dataset_name}.json.zst")
    else:
        current_dir = os.path.dirname(os.path.abspath(__file__))
        output_dir = os.path.join(current_dir, "..", "data", "datasets")
        zst_filename = os.path.join(output_dir, f"{dataset_name}.json.zst")

    os.makedirs(output_dir, exist_ok=True)

    with open(zst_filename, "wb") as f:
        f.write(content)

Utils¶

hyperbench.utils ¶

to_0based_ids(original_ids, ids_to_rebase=None) ¶

validate_hif_json(filename) ¶

maxmin_scatter(src, index, dim, dim_size=None) ¶

sparse_dropout(sparse_tensor, dropout_prob, fill_value=0.0) ¶

decompress_zst(zst_path) ¶

compress_to_zst(json_path) ¶

write_to_disk(dataset_name, content, output_dir=None) ¶

`hyperbench.utils` ¶

`to_0based_ids(original_ids, ids_to_rebase=None)` ¶

`validate_hif_json(filename)` ¶

`maxmin_scatter(src, index, dim, dim_size=None)` ¶

`sparse_dropout(sparse_tensor, dropout_prob, fill_value=0.0)` ¶

`decompress_zst(zst_path)` ¶

`compress_to_zst(json_path)` ¶

`write_to_disk(dataset_name, content, output_dir=None)` ¶