Skip to content

Utils

hyperbench.utils

to_0based_ids(original_ids, ids_to_rebase=None)

Remap IDs to contiguous 0-based indices.

If ids_to_rebase is provided, only IDs present in it are kept and remapped. If ids_to_rebase is not provided, all unique IDs in original_ids are remapped.

Examples:

>>> to_0based_ids(torch.tensor([1, 3, 3, 7]), torch.tensor([3, 7]))
... -> tensor([0, 0, 1])  # 1 is excluded, 3 -> 0, 7 -> 1
>>> to_0based_ids(torch.tensor([5, 3, 5, 8]))
... -> tensor([1, 0, 1, 2])  # 3 -> 0, 5 -> 1, 8 -> 2

Parameters:

Name Type Description Default
original_ids Tensor

Tensor of original IDs.

required
ids_to_rebase Tensor | None

Optional tensor of IDs to keep and remap. If None, all unique IDs are used.

None

Returns:

Type Description
Tensor

Tensor of 0-based IDs.

Source code in hyperbench/utils/data_utils.py
def to_0based_ids(original_ids: Tensor, ids_to_rebase: Tensor | None = None) -> Tensor:
    """
    Remap IDs to contiguous 0-based indices.

    If ``ids_to_rebase`` is provided, only IDs present in it are kept and remapped.
    If ``ids_to_rebase`` is not provided, all unique IDs in ``original_ids`` are remapped.

    Examples:
        >>> to_0based_ids(torch.tensor([1, 3, 3, 7]), torch.tensor([3, 7]))
        ... -> tensor([0, 0, 1])  # 1 is excluded, 3 -> 0, 7 -> 1

        >>> to_0based_ids(torch.tensor([5, 3, 5, 8]))
        ... -> tensor([1, 0, 1, 2])  # 3 -> 0, 5 -> 1, 8 -> 2

    Args:
        original_ids: Tensor of original IDs.
        ids_to_rebase: Optional tensor of IDs to keep and remap. If None, all unique IDs are used.

    Returns:
        Tensor of 0-based IDs.
    """
    if ids_to_rebase is None:
        sorted_unique_original_ids = original_ids.unique(sorted=True)
        return torch.searchsorted(sorted_unique_original_ids, original_ids)

    keep_mask = torch.isin(original_ids, ids_to_rebase)
    ids_to_keep = original_ids[keep_mask]
    sorted_unique_ids_to_rebase = ids_to_rebase.unique(sorted=True)
    return torch.searchsorted(sorted_unique_ids_to_rebase, ids_to_keep)

validate_hif_json(filename)

Validate a JSON file against the HIF (Hypergraph Interchange Format) schema.

Parameters:

Name Type Description Default
filename str

Path to the JSON file to validate.

required

Returns:

Type Description
bool

True if the file is valid HIF, False otherwise.

Source code in hyperbench/utils/hif_utils.py
def validate_hif_json(filename: str) -> bool:
    """
    Validate a JSON file against the HIF (Hypergraph Interchange Format) schema.

    Args:
        filename: Path to the JSON file to validate.

    Returns:
        ``True`` if the file is valid HIF, ``False`` otherwise.
    """
    url = f"https://raw.githubusercontent.com/HIF-org/HIF-standard/{HIF_SCHEMA_COMMIT_SHA}/schemas/hif_schema.json"
    try:
        schema = requests.get(url, timeout=10).json()
    except (requests.RequestException, requests.Timeout):
        with resources.files("hyperbench.utils.schema").joinpath("hif_schema.json").open("r") as f:
            schema = json.load(f)
    validator = fastjsonschema.compile(schema)

    with open(filename) as f:
        hiftext = json.load(f)
        try:
            validator(hiftext)
            return True
        except Exception:
            return False

maxmin_scatter(src, index, dim, dim_size=None)

Performs a scatter reduction that computes the channel-wise range (max - min) for each index group.

Parameters:

Name Type Description Default
src Tensor

The source tensor containing the values to scatter.

required
index Tensor

The indices of elements to scatter.

required
dim int

The axis along which to index.

required
dim_size int | None

The size of the output tensor along the scatter dimension. If not provided, it will be inferred from the maximum index value.

None

Returns:

Type Description
Tensor

A tensor containing the max-min values for each index group.

Source code in hyperbench/utils/nn_utils.py
def maxmin_scatter(
    src: Tensor,
    index: Tensor,
    dim: int,
    dim_size: int | None = None,
) -> Tensor:
    """
    Performs a scatter reduction that computes the channel-wise range (max - min) for each index group.

    Args:
        src: The source tensor containing the values to scatter.
        index: The indices of elements to scatter.
        dim: The axis along which to index.
        dim_size: The size of the output tensor along the scatter dimension.
            If not provided, it will be inferred from the maximum index value.

    Returns:
        A tensor containing the max-min values for each index group.
    """
    max_embeddings = scatter(src=src, index=index, dim=dim, dim_size=dim_size, reduce="max")
    min_embeddings = scatter(src=src, index=index, dim=dim, dim_size=dim_size, reduce="min")
    return max_embeddings - min_embeddings

sparse_dropout(sparse_tensor, dropout_prob, fill_value=0.0)

Dropout function for sparse matrix.

Returns a new sparse matrix with the same shape as the input sparse matrix, but with some elements dropped out.

Parameters:

Name Type Description Default
sparse_tensor Tensor

The sparse matrix with format torch.sparse_coo_tensor.

required
dropout_prob float

Probability of an element to be dropped.

required
fill_value float

The fill value for dropped elements. Defaults to 0.0.

0.0

Returns:

Type Description
Tensor

A new sparse matrix with the same shape as the input sparse matrix, but with some elements dropped out.

Source code in hyperbench/utils/sparse_utils.py
def sparse_dropout(
    sparse_tensor: Tensor,
    dropout_prob: float,
    fill_value: float = 0.0,
) -> Tensor:
    """Dropout function for sparse matrix.

    Returns a new sparse matrix with the same shape as the input sparse matrix,
    but with some elements dropped out.

    Args:
        sparse_tensor: The sparse matrix with format ``torch.sparse_coo_tensor``.
        dropout_prob: Probability of an element to be dropped.
        fill_value: The fill value for dropped elements. Defaults to ``0.0``.

    Returns:
        A new sparse matrix with the same shape as the input sparse matrix, but with some elements dropped out.
    """
    device = sparse_tensor.device

    # Sparse tensors may be unsorted indices or have duplicate entries
    # 'coalesce()' will sum duplicates and sort indices to have a consistent format for dropout
    sparse_tensor = sparse_tensor.coalesce()

    if dropout_prob > 1 or dropout_prob < 0:
        raise ValueError("Dropout probability must be in the range [0, 1]")

    # Nothing to drop, return the original sparse tensor
    if dropout_prob == 0:
        return sparse_tensor

    values = sparse_tensor.values()
    indices = sparse_tensor.indices()

    keep_prob = 1 - dropout_prob

    # Generate a binary mask matching the shape of values for elements to keep
    # 'torch.bernoulli()' samples 1 with probability keep_prob and 0 with probability dropout_prob
    # Example: values = [0.5, 1.2, 3.4], keep_prob = 0.8
    #          -> keep_mask might be [1, 0, 1], meaning we keep the 1st and 3rd elements, drop the 2nd
    keep_mask = torch.bernoulli(torch.full_like(values, keep_prob)).to(device)

    if fill_value == 0.0:
        # If fill_value is 0, just zero out the dropped elements,
        # as keep_mask will be 0 for dropped elements and 1 for kept elements
        # Example: values = [0.5, 1.2, 3.4], keep_mask = [1, 0, 1], fill_value = 0.0
        #          -> new_values = [0.5*1, 1.2*0, 3.4*1] = [0.5, 0.0, 3.4]
        new_values = values * keep_mask
    else:
        # If fill_value is non-zero, we must fill the dropped elements with the specified fill_value instead of zero
        # 'torch.logical_not(keep_mask)' identifies dropped elements where mask is 0 and
        # Example: values = [0.5, 1.2, 3.4], keep_mask = [1, 0, 1], fill_value = 9.9
        #          -> values_to_fill_mask = [0, 1, 0]
        #          -> fill_values = [0*9.9, 1*9.9, 0*9.9] = [0.0, 9.9, 0.0]
        #          -> new_values = [0.5*1 + 0.0, 1.2*0 + 9.9, 3.4*1 + 0.0] = [0.5, 9.9, 3.4]
        values_to_fill_mask = torch.logical_not(keep_mask)
        fill_values = values_to_fill_mask * fill_value
        new_values = values * keep_mask + fill_values

    # Reuse the original indices and shape to preserve spasity but change values
    dropout_sparse_tensor = torch.sparse_coo_tensor(
        indices=indices,
        values=new_values,
        size=sparse_tensor.size(),
        dtype=sparse_tensor.dtype,
        device=device,
    )

    return dropout_sparse_tensor

decompress_zst(zst_path)

Decompresses a .zst file and returns the path to the decompressed JSON file. Args: zst_path: The path to the .zst file to decompress. Returns: The path to the decompressed JSON file.

Source code in hyperbench/utils/file_utils.py
def decompress_zst(zst_path: str) -> str:
    """
    Decompresses a .zst file and returns the path to the decompressed JSON file.
    Args:
        zst_path: The path to the .zst file to decompress.
    Returns:
        The path to the decompressed JSON file.
    """
    dctx = zstd.ZstdDecompressor()
    with (
        open(zst_path, "rb") as input_f,
        tempfile.NamedTemporaryFile(mode="wb", suffix=".json", delete=False) as tmp_file,
    ):
        dctx.copy_stream(input_f, tmp_file)
        output = tmp_file.name
    return output

compress_to_zst(json_path)

Compresses a JSON file to .zst format and returns the compressed bytes.

Parameters:

Name Type Description Default
json_path str

The path to the JSON file to compress.

required

Returns: The compressed content as bytes.

Source code in hyperbench/utils/file_utils.py
def compress_to_zst(json_path: str) -> bytes:
    """
    Compresses a JSON file to .zst format and returns the compressed bytes.

    Args:
        json_path: The path to the JSON file to compress.
    Returns:
        The compressed content as bytes.
    """
    cctx = zstd.ZstdCompressor()
    with open(json_path, "rb") as input_f:
        compressed_content = cctx.compress(input_f.read())
    return compressed_content

write_to_disk(dataset_name, content, output_dir=None)

Writes the compressed content to disk in the specified output directory or a default location. Args: dataset_name: The name of the dataset. content: The compressed content as bytes. output_dir: The directory to write the file to. If None, a default location is used.

Source code in hyperbench/utils/file_utils.py
def write_to_disk(dataset_name: str, content: bytes, output_dir: str | None = None) -> None:
    """
    Writes the compressed content to disk in the specified output directory or a default location.
    Args:
        dataset_name: The name of the dataset.
        content: The compressed content as bytes.
        output_dir: The directory to write the file to. If None, a default location is used.
    """
    if output_dir is not None:
        zst_filename = os.path.join(output_dir, f"{dataset_name}.json.zst")
    else:
        current_dir = os.path.dirname(os.path.abspath(__file__))
        output_dir = os.path.join(current_dir, "..", "data", "datasets")
        zst_filename = os.path.join(output_dir, f"{dataset_name}.json.zst")

    os.makedirs(output_dir, exist_ok=True)

    with open(zst_filename, "wb") as f:
        f.write(content)