Source code for inseq.data.data_utils

from copy import deepcopy
from dataclasses import dataclass, fields
from typing import Any, TypeVar

import numpy as np
import torch
import treescope as ts
from jaxtyping import Int

from ..utils import isnotebook, pretty_dict

TensorClass = TypeVar("TensorClass", bound="TensorWrapper")


[docs] @dataclass class TensorWrapper: """Wrapper for tensors and lists of tensors to allow for easy access to their attributes.""" @staticmethod def _getitem(attr, subscript): if isinstance(attr, torch.Tensor): if attr.ndim == 1: return attr[subscript] if attr.ndim >= 2: return attr[:, subscript, ...] elif isinstance(attr, TensorWrapper): return attr[subscript] elif isinstance(attr, list) and isinstance(attr[0], list): return [seq[subscript] for seq in attr] elif isinstance(attr, dict): return {key: TensorWrapper._getitem(val, subscript) for key, val in attr.items()} else: return attr @staticmethod def _slice_batch(attr, subscript): if isinstance(attr, torch.Tensor): if attr.ndim == 1: return attr[subscript] if attr.ndim >= 2: return attr[subscript, ...] elif isinstance(attr, TensorWrapper | list): return attr[subscript] elif isinstance(attr, dict): return {key: TensorWrapper._slice_batch(val, subscript) for key, val in attr.items()} else: return attr @staticmethod def _select_active(attr, mask): if isinstance(attr, torch.Tensor): if attr.ndim <= 1: return attr else: curr_mask = mask.clone() if curr_mask.dtype != torch.bool: curr_mask = curr_mask.bool() while curr_mask.ndim < attr.ndim: curr_mask = curr_mask.unsqueeze(-1) orig_shape = attr.shape[1:] return attr.masked_select(curr_mask).reshape(-1, *orig_shape) elif isinstance(attr, TensorWrapper): return attr.select_active(mask) elif isinstance(attr, list): return [val for i, val in enumerate(attr) if mask.tolist()[i]] elif isinstance(attr, dict): return {key: TensorWrapper._select_active(val, mask) for key, val in attr.items()} else: return attr @staticmethod def _to(attr, device: str): if isinstance(attr, torch.Tensor | TensorWrapper): return attr.to(device) elif isinstance(attr, dict): return {key: TensorWrapper._to(val, device) for key, val in attr.items()} else: return attr @staticmethod def _detach(attr): if isinstance(attr, torch.Tensor | TensorWrapper): return attr.detach() elif isinstance(attr, dict): return {key: TensorWrapper._detach(val) for key, val in attr.items()} else: return attr @staticmethod def _numpy(attr): if isinstance(attr, torch.Tensor | TensorWrapper): np_array = attr.numpy() if isinstance(np_array, np.ndarray): return np.ascontiguousarray(np_array, dtype=np_array.dtype) return np_array elif isinstance(attr, dict): return {key: TensorWrapper._numpy(val) for key, val in attr.items()} else: return attr @staticmethod def _torch(attr): if isinstance(attr, np.ndarray): return torch.tensor(attr) elif isinstance(attr, TensorWrapper): return attr.torch() elif isinstance(attr, dict): return {key: TensorWrapper._torch(val) for key, val in attr.items()} else: return attr @staticmethod def _eq(self_attr: TensorClass, other_attr: TensorClass) -> bool: try: if isinstance(self_attr, torch.Tensor): return torch.allclose(self_attr, other_attr, equal_nan=True, atol=1e-5) elif isinstance(self_attr, dict): return all(TensorWrapper._eq(self_attr[k], other_attr[k]) for k in self_attr.keys()) else: return self_attr == other_attr except: # noqa: E722 return False
[docs] def __getitem__(self: TensorClass, subscript) -> TensorClass: """By default, idiomatic slicing is used for the sequence dimension across batches. For batching use `slice_batch` instead. """ return self.__class__( **{field.name: self._getitem(getattr(self, field.name), subscript) for field in fields(self.__class__)} )
def slice_batch(self: TensorClass, subscript) -> TensorClass: return self.__class__( **{field.name: self._slice_batch(getattr(self, field.name), subscript) for field in fields(self.__class__)} ) def select_active(self: TensorClass, mask: Int[torch.Tensor, "batch_size 1"]) -> TensorClass: return self.__class__( **{field.name: self._select_active(getattr(self, field.name), mask) for field in fields(self.__class__)} ) def to(self: TensorClass, device: str) -> TensorClass: for field in fields(self.__class__): attr = getattr(self, field.name) setattr(self, field.name, self._to(attr, device)) if device == "cpu" and torch.cuda.is_available(): torch.cuda.empty_cache() return self def detach(self: TensorClass) -> TensorClass: for field in fields(self.__class__): attr = getattr(self, field.name) setattr(self, field.name, self._detach(attr)) return self def numpy(self: TensorClass) -> TensorClass: for field in fields(self.__class__): attr = getattr(self, field.name) setattr(self, field.name, self._numpy(attr)) return self def torch(self: TensorClass) -> TensorClass: for field, val in self.to_dict().items(): setattr(self, field, self._torch(val)) return self def clone(self: TensorClass) -> TensorClass: out_params = {} for field in fields(self.__class__): attr = getattr(self, field.name) if isinstance(attr, torch.Tensor | TensorWrapper): out_params[field.name] = attr.clone() elif attr is not None: out_params[field.name] = deepcopy(attr) else: out_params[field.name] = None return self.__class__(**out_params) def clone_empty(self: TensorClass) -> TensorClass: out_params = {k: v for k, v in self.__dict__.items() if k.startswith("_") and v is not None} return self.__class__(**out_params) def to_dict(self: TensorClass) -> dict[str, Any]: return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} def __str__(self): return f"{self.__class__.__name__}({pretty_dict(self.to_dict())})" def __repr__(self): if isnotebook(): ts.display(self) return "" return self.__str__() def __eq__(self, other): equals = {field: self._eq(val, getattr(other, field)) for field, val in self.__dict__.items()} return all(x for x in equals.values()) def __json_encode__(self): return self.clone().detach().to("cpu").numpy().to_dict() def __json_decode__(self, **attrs): # Does not contemplate the usage of __slots__ self.__dict__ = attrs self.__post_init__() def __post_init__(self): pass