"""
Dummy matrix.
This module provides a fixed dummy matrix for use in linear
mixed-effects models. The matrix is constructed
from categorical input at initialisation and has no
trainable parameters.
Classes:
DummyMatrix:
A fixed dummy matrix constructed from categorical data.
"""
import warnings
from itertools import product
import torch
import pandas as pd
from torch_openreml.covariance.matrix import Matrix
[docs]
class DummyMatrix(Matrix):
r"""
Fixed dummy matrix constructed from categorical input.
.. math::
\symbf{V} = \symbf{X}
where :math:`\symbf{X}` is constructed from ``*args`` at initialisation
and remains fixed thereafter. This matrix has no trainable parameters,
so :meth:`grad` always returns ``(None, [])``.
"""
def __init__(self, *args, levels=None, lex_order=True, drop_first=False, drop_empty_cols=False, dtype=None, device=None):
"""
Initialize a fixed dummy matrix from numeric or categorical input.
Args:
*args (list, tuple, or pandas.Series): Input data. One or many lists
of strings for categorical data.
levels (list or tuple, optional): Levels of each list of strings.
Defaults to a list of sorted unique elements in each list of
strings.
lex_order (bool, optional): If ``True``, the result columns are
lexically ordered.
drop_first (bool, optional): Whether to drop the first column.
Defaults to ``False``.
drop_empty_cols (bool, optional): Whether to drop empty columns.
dtype (torch.dtype, optional): Desired dtype of the matrix.
device (torch.device, optional): Desired device of the matrix.
Raises:
TypeError: If any ``args`` is not a :class: list or tuple.
Example:
.. jupyter-execute::
from torch_openreml.covariance import DummyMatrix
rep = ["rep1", "rep2", "rep2"]
block = ["block1", "block2", "block1"]
mat = DummyMatrix(rep, block)
print(mat())
print(mat.colnames)
.. jupyter-execute::
mat = DummyMatrix(rep, block, drop_first=True)
print(mat())
print(mat.colnames)
.. jupyter-execute::
mat = DummyMatrix(rep, block, levels=[["rep1", "rep2", "rep3"], ["block1", "block2"]])
print(mat())
print(mat.colnames)
.. jupyter-execute::
mat = DummyMatrix(rep, block, levels=[["rep3", "rep1"], ["block1", "block2"]], lex_order=False)
print(mat())
print(mat.colnames)
.. jupyter-execute::
mat = DummyMatrix(rep, block, levels=[["rep2", "rep1"], ["block1", "block2"]], lex_order=False, drop_empty_cols=True)
print(mat())
print(mat.colnames)
"""
dtype = dtype or torch.get_default_dtype()
device = device or torch.get_default_device()
for i, arg in enumerate(args):
if not isinstance(arg, (list, tuple, pd.Series)):
raise TypeError(f"Argument {i} must be a list, a tuple or a pandas.Series!")
args = [arg.to_list() if isinstance(arg, pd.Series) else arg for arg in args]
n = len(args[0])
for i, arg in enumerate(args):
if len(arg) != n:
raise ValueError(f"Argument {i} must have the same number of elements as other arguments!")
levels = levels or [sorted(set(arg)) for arg in args]
rows = list(zip(*args))
combos = list(product(*levels))
if lex_order:
combos = sorted(combos)
colnames = ["⋈".join(c) for c in combos]
x = pd.DataFrame(0, index=range(n), columns=colnames)
for i, r in enumerate(rows):
key = "⋈".join(r)
if key not in x.columns:
warnings.warn(f"Unknown combination {r} dropped!", RuntimeWarning)
else:
x.loc[i, key] = 1
if drop_first:
x = x.iloc[:, 1:]
if drop_empty_cols:
x = x.loc[:, (x != 0).any(axis=0)]
self._colnames = x.columns.tolist()
self._matrix = torch.tensor(x.to_numpy(), dtype=dtype, device=device)
super().__init__((self._matrix.shape[0], self._matrix.shape[1]), {})
[docs]
def __call__(self, *args, **kwargs):
return self._matrix
@property
def colnames(self):
"""list: Column names of the matrix."""
return self._colnames
@property
def repr_dict(self):
return {"shape": self._shape}