行特征索引

`RowFeatureIndex`

维护行与其特征之间的映射。

这是一个不规则数据集，其中每行的特征数量和维度可能不同。

属性

名称	类型	描述
`_cumulative_sum_index`	`数组`	指示条目的指针
`_feature_arr`	`list[dict[str, ndarray]]`	每个数据集的特征字典列表
`_num_genes_per_row`	`list[int]`	跟踪每个数据集的特征长度（基因数量）的列表。
`_labels`	`list[str]`	标签列表
`_version`		数据集的版本

源代码位于 bionemo/scdl/index/row_feature_index.py

class RowFeatureIndex:
    """Maintains a mapping between a row and its features.

    This is a ragged dataset, where the number and dimension of features
    can be different at every row.

    Attributes:
        _cumulative_sum_index: Pointer that deliniates which entries
        correspondto a given row. For examples if the array is [-1, 200, 201],
        rows 0 to 199 correspond to _feature_arr[0] and 200 corresponds to
        _feature_arr[1]
        _feature_arr: list of feature dictionaries for each dataset
        _num_genes_per_row: list that tracks the feature length (number of genes) for each dataset.
        Extracting this information repeatedly from self._feature_arr would be cumbersome which is why we
        add this attribute.
        _labels: list of labels
        _version: The version of the dataset
    """

    def __init__(self) -> None:
        """Instantiates the index."""
        self._cumulative_sum_index: np.array = np.array([-1])
        self._feature_arr: list[dict[str, np.ndarray]] = []
        self._num_genes_per_row: list[int] = []
        self._version = importlib.metadata.version("bionemo.scdl")
        self._labels: list[str] = []

    def _get_dataset_id(self, row) -> int:
        """Gets the dataset id for a specified row index.

        Args:
            row (int): The index of the row.

        Returns:
            An int representing the dataset id the row belongs to.
        """
        # creates a mask for values where cumulative sum > row
        mask = ~(self._cumulative_sum_index > row)
        # Sum these to get the index of the first range > row
        # Subtract one to get the range containing row.
        d_id = sum(mask) - 1
        return d_id

    def version(self) -> str:
        """Returns a version number.

        (following <major>.<minor>.<point> convention).
        """
        return self._version

    def __len__(self) -> int:
        """The length is the number of rows or RowFeatureIndex length."""
        return len(self._feature_arr)

    def append_features(
        self, n_obs: int, features: dict[str, np.ndarray], num_genes: int, label: Optional[str] = None
    ) -> None:
        """Updates the index with the given features.

        The dict is inserted into the feature array by adding a
        new span to the row lookup index. Additionally, we update the number of genes for the newly added row.

        Args:
            n_obs (int): The number of times that these feature occur in the
            class.
            features (dict): Corresponding features.
            num_genes (int): the length of the features for each feature key in features (i.e., number of genes)
            label (str): Label for the features.
        """
        if isinstance(features, pd.DataFrame):
            raise TypeError("Expected a dictionary, but received a Pandas DataFrame.")
        csum = max(self._cumulative_sum_index[-1], 0)

        # If the new feature array is identical to the last one, it is not appended. Instead, the last array accounts
        # for the additional n_obs also.
        if len(self._feature_arr) > 0 and are_dicts_equal(self._feature_arr[-1], features):
            self._cumulative_sum_index[-1] = csum + n_obs
        else:
            self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
            self._feature_arr.append(features)
            self._num_genes_per_row.append(num_genes)
            self._labels.append(label)

    def lookup(self, row: int, select_features: Optional[list[str]] = None) -> Tuple[list[np.ndarray], str]:
        """Find the features at a given row.

        It is assumed that the row is
        non-zero._cumulative_sum_index contains pointers to which rows correspond
        to given dictionaries. To obtain a specific row, we determine where it is
        located in _cumulative_sum_index and then look up that dictionary in
        _feature_arr
        Args:
            row (int): The row in the feature index.
            select_features (list[str]): a list of features to select
        Returns
            list[np.ndarray]: list of np arrays with the feature values in that row of the specified features
            str: optional label for the row
        Raises:
            IndexError: An error occured due to input row being negative or it
            exceeding the larger row of the rows in the index. It is also raised
            if there are no entries in the index yet.
        """
        if row < 0:
            raise IndexError(f"Row index {row} is not valid. It must be non-negative.")
        if len(self._cumulative_sum_index) < 2:
            raise IndexError("There are no features to lookup.")

        if row > self._cumulative_sum_index[-1]:
            raise IndexError(
                f"Row index {row} is larger than number of rows in FeatureIndex ({self._cumulative_sum_index[-1]})."
            )
        d_id = self._get_dataset_id(row)

        # Retrieve the features for the identified value.
        features_dict = self._feature_arr[d_id]

        # If specific features are to be selected, filter the features.
        if select_features is not None:
            features = []
            for feature in select_features:
                if feature not in features_dict:
                    raise ValueError(f"Provided feature column {feature} in select_features not present in dataset.")
                features.append(features_dict[feature])
        else:
            features = [features_dict[f] for f in features_dict]

        # Return the features for the identified range.
        return features, self._labels[d_id]

    def number_vars_at_row(self, row: int) -> int:
        """Return number of variables in a given row.

        Args:
            row (int): The row in the feature index.

        Returns:
            The length of the features at the row
        """
        return self._num_genes_per_row[self._get_dataset_id(row)]

    def column_dims(self) -> list[int]:
        """Return the number of columns in all rows.

        Args:
            length of features at every row is returned.

        Returns:
            A list containing the lengths of the features in every row
        """
        return self._num_genes_per_row

    def number_of_values(self) -> list[int]:
        """Get the total number of values in the array.

        For each row, the number of genes is counted.

        Returns:
            A list containing the lengths of the features in every block of rows
        """
        if len(self._feature_arr) == 0:
            return [0]
        rows = [
            self._cumulative_sum_index[i] - max(self._cumulative_sum_index[i - 1], 0)
            for i in range(1, len(self._cumulative_sum_index))
        ]
        vals = []
        vals = [n_rows * self._num_genes_per_row[i] for i, n_rows in enumerate(rows)]
        return vals

    def number_of_rows(self) -> int:
        """The number of rows in the index"".

        Returns:
            An integer corresponding to the number or rows in the index
        """
        return int(max(self._cumulative_sum_index[-1], 0))

    def concat(self, other_row_index: RowFeatureIndex, fail_on_empty_index: bool = True) -> RowFeatureIndex:
        """Concatenates the other FeatureIndex to this one.

        Returns the new, updated index. Warning: modifies this index in-place.

        Args:
            other_row_index: another RowFeatureIndex
            fail_on_empty_index: A boolean flag that sets whether to raise an
            error if an empty row index is passed in.

        Returns:
            self, the RowIndexFeature after the concatenations.

        Raises:
            TypeError if other_row_index is not a RowFeatureIndex
            ValueError if an empty RowFeatureIndex is passed and the function is
            set to fail in this case.
        """
        match other_row_index:
            case self.__class__():
                pass
            case _:
                raise TypeError("Error: trying to concatenate something that's not a RowFeatureIndex.")

        if fail_on_empty_index and not len(other_row_index._feature_arr) > 0:
            raise ValueError("Error: Cannot append empty FeatureIndex.")
        for i, feats in enumerate(list(other_row_index._feature_arr)):
            c_span = other_row_index._cumulative_sum_index[i + 1]
            label = other_row_index._labels[i]
            num_genes = other_row_index._num_genes_per_row[i]
            self.append_features(c_span, feats, num_genes, label)

        return self

    def save(self, datapath: str) -> None:
        """Saves the RowFeatureIndex to a given path.

        Args:
            datapath: path to save the index
        """
        Path(datapath).mkdir(parents=True, exist_ok=True)
        num_digits = len(str(len(self._feature_arr)))
        for index, feature_dict in enumerate(self._feature_arr):
            table = pa.table({column: pa.array(values) for column, values in feature_dict.items()})
            dataframe_str_index = f"{index:0{num_digits}d}"
            pq.write_table(table, f"{datapath}/dataframe_{dataframe_str_index}.parquet")

        np.save(Path(datapath) / "cumulative_sum_index.npy", self._cumulative_sum_index)
        np.save(Path(datapath) / "labels.npy", self._labels)
        np.save(Path(datapath) / "version.npy", np.array(self._version))

    @staticmethod
    def load(datapath: str) -> RowFeatureIndex:
        """Loads the data from datapath.

        Args:
            datapath: the path to load from
        Returns:
            An instance of RowFeatureIndex
        """
        new_row_feat_index = RowFeatureIndex()
        parquet_data_paths = sorted(Path(datapath).rglob("*.parquet"))
        data_tables = [pq.read_table(csv_path) for csv_path in parquet_data_paths]
        new_row_feat_index._feature_arr = [
            {column: table[column].to_numpy() for column in table.column_names} for table in data_tables
        ]
        new_row_feat_index._num_genes_per_row = [
            len(feats[next(iter(feats.keys()))]) for feats in new_row_feat_index._feature_arr
        ]

        new_row_feat_index._cumulative_sum_index = np.load(Path(datapath) / "cumulative_sum_index.npy")
        new_row_feat_index._labels = np.load(Path(datapath) / "labels.npy", allow_pickle=True)
        new_row_feat_index._version = np.load(Path(datapath) / "version.npy").item()
        return new_row_feat_index

`init()`

实例化索引。

源代码位于 bionemo/scdl/index/row_feature_index.py

def __init__(self) -> None:
    """Instantiates the index."""
    self._cumulative_sum_index: np.array = np.array([-1])
    self._feature_arr: list[dict[str, np.ndarray]] = []
    self._num_genes_per_row: list[int] = []
    self._version = importlib.metadata.version("bionemo.scdl")
    self._labels: list[str] = []

`len()`

长度是行数或 RowFeatureIndex 长度。

源代码位于 bionemo/scdl/index/row_feature_index.py

def __len__(self) -> int:
    """The length is the number of rows or RowFeatureIndex length."""
    return len(self._feature_arr)

`append_features(n_obs, features, num_genes, label=None)`

使用给定的特征更新索引。

通过向行查找索引添加新跨度，将字典插入特征数组。此外，我们更新新添加行的基因数量。

参数

名称	类型	描述	默认值
`n_obs`	`int`	这些特征在其中出现的次数	必需
`features`	`dict`	对应的特征。	必需
`num_genes`	`int`	特征中每个特征键的特征长度（即，基因数量）	必需
`label`	`str`	特征的标签。	`无`

源代码位于 bionemo/scdl/index/row_feature_index.py

def append_features(
    self, n_obs: int, features: dict[str, np.ndarray], num_genes: int, label: Optional[str] = None
) -> None:
    """Updates the index with the given features.

    The dict is inserted into the feature array by adding a
    new span to the row lookup index. Additionally, we update the number of genes for the newly added row.

    Args:
        n_obs (int): The number of times that these feature occur in the
        class.
        features (dict): Corresponding features.
        num_genes (int): the length of the features for each feature key in features (i.e., number of genes)
        label (str): Label for the features.
    """
    if isinstance(features, pd.DataFrame):
        raise TypeError("Expected a dictionary, but received a Pandas DataFrame.")
    csum = max(self._cumulative_sum_index[-1], 0)

    # If the new feature array is identical to the last one, it is not appended. Instead, the last array accounts
    # for the additional n_obs also.
    if len(self._feature_arr) > 0 and are_dicts_equal(self._feature_arr[-1], features):
        self._cumulative_sum_index[-1] = csum + n_obs
    else:
        self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
        self._feature_arr.append(features)
        self._num_genes_per_row.append(num_genes)
        self._labels.append(label)

`column_dims()`

返回所有行中的列数。

返回

类型	描述
`list[int]`	一个列表，其中包含每行中特征的长度

源代码位于 bionemo/scdl/index/row_feature_index.py

def column_dims(self) -> list[int]:
    """Return the number of columns in all rows.

    Args:
        length of features at every row is returned.

    Returns:
        A list containing the lengths of the features in every row
    """
    return self._num_genes_per_row

`concat(other_row_index, fail_on_empty_index=True)`

将其他 FeatureIndex 连接到此索引。

返回新的、更新的索引。警告：就地修改此索引。

参数

名称	类型	描述	默认值
`other_row_index`	`RowFeatureIndex`	另一个 RowFeatureIndex	必需
`fail_on_empty_index`	`bool`	一个布尔标志，用于设置是否引发	`True`

返回

类型	描述
`RowFeatureIndex`	self，连接后的 RowIndexFeature。

源代码位于 bionemo/scdl/index/row_feature_index.py

def concat(self, other_row_index: RowFeatureIndex, fail_on_empty_index: bool = True) -> RowFeatureIndex:
    """Concatenates the other FeatureIndex to this one.

    Returns the new, updated index. Warning: modifies this index in-place.

    Args:
        other_row_index: another RowFeatureIndex
        fail_on_empty_index: A boolean flag that sets whether to raise an
        error if an empty row index is passed in.

    Returns:
        self, the RowIndexFeature after the concatenations.

    Raises:
        TypeError if other_row_index is not a RowFeatureIndex
        ValueError if an empty RowFeatureIndex is passed and the function is
        set to fail in this case.
    """
    match other_row_index:
        case self.__class__():
            pass
        case _:
            raise TypeError("Error: trying to concatenate something that's not a RowFeatureIndex.")

    if fail_on_empty_index and not len(other_row_index._feature_arr) > 0:
        raise ValueError("Error: Cannot append empty FeatureIndex.")
    for i, feats in enumerate(list(other_row_index._feature_arr)):
        c_span = other_row_index._cumulative_sum_index[i + 1]
        label = other_row_index._labels[i]
        num_genes = other_row_index._num_genes_per_row[i]
        self.append_features(c_span, feats, num_genes, label)

    return self

`load(datapath)` `staticmethod`

从数据路径加载数据。

参数

名称	类型	描述	默认值
`datapath`	`str`	要加载的路径	必需

返回：RowFeatureIndex 的实例

源代码位于 bionemo/scdl/index/row_feature_index.py

@staticmethod
def load(datapath: str) -> RowFeatureIndex:
    """Loads the data from datapath.

    Args:
        datapath: the path to load from
    Returns:
        An instance of RowFeatureIndex
    """
    new_row_feat_index = RowFeatureIndex()
    parquet_data_paths = sorted(Path(datapath).rglob("*.parquet"))
    data_tables = [pq.read_table(csv_path) for csv_path in parquet_data_paths]
    new_row_feat_index._feature_arr = [
        {column: table[column].to_numpy() for column in table.column_names} for table in data_tables
    ]
    new_row_feat_index._num_genes_per_row = [
        len(feats[next(iter(feats.keys()))]) for feats in new_row_feat_index._feature_arr
    ]

    new_row_feat_index._cumulative_sum_index = np.load(Path(datapath) / "cumulative_sum_index.npy")
    new_row_feat_index._labels = np.load(Path(datapath) / "labels.npy", allow_pickle=True)
    new_row_feat_index._version = np.load(Path(datapath) / "version.npy").item()
    return new_row_feat_index

`lookup(row, select_features=None)`

查找给定行的特征。

假定该行是非零的。_cumulative_sum_index 包含指向哪些行对应于给定字典的指针。为了获得特定行，我们确定它在 _cumulative_sum_index 中的位置，然后在 _feature_arr Args 中查找该字典：row (int)：特征索引中的行。 select_features (list[str])：要选择的特征列表返回 list[np.ndarray]：np 数组列表，其中包含指定特征的该行中的特征值 str：行的可选标签 Raises： IndexError：由于输入行是负数或超出索引中行的较大行而发生的错误。如果索引中还没有条目，也会引发此错误。

源代码位于 bionemo/scdl/index/row_feature_index.py

def lookup(self, row: int, select_features: Optional[list[str]] = None) -> Tuple[list[np.ndarray], str]:
    """Find the features at a given row.

    It is assumed that the row is
    non-zero._cumulative_sum_index contains pointers to which rows correspond
    to given dictionaries. To obtain a specific row, we determine where it is
    located in _cumulative_sum_index and then look up that dictionary in
    _feature_arr
    Args:
        row (int): The row in the feature index.
        select_features (list[str]): a list of features to select
    Returns
        list[np.ndarray]: list of np arrays with the feature values in that row of the specified features
        str: optional label for the row
    Raises:
        IndexError: An error occured due to input row being negative or it
        exceeding the larger row of the rows in the index. It is also raised
        if there are no entries in the index yet.
    """
    if row < 0:
        raise IndexError(f"Row index {row} is not valid. It must be non-negative.")
    if len(self._cumulative_sum_index) < 2:
        raise IndexError("There are no features to lookup.")

    if row > self._cumulative_sum_index[-1]:
        raise IndexError(
            f"Row index {row} is larger than number of rows in FeatureIndex ({self._cumulative_sum_index[-1]})."
        )
    d_id = self._get_dataset_id(row)

    # Retrieve the features for the identified value.
    features_dict = self._feature_arr[d_id]

    # If specific features are to be selected, filter the features.
    if select_features is not None:
        features = []
        for feature in select_features:
            if feature not in features_dict:
                raise ValueError(f"Provided feature column {feature} in select_features not present in dataset.")
            features.append(features_dict[feature])
    else:
        features = [features_dict[f] for f in features_dict]

    # Return the features for the identified range.
    return features, self._labels[d_id]

`number_of_rows()`

索引中的行数“”。

返回

类型	描述
`int`	与索引中的行数相对应的整数

源代码位于 bionemo/scdl/index/row_feature_index.py

def number_of_rows(self) -> int:
    """The number of rows in the index"".

    Returns:
        An integer corresponding to the number or rows in the index
    """
    return int(max(self._cumulative_sum_index[-1], 0))

`number_of_values()`

获取数组中值的总数。

对于每一行，都计算基因数量。

返回

类型	描述
`list[int]`	一个列表，其中包含每行块中特征的长度

源代码位于 bionemo/scdl/index/row_feature_index.py

def number_of_values(self) -> list[int]:
    """Get the total number of values in the array.

    For each row, the number of genes is counted.

    Returns:
        A list containing the lengths of the features in every block of rows
    """
    if len(self._feature_arr) == 0:
        return [0]
    rows = [
        self._cumulative_sum_index[i] - max(self._cumulative_sum_index[i - 1], 0)
        for i in range(1, len(self._cumulative_sum_index))
    ]
    vals = []
    vals = [n_rows * self._num_genes_per_row[i] for i, n_rows in enumerate(rows)]
    return vals

`number_vars_at_row(row)`

返回给定行中的变量数。

参数

名称	类型	描述	默认值
`row`	`int`	特征索引中的行。	必需

返回

类型	描述
`int`	该行特征的长度

源代码位于 bionemo/scdl/index/row_feature_index.py

def number_vars_at_row(self, row: int) -> int:
    """Return number of variables in a given row.

    Args:
        row (int): The row in the feature index.

    Returns:
        The length of the features at the row
    """
    return self._num_genes_per_row[self._get_dataset_id(row)]

`save(datapath)`

将 RowFeatureIndex 保存到给定路径。

参数

名称	类型	描述	默认值
`datapath`	`str`	保存索引的路径	必需

源代码位于 bionemo/scdl/index/row_feature_index.py

def save(self, datapath: str) -> None:
    """Saves the RowFeatureIndex to a given path.

    Args:
        datapath: path to save the index
    """
    Path(datapath).mkdir(parents=True, exist_ok=True)
    num_digits = len(str(len(self._feature_arr)))
    for index, feature_dict in enumerate(self._feature_arr):
        table = pa.table({column: pa.array(values) for column, values in feature_dict.items()})
        dataframe_str_index = f"{index:0{num_digits}d}"
        pq.write_table(table, f"{datapath}/dataframe_{dataframe_str_index}.parquet")

    np.save(Path(datapath) / "cumulative_sum_index.npy", self._cumulative_sum_index)
    np.save(Path(datapath) / "labels.npy", self._labels)
    np.save(Path(datapath) / "version.npy", np.array(self._version))

`version()`

返回版本号。

(遵循..惯例)。

源代码位于 bionemo/scdl/index/row_feature_index.py

def version(self) -> str:
    """Returns a version number.

    (following <major>.<minor>.<point> convention).
    """
    return self._version

`are_dicts_equal(dict1, dict2)`

比较两个具有字符串键和 numpy.ndarray 值的字典。

参数

名称	类型	描述	默认值
`dict1`	`dict[str, ndarray]`	要比较的第一个字典。	必需
`dict2`	`dict[str, ndarray]`	要比较的第二个字典。	必需

返回

名称	类型	描述
`bool`	`bool`	如果字典具有相同的键并且所有对应的 numpy 数组都相等，则为 True；否则为 False。

源代码位于 bionemo/scdl/index/row_feature_index.py

def are_dicts_equal(dict1: dict[str, np.ndarray], dict2: dict[str, np.ndarray]) -> bool:
    """Compare two dictionaries with string keys and numpy.ndarray values.

    Args:
        dict1 (dict[str, np.ndarray]): The first dictionary to compare.
        dict2 (dict[str, np.ndarray]): The second dictionary to compare.

    Returns:
        bool: True if the dictionaries have the same keys and all corresponding
              numpy arrays are equal; False otherwise.
    """
    return dict1.keys() == dict2.keys() and all(np.array_equal(dict1[k], dict2[k]) for k in dict1)

行特征索引

RowFeatureIndex

__init__()

__len__()

append_features(n_obs, features, num_genes, label=None)

column_dims()

concat(other_row_index, fail_on_empty_index=True)

load(datapath) staticmethod

lookup(row, select_features=None)

number_of_rows()

number_of_values()

number_vars_at_row(row)

save(datapath)

version()

are_dicts_equal(dict1, dict2)

`RowFeatureIndex`

`init()`

`len()`

`append_features(n_obs, features, num_genes, label=None)`

`column_dims()`

`concat(other_row_index, fail_on_empty_index=True)`

`load(datapath)` `staticmethod`

`lookup(row, select_features=None)`

`number_of_rows()`

`number_of_values()`

`number_vars_at_row(row)`

`save(datapath)`

`version()`

`are_dicts_equal(dict1, dict2)`