跳到内容

行特征索引

RowFeatureIndex

维护行与其特征之间的映射。

这是一个不规则数据集,其中每行的特征数量和维度可能不同。

属性

名称 类型 描述
_cumulative_sum_index 数组

指示条目的指针

_feature_arr list[dict[str, ndarray]]

每个数据集的特征字典列表

_num_genes_per_row list[int]

跟踪每个数据集的特征长度(基因数量)的列表。

_labels list[str]

标签列表

_version

数据集的版本

源代码位于 bionemo/scdl/index/row_feature_index.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
class RowFeatureIndex:
    """Maintains a mapping between a row and its features.

    This is a ragged dataset, where the number and dimension of features
    can be different at every row.

    Attributes:
        _cumulative_sum_index: Pointer that deliniates which entries
        correspondto a given row. For examples if the array is [-1, 200, 201],
        rows 0 to 199 correspond to _feature_arr[0] and 200 corresponds to
        _feature_arr[1]
        _feature_arr: list of feature dictionaries for each dataset
        _num_genes_per_row: list that tracks the feature length (number of genes) for each dataset.
        Extracting this information repeatedly from self._feature_arr would be cumbersome which is why we
        add this attribute.
        _labels: list of labels
        _version: The version of the dataset
    """

    def __init__(self) -> None:
        """Instantiates the index."""
        self._cumulative_sum_index: np.array = np.array([-1])
        self._feature_arr: list[dict[str, np.ndarray]] = []
        self._num_genes_per_row: list[int] = []
        self._version = importlib.metadata.version("bionemo.scdl")
        self._labels: list[str] = []

    def _get_dataset_id(self, row) -> int:
        """Gets the dataset id for a specified row index.

        Args:
            row (int): The index of the row.

        Returns:
            An int representing the dataset id the row belongs to.
        """
        # creates a mask for values where cumulative sum > row
        mask = ~(self._cumulative_sum_index > row)
        # Sum these to get the index of the first range > row
        # Subtract one to get the range containing row.
        d_id = sum(mask) - 1
        return d_id

    def version(self) -> str:
        """Returns a version number.

        (following <major>.<minor>.<point> convention).
        """
        return self._version

    def __len__(self) -> int:
        """The length is the number of rows or RowFeatureIndex length."""
        return len(self._feature_arr)

    def append_features(
        self, n_obs: int, features: dict[str, np.ndarray], num_genes: int, label: Optional[str] = None
    ) -> None:
        """Updates the index with the given features.

        The dict is inserted into the feature array by adding a
        new span to the row lookup index. Additionally, we update the number of genes for the newly added row.

        Args:
            n_obs (int): The number of times that these feature occur in the
            class.
            features (dict): Corresponding features.
            num_genes (int): the length of the features for each feature key in features (i.e., number of genes)
            label (str): Label for the features.
        """
        if isinstance(features, pd.DataFrame):
            raise TypeError("Expected a dictionary, but received a Pandas DataFrame.")
        csum = max(self._cumulative_sum_index[-1], 0)

        # If the new feature array is identical to the last one, it is not appended. Instead, the last array accounts
        # for the additional n_obs also.
        if len(self._feature_arr) > 0 and are_dicts_equal(self._feature_arr[-1], features):
            self._cumulative_sum_index[-1] = csum + n_obs
        else:
            self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
            self._feature_arr.append(features)
            self._num_genes_per_row.append(num_genes)
            self._labels.append(label)

    def lookup(self, row: int, select_features: Optional[list[str]] = None) -> Tuple[list[np.ndarray], str]:
        """Find the features at a given row.

        It is assumed that the row is
        non-zero._cumulative_sum_index contains pointers to which rows correspond
        to given dictionaries. To obtain a specific row, we determine where it is
        located in _cumulative_sum_index and then look up that dictionary in
        _feature_arr
        Args:
            row (int): The row in the feature index.
            select_features (list[str]): a list of features to select
        Returns
            list[np.ndarray]: list of np arrays with the feature values in that row of the specified features
            str: optional label for the row
        Raises:
            IndexError: An error occured due to input row being negative or it
            exceeding the larger row of the rows in the index. It is also raised
            if there are no entries in the index yet.
        """
        if row < 0:
            raise IndexError(f"Row index {row} is not valid. It must be non-negative.")
        if len(self._cumulative_sum_index) < 2:
            raise IndexError("There are no features to lookup.")

        if row > self._cumulative_sum_index[-1]:
            raise IndexError(
                f"Row index {row} is larger than number of rows in FeatureIndex ({self._cumulative_sum_index[-1]})."
            )
        d_id = self._get_dataset_id(row)

        # Retrieve the features for the identified value.
        features_dict = self._feature_arr[d_id]

        # If specific features are to be selected, filter the features.
        if select_features is not None:
            features = []
            for feature in select_features:
                if feature not in features_dict:
                    raise ValueError(f"Provided feature column {feature} in select_features not present in dataset.")
                features.append(features_dict[feature])
        else:
            features = [features_dict[f] for f in features_dict]

        # Return the features for the identified range.
        return features, self._labels[d_id]

    def number_vars_at_row(self, row: int) -> int:
        """Return number of variables in a given row.

        Args:
            row (int): The row in the feature index.

        Returns:
            The length of the features at the row
        """
        return self._num_genes_per_row[self._get_dataset_id(row)]

    def column_dims(self) -> list[int]:
        """Return the number of columns in all rows.

        Args:
            length of features at every row is returned.

        Returns:
            A list containing the lengths of the features in every row
        """
        return self._num_genes_per_row

    def number_of_values(self) -> list[int]:
        """Get the total number of values in the array.

        For each row, the number of genes is counted.

        Returns:
            A list containing the lengths of the features in every block of rows
        """
        if len(self._feature_arr) == 0:
            return [0]
        rows = [
            self._cumulative_sum_index[i] - max(self._cumulative_sum_index[i - 1], 0)
            for i in range(1, len(self._cumulative_sum_index))
        ]
        vals = []
        vals = [n_rows * self._num_genes_per_row[i] for i, n_rows in enumerate(rows)]
        return vals

    def number_of_rows(self) -> int:
        """The number of rows in the index"".

        Returns:
            An integer corresponding to the number or rows in the index
        """
        return int(max(self._cumulative_sum_index[-1], 0))

    def concat(self, other_row_index: RowFeatureIndex, fail_on_empty_index: bool = True) -> RowFeatureIndex:
        """Concatenates the other FeatureIndex to this one.

        Returns the new, updated index. Warning: modifies this index in-place.

        Args:
            other_row_index: another RowFeatureIndex
            fail_on_empty_index: A boolean flag that sets whether to raise an
            error if an empty row index is passed in.

        Returns:
            self, the RowIndexFeature after the concatenations.

        Raises:
            TypeError if other_row_index is not a RowFeatureIndex
            ValueError if an empty RowFeatureIndex is passed and the function is
            set to fail in this case.
        """
        match other_row_index:
            case self.__class__():
                pass
            case _:
                raise TypeError("Error: trying to concatenate something that's not a RowFeatureIndex.")

        if fail_on_empty_index and not len(other_row_index._feature_arr) > 0:
            raise ValueError("Error: Cannot append empty FeatureIndex.")
        for i, feats in enumerate(list(other_row_index._feature_arr)):
            c_span = other_row_index._cumulative_sum_index[i + 1]
            label = other_row_index._labels[i]
            num_genes = other_row_index._num_genes_per_row[i]
            self.append_features(c_span, feats, num_genes, label)

        return self

    def save(self, datapath: str) -> None:
        """Saves the RowFeatureIndex to a given path.

        Args:
            datapath: path to save the index
        """
        Path(datapath).mkdir(parents=True, exist_ok=True)
        num_digits = len(str(len(self._feature_arr)))
        for index, feature_dict in enumerate(self._feature_arr):
            table = pa.table({column: pa.array(values) for column, values in feature_dict.items()})
            dataframe_str_index = f"{index:0{num_digits}d}"
            pq.write_table(table, f"{datapath}/dataframe_{dataframe_str_index}.parquet")

        np.save(Path(datapath) / "cumulative_sum_index.npy", self._cumulative_sum_index)
        np.save(Path(datapath) / "labels.npy", self._labels)
        np.save(Path(datapath) / "version.npy", np.array(self._version))

    @staticmethod
    def load(datapath: str) -> RowFeatureIndex:
        """Loads the data from datapath.

        Args:
            datapath: the path to load from
        Returns:
            An instance of RowFeatureIndex
        """
        new_row_feat_index = RowFeatureIndex()
        parquet_data_paths = sorted(Path(datapath).rglob("*.parquet"))
        data_tables = [pq.read_table(csv_path) for csv_path in parquet_data_paths]
        new_row_feat_index._feature_arr = [
            {column: table[column].to_numpy() for column in table.column_names} for table in data_tables
        ]
        new_row_feat_index._num_genes_per_row = [
            len(feats[next(iter(feats.keys()))]) for feats in new_row_feat_index._feature_arr
        ]

        new_row_feat_index._cumulative_sum_index = np.load(Path(datapath) / "cumulative_sum_index.npy")
        new_row_feat_index._labels = np.load(Path(datapath) / "labels.npy", allow_pickle=True)
        new_row_feat_index._version = np.load(Path(datapath) / "version.npy").item()
        return new_row_feat_index

__init__()

实例化索引。

源代码位于 bionemo/scdl/index/row_feature_index.py
64
65
66
67
68
69
70
def __init__(self) -> None:
    """Instantiates the index."""
    self._cumulative_sum_index: np.array = np.array([-1])
    self._feature_arr: list[dict[str, np.ndarray]] = []
    self._num_genes_per_row: list[int] = []
    self._version = importlib.metadata.version("bionemo.scdl")
    self._labels: list[str] = []

__len__()

长度是行数或 RowFeatureIndex 长度。

源代码位于 bionemo/scdl/index/row_feature_index.py
95
96
97
def __len__(self) -> int:
    """The length is the number of rows or RowFeatureIndex length."""
    return len(self._feature_arr)

append_features(n_obs, features, num_genes, label=None)

使用给定的特征更新索引。

通过向行查找索引添加新跨度,将字典插入特征数组。 此外,我们更新新添加行的基因数量。

参数

名称 类型 描述 默认值
n_obs int

这些特征在其中出现的次数

必需
features dict

对应的特征。

必需
num_genes int

特征中每个特征键的特征长度(即,基因数量)

必需
label str

特征的标签。

源代码位于 bionemo/scdl/index/row_feature_index.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def append_features(
    self, n_obs: int, features: dict[str, np.ndarray], num_genes: int, label: Optional[str] = None
) -> None:
    """Updates the index with the given features.

    The dict is inserted into the feature array by adding a
    new span to the row lookup index. Additionally, we update the number of genes for the newly added row.

    Args:
        n_obs (int): The number of times that these feature occur in the
        class.
        features (dict): Corresponding features.
        num_genes (int): the length of the features for each feature key in features (i.e., number of genes)
        label (str): Label for the features.
    """
    if isinstance(features, pd.DataFrame):
        raise TypeError("Expected a dictionary, but received a Pandas DataFrame.")
    csum = max(self._cumulative_sum_index[-1], 0)

    # If the new feature array is identical to the last one, it is not appended. Instead, the last array accounts
    # for the additional n_obs also.
    if len(self._feature_arr) > 0 and are_dicts_equal(self._feature_arr[-1], features):
        self._cumulative_sum_index[-1] = csum + n_obs
    else:
        self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
        self._feature_arr.append(features)
        self._num_genes_per_row.append(num_genes)
        self._labels.append(label)

column_dims()

返回所有行中的列数。

返回

类型 描述
list[int]

一个列表,其中包含每行中特征的长度

源代码位于 bionemo/scdl/index/row_feature_index.py
185
186
187
188
189
190
191
192
193
194
def column_dims(self) -> list[int]:
    """Return the number of columns in all rows.

    Args:
        length of features at every row is returned.

    Returns:
        A list containing the lengths of the features in every row
    """
    return self._num_genes_per_row

concat(other_row_index, fail_on_empty_index=True)

将其他 FeatureIndex 连接到此索引。

返回新的、更新的索引。 警告:就地修改此索引。

参数

名称 类型 描述 默认值
other_row_index RowFeatureIndex

另一个 RowFeatureIndex

必需
fail_on_empty_index bool

一个布尔标志,用于设置是否引发

True

返回

类型 描述
RowFeatureIndex

self,连接后的 RowIndexFeature。

源代码位于 bionemo/scdl/index/row_feature_index.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def concat(self, other_row_index: RowFeatureIndex, fail_on_empty_index: bool = True) -> RowFeatureIndex:
    """Concatenates the other FeatureIndex to this one.

    Returns the new, updated index. Warning: modifies this index in-place.

    Args:
        other_row_index: another RowFeatureIndex
        fail_on_empty_index: A boolean flag that sets whether to raise an
        error if an empty row index is passed in.

    Returns:
        self, the RowIndexFeature after the concatenations.

    Raises:
        TypeError if other_row_index is not a RowFeatureIndex
        ValueError if an empty RowFeatureIndex is passed and the function is
        set to fail in this case.
    """
    match other_row_index:
        case self.__class__():
            pass
        case _:
            raise TypeError("Error: trying to concatenate something that's not a RowFeatureIndex.")

    if fail_on_empty_index and not len(other_row_index._feature_arr) > 0:
        raise ValueError("Error: Cannot append empty FeatureIndex.")
    for i, feats in enumerate(list(other_row_index._feature_arr)):
        c_span = other_row_index._cumulative_sum_index[i + 1]
        label = other_row_index._labels[i]
        num_genes = other_row_index._num_genes_per_row[i]
        self.append_features(c_span, feats, num_genes, label)

    return self

load(datapath) staticmethod

从数据路径加载数据。

参数

名称 类型 描述 默认值
datapath str

要加载的路径

必需

返回:RowFeatureIndex 的实例

源代码位于 bionemo/scdl/index/row_feature_index.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
@staticmethod
def load(datapath: str) -> RowFeatureIndex:
    """Loads the data from datapath.

    Args:
        datapath: the path to load from
    Returns:
        An instance of RowFeatureIndex
    """
    new_row_feat_index = RowFeatureIndex()
    parquet_data_paths = sorted(Path(datapath).rglob("*.parquet"))
    data_tables = [pq.read_table(csv_path) for csv_path in parquet_data_paths]
    new_row_feat_index._feature_arr = [
        {column: table[column].to_numpy() for column in table.column_names} for table in data_tables
    ]
    new_row_feat_index._num_genes_per_row = [
        len(feats[next(iter(feats.keys()))]) for feats in new_row_feat_index._feature_arr
    ]

    new_row_feat_index._cumulative_sum_index = np.load(Path(datapath) / "cumulative_sum_index.npy")
    new_row_feat_index._labels = np.load(Path(datapath) / "labels.npy", allow_pickle=True)
    new_row_feat_index._version = np.load(Path(datapath) / "version.npy").item()
    return new_row_feat_index

lookup(row, select_features=None)

查找给定行的特征。

假定该行是非零的。_cumulative_sum_index 包含指向哪些行对应于给定字典的指针。 为了获得特定行,我们确定它在 _cumulative_sum_index 中的位置,然后在 _feature_arr Args 中查找该字典:row (int):特征索引中的行。 select_features (list[str]):要选择的特征列表 返回 list[np.ndarray]:np 数组列表,其中包含指定特征的该行中的特征值 str:行的可选标签 Raises: IndexError:由于输入行是负数或超出索引中行的较大行而发生的错误。 如果索引中还没有条目,也会引发此错误。

源代码位于 bionemo/scdl/index/row_feature_index.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def lookup(self, row: int, select_features: Optional[list[str]] = None) -> Tuple[list[np.ndarray], str]:
    """Find the features at a given row.

    It is assumed that the row is
    non-zero._cumulative_sum_index contains pointers to which rows correspond
    to given dictionaries. To obtain a specific row, we determine where it is
    located in _cumulative_sum_index and then look up that dictionary in
    _feature_arr
    Args:
        row (int): The row in the feature index.
        select_features (list[str]): a list of features to select
    Returns
        list[np.ndarray]: list of np arrays with the feature values in that row of the specified features
        str: optional label for the row
    Raises:
        IndexError: An error occured due to input row being negative or it
        exceeding the larger row of the rows in the index. It is also raised
        if there are no entries in the index yet.
    """
    if row < 0:
        raise IndexError(f"Row index {row} is not valid. It must be non-negative.")
    if len(self._cumulative_sum_index) < 2:
        raise IndexError("There are no features to lookup.")

    if row > self._cumulative_sum_index[-1]:
        raise IndexError(
            f"Row index {row} is larger than number of rows in FeatureIndex ({self._cumulative_sum_index[-1]})."
        )
    d_id = self._get_dataset_id(row)

    # Retrieve the features for the identified value.
    features_dict = self._feature_arr[d_id]

    # If specific features are to be selected, filter the features.
    if select_features is not None:
        features = []
        for feature in select_features:
            if feature not in features_dict:
                raise ValueError(f"Provided feature column {feature} in select_features not present in dataset.")
            features.append(features_dict[feature])
    else:
        features = [features_dict[f] for f in features_dict]

    # Return the features for the identified range.
    return features, self._labels[d_id]

number_of_rows()

索引中的行数“”。

返回

类型 描述
int

与索引中的行数相对应的整数

源代码位于 bionemo/scdl/index/row_feature_index.py
214
215
216
217
218
219
220
def number_of_rows(self) -> int:
    """The number of rows in the index"".

    Returns:
        An integer corresponding to the number or rows in the index
    """
    return int(max(self._cumulative_sum_index[-1], 0))

number_of_values()

获取数组中值的总数。

对于每一行,都计算基因数量。

返回

类型 描述
list[int]

一个列表,其中包含每行块中特征的长度

源代码位于 bionemo/scdl/index/row_feature_index.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def number_of_values(self) -> list[int]:
    """Get the total number of values in the array.

    For each row, the number of genes is counted.

    Returns:
        A list containing the lengths of the features in every block of rows
    """
    if len(self._feature_arr) == 0:
        return [0]
    rows = [
        self._cumulative_sum_index[i] - max(self._cumulative_sum_index[i - 1], 0)
        for i in range(1, len(self._cumulative_sum_index))
    ]
    vals = []
    vals = [n_rows * self._num_genes_per_row[i] for i, n_rows in enumerate(rows)]
    return vals

number_vars_at_row(row)

返回给定行中的变量数。

参数

名称 类型 描述 默认值
row int

特征索引中的行。

必需

返回

类型 描述
int

该行特征的长度

源代码位于 bionemo/scdl/index/row_feature_index.py
174
175
176
177
178
179
180
181
182
183
def number_vars_at_row(self, row: int) -> int:
    """Return number of variables in a given row.

    Args:
        row (int): The row in the feature index.

    Returns:
        The length of the features at the row
    """
    return self._num_genes_per_row[self._get_dataset_id(row)]

save(datapath)

将 RowFeatureIndex 保存到给定路径。

参数

名称 类型 描述 默认值
datapath str

保存索引的路径

必需
源代码位于 bionemo/scdl/index/row_feature_index.py
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def save(self, datapath: str) -> None:
    """Saves the RowFeatureIndex to a given path.

    Args:
        datapath: path to save the index
    """
    Path(datapath).mkdir(parents=True, exist_ok=True)
    num_digits = len(str(len(self._feature_arr)))
    for index, feature_dict in enumerate(self._feature_arr):
        table = pa.table({column: pa.array(values) for column, values in feature_dict.items()})
        dataframe_str_index = f"{index:0{num_digits}d}"
        pq.write_table(table, f"{datapath}/dataframe_{dataframe_str_index}.parquet")

    np.save(Path(datapath) / "cumulative_sum_index.npy", self._cumulative_sum_index)
    np.save(Path(datapath) / "labels.npy", self._labels)
    np.save(Path(datapath) / "version.npy", np.array(self._version))

version()

返回版本号。

(遵循..惯例)。

源代码位于 bionemo/scdl/index/row_feature_index.py
88
89
90
91
92
93
def version(self) -> str:
    """Returns a version number.

    (following <major>.<minor>.<point> convention).
    """
    return self._version

are_dicts_equal(dict1, dict2)

比较两个具有字符串键和 numpy.ndarray 值的字典。

参数

名称 类型 描述 默认值
dict1 dict[str, ndarray]

要比较的第一个字典。

必需
dict2 dict[str, ndarray]

要比较的第二个字典。

必需

返回

名称 类型 描述
bool bool

如果字典具有相同的键并且所有对应的 numpy 数组都相等,则为 True;否则为 False。

源代码位于 bionemo/scdl/index/row_feature_index.py
31
32
33
34
35
36
37
38
39
40
41
42
def are_dicts_equal(dict1: dict[str, np.ndarray], dict2: dict[str, np.ndarray]) -> bool:
    """Compare two dictionaries with string keys and numpy.ndarray values.

    Args:
        dict1 (dict[str, np.ndarray]): The first dictionary to compare.
        dict2 (dict[str, np.ndarray]): The second dictionary to compare.

    Returns:
        bool: True if the dictionaries have the same keys and all corresponding
              numpy arrays are equal; False otherwise.
    """
    return dict1.keys() == dict2.keys() and all(np.array_equal(dict1[k], dict2[k]) for k in dict1)