跳到内容

单细胞集合

FileNames

基类:str, Enum

在 SingleCellCollection 中生成的文件名。

源代码在 bionemo/scdl/io/single_cell_collection.py
57
58
59
60
61
62
class FileNames(str, Enum):
    """Names of files that are generated in SingleCellCollection."""

    VERSION = "version.json"
    METADATA = "metadata.json"
    FEATURES = "features"

SingleCellCollection

基类:SingleCellRowDatasetCore

一个或多个 SingleCellMemMapDataset 的集合。

SingleCellCollection 支持 SingleCellDataSet API 的大多数功能。 SingleCellCollection 可以转换为单个 SingleCellMemMapDataset。 SingleCellCollection 支持使用异构数据集,例如由多个 AnnData 文件组成的数据集。

属性

名称 类型 描述
_version str

数据集的版本

data_path str

存储数据集集合的目录。

_feature_index RowFeatureIndex

对应的 RowFeatureIndex,特征位于其中

fname_to_mmap Dict[str, SingleCellMemMapDataset]

用于保存每个 SingleCellMemMapDataset 对象的字典。

False Dict[str, SingleCellMemMapDataset]

非参差不齐;所有 SingleCellMemMapDataset 都具有相同的列维度

True Dict[str, SingleCellMemMapDataset]

参差不齐;scmmap 列维度各不相同

源代码在 bionemo/scdl/io/single_cell_collection.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class SingleCellCollection(SingleCellRowDatasetCore):
    """A collection of one or more SingleCellMemMapDatasets.

    SingleCellCollection support most of the functionality of the
    SingleCellDataSet API. An SingleCellCollection can be converted
    to a single SingleCellMemMapDataset. A SingleCellCollection
    enables the use of heterogeneous datasets, such as those composed of many
    AnnData files.

    Attributes:
        _version: The version of the dataset
        data_path: The directory where the colleection of datasets is stored.
        _feature_index: The corresponding RowFeatureIndex where features are
        stored.
        fname_to_mmap:  dictionary to hold each SingleCellMemMapDataset object.
        This maps from the path to the dataset.
        ragged dataset is an dataset of arrays where the arrays have different
        lengths
        False: not ragged; all SingleCellMemMapDataset have same column dimemsion
        True: ragged; scmmap column dimemsions vary
    """

    def __init__(self, data_path: str) -> None:
        """Instantiate the class.

        Args:
            data_path: Where the class will be stored.
        """
        self.data_path: str = data_path
        self._version: str = importlib.metadata.version("bionemo.scdl")
        self.metadata: Dict[str, int] = {}
        self._feature_index: RowFeatureIndex = RowFeatureIndex()
        self.fname_to_mmap: Dict[str, SingleCellMemMapDataset] = {}

        Path(self.data_path).mkdir(parents=True, exist_ok=True)

        # Write the version
        if not os.path.exists(f"{self.data_path}/{FileNames.VERSION.value}"):
            with open(f"{self.data_path}/{FileNames.VERSION.value}", "w") as vfi:
                json.dump(self.version(), vfi)

    def version(self) -> str:
        """Returns a version number.

        (following <major>.<minor>.<point> convention).
        """
        return self._version

    def load_h5ad(self, h5ad_path: str) -> None:
        """Loads data from an existing AnnData archive.

        This creates and saves a new backing data structure.
        Then, the location and the data and the dataset are stored.

        Args:
            h5ad_path: the path to AnnData archive
        """
        mmap_path = Path(self.data_path) / Path(h5ad_path).stem
        self.fname_to_mmap[mmap_path] = _create_single_cell_memmap_dataset_from_h5ad(
            h5ad_path=h5ad_path, base_directory_path=self.data_path
        )
        self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

    def load_h5ad_multi(self, directory_path: str, max_workers: int = 5, use_processes: bool = False) -> None:
        """Loads one or more AnnData files and adds them to the collection.

        Args:
            directory_path: The path to the directory with the AnnData files
            max_workers: the maximal number of workers to use
            use_processes: If True, use ProcessPoolExecutor; otherwise, use
                ThreadPoolExecutor
        Raises:
            FileNotFoundError: If no h5ad files are found in the directory.
            RuntimeError: If an error occurs in the loading of any of the h5ad files.
        """
        directory_path = Path(directory_path)
        ann_data_paths = sorted(directory_path.rglob("*.h5ad"))
        if len(ann_data_paths) == 0:
            raise FileNotFoundError(f"There a no h5ad files in {directory_path}.")
        mmap_paths = [Path(self.data_path) / Path(ann_datapath).stem for ann_datapath in ann_data_paths]
        queue = AsyncWorkQueue(max_workers=max_workers, use_processes=use_processes)
        for ann in ann_data_paths:
            queue.submit_task(_create_single_cell_memmap_dataset_from_h5ad, ann, base_directory_path=self.data_path)
        queue.wait()
        mmaps = queue.get_task_results()

        for result in mmaps:
            if isinstance(result, Exception):
                raise RuntimeError(f"Error in processing file {ann}: {result}") from result

        for mmap_path, mmap in zip(mmap_paths, mmaps):
            if isinstance(mmap, Exception):
                raise RuntimeError(f"Error in processing file {mmap_path}: {mmap}") from mmap

            self.fname_to_mmap[mmap_path] = mmap
            self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

    def number_nonzero_values(self) -> int:
        """Sum of the number of non zero entries in each dataset."""
        return sum([self.fname_to_mmap[mmap_path].number_nonzero_values() for mmap_path in self.fname_to_mmap])

    def number_of_values(self) -> int:
        """Sum of the number of values in each dataset."""
        return sum([self.fname_to_mmap[mmap_path].number_of_values() for mmap_path in self.fname_to_mmap])

    def number_of_rows(self) -> int:
        """The number of rows in the dataset.

        Returns:
            The number of rows in the dataset
        Raises:
            ValueError if the length of the number of rows in the feature
            index does not correspond to the number of stored rows.
        """
        row_sum_from_datasets = sum(
            [self.fname_to_mmap[mmap_path].number_of_rows() for mmap_path in self.fname_to_mmap]
        )
        if len(self._feature_index) > 0 and self._feature_index.number_of_rows() != row_sum_from_datasets:
            raise ValueError(
                f"""The nuber of rows in the feature index {self._feature_index.number_of_rows()}
                             does not correspond to the number of rows in the datasets {row_sum_from_datasets}"""
            )

        return row_sum_from_datasets

    def number_of_variables(self) -> List[int]:
        """If ragged, returns a list of variable lengths.

        If not ragged, returns a list with one entry. A ragged
        collection is one where the datasets have different lengths.
        """
        if len(self._feature_index) == 0:
            return [0]
        else:
            num_vars = self._feature_index.column_dims()
            return num_vars

    def shape(self) -> Tuple[int, List[int]]:
        """Get the shape of the dataset.

        This is the number of entries by the the length of the feature index
        corresponding to that variable.

        Returns:
            The total number of elements across dataset
            A list containing the number of variables for each entry in the
                RowFeatureIndex.
        """
        return self.number_of_rows(), self.number_of_variables()

    def flatten(
        self,
        output_path: str,
        destroy_on_copy: bool = False,
    ) -> None:
        """Flattens the collection into a single SingleCellMemMapDataset.

        Args:
            output_path: location to store new dataset
            destroy_on_copy: Whether to remove the current data_path
        """
        output = SingleCellMemMapDataset(
            output_path,
            num_elements=self.number_of_rows(),
            num_rows=self.number_nonzero_values(),
            mode=Mode.CREATE_APPEND,
        )

        output.concat(list(self.fname_to_mmap.values()))

        # Hit save!
        output.save()

        if destroy_on_copy:
            shutil.rmtree(self.data_path)

__init__(data_path)

实例化类。

参数

名称 类型 描述 默认
data_path str

类的存储位置。

必需
源代码在 bionemo/scdl/io/single_cell_collection.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def __init__(self, data_path: str) -> None:
    """Instantiate the class.

    Args:
        data_path: Where the class will be stored.
    """
    self.data_path: str = data_path
    self._version: str = importlib.metadata.version("bionemo.scdl")
    self.metadata: Dict[str, int] = {}
    self._feature_index: RowFeatureIndex = RowFeatureIndex()
    self.fname_to_mmap: Dict[str, SingleCellMemMapDataset] = {}

    Path(self.data_path).mkdir(parents=True, exist_ok=True)

    # Write the version
    if not os.path.exists(f"{self.data_path}/{FileNames.VERSION.value}"):
        with open(f"{self.data_path}/{FileNames.VERSION.value}", "w") as vfi:
            json.dump(self.version(), vfi)

flatten(output_path, destroy_on_copy=False)

将集合展平为单个 SingleCellMemMapDataset。

参数

名称 类型 描述 默认
output_path str

存储新数据集的位置

必需
destroy_on_copy bool

是否删除当前 data_path

False
源代码在 bionemo/scdl/io/single_cell_collection.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def flatten(
    self,
    output_path: str,
    destroy_on_copy: bool = False,
) -> None:
    """Flattens the collection into a single SingleCellMemMapDataset.

    Args:
        output_path: location to store new dataset
        destroy_on_copy: Whether to remove the current data_path
    """
    output = SingleCellMemMapDataset(
        output_path,
        num_elements=self.number_of_rows(),
        num_rows=self.number_nonzero_values(),
        mode=Mode.CREATE_APPEND,
    )

    output.concat(list(self.fname_to_mmap.values()))

    # Hit save!
    output.save()

    if destroy_on_copy:
        shutil.rmtree(self.data_path)

load_h5ad(h5ad_path)

从现有的 AnnData 存档加载数据。

这将创建并保存新的后备数据结构。然后,存储位置以及数据和数据集。

参数

名称 类型 描述 默认
h5ad_path str

AnnData 存档的路径

必需
源代码在 bionemo/scdl/io/single_cell_collection.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def load_h5ad(self, h5ad_path: str) -> None:
    """Loads data from an existing AnnData archive.

    This creates and saves a new backing data structure.
    Then, the location and the data and the dataset are stored.

    Args:
        h5ad_path: the path to AnnData archive
    """
    mmap_path = Path(self.data_path) / Path(h5ad_path).stem
    self.fname_to_mmap[mmap_path] = _create_single_cell_memmap_dataset_from_h5ad(
        h5ad_path=h5ad_path, base_directory_path=self.data_path
    )
    self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

load_h5ad_multi(directory_path, max_workers=5, use_processes=False)

加载一个或多个 AnnData 文件,并将它们添加到集合中。

参数

名称 类型 描述 默认
directory_path str

包含 AnnData 文件的目录路径

必需
max_workers int

要使用的最大工作线程数

5
use_processes bool

如果为 True,则使用 ProcessPoolExecutor;否则,使用 ThreadPoolExecutor

False

Raises: FileNotFoundError: 如果在目录中找不到 h5ad 文件。 RuntimeError: 如果在加载任何 h5ad 文件时发生错误。

源代码在 bionemo/scdl/io/single_cell_collection.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def load_h5ad_multi(self, directory_path: str, max_workers: int = 5, use_processes: bool = False) -> None:
    """Loads one or more AnnData files and adds them to the collection.

    Args:
        directory_path: The path to the directory with the AnnData files
        max_workers: the maximal number of workers to use
        use_processes: If True, use ProcessPoolExecutor; otherwise, use
            ThreadPoolExecutor
    Raises:
        FileNotFoundError: If no h5ad files are found in the directory.
        RuntimeError: If an error occurs in the loading of any of the h5ad files.
    """
    directory_path = Path(directory_path)
    ann_data_paths = sorted(directory_path.rglob("*.h5ad"))
    if len(ann_data_paths) == 0:
        raise FileNotFoundError(f"There a no h5ad files in {directory_path}.")
    mmap_paths = [Path(self.data_path) / Path(ann_datapath).stem for ann_datapath in ann_data_paths]
    queue = AsyncWorkQueue(max_workers=max_workers, use_processes=use_processes)
    for ann in ann_data_paths:
        queue.submit_task(_create_single_cell_memmap_dataset_from_h5ad, ann, base_directory_path=self.data_path)
    queue.wait()
    mmaps = queue.get_task_results()

    for result in mmaps:
        if isinstance(result, Exception):
            raise RuntimeError(f"Error in processing file {ann}: {result}") from result

    for mmap_path, mmap in zip(mmap_paths, mmaps):
        if isinstance(mmap, Exception):
            raise RuntimeError(f"Error in processing file {mmap_path}: {mmap}") from mmap

        self.fname_to_mmap[mmap_path] = mmap
        self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

number_nonzero_values()

每个数据集中非零条目数的总和。

源代码在 bionemo/scdl/io/single_cell_collection.py
162
163
164
def number_nonzero_values(self) -> int:
    """Sum of the number of non zero entries in each dataset."""
    return sum([self.fname_to_mmap[mmap_path].number_nonzero_values() for mmap_path in self.fname_to_mmap])

number_of_rows()

数据集中的行数。

返回

类型 描述
int

数据集中的行数

Raises: ValueError 如果特征索引中行数的长度与存储的行数不符。

源代码在 bionemo/scdl/io/single_cell_collection.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def number_of_rows(self) -> int:
    """The number of rows in the dataset.

    Returns:
        The number of rows in the dataset
    Raises:
        ValueError if the length of the number of rows in the feature
        index does not correspond to the number of stored rows.
    """
    row_sum_from_datasets = sum(
        [self.fname_to_mmap[mmap_path].number_of_rows() for mmap_path in self.fname_to_mmap]
    )
    if len(self._feature_index) > 0 and self._feature_index.number_of_rows() != row_sum_from_datasets:
        raise ValueError(
            f"""The nuber of rows in the feature index {self._feature_index.number_of_rows()}
                         does not correspond to the number of rows in the datasets {row_sum_from_datasets}"""
        )

    return row_sum_from_datasets

number_of_values()

每个数据集中值的数量的总和。

源代码在 bionemo/scdl/io/single_cell_collection.py
166
167
168
def number_of_values(self) -> int:
    """Sum of the number of values in each dataset."""
    return sum([self.fname_to_mmap[mmap_path].number_of_values() for mmap_path in self.fname_to_mmap])

number_of_variables()

如果参差不齐,则返回变量长度列表。

如果非参差不齐,则返回包含一个条目的列表。 参差不齐的集合是指数据集具有不同长度的集合。

源代码在 bionemo/scdl/io/single_cell_collection.py
190
191
192
193
194
195
196
197
198
199
200
def number_of_variables(self) -> List[int]:
    """If ragged, returns a list of variable lengths.

    If not ragged, returns a list with one entry. A ragged
    collection is one where the datasets have different lengths.
    """
    if len(self._feature_index) == 0:
        return [0]
    else:
        num_vars = self._feature_index.column_dims()
        return num_vars

shape()

获取数据集的形状。

这是条目数乘以与该变量对应的特征索引的长度。

返回

类型 描述
int

跨数据集的元素总数

List[int]

一个列表,其中包含 RowFeatureIndex 中每个条目的变量数。

源代码在 bionemo/scdl/io/single_cell_collection.py
202
203
204
205
206
207
208
209
210
211
212
213
def shape(self) -> Tuple[int, List[int]]:
    """Get the shape of the dataset.

    This is the number of entries by the the length of the feature index
    corresponding to that variable.

    Returns:
        The total number of elements across dataset
        A list containing the number of variables for each entry in the
            RowFeatureIndex.
    """
    return self.number_of_rows(), self.number_of_variables()

version()

返回版本号。

(遵循..惯例)。

源代码在 bionemo/scdl/io/single_cell_collection.py
106
107
108
109
110
111
def version(self) -> str:
    """Returns a version number.

    (following <major>.<minor>.<point> convention).
    """
    return self._version