单细胞集合

`FileNames`

基类：str, Enum

在 SingleCellCollection 中生成的文件名。

源代码在 bionemo/scdl/io/single_cell_collection.py 中

class FileNames(str, Enum):
    """Names of files that are generated in SingleCellCollection."""

    VERSION = "version.json"
    METADATA = "metadata.json"
    FEATURES = "features"

`SingleCellCollection`

基类：SingleCellRowDatasetCore

一个或多个 SingleCellMemMapDataset 的集合。

SingleCellCollection 支持 SingleCellDataSet API 的大多数功能。 SingleCellCollection 可以转换为单个 SingleCellMemMapDataset。 SingleCellCollection 支持使用异构数据集，例如由多个 AnnData 文件组成的数据集。

属性

名称	类型	描述
`_version`	`str`	数据集的版本
`data_path`	`str`	存储数据集集合的目录。
`_feature_index`	`RowFeatureIndex`	对应的 RowFeatureIndex，特征位于其中
`fname_to_mmap`	`Dict[str, SingleCellMemMapDataset]`	用于保存每个 SingleCellMemMapDataset 对象的字典。
`False`	`Dict[str, SingleCellMemMapDataset]`	非参差不齐；所有 SingleCellMemMapDataset 都具有相同的列维度
`True`	`Dict[str, SingleCellMemMapDataset]`	参差不齐；scmmap 列维度各不相同

源代码在 bionemo/scdl/io/single_cell_collection.py 中

class SingleCellCollection(SingleCellRowDatasetCore):
    """A collection of one or more SingleCellMemMapDatasets.

    SingleCellCollection support most of the functionality of the
    SingleCellDataSet API. An SingleCellCollection can be converted
    to a single SingleCellMemMapDataset. A SingleCellCollection
    enables the use of heterogeneous datasets, such as those composed of many
    AnnData files.

    Attributes:
        _version: The version of the dataset
        data_path: The directory where the colleection of datasets is stored.
        _feature_index: The corresponding RowFeatureIndex where features are
        stored.
        fname_to_mmap:  dictionary to hold each SingleCellMemMapDataset object.
        This maps from the path to the dataset.
        ragged dataset is an dataset of arrays where the arrays have different
        lengths
        False: not ragged; all SingleCellMemMapDataset have same column dimemsion
        True: ragged; scmmap column dimemsions vary
    """

    def __init__(self, data_path: str) -> None:
        """Instantiate the class.

        Args:
            data_path: Where the class will be stored.
        """
        self.data_path: str = data_path
        self._version: str = importlib.metadata.version("bionemo.scdl")
        self.metadata: Dict[str, int] = {}
        self._feature_index: RowFeatureIndex = RowFeatureIndex()
        self.fname_to_mmap: Dict[str, SingleCellMemMapDataset] = {}

        Path(self.data_path).mkdir(parents=True, exist_ok=True)

        # Write the version
        if not os.path.exists(f"{self.data_path}/{FileNames.VERSION.value}"):
            with open(f"{self.data_path}/{FileNames.VERSION.value}", "w") as vfi:
                json.dump(self.version(), vfi)

    def version(self) -> str:
        """Returns a version number.

        (following <major>.<minor>.<point> convention).
        """
        return self._version

    def load_h5ad(self, h5ad_path: str) -> None:
        """Loads data from an existing AnnData archive.

        This creates and saves a new backing data structure.
        Then, the location and the data and the dataset are stored.

        Args:
            h5ad_path: the path to AnnData archive
        """
        mmap_path = Path(self.data_path) / Path(h5ad_path).stem
        self.fname_to_mmap[mmap_path] = _create_single_cell_memmap_dataset_from_h5ad(
            h5ad_path=h5ad_path, base_directory_path=self.data_path
        )
        self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

    def load_h5ad_multi(self, directory_path: str, max_workers: int = 5, use_processes: bool = False) -> None:
        """Loads one or more AnnData files and adds them to the collection.

        Args:
            directory_path: The path to the directory with the AnnData files
            max_workers: the maximal number of workers to use
            use_processes: If True, use ProcessPoolExecutor; otherwise, use
                ThreadPoolExecutor
        Raises:
            FileNotFoundError: If no h5ad files are found in the directory.
            RuntimeError: If an error occurs in the loading of any of the h5ad files.
        """
        directory_path = Path(directory_path)
        ann_data_paths = sorted(directory_path.rglob("*.h5ad"))
        if len(ann_data_paths) == 0:
            raise FileNotFoundError(f"There a no h5ad files in {directory_path}.")
        mmap_paths = [Path(self.data_path) / Path(ann_datapath).stem for ann_datapath in ann_data_paths]
        queue = AsyncWorkQueue(max_workers=max_workers, use_processes=use_processes)
        for ann in ann_data_paths:
            queue.submit_task(_create_single_cell_memmap_dataset_from_h5ad, ann, base_directory_path=self.data_path)
        queue.wait()
        mmaps = queue.get_task_results()

        for result in mmaps:
            if isinstance(result, Exception):
                raise RuntimeError(f"Error in processing file {ann}: {result}") from result

        for mmap_path, mmap in zip(mmap_paths, mmaps):
            if isinstance(mmap, Exception):
                raise RuntimeError(f"Error in processing file {mmap_path}: {mmap}") from mmap

            self.fname_to_mmap[mmap_path] = mmap
            self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

    def number_nonzero_values(self) -> int:
        """Sum of the number of non zero entries in each dataset."""
        return sum([self.fname_to_mmap[mmap_path].number_nonzero_values() for mmap_path in self.fname_to_mmap])

    def number_of_values(self) -> int:
        """Sum of the number of values in each dataset."""
        return sum([self.fname_to_mmap[mmap_path].number_of_values() for mmap_path in self.fname_to_mmap])

    def number_of_rows(self) -> int:
        """The number of rows in the dataset.

        Returns:
            The number of rows in the dataset
        Raises:
            ValueError if the length of the number of rows in the feature
            index does not correspond to the number of stored rows.
        """
        row_sum_from_datasets = sum(
            [self.fname_to_mmap[mmap_path].number_of_rows() for mmap_path in self.fname_to_mmap]
        )
        if len(self._feature_index) > 0 and self._feature_index.number_of_rows() != row_sum_from_datasets:
            raise ValueError(
                f"""The nuber of rows in the feature index {self._feature_index.number_of_rows()}
                             does not correspond to the number of rows in the datasets {row_sum_from_datasets}"""
            )

        return row_sum_from_datasets

    def number_of_variables(self) -> List[int]:
        """If ragged, returns a list of variable lengths.

        If not ragged, returns a list with one entry. A ragged
        collection is one where the datasets have different lengths.
        """
        if len(self._feature_index) == 0:
            return [0]
        else:
            num_vars = self._feature_index.column_dims()
            return num_vars

    def shape(self) -> Tuple[int, List[int]]:
        """Get the shape of the dataset.

        This is the number of entries by the the length of the feature index
        corresponding to that variable.

        Returns:
            The total number of elements across dataset
            A list containing the number of variables for each entry in the
                RowFeatureIndex.
        """
        return self.number_of_rows(), self.number_of_variables()

    def flatten(
        self,
        output_path: str,
        destroy_on_copy: bool = False,
    ) -> None:
        """Flattens the collection into a single SingleCellMemMapDataset.

        Args:
            output_path: location to store new dataset
            destroy_on_copy: Whether to remove the current data_path
        """
        output = SingleCellMemMapDataset(
            output_path,
            num_elements=self.number_of_rows(),
            num_rows=self.number_nonzero_values(),
            mode=Mode.CREATE_APPEND,
        )

        output.concat(list(self.fname_to_mmap.values()))

        # Hit save!
        output.save()

        if destroy_on_copy:
            shutil.rmtree(self.data_path)

`init(data_path)`

实例化类。

参数

名称	类型	描述	默认
`data_path`	`str`	类的存储位置。	必需

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def __init__(self, data_path: str) -> None:
    """Instantiate the class.

    Args:
        data_path: Where the class will be stored.
    """
    self.data_path: str = data_path
    self._version: str = importlib.metadata.version("bionemo.scdl")
    self.metadata: Dict[str, int] = {}
    self._feature_index: RowFeatureIndex = RowFeatureIndex()
    self.fname_to_mmap: Dict[str, SingleCellMemMapDataset] = {}

    Path(self.data_path).mkdir(parents=True, exist_ok=True)

    # Write the version
    if not os.path.exists(f"{self.data_path}/{FileNames.VERSION.value}"):
        with open(f"{self.data_path}/{FileNames.VERSION.value}", "w") as vfi:
            json.dump(self.version(), vfi)

`flatten(output_path, destroy_on_copy=False)`

将集合展平为单个 SingleCellMemMapDataset。

参数

名称	类型	描述	默认
`output_path`	`str`	存储新数据集的位置	必需
`destroy_on_copy`	`bool`	是否删除当前 data_path	`False`

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def flatten(
    self,
    output_path: str,
    destroy_on_copy: bool = False,
) -> None:
    """Flattens the collection into a single SingleCellMemMapDataset.

    Args:
        output_path: location to store new dataset
        destroy_on_copy: Whether to remove the current data_path
    """
    output = SingleCellMemMapDataset(
        output_path,
        num_elements=self.number_of_rows(),
        num_rows=self.number_nonzero_values(),
        mode=Mode.CREATE_APPEND,
    )

    output.concat(list(self.fname_to_mmap.values()))

    # Hit save!
    output.save()

    if destroy_on_copy:
        shutil.rmtree(self.data_path)

`load_h5ad(h5ad_path)`

从现有的 AnnData 存档加载数据。

这将创建并保存新的后备数据结构。然后，存储位置以及数据和数据集。

参数

名称	类型	描述	默认
`h5ad_path`	`str`	AnnData 存档的路径	必需

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def load_h5ad(self, h5ad_path: str) -> None:
    """Loads data from an existing AnnData archive.

    This creates and saves a new backing data structure.
    Then, the location and the data and the dataset are stored.

    Args:
        h5ad_path: the path to AnnData archive
    """
    mmap_path = Path(self.data_path) / Path(h5ad_path).stem
    self.fname_to_mmap[mmap_path] = _create_single_cell_memmap_dataset_from_h5ad(
        h5ad_path=h5ad_path, base_directory_path=self.data_path
    )
    self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

`load_h5ad_multi(directory_path, max_workers=5, use_processes=False)`

加载一个或多个 AnnData 文件，并将它们添加到集合中。

参数

名称	类型	描述	默认
`directory_path`	`str`	包含 AnnData 文件的目录路径	必需
`max_workers`	`int`	要使用的最大工作线程数	`5`
`use_processes`	`bool`	如果为 True，则使用 ProcessPoolExecutor；否则，使用 ThreadPoolExecutor	`False`

Raises: FileNotFoundError: 如果在目录中找不到 h5ad 文件。 RuntimeError: 如果在加载任何 h5ad 文件时发生错误。

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def load_h5ad_multi(self, directory_path: str, max_workers: int = 5, use_processes: bool = False) -> None:
    """Loads one or more AnnData files and adds them to the collection.

    Args:
        directory_path: The path to the directory with the AnnData files
        max_workers: the maximal number of workers to use
        use_processes: If True, use ProcessPoolExecutor; otherwise, use
            ThreadPoolExecutor
    Raises:
        FileNotFoundError: If no h5ad files are found in the directory.
        RuntimeError: If an error occurs in the loading of any of the h5ad files.
    """
    directory_path = Path(directory_path)
    ann_data_paths = sorted(directory_path.rglob("*.h5ad"))
    if len(ann_data_paths) == 0:
        raise FileNotFoundError(f"There a no h5ad files in {directory_path}.")
    mmap_paths = [Path(self.data_path) / Path(ann_datapath).stem for ann_datapath in ann_data_paths]
    queue = AsyncWorkQueue(max_workers=max_workers, use_processes=use_processes)
    for ann in ann_data_paths:
        queue.submit_task(_create_single_cell_memmap_dataset_from_h5ad, ann, base_directory_path=self.data_path)
    queue.wait()
    mmaps = queue.get_task_results()

    for result in mmaps:
        if isinstance(result, Exception):
            raise RuntimeError(f"Error in processing file {ann}: {result}") from result

    for mmap_path, mmap in zip(mmap_paths, mmaps):
        if isinstance(mmap, Exception):
            raise RuntimeError(f"Error in processing file {mmap_path}: {mmap}") from mmap

        self.fname_to_mmap[mmap_path] = mmap
        self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

`number_nonzero_values()`

每个数据集中非零条目数的总和。

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def number_nonzero_values(self) -> int:
    """Sum of the number of non zero entries in each dataset."""
    return sum([self.fname_to_mmap[mmap_path].number_nonzero_values() for mmap_path in self.fname_to_mmap])

`number_of_rows()`

数据集中的行数。

返回

类型	描述
`int`	数据集中的行数

Raises: ValueError 如果特征索引中行数的长度与存储的行数不符。

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def number_of_rows(self) -> int:
    """The number of rows in the dataset.

    Returns:
        The number of rows in the dataset
    Raises:
        ValueError if the length of the number of rows in the feature
        index does not correspond to the number of stored rows.
    """
    row_sum_from_datasets = sum(
        [self.fname_to_mmap[mmap_path].number_of_rows() for mmap_path in self.fname_to_mmap]
    )
    if len(self._feature_index) > 0 and self._feature_index.number_of_rows() != row_sum_from_datasets:
        raise ValueError(
            f"""The nuber of rows in the feature index {self._feature_index.number_of_rows()}
                         does not correspond to the number of rows in the datasets {row_sum_from_datasets}"""
        )

    return row_sum_from_datasets

`number_of_values()`

每个数据集中值的数量的总和。

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def number_of_values(self) -> int:
    """Sum of the number of values in each dataset."""
    return sum([self.fname_to_mmap[mmap_path].number_of_values() for mmap_path in self.fname_to_mmap])

`number_of_variables()`

如果参差不齐，则返回变量长度列表。

如果非参差不齐，则返回包含一个条目的列表。参差不齐的集合是指数据集具有不同长度的集合。

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def number_of_variables(self) -> List[int]:
    """If ragged, returns a list of variable lengths.

    If not ragged, returns a list with one entry. A ragged
    collection is one where the datasets have different lengths.
    """
    if len(self._feature_index) == 0:
        return [0]
    else:
        num_vars = self._feature_index.column_dims()
        return num_vars

`shape()`

获取数据集的形状。

这是条目数乘以与该变量对应的特征索引的长度。

返回

类型	描述
`int`	跨数据集的元素总数
`List[int]`	一个列表，其中包含 RowFeatureIndex 中每个条目的变量数。

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def shape(self) -> Tuple[int, List[int]]:
    """Get the shape of the dataset.

    This is the number of entries by the the length of the feature index
    corresponding to that variable.

    Returns:
        The total number of elements across dataset
        A list containing the number of variables for each entry in the
            RowFeatureIndex.
    """
    return self.number_of_rows(), self.number_of_variables()

`version()`

返回版本号。

(遵循..惯例)。

源代码在 bionemo/scdl/io/single_cell_collection.py 中

def version(self) -> str:
    """Returns a version number.

    (following <major>.<minor>.<point> convention).
    """
    return self._version

单细胞集合

FileNames

SingleCellCollection

__init__(data_path)

flatten(output_path, destroy_on_copy=False)

load_h5ad(h5ad_path)

load_h5ad_multi(directory_path, max_workers=5, use_processes=False)

number_nonzero_values()

number_of_rows()

number_of_values()

number_of_variables()

shape()

version()

`FileNames`

`SingleCellCollection`

`init(data_path)`

`flatten(output_path, destroy_on_copy=False)`

`load_h5ad(h5ad_path)`

`load_h5ad_multi(directory_path, max_workers=5, use_processes=False)`

`number_nonzero_values()`

`number_of_rows()`

`number_of_values()`

`number_of_variables()`

`shape()`

`version()`