示例笔记本
In [ ]
已复制!
import os
import tempfile
import pooch
from torch.utils.data import DataLoader
from bionemo.core import BIONEMO_CACHE_DIR
from bionemo.scdl.io.single_cell_memmap_dataset import SingleCellMemMapDataset
from bionemo.scdl.util.torch_dataloader_utils import collate_sparse_matrix_batch
import os import tempfile import pooch from torch.utils.data import DataLoader from bionemo.core import BIONEMO_CACHE_DIR from bionemo.scdl.io.single_cell_memmap_dataset import SingleCellMemMapDataset from bionemo.scdl.util.torch_dataloader_utils import collate_sparse_matrix_batch
首先,复制输入数据。这可以通过复制 https://datasets.cellxgene.cziscience.com/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad 到名为 hdf5s
的目录来完成。
In [ ]
已复制!
input_data = pooch.retrieve(
'https://datasets.cellxgene.cziscience.com/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad',
path=BIONEMO_CACHE_DIR / "hdf5s",
known_hash='a0728e13a421bbcd6b2718e1d32f88d0d5c7cb92289331e3f14a59b7c513b3bc')
input_data = pooch.retrieve( 'https://datasets.cellxgene.cziscience.com/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad', path=BIONEMO_CACHE_DIR / "hdf5s", known_hash='a0728e13a421bbcd6b2718e1d32f88d0d5c7cb92289331e3f14a59b7c513b3bc')
In [ ]
已复制!
#Create a SingleCellMemMapDataset
dataset_temp_dir = tempfile.TemporaryDirectory()
dataset_dir = os.path.join(dataset_temp_dir.name, "97e_scmm")
data = SingleCellMemMapDataset(dataset_dir, input_data)
#创建 SingleCellMemMapDataset dataset_temp_dir = tempfile.TemporaryDirectory() dataset_dir = os.path.join(dataset_temp_dir.name, "97e_scmm") data = SingleCellMemMapDataset(dataset_dir, input_data)
In [ ]
已复制!
#Save the dataset to the disk.
data.save()
#将数据集保存到磁盘。 data.save()
In [ ]
已复制!
#Reload the data
reloaded_data = SingleCellMemMapDataset(dataset_dir)
#重新加载数据 reloaded_data = SingleCellMemMapDataset(dataset_dir)
每个观测值有不同数量的列。但是,对于批量大小为 1,数据不需要整理。然后它将以形状为 (1, 2, num_obs) 的 torch 张量输出。第一行长度为 num_obs 包含列指针,第二行包含相应的值。
In [ ]
已复制!
model = lambda x : x
dataloader = DataLoader(data, batch_size=1, shuffle=True, collate_fn=collate_sparse_matrix_batch)
n_epochs = 1
for e in range(n_epochs):
for batch in dataloader:
model(batch)
model = lambda x : x dataloader = DataLoader(data, batch_size=1, shuffle=True, collate_fn=collate_sparse_matrix_batch) n_epochs = 1 for e in range(n_epochs): for batch in dataloader: model(batch)
数据可以以批量大小为 1 进行整理,并且必须以更大的批量大小进行整理。这将把几个稀疏矩阵整理成 CSR(压缩稀疏行)torch 张量格式。
In [ ]
已复制!
model = lambda x : x
dataloader = DataLoader(data, batch_size=8, shuffle=True, collate_fn=collate_sparse_matrix_batch)
n_epochs = 1
for e in range(n_epochs):
for batch in dataloader:
model(batch)
model = lambda x : x dataloader = DataLoader(data, batch_size=8, shuffle=True, collate_fn=collate_sparse_matrix_batch) n_epochs = 1 for e in range(n_epochs): for batch in dataloader: model(batch)
或者,如果有多个 AnnData 文件,可以将它们转换为单个 SingleCellMemMapDataset。如果 hdf5 目录有一个或多个 AnnData 文件,SingleCellCollection 类会爬取文件系统以递归查找 AnnData 文件(带有 h5ad 扩展名)。以下代码位于 scripts/convert_h5ad_to_scdl.py 中。它将在 example_dataset 创建一个新的数据集。也可以使用 convert_h5ad_to_scdl 命令调用它。
In [ ]
已复制!
# path to dir holding hdf5s data
hdf5s = BIONEMO_CACHE_DIR / "hdf5s"
# path to output dir where SCDataset will be stored
output_dir = os.path.join('scdataset_output')
# hdf5s 数据的目录路径 hdf5s = BIONEMO_CACHE_DIR / "hdf5s" # SCDataset 将存储的输出目录路径 output_dir = os.path.join('scdataset_output')
In [ ]
已复制!
from bionemo.scdl.io.single_cell_collection import SingleCellCollection
with tempfile.TemporaryDirectory() as temp_dir:
coll = SingleCellCollection(temp_dir)
coll.load_h5ad_multi(hdf5s, max_workers=4, use_processes=True)
coll.flatten(output_dir, destroy_on_copy=True)
from bionemo.scdl.io.single_cell_collection import SingleCellCollection with tempfile.TemporaryDirectory() as temp_dir: coll = SingleCellCollection(temp_dir) coll.load_h5ad_multi(hdf5s, max_workers=4, use_processes=True) coll.flatten(output_dir, destroy_on_copy=True)
In [ ]
已复制!
dataset_temp_dir.cleanup()
dataset_temp_dir.cleanup()
In [ ]
已复制!