跳到内容

预处理

GeneformerPreprocess

源代码位于 bionemo/geneformer/data/singlecell/preprocess.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
class GeneformerPreprocess:  # noqa: D101
    def __init__(self, download_directory: Path, medians_file_path: Path, tokenizer_vocab_path: Path):
        """Downloads HGNC symbols

        preproc_dir (str): Directory to store the reference preproc in
        tokenizer_vocab_path (str): Filepath to store the tokenizer vocab
        dataset_conf (OmegaConf): has 'train', 'val', 'test' keys containing
            the names of preprocessed train/val/test files to use for training.
        """  # noqa: D415
        self.download_directory = download_directory
        self.medians_file_path = medians_file_path
        self.tokenizer_vocab_path = tokenizer_vocab_path
        self._validate_tokenizer_args(
            self.tokenizer_vocab_path,
        )

    def build_and_save_tokenizer(self, median_dict, gene_to_ens, vocab_output_name):
        """Builds the GeneTokenizer using the median dictionary
        then serializes and saves the dictionary to disk.
        """  # noqa: D205
        tokenizer = GeneTokenizer.from_medians_and_genes_dicts(median_dict, gene_to_ens)
        tokenizer.save_vocab(vocab_output_name)
        return tokenizer

    def _validate_tokenizer_args(self, vocab_output_name):
        vocab_exists = os.path.exists(vocab_output_name)
        if vocab_exists:
            logging.warning(f"Tokenizer vocab file: {vocab_output_name} already exists. Overwriting...")

    def preprocess(self) -> dict[Literal["tokenizer", "median_dict"], Any]:
        """Preprocesses for the Geneformer model"""  # noqa: D415
        gene_name_dict_fn, gene_median_dict_fn = GeneformerResourcePreprocessor(
            dest_directory=self.download_directory,
        ).prepare()

        # Load artifacts
        with open(gene_name_dict_fn, "rb") as fd:
            gene_ens = pickle.load(fd)

        with open(gene_median_dict_fn, "rb") as fd:
            median_dict = pickle.load(fd)

        # Save converted artifacts to JSON to prevent pickle issues.
        medians_dir = os.path.dirname(self.medians_file_path)
        if not os.path.exists(medians_dir):
            os.makedirs(medians_dir, exist_ok=True)  # ensure the dir exists but be ok with race conditions.
        with open(self.medians_file_path, "w") as fp:
            json.dump(median_dict, fp)

        if self.tokenizer_vocab_path is not None:
            tokenizer = self.build_and_save_tokenizer(
                median_dict,
                gene_ens,
                self.tokenizer_vocab_path,
            )
        else:
            tokenizer = None

        return {"tokenizer": tokenizer, "median_dict": median_dict}

__init__(download_directory, medians_file_path, tokenizer_vocab_path)

下载 HGNC 符号

preproc_dir (str): 用于存储参考预处理的目录 tokenizer_vocab_path (str): 用于存储分词器词汇表的文件路径 dataset_conf (OmegaConf): 具有 'train'、'val'、'test' 键,包含用于训练的预处理后的训练/验证/测试文件名。

源代码位于 bionemo/geneformer/data/singlecell/preprocess.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def __init__(self, download_directory: Path, medians_file_path: Path, tokenizer_vocab_path: Path):
    """Downloads HGNC symbols

    preproc_dir (str): Directory to store the reference preproc in
    tokenizer_vocab_path (str): Filepath to store the tokenizer vocab
    dataset_conf (OmegaConf): has 'train', 'val', 'test' keys containing
        the names of preprocessed train/val/test files to use for training.
    """  # noqa: D415
    self.download_directory = download_directory
    self.medians_file_path = medians_file_path
    self.tokenizer_vocab_path = tokenizer_vocab_path
    self._validate_tokenizer_args(
        self.tokenizer_vocab_path,
    )

build_and_save_tokenizer(median_dict, gene_to_ens, vocab_output_name)

使用中位数词典构建 GeneTokenizer,然后序列化并将词典保存到磁盘。

源代码位于 bionemo/geneformer/data/singlecell/preprocess.py
90
91
92
93
94
95
96
def build_and_save_tokenizer(self, median_dict, gene_to_ens, vocab_output_name):
    """Builds the GeneTokenizer using the median dictionary
    then serializes and saves the dictionary to disk.
    """  # noqa: D205
    tokenizer = GeneTokenizer.from_medians_and_genes_dicts(median_dict, gene_to_ens)
    tokenizer.save_vocab(vocab_output_name)
    return tokenizer

preprocess()

为 Geneformer 模型进行预处理

源代码位于 bionemo/geneformer/data/singlecell/preprocess.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def preprocess(self) -> dict[Literal["tokenizer", "median_dict"], Any]:
    """Preprocesses for the Geneformer model"""  # noqa: D415
    gene_name_dict_fn, gene_median_dict_fn = GeneformerResourcePreprocessor(
        dest_directory=self.download_directory,
    ).prepare()

    # Load artifacts
    with open(gene_name_dict_fn, "rb") as fd:
        gene_ens = pickle.load(fd)

    with open(gene_median_dict_fn, "rb") as fd:
        median_dict = pickle.load(fd)

    # Save converted artifacts to JSON to prevent pickle issues.
    medians_dir = os.path.dirname(self.medians_file_path)
    if not os.path.exists(medians_dir):
        os.makedirs(medians_dir, exist_ok=True)  # ensure the dir exists but be ok with race conditions.
    with open(self.medians_file_path, "w") as fp:
        json.dump(median_dict, fp)

    if self.tokenizer_vocab_path is not None:
        tokenizer = self.build_and_save_tokenizer(
            median_dict,
            gene_ens,
            self.tokenizer_vocab_path,
        )
    else:
        tokenizer = None

    return {"tokenizer": tokenizer, "median_dict": median_dict}

GeneformerResourcePreprocessor dataclass

基类:ResourcePreprocessor

Geneformer 模型的 ResourcePreprocessor。下载 gene_name_id_dict.pkl 和 gene_median_dictionary.pkl 文件。

源代码位于 bionemo/geneformer/data/singlecell/preprocess.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
@dataclass
class GeneformerResourcePreprocessor(ResourcePreprocessor):
    """ResourcePreprocessor for the Geneformer model. Downloads the gene_name_id_dict.pkl and gene_median_dictionary.pkl files."""

    dest_directory: str = "geneformer"

    def get_remote_resources(self) -> List[RemoteResource]:  # noqa: D102
        url_fn = {
            "https://hugging-face.cn/ctheodoris/Geneformer/resolve/main/geneformer/gene_dictionaries_30m/gene_name_id_dict_gc30M.pkl?download=true": "gene_name_id_dict.pkl",
            "https://hugging-face.cn/ctheodoris/Geneformer/resolve/main/geneformer/gene_dictionaries_30m/gene_median_dictionary_gc30M.pkl?download=true": "gene_median_dictionary.pkl",
        }

        resources = []
        for url, filename in url_fn.items():
            resource = RemoteResource(
                dest_directory=self.dest_directory,
                dest_filename=filename,
                root_directory=self.root_directory,
                checksum=None,
                url=url,
            )
            resources.append(resource)
        return resources

    def prepare_resource(self, resource: RemoteResource) -> str:
        """Logs and downloads the passed resource.

        resource: RemoteResource - Resource to be prepared.

        Returns - the absolute destination path for the downloaded resource
        """
        return resource.download_resource()

    def prepare(self):  # noqa: D102
        return [self.prepare_resource(resource) for resource in self.get_remote_resources()]

prepare_resource(resource)

记录并下载传递的资源。

resource: RemoteResource - 要准备的资源。

返回 - 下载资源的绝对目标路径

源代码位于 bionemo/geneformer/data/singlecell/preprocess.py
61
62
63
64
65
66
67
68
def prepare_resource(self, resource: RemoteResource) -> str:
    """Logs and downloads the passed resource.

    resource: RemoteResource - Resource to be prepared.

    Returns - the absolute destination path for the downloaded resource
    """
    return resource.download_resource()