跳到内容

基因分词器

GeneTokenizer

基类:Label2IDTokenizerIOMixin

初始化 GeneTokenizer 对象。

源代码在 bionemo/geneformer/tokenizer/gene_tokenizer.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
class GeneTokenizer(Label2IDTokenizer, io.IOMixin):
    """Initializes the GeneTokenizer object."""

    cls_token: str = "[CLS]"
    mask_token: str = "[MASK]"
    pad_token: str = "[PAD]"
    sep_token: str = "[SEP]"
    ukw_token: str = "[UKW]"
    special_tokens: Tuple[str, str, str, str, str] = (cls_token, mask_token, pad_token, sep_token, ukw_token)

    def __init__(self, vocab: Dict[str, int], gene_to_ens: Dict[str, str]):  # noqa: D107
        # Sets up vocab/decode_vocab dictionaries, parent class is sateful.
        super().__init__()
        assert set(self.special_tokens).issubset(
            set(vocab.keys())
        ), f"Vocab must contain all of {self.special_tokens}, missing {set(self.special_tokens) - set(vocab.keys())}"
        self.gene_to_ens = deepcopy(gene_to_ens)
        self.ens_to_gene = {v: k for k, v in self.gene_to_ens.items()}
        self.vocab = deepcopy(vocab)
        self.decode_vocab = {v: k for k, v in self.vocab.items()}

    @classmethod
    def from_medians_and_genes_dicts(cls, median_dict: Dict[str, float], gene_to_ens: Dict[str, str]) -> T:
        """Creates a tokenizer from a median dictionary."""
        tokens = list(cls.special_tokens) + list(median_dict.keys())
        vocab = cls._build_vocab(tokens)
        return cls(vocab, gene_to_ens)

    @staticmethod
    def _build_vocab(strings: Union[List[str], str]) -> Dict[str, int]:
        """We override the parent because complete strings are tokens. Otherwise, has the same behavior."""
        vocab: Dict[str, int] = {}
        if isinstance(strings, str):
            strings = [strings]

        for token in strings:
            if token not in vocab:
                vocab[token] = len(vocab)
        return vocab

    def token_to_id(self, token: str) -> int:
        """Converts a token to its corresponding ID.

        Args:
            token: The token to be converted.

        Returns:
            The ID corresponding to the token.
        """
        return self.vocab.get(token)

    @property
    def pad_id(self) -> int:  # noqa: D102
        return self.token_to_id(self.pad_token)

    @property
    def mask_token_id(self) -> int:  # noqa: D102
        return self.token_to_id(self.mask_token)

    @property
    def all_special_ids(self) -> list[int]:  # noqa: D102
        return [self.token_to_id(tok) for tok in self.special_tokens]

    @property
    def class_id(self) -> int:  # noqa: D102
        return self.token_to_id(self.cls_token)

    def tokens_to_ids(self, tokens: List[str]) -> List[int]:  # noqa: D102
        return super().tokens_to_ids(tokens)

    def save_vocab(self, vocab_file: str) -> None:
        """Saves the vocabulary as a newline delimieted vocabulary file, each line represents an int -> token mapping. line number is assumed to be the integer."""
        vocab_dir = os.path.dirname(vocab_file)
        if not os.path.exists(vocab_dir):
            os.makedirs(vocab_dir, exist_ok=True)  # ensure the dir exists but be ok with race conditions.

        to_serialize = {}
        to_serialize["vocab"] = self.vocab
        to_serialize["gene_to_ens"] = self.gene_to_ens

        with open(vocab_file, "w") as f:
            json.dump(to_serialize, f)

    @classmethod
    def from_vocab_file(cls, vocab_file: str) -> None:
        """This method adds a layer on the constructor in the case we are working from a filename instead of a dictionary."""
        if not os.path.exists(vocab_file):
            raise FileNotFoundError(f"Vocab file {vocab_file} not found, run preprocessing to create it.")

        with open(vocab_file) as f:
            to_deserialize = json.load(f)
            vocab = to_deserialize["vocab"]
            gene_to_ens = to_deserialize["gene_to_ens"]

        tokenizer = GeneTokenizer(vocab, gene_to_ens)
        return tokenizer

    def gene_tok_to_ens(self, gene: str) -> str:
        """Converts a gene token to its corresponding Ensembl ID.

        Args:
            gene (str): The gene token to be converted.

        Returns:
            str: The Ensembl ID corresponding to the gene token.
        """
        return self.gene_to_ens[gene]

    def ens_tok_to_gene(self, ens: str) -> str:
        """Converts an Ensembl token to a gene name.

        Args:
            ens (str): The Ensembl token to be converted.

        Returns:
            str: The corresponding gene name.
        """
        return self.ens_to_gene[ens]

    def genes_to_enss(self, genes: List[str]) -> List[str]:
        """Converts a list of gene names to Ensembl IDs.

        Args:
            genes (List[str]): A list of gene names.

        Returns:
            List[str]: A list of corresponding Ensembl IDs.

        Raises:
            ValueError: If a gene name is not found in the gene_to_ens dictionary.
        """
        ens_ids = []
        for gene in genes:
            if gene in self.gene_to_ens:
                ens_ids.append(self.gene_to_ens[gene])
            else:
                raise ValueError(f"{gene} not found")
        return ens_ids

    def enss_to_genes(self, ensemble_ids: List[str]) -> List[str]:
        """Converts a list of ensemble IDs to gene names.

        Args:
            ensemble_ids (List[str]): A list of ensemble IDs.

        Returns:
            List[str]: A list of gene names corresponding to the ensemble IDs.

        Raises:
            ValueError: If an ensemble ID is not found in the mapping.
        """
        genes = []
        for ens_id in ensemble_ids:
            if ens_id in self.ens_to_gene:
                genes.append(self.ens_to_gene[ens_id])
            else:
                raise ValueError(f"{ens_id} not found")
        return genes

ens_tok_to_gene(ens)

将 Ensembl 令牌转换为基因名称。

参数

名称 类型 描述 默认值
ens str

要转换的 Ensembl 令牌。

必需

返回

名称 类型 描述
str str

相应的基因名称。

源代码在 bionemo/geneformer/tokenizer/gene_tokenizer.py
140
141
142
143
144
145
146
147
148
149
def ens_tok_to_gene(self, ens: str) -> str:
    """Converts an Ensembl token to a gene name.

    Args:
        ens (str): The Ensembl token to be converted.

    Returns:
        str: The corresponding gene name.
    """
    return self.ens_to_gene[ens]

enss_to_genes(ensemble_ids)

将 Ensembl ID 列表转换为基因名称。

参数

名称 类型 描述 默认值
ensemble_ids List[str]

Ensembl ID 的列表。

必需

返回

类型 描述
List[str]

List[str]:与 Ensembl ID 对应的基因名称列表。

引发

类型 描述
ValueError

如果在映射中找不到 Ensembl ID。

源代码在 bionemo/geneformer/tokenizer/gene_tokenizer.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def enss_to_genes(self, ensemble_ids: List[str]) -> List[str]:
    """Converts a list of ensemble IDs to gene names.

    Args:
        ensemble_ids (List[str]): A list of ensemble IDs.

    Returns:
        List[str]: A list of gene names corresponding to the ensemble IDs.

    Raises:
        ValueError: If an ensemble ID is not found in the mapping.
    """
    genes = []
    for ens_id in ensemble_ids:
        if ens_id in self.ens_to_gene:
            genes.append(self.ens_to_gene[ens_id])
        else:
            raise ValueError(f"{ens_id} not found")
    return genes

from_medians_and_genes_dicts(median_dict, gene_to_ens) classmethod

从中间值字典创建分词器。

源代码在 bionemo/geneformer/tokenizer/gene_tokenizer.py
53
54
55
56
57
58
@classmethod
def from_medians_and_genes_dicts(cls, median_dict: Dict[str, float], gene_to_ens: Dict[str, str]) -> T:
    """Creates a tokenizer from a median dictionary."""
    tokens = list(cls.special_tokens) + list(median_dict.keys())
    vocab = cls._build_vocab(tokens)
    return cls(vocab, gene_to_ens)

from_vocab_file(vocab_file) classmethod

如果我们从文件名而不是字典工作,此方法会在构造函数上添加一个层。

源代码在 bionemo/geneformer/tokenizer/gene_tokenizer.py
115
116
117
118
119
120
121
122
123
124
125
126
127
@classmethod
def from_vocab_file(cls, vocab_file: str) -> None:
    """This method adds a layer on the constructor in the case we are working from a filename instead of a dictionary."""
    if not os.path.exists(vocab_file):
        raise FileNotFoundError(f"Vocab file {vocab_file} not found, run preprocessing to create it.")

    with open(vocab_file) as f:
        to_deserialize = json.load(f)
        vocab = to_deserialize["vocab"]
        gene_to_ens = to_deserialize["gene_to_ens"]

    tokenizer = GeneTokenizer(vocab, gene_to_ens)
    return tokenizer

gene_tok_to_ens(gene)

将基因令牌转换为其对应的 Ensembl ID。

参数

名称 类型 描述 默认值
gene str

要转换的基因令牌。

必需

返回

名称 类型 描述
str str

与基因令牌对应的 Ensembl ID。

源代码在 bionemo/geneformer/tokenizer/gene_tokenizer.py
129
130
131
132
133
134
135
136
137
138
def gene_tok_to_ens(self, gene: str) -> str:
    """Converts a gene token to its corresponding Ensembl ID.

    Args:
        gene (str): The gene token to be converted.

    Returns:
        str: The Ensembl ID corresponding to the gene token.
    """
    return self.gene_to_ens[gene]

genes_to_enss(genes)

将基因名称列表转换为 Ensembl ID。

参数

名称 类型 描述 默认值
genes List[str]

基因名称列表。

必需

返回

类型 描述
List[str]

List[str]:对应的 Ensembl ID 列表。

引发

类型 描述
ValueError

如果在 gene_to_ens 字典中找不到基因名称。

源代码在 bionemo/geneformer/tokenizer/gene_tokenizer.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def genes_to_enss(self, genes: List[str]) -> List[str]:
    """Converts a list of gene names to Ensembl IDs.

    Args:
        genes (List[str]): A list of gene names.

    Returns:
        List[str]: A list of corresponding Ensembl IDs.

    Raises:
        ValueError: If a gene name is not found in the gene_to_ens dictionary.
    """
    ens_ids = []
    for gene in genes:
        if gene in self.gene_to_ens:
            ens_ids.append(self.gene_to_ens[gene])
        else:
            raise ValueError(f"{gene} not found")
    return ens_ids

save_vocab(vocab_file)

将词汇表另存为换行符分隔的词汇表文件,每行代表一个 int -> 令牌映射。行号被假定为整数。

源代码在 bionemo/geneformer/tokenizer/gene_tokenizer.py
102
103
104
105
106
107
108
109
110
111
112
113
def save_vocab(self, vocab_file: str) -> None:
    """Saves the vocabulary as a newline delimieted vocabulary file, each line represents an int -> token mapping. line number is assumed to be the integer."""
    vocab_dir = os.path.dirname(vocab_file)
    if not os.path.exists(vocab_dir):
        os.makedirs(vocab_dir, exist_ok=True)  # ensure the dir exists but be ok with race conditions.

    to_serialize = {}
    to_serialize["vocab"] = self.vocab
    to_serialize["gene_to_ens"] = self.gene_to_ens

    with open(vocab_file, "w") as f:
        json.dump(to_serialize, f)

token_to_id(token)

将令牌转换为其对应的 ID。

参数

名称 类型 描述 默认值
token str

要转换的令牌。

必需

返回

类型 描述
int

与令牌对应的 ID。

源代码在 bionemo/geneformer/tokenizer/gene_tokenizer.py
72
73
74
75
76
77
78
79
80
81
def token_to_id(self, token: str) -> int:
    """Converts a token to its corresponding ID.

    Args:
        token: The token to be converted.

    Returns:
        The ID corresponding to the token.
    """
    return self.vocab.get(token)