基类:TokenizerSpec
初始化简单的字符标记器。
旨在用于提取分类模型的类别标签,例如二级结构预测模型,其中每个类别都用字符编码(例如“C”、“H”、“E”)
示例
>>> tokenizer = Label2IDTokenizer()
>>> seqs = ['CHE', 'CCC', 'EHH']
>>> tokenizer = tokenizer.build_vocab(s)
源代码位于 bionemo/llm/data/label2id_tokenizer.py
中
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123 | class Label2IDTokenizer(TokenizerSpec):
"""Initializes simple Char Tokenizer.
Intended to be used for extracting class labels
for classification models such as secondary
structure prediction model, where each class is
encoded with a character (ex. "C", "H", "E")
Examples:
>>> tokenizer = Label2IDTokenizer()
>>> seqs = ['CHE', 'CCC', 'EHH']
>>> tokenizer = tokenizer.build_vocab(s)
"""
def __init__(self) -> None: # noqa: D107
super().__init__()
self.vocab: Dict[str, int] = {}
self.decode_vocab: Dict[int, str] = {id_: token for token, id_ in self.vocab.items()}
@property
def vocab_size(self) -> int:
"""Return the size of the vocab being used."""
return len(self.vocab)
def text_to_tokens(self, text: str) -> List[str]: # noqa: D102
return list(text)
def tokens_to_text(self, tokens: List[str]) -> str: # noqa: D102
return "".join(tokens)
def tokens_to_ids(self, tokens: List[str]) -> List[int]:
"""Convert tokens to indexes/ids.
Args:
tokens: Containing tokens
Returns:
Containing ID's for each token
"""
ids = []
for token in tokens:
id_ = self.vocab.get(token)
if id_ is None:
raise ValueError(f"Do not recognize token: {token}")
else:
ids.append(id_)
return ids
def ids_to_tokens(self, ids: List[int]) -> List[str]:
"""Convert Ids to tokens.
Args:
ids: Containg ids for each token
Returns:
Containing tokens
"""
tokens = []
for id_ in ids:
token = self.decode_vocab.get(id_)
if token is None:
raise ValueError(f"Do not recognize ID: {id_}")
tokens.append(token)
return tokens
def text_to_ids(self, text: str) -> List[int]:
"""Converts text to ids.
Args:
text (str): String containing text to convert
Returns:
(List[int]): Id's corresponding to the tokenization
of the text
"""
tokens = self.text_to_tokens(text)
return self.tokens_to_ids(tokens)
def ids_to_text(self, ids: List[int]) -> str: # noqa: D102
tokens = self.ids_to_tokens(ids)
return self.tokens_to_text(tokens)
def build_vocab(self, strings: Union[str, Iterable[str]]) -> "Label2IDTokenizer":
"""Builds the vocabulary of the tokenizer from strings
Args:
strings: (Union[str, Iterable[str]]): Strings to
build the vocabulary with. If a string is supplied,
then the vocabulary is built from the single string.
Otherwise, the vocabulary is progressively built
from all the strings in `strings`.
""" # noqa: D205
if isinstance(strings, str):
strings = [strings]
for string in strings:
for token in string:
if token not in self.vocab:
self.vocab[token] = len(self.vocab)
self.decode_vocab[self.vocab[token]] = token
return self
|
build_vocab(strings)
从字符串构建标记器的词汇表 参数: strings: (Union[str, Iterable[str]]): 用于构建词汇表的字符串。如果提供一个字符串,则从单个字符串构建词汇表。否则,词汇表将从 strings
中的所有字符串逐步构建。
源代码位于 bionemo/llm/data/label2id_tokenizer.py
中
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123 | def build_vocab(self, strings: Union[str, Iterable[str]]) -> "Label2IDTokenizer":
"""Builds the vocabulary of the tokenizer from strings
Args:
strings: (Union[str, Iterable[str]]): Strings to
build the vocabulary with. If a string is supplied,
then the vocabulary is built from the single string.
Otherwise, the vocabulary is progressively built
from all the strings in `strings`.
""" # noqa: D205
if isinstance(strings, str):
strings = [strings]
for string in strings:
for token in string:
if token not in self.vocab:
self.vocab[token] = len(self.vocab)
self.decode_vocab[self.vocab[token]] = token
return self
|
ids_to_tokens(ids)
将 ID 转换为令牌。
参数
名称 |
类型 |
描述 |
默认值 |
ids
|
List[int]
|
|
必需
|
返回:包含令牌
源代码位于 bionemo/llm/data/label2id_tokenizer.py
中
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87 | def ids_to_tokens(self, ids: List[int]) -> List[str]:
"""Convert Ids to tokens.
Args:
ids: Containg ids for each token
Returns:
Containing tokens
"""
tokens = []
for id_ in ids:
token = self.decode_vocab.get(id_)
if token is None:
raise ValueError(f"Do not recognize ID: {id_}")
tokens.append(token)
return tokens
|
text_to_ids(text)
将文本转换为 ID。
参数
返回: (List[int]): 对应于文本令牌化的 ID
源代码位于 bionemo/llm/data/label2id_tokenizer.py
中
89
90
91
92
93
94
95
96
97
98
99 | def text_to_ids(self, text: str) -> List[int]:
"""Converts text to ids.
Args:
text (str): String containing text to convert
Returns:
(List[int]): Id's corresponding to the tokenization
of the text
"""
tokens = self.text_to_tokens(text)
return self.tokens_to_ids(tokens)
|
tokens_to_ids(tokens)
将令牌转换为索引/ID。
参数
名称 |
类型 |
描述 |
默认值 |
tokens
|
List[str]
|
|
必需
|
返回: 包含每个令牌的 ID
源代码位于 bionemo/llm/data/label2id_tokenizer.py
中
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 | def tokens_to_ids(self, tokens: List[str]) -> List[int]:
"""Convert tokens to indexes/ids.
Args:
tokens: Containing tokens
Returns:
Containing ID's for each token
"""
ids = []
for token in tokens:
id_ = self.vocab.get(token)
if id_ is None:
raise ValueError(f"Do not recognize token: {token}")
else:
ids.append(id_)
return ids
|