跳到内容

Label2id 标记器

Label2IDTokenizer

基类:TokenizerSpec

初始化简单的字符标记器。

旨在用于提取分类模型的类别标签,例如二级结构预测模型,其中每个类别都用字符编码(例如“C”、“H”、“E”)

示例

>>> tokenizer = Label2IDTokenizer()
>>> seqs = ['CHE', 'CCC', 'EHH']
>>> tokenizer = tokenizer.build_vocab(s)
源代码位于 bionemo/llm/data/label2id_tokenizer.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class Label2IDTokenizer(TokenizerSpec):
    """Initializes simple Char Tokenizer.

    Intended to be used for extracting class labels
    for classification models such as secondary
    structure prediction model, where each class is
    encoded with a character (ex. "C", "H", "E")

    Examples:
            >>> tokenizer = Label2IDTokenizer()
            >>> seqs = ['CHE', 'CCC', 'EHH']
            >>> tokenizer = tokenizer.build_vocab(s)

    """

    def __init__(self) -> None:  # noqa: D107
        super().__init__()
        self.vocab: Dict[str, int] = {}
        self.decode_vocab: Dict[int, str] = {id_: token for token, id_ in self.vocab.items()}

    @property
    def vocab_size(self) -> int:
        """Return the size of the vocab being used."""
        return len(self.vocab)

    def text_to_tokens(self, text: str) -> List[str]:  # noqa: D102
        return list(text)

    def tokens_to_text(self, tokens: List[str]) -> str:  # noqa: D102
        return "".join(tokens)

    def tokens_to_ids(self, tokens: List[str]) -> List[int]:
        """Convert tokens to indexes/ids.

        Args:
            tokens: Containing tokens
        Returns:
            Containing ID's for each token
        """
        ids = []
        for token in tokens:
            id_ = self.vocab.get(token)
            if id_ is None:
                raise ValueError(f"Do not recognize token: {token}")
            else:
                ids.append(id_)
        return ids

    def ids_to_tokens(self, ids: List[int]) -> List[str]:
        """Convert Ids to tokens.

        Args:
            ids: Containg ids for each token
        Returns:
            Containing tokens
        """
        tokens = []
        for id_ in ids:
            token = self.decode_vocab.get(id_)
            if token is None:
                raise ValueError(f"Do not recognize ID: {id_}")
            tokens.append(token)
        return tokens

    def text_to_ids(self, text: str) -> List[int]:
        """Converts text to ids.

        Args:
            text (str): String containing text to convert
        Returns:
            (List[int]): Id's corresponding to the tokenization
            of the text
        """
        tokens = self.text_to_tokens(text)
        return self.tokens_to_ids(tokens)

    def ids_to_text(self, ids: List[int]) -> str:  # noqa: D102
        tokens = self.ids_to_tokens(ids)
        return self.tokens_to_text(tokens)

    def build_vocab(self, strings: Union[str, Iterable[str]]) -> "Label2IDTokenizer":
        """Builds the vocabulary of the tokenizer from strings
        Args:
            strings: (Union[str, Iterable[str]]): Strings to
                build the vocabulary with. If a string is supplied,
                then the vocabulary is built from the single string.
                Otherwise, the vocabulary is progressively built
                from all the strings in `strings`.
        """  # noqa: D205
        if isinstance(strings, str):
            strings = [strings]

        for string in strings:
            for token in string:
                if token not in self.vocab:
                    self.vocab[token] = len(self.vocab)
                    self.decode_vocab[self.vocab[token]] = token

        return self

vocab_size: int property

返回正在使用的词汇表的大小。

build_vocab(strings)

从字符串构建标记器的词汇表 参数: strings: (Union[str, Iterable[str]]): 用于构建词汇表的字符串。如果提供一个字符串,则从单个字符串构建词汇表。否则,词汇表将从 strings 中的所有字符串逐步构建。

源代码位于 bionemo/llm/data/label2id_tokenizer.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def build_vocab(self, strings: Union[str, Iterable[str]]) -> "Label2IDTokenizer":
    """Builds the vocabulary of the tokenizer from strings
    Args:
        strings: (Union[str, Iterable[str]]): Strings to
            build the vocabulary with. If a string is supplied,
            then the vocabulary is built from the single string.
            Otherwise, the vocabulary is progressively built
            from all the strings in `strings`.
    """  # noqa: D205
    if isinstance(strings, str):
        strings = [strings]

    for string in strings:
        for token in string:
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab)
                self.decode_vocab[self.vocab[token]] = token

    return self

ids_to_tokens(ids)

将 ID 转换为令牌。

参数

名称 类型 描述 默认值
ids List[int]

包含每个令牌的 ID

必需

返回:包含令牌

源代码位于 bionemo/llm/data/label2id_tokenizer.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def ids_to_tokens(self, ids: List[int]) -> List[str]:
    """Convert Ids to tokens.

    Args:
        ids: Containg ids for each token
    Returns:
        Containing tokens
    """
    tokens = []
    for id_ in ids:
        token = self.decode_vocab.get(id_)
        if token is None:
            raise ValueError(f"Do not recognize ID: {id_}")
        tokens.append(token)
    return tokens

text_to_ids(text)

将文本转换为 ID。

参数

名称 类型 描述 默认值
text str

包含要转换的文本的字符串

必需

返回: (List[int]): 对应于文本令牌化的 ID

源代码位于 bionemo/llm/data/label2id_tokenizer.py
89
90
91
92
93
94
95
96
97
98
99
def text_to_ids(self, text: str) -> List[int]:
    """Converts text to ids.

    Args:
        text (str): String containing text to convert
    Returns:
        (List[int]): Id's corresponding to the tokenization
        of the text
    """
    tokens = self.text_to_tokens(text)
    return self.tokens_to_ids(tokens)

tokens_to_ids(tokens)

将令牌转换为索引/ID。

参数

名称 类型 描述 默认值
tokens List[str]

包含令牌

必需

返回: 包含每个令牌的 ID

源代码位于 bionemo/llm/data/label2id_tokenizer.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def tokens_to_ids(self, tokens: List[str]) -> List[int]:
    """Convert tokens to indexes/ids.

    Args:
        tokens: Containing tokens
    Returns:
        Containing ID's for each token
    """
    ids = []
    for token in tokens:
        id_ = self.vocab.get(token)
        if id_ is None:
            raise ValueError(f"Do not recognize token: {token}")
        else:
            ids.append(id_)
    return ids