跳到内容

微调回归器

ESM2FineTuneSeqConfig dataclass

基类:ESM2GenericConfig[ESM2FineTuneSeqModel, RegressorLossReduction], IOMixinWithGettersSetters

ExampleConfig 是一个用于配置模型的数据类。

ModelParallelConfig 中的 Timers 是 megatron 向前兼容所必需的。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
@dataclass
class ESM2FineTuneSeqConfig(
    ESM2GenericConfig[ESM2FineTuneSeqModel, RegressorLossReduction], iom.IOMixinWithGettersSetters
):
    """ExampleConfig is a dataclass that is used to configure the model.

    Timers from ModelParallelConfig are required for megatron forward compatibility.
    """

    model_cls: Type[ESM2FineTuneSeqModel] = ESM2FineTuneSeqModel
    # typical case is fine-tune the base biobert that doesn't have this head. If you are instead loading a checkpoint
    # that has this new head and want to keep using these weights, please drop this next line or set to []
    initial_ckpt_skip_keys_with_these_prefixes: List[str] = field(default_factory=lambda: ["regression_head"])

    encoder_frozen: bool = True  # freeze encoder parameters
    ft_dropout: float = 0.25  # MLP layer dropout

    def get_loss_reduction_class(self) -> Type[RegressorLossReduction]:
        """Returns RegressorLossReduction class."""
        return RegressorLossReduction

get_loss_reduction_class()

返回 RegressorLossReduction 类。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
178
179
180
def get_loss_reduction_class(self) -> Type[RegressorLossReduction]:
    """Returns RegressorLossReduction class."""
    return RegressorLossReduction

ESM2FineTuneSeqModel

基类:ESM2Model

适用于在下游任务上微调的 ESM2 模型。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
class ESM2FineTuneSeqModel(ESM2Model):
    """ESM2 model that is suitable for fine-tuning on downstream tasks."""

    def __init__(self, config, *args, post_process: bool = True, include_embeddings: bool = False, **kwargs):
        """Constructs an instance of the ESM2 model suitable for fine-tuning."""
        super().__init__(config, *args, post_process=post_process, include_embeddings=True, **kwargs)

        # freeze encoder parameters
        if config.encoder_frozen:
            for _, param in self.named_parameters():
                param.requires_grad = False

        self.include_embeddings_finetuning = (
            include_embeddings  # this include_embeddings is for the final output of fine-tuning
        )
        # If post_process is True that means that we are at the last megatron parallelism stage and we can
        #   apply the head.
        if post_process:
            # if we are doing post process (eg pipeline last stage) then we need to add the output layers
            self.regression_head = MegatronMLPHead(config)

    def forward(self, *args, **kwargs) -> BioBertOutput | Tensor:
        """Inference."""
        output = super().forward(*args, **kwargs)
        # Stop early if we are not in post_process mode (for example if we are in the middle of model parallelism)
        if not self.post_process:
            return output  # we are not at the last pipeline stage so just return what the parent has
        # Double check that the output from the parent has everything we need to do prediction in this head.
        if not isinstance(output, dict) or "embeddings" not in output:
            raise ValueError(
                f"Expected to find 'embeddings' in the output, and output to be dictionary-like, found {output},\n"
                "Make sure include_embeddings=True in the call to super().__init__"
            )
        # Get the embeddings from the parent output, and pull out the [CLS] token for this task
        embeddings: Tensor = output["embeddings"]
        # Predict our 1d regression target
        regression_output = self.regression_head(embeddings)
        if not self.include_embeddings_finetuning:
            del output["embeddings"]
        output["regression_output"] = regression_output
        return output

__init__(config, *args, post_process=True, include_embeddings=False, **kwargs)

构建适用于微调的 ESM2 模型实例。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def __init__(self, config, *args, post_process: bool = True, include_embeddings: bool = False, **kwargs):
    """Constructs an instance of the ESM2 model suitable for fine-tuning."""
    super().__init__(config, *args, post_process=post_process, include_embeddings=True, **kwargs)

    # freeze encoder parameters
    if config.encoder_frozen:
        for _, param in self.named_parameters():
            param.requires_grad = False

    self.include_embeddings_finetuning = (
        include_embeddings  # this include_embeddings is for the final output of fine-tuning
    )
    # If post_process is True that means that we are at the last megatron parallelism stage and we can
    #   apply the head.
    if post_process:
        # if we are doing post process (eg pipeline last stage) then we need to add the output layers
        self.regression_head = MegatronMLPHead(config)

forward(*args, **kwargs)

推理。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def forward(self, *args, **kwargs) -> BioBertOutput | Tensor:
    """Inference."""
    output = super().forward(*args, **kwargs)
    # Stop early if we are not in post_process mode (for example if we are in the middle of model parallelism)
    if not self.post_process:
        return output  # we are not at the last pipeline stage so just return what the parent has
    # Double check that the output from the parent has everything we need to do prediction in this head.
    if not isinstance(output, dict) or "embeddings" not in output:
        raise ValueError(
            f"Expected to find 'embeddings' in the output, and output to be dictionary-like, found {output},\n"
            "Make sure include_embeddings=True in the call to super().__init__"
        )
    # Get the embeddings from the parent output, and pull out the [CLS] token for this task
    embeddings: Tensor = output["embeddings"]
    # Predict our 1d regression target
    regression_output = self.regression_head(embeddings)
    if not self.include_embeddings_finetuning:
        del output["embeddings"]
    output["regression_output"] = regression_output
    return output

InMemorySingleValueDataset

基类:Dataset

一个内存数据集,将字符串分词为 BertSample 实例。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
class InMemorySingleValueDataset(Dataset):
    """An in-memory dataset that tokenizes strings into BertSample instances."""

    def __init__(
        self,
        data: Sequence[Tuple[str, float]],
        tokenizer: tokenizer.BioNeMoESMTokenizer = tokenizer.get_tokenizer(),
        seed: int = np.random.SeedSequence().entropy,  # type: ignore
    ):
        """Initializes a dataset for single-value regression fine-tuning.

        This is an in-memory dataset that does not apply masking to the sequence.

        Args:
            data (Sequence[Tuple[str, float]]): A sequence of tuples containing the sequence and target data.
            tokenizer (tokenizer.BioNeMoESMTokenizer, optional): The tokenizer to use. Defaults to tokenizer.get_tokenizer().
            seed: Random seed for reproducibility. This seed is mixed with the index of the sample to retrieve to ensure
                that __getitem__ is deterministic, but can be random across different runs. If None, a random seed is
                generated.
        """
        self.data = data
        self.seed = seed
        self._len = len(self.data)
        self.tokenizer = tokenizer

    def __len__(self) -> int:
        """The size of the dataset."""
        return self._len

    def __getitem__(self, index: int) -> BertSample:
        """Obtains the BertSample at the given index."""
        sequence, target = self.data[index]
        tokenized_sequence = self._tokenize(sequence)
        # Overall mask for a token being masked in some capacity - either mask token, random token, or left as-is
        loss_mask = ~torch.isin(tokenized_sequence, Tensor(self.tokenizer.all_special_ids))

        return {
            "text": tokenized_sequence,
            "types": torch.zeros_like(tokenized_sequence, dtype=torch.int64),
            "attention_mask": torch.ones_like(tokenized_sequence, dtype=torch.int64),
            "labels": torch.tensor([target], dtype=torch.float),
            "loss_mask": loss_mask,
            "is_random": torch.zeros_like(tokenized_sequence, dtype=torch.int64),
        }

    def _tokenize(self, sequence: str) -> Tensor:
        """Tokenize a protein sequence.

        Args:
            sequence: The protein sequence.

        Returns:
            The tokenized sequence.
        """
        tensor = self.tokenizer.encode(sequence, add_special_tokens=True, return_tensors="pt")
        return tensor.flatten()  # type: ignore

__getitem__(index)

获取给定索引处的 BertSample。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def __getitem__(self, index: int) -> BertSample:
    """Obtains the BertSample at the given index."""
    sequence, target = self.data[index]
    tokenized_sequence = self._tokenize(sequence)
    # Overall mask for a token being masked in some capacity - either mask token, random token, or left as-is
    loss_mask = ~torch.isin(tokenized_sequence, Tensor(self.tokenizer.all_special_ids))

    return {
        "text": tokenized_sequence,
        "types": torch.zeros_like(tokenized_sequence, dtype=torch.int64),
        "attention_mask": torch.ones_like(tokenized_sequence, dtype=torch.int64),
        "labels": torch.tensor([target], dtype=torch.float),
        "loss_mask": loss_mask,
        "is_random": torch.zeros_like(tokenized_sequence, dtype=torch.int64),
    }

__init__(data, tokenizer=tokenizer.get_tokenizer(), seed=np.random.SeedSequence().entropy)

初始化单值回归微调的数据集。

这是一个内存数据集,不对序列应用掩码。

参数

名称 类型 描述 默认值
data Sequence[Tuple[str, float]]

包含序列和目标数据的元组序列。

必需
tokenizer BioNeMoESMTokenizer

要使用的分词器。默认为 tokenizer.get_tokenizer()。

get_tokenizer()
seed int

用于重现性的随机种子。此种子与要检索的样本的索引混合,以确保 getitem 是确定性的,但在不同的运行中可以是随机的。如果为 None,则生成一个随机种子。

entropy
源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
def __init__(
    self,
    data: Sequence[Tuple[str, float]],
    tokenizer: tokenizer.BioNeMoESMTokenizer = tokenizer.get_tokenizer(),
    seed: int = np.random.SeedSequence().entropy,  # type: ignore
):
    """Initializes a dataset for single-value regression fine-tuning.

    This is an in-memory dataset that does not apply masking to the sequence.

    Args:
        data (Sequence[Tuple[str, float]]): A sequence of tuples containing the sequence and target data.
        tokenizer (tokenizer.BioNeMoESMTokenizer, optional): The tokenizer to use. Defaults to tokenizer.get_tokenizer().
        seed: Random seed for reproducibility. This seed is mixed with the index of the sample to retrieve to ensure
            that __getitem__ is deterministic, but can be random across different runs. If None, a random seed is
            generated.
    """
    self.data = data
    self.seed = seed
    self._len = len(self.data)
    self.tokenizer = tokenizer

__len__()

数据集的大小。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
208
209
210
def __len__(self) -> int:
    """The size of the dataset."""
    return self._len

MegatronMLPHead

基类:MegatronModule

用于序列级回归的 MLP 类。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class MegatronMLPHead(MegatronModule):
    """An MLP class for sequence-level regression."""

    def __init__(self, config: TransformerConfig):
        """Constructor."""
        super().__init__(config)

        layer_sizes = [config.hidden_size, 256, 1]
        self.linear_layers = torch.nn.ModuleList(
            [torch.nn.Linear(i, o) for i, o in zip(layer_sizes[:-1], layer_sizes[1:])]  # noqa: RUF007
        )
        self.act = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(p=config.ft_dropout)

    def forward(self, hidden_states: Tensor) -> List[Tensor]:
        """Inference."""
        # [b, s, h]
        for layer in self.linear_layers[:-1]:
            hidden_states = self.dropout(self.act(layer(hidden_states)))

        output = self.linear_layers[-1](hidden_states)
        return output

__init__(config)

构造函数。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
 97
 98
 99
100
101
102
103
104
105
106
def __init__(self, config: TransformerConfig):
    """Constructor."""
    super().__init__(config)

    layer_sizes = [config.hidden_size, 256, 1]
    self.linear_layers = torch.nn.ModuleList(
        [torch.nn.Linear(i, o) for i, o in zip(layer_sizes[:-1], layer_sizes[1:])]  # noqa: RUF007
    )
    self.act = torch.nn.ReLU()
    self.dropout = torch.nn.Dropout(p=config.ft_dropout)

forward(hidden_states)

推理。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
108
109
110
111
112
113
114
115
def forward(self, hidden_states: Tensor) -> List[Tensor]:
    """Inference."""
    # [b, s, h]
    for layer in self.linear_layers[:-1]:
        hidden_states = self.dropout(self.act(layer(hidden_states)))

    output = self.linear_layers[-1](hidden_states)
    return output

RegressorLossReduction

基类:BERTMLMLossWithReduction

用于计算回归输出的 MSE 损失的类。

此类用于计算损失,以及记录跨微批次的缩减损失。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class RegressorLossReduction(BERTMLMLossWithReduction):
    """A class for calculating the MSE loss of regression output.

    This class used for calculating the loss, and for logging the reduced loss across micro batches.
    """

    def forward(
        self, batch: Dict[str, Tensor], forward_out: Dict[str, Tensor]
    ) -> Tuple[Tensor, PerTokenLossDict | SameSizeLossDict]:
        """Calculates the loss within a micro-batch. A micro-batch is a batch of data on a single GPU.

        Args:
            batch: A batch of data that gets passed to the original forward inside LitAutoEncoder.
            forward_out: the output of the forward method inside classification head.

        Returns:
            A tuple containing [<loss_tensor>, ReductionT] where the loss tensor will be used for
                backpropagation and the ReductionT will be passed to the reduce method
                (which currently only works for logging.).
        """
        regression_output = forward_out["regression_output"]
        targets = batch["labels"].to(dtype=regression_output.dtype)  # [b, 1]

        cp_size = parallel_state.get_context_parallel_world_size()
        if cp_size == 1:
            loss = torch.nn.functional.mse_loss(regression_output, targets)
        else:  # TODO: support CP with masked_token_loss_context_parallel
            raise NotImplementedError("Context Parallel support is not implemented for this loss")

        return loss, {"avg": loss}

    def reduce(self, losses_reduced_per_micro_batch: Sequence[SameSizeLossDict]) -> Tensor:
        """Works across micro-batches. (data on single gpu).

        Note: This currently only works for logging and this loss will not be used for backpropagation.

        Args:
            losses_reduced_per_micro_batch: a list of the outputs of forward

        Returns:
            A tensor that is the mean of the losses. (used for logging).
        """
        losses = torch.stack([loss["avg"] for loss in losses_reduced_per_micro_batch])
        return losses.mean()

forward(batch, forward_out)

计算微批次内的损失。微批次是单个 GPU 上的数据批次。

参数

名称 类型 描述 默认值
batch Dict[str, Tensor]

传递到 LitAutoEncoder 内部原始 forward 的数据批次。

必需
forward_out Dict[str, Tensor]

分类头内部 forward 方法的输出。

必需

返回

类型 描述
Tuple[Tensor, PerTokenLossDict | SameSizeLossDict]

包含以下内容的元组:[, ReductionT],其中损失张量将用于反向传播,ReductionT 将传递给 reduce 方法(目前仅适用于日志记录)。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def forward(
    self, batch: Dict[str, Tensor], forward_out: Dict[str, Tensor]
) -> Tuple[Tensor, PerTokenLossDict | SameSizeLossDict]:
    """Calculates the loss within a micro-batch. A micro-batch is a batch of data on a single GPU.

    Args:
        batch: A batch of data that gets passed to the original forward inside LitAutoEncoder.
        forward_out: the output of the forward method inside classification head.

    Returns:
        A tuple containing [<loss_tensor>, ReductionT] where the loss tensor will be used for
            backpropagation and the ReductionT will be passed to the reduce method
            (which currently only works for logging.).
    """
    regression_output = forward_out["regression_output"]
    targets = batch["labels"].to(dtype=regression_output.dtype)  # [b, 1]

    cp_size = parallel_state.get_context_parallel_world_size()
    if cp_size == 1:
        loss = torch.nn.functional.mse_loss(regression_output, targets)
    else:  # TODO: support CP with masked_token_loss_context_parallel
        raise NotImplementedError("Context Parallel support is not implemented for this loss")

    return loss, {"avg": loss}

reduce(losses_reduced_per_micro_batch)

跨微批次工作。(单个 gpu 上的数据)。

注意:这目前仅适用于日志记录,此损失不会用于反向传播。

参数

名称 类型 描述 默认值
losses_reduced_per_micro_batch Sequence[SameSizeLossDict]

forward 输出的列表

必需

返回

类型 描述
Tensor

损失均值的张量。(用于日志记录)。

源代码位于 bionemo/esm2/model/finetune/finetune_regressor.py
79
80
81
82
83
84
85
86
87
88
89
90
91
def reduce(self, losses_reduced_per_micro_batch: Sequence[SameSizeLossDict]) -> Tensor:
    """Works across micro-batches. (data on single gpu).

    Note: This currently only works for logging and this loss will not be used for backpropagation.

    Args:
        losses_reduced_per_micro_batch: a list of the outputs of forward

    Returns:
        A tensor that is the mean of the losses. (used for logging).
    """
    losses = torch.stack([loss["avg"] for loss in losses_reduced_per_micro_batch])
    return losses.mean()