跳到内容

Lr 调度器

SchedulerOutput

基类:TypedDict

调度器方法的输出。

源代码位于 bionemo/llm/model/lr_scheduler.py
33
34
35
36
37
38
class SchedulerOutput(TypedDict):
    """Output of the scheduler method."""

    optimizer: MegatronOptimizerModule
    lr_scheduler: dict
    monitor: str

WarmupAnnealDecayHold

基类:_LRScheduler

预热退火衰减保持学习率调度器。

源代码位于 bionemo/llm/model/lr_scheduler.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class WarmupAnnealDecayHold(_LRScheduler):
    """Warmup Anneal Decay Hold learning rate scheduler."""

    def __init__(
        self,
        optimizer: MegatronOptimizerModule,
        *,
        warmup_steps: Optional[int] = None,
        max_steps: Optional[int] = None,
        max_lr: Optional[float] = None,
        min_lr: float = 4e-5,
        anneal_percentage: float = 0.10,
        last_epoch: int = -1,
    ) -> None:
        """Initializes the WarmupAnnealDecayHold learning rate scheduler.

        Args:
            optimizer: Optimizer to apply the learning rate scheduler.
            warmup_steps (int): Number of steps for the linear warm-up.
            max_steps (int): Total number of training steps.
            max_lr (float): Peak learning rate to be achieved after warm-up.
            min_lr (float): Minimum learning rate.
            anneal_percentage (float): Percentage of the max_lr to hold after decay.
            last_epoch (int): The index of the last epoch.
        """
        self.warmup_steps = warmup_steps
        self.max_steps = max_steps
        self.max_lr = max_lr
        self.min_lr = min_lr
        self.anneal_percentage = anneal_percentage
        self.last_epoch = last_epoch

        for group in optimizer.param_groups:
            group.setdefault("initial_lr", max_lr)

        super(WarmupAnnealDecayHold, self).__init__(optimizer, last_epoch)

    def get_lr(self) -> List[float]:
        """Get the learning rate at the current step."""
        step_num = self.last_epoch
        if step_num < self.warmup_steps:
            lr = self.min_lr + (self.max_lr - self.min_lr) * step_num / self.warmup_steps
        else:
            decay_steps = self.max_steps - self.warmup_steps
            lr = self.max_lr * (1 - (step_num - self.warmup_steps) / decay_steps)
            lr = max(lr, self.max_lr * self.anneal_percentage)

        return [lr for _ in self.optimizer.param_groups]

__init__(optimizer, *, warmup_steps=None, max_steps=None, max_lr=None, min_lr=4e-05, anneal_percentage=0.1, last_epoch=-1)

初始化 WarmupAnnealDecayHold 学习率调度器。

参数

名称 类型 描述 默认值
optimizer MegatronOptimizerModule

应用学习率调度器的优化器。

必需
warmup_steps int

线性预热的步数。

max_steps int

训练步骤总数。

max_lr float

预热后要达到的峰值学习率。

min_lr float

最小学习率。

4e-05
anneal_percentage float

衰减后保持的 max_lr 的百分比。

0.1
last_epoch int

最后一个 epoch 的索引。

-1
源代码位于 bionemo/llm/model/lr_scheduler.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def __init__(
    self,
    optimizer: MegatronOptimizerModule,
    *,
    warmup_steps: Optional[int] = None,
    max_steps: Optional[int] = None,
    max_lr: Optional[float] = None,
    min_lr: float = 4e-5,
    anneal_percentage: float = 0.10,
    last_epoch: int = -1,
) -> None:
    """Initializes the WarmupAnnealDecayHold learning rate scheduler.

    Args:
        optimizer: Optimizer to apply the learning rate scheduler.
        warmup_steps (int): Number of steps for the linear warm-up.
        max_steps (int): Total number of training steps.
        max_lr (float): Peak learning rate to be achieved after warm-up.
        min_lr (float): Minimum learning rate.
        anneal_percentage (float): Percentage of the max_lr to hold after decay.
        last_epoch (int): The index of the last epoch.
    """
    self.warmup_steps = warmup_steps
    self.max_steps = max_steps
    self.max_lr = max_lr
    self.min_lr = min_lr
    self.anneal_percentage = anneal_percentage
    self.last_epoch = last_epoch

    for group in optimizer.param_groups:
        group.setdefault("initial_lr", max_lr)

    super(WarmupAnnealDecayHold, self).__init__(optimizer, last_epoch)

get_lr()

获取当前步骤的学习率。

源代码位于 bionemo/llm/model/lr_scheduler.py
78
79
80
81
82
83
84
85
86
87
88
def get_lr(self) -> List[float]:
    """Get the learning rate at the current step."""
    step_num = self.last_epoch
    if step_num < self.warmup_steps:
        lr = self.min_lr + (self.max_lr - self.min_lr) * step_num / self.warmup_steps
    else:
        decay_steps = self.max_steps - self.warmup_steps
        lr = self.max_lr * (1 - (step_num - self.warmup_steps) / decay_steps)
        lr = max(lr, self.max_lr * self.anneal_percentage)

    return [lr for _ in self.optimizer.param_groups]

WarmupAnnealDecayHoldScheduler

基类:LRSchedulerModule

预热策略学习率调度器。

源代码位于 bionemo/llm/model/lr_scheduler.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class WarmupAnnealDecayHoldScheduler(LRSchedulerModule):
    """Warmup Policy Learning Rate Scheduler."""

    def __init__(
        self,
        warmup_steps: int = 2000,
        max_steps: int = 500_000,
        max_lr: float = 4e-4,
        min_lr: float = 4e-5,
        anneal_percentage: float = 0.10,
        interval: str = "step",
        frequency: int = 1,
        monitor: str = "val_loss",
    ) -> None:
        """Initializes the WarmupAnnealDecayHoldScheduler."""
        super().__init__()
        self.warmup_steps = warmup_steps
        self.max_steps = max_steps
        self.max_lr = max_lr
        self.min_lr = min_lr
        self.anneal_percentage = anneal_percentage
        self.interval = interval
        self.frequency = frequency
        self.monitor = monitor

    def scheduler(self, model: MegatronBioBertModel, optimizer: MegatronOptimizerModule) -> SchedulerOutput:
        """Returns the scheduler output."""
        lr_scheduler = WarmupAnnealDecayHold(
            optimizer,
            warmup_steps=self.warmup_steps,
            max_steps=self.max_steps,
            max_lr=self.max_lr,
            min_lr=self.min_lr,
            anneal_percentage=self.anneal_percentage,
        )
        return {
            "optimizer": optimizer,
            # REQUIRED: The scheduler instance
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                # `interval` is the unit of the scheduler's step size, could also be 'step'.
                # 'epoch' updates the scheduler on epoch end whereas 'step'
                # updates it after a optimizer update.
                "interval": self.interval,
                # How many epochs/steps should pass between calls to
                # `scheduler.step()`. 1 corresponds to updating the learning
                # rate after every epoch/step.
                "frequency": self.frequency,
            },
            # Metric to to monitor for schedulers like `ReduceLROnPlateau`
            "monitor": self.monitor,
        }

__init__(warmup_steps=2000, max_steps=500000, max_lr=0.0004, min_lr=4e-05, anneal_percentage=0.1, interval='step', frequency=1, monitor='val_loss')

初始化 WarmupAnnealDecayHoldScheduler。

源代码位于 bionemo/llm/model/lr_scheduler.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def __init__(
    self,
    warmup_steps: int = 2000,
    max_steps: int = 500_000,
    max_lr: float = 4e-4,
    min_lr: float = 4e-5,
    anneal_percentage: float = 0.10,
    interval: str = "step",
    frequency: int = 1,
    monitor: str = "val_loss",
) -> None:
    """Initializes the WarmupAnnealDecayHoldScheduler."""
    super().__init__()
    self.warmup_steps = warmup_steps
    self.max_steps = max_steps
    self.max_lr = max_lr
    self.min_lr = min_lr
    self.anneal_percentage = anneal_percentage
    self.interval = interval
    self.frequency = frequency
    self.monitor = monitor

scheduler(model, optimizer)

返回调度器输出。

源代码位于 bionemo/llm/model/lr_scheduler.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def scheduler(self, model: MegatronBioBertModel, optimizer: MegatronOptimizerModule) -> SchedulerOutput:
    """Returns the scheduler output."""
    lr_scheduler = WarmupAnnealDecayHold(
        optimizer,
        warmup_steps=self.warmup_steps,
        max_steps=self.max_steps,
        max_lr=self.max_lr,
        min_lr=self.min_lr,
        anneal_percentage=self.anneal_percentage,
    )
    return {
        "optimizer": optimizer,
        # REQUIRED: The scheduler instance
        "lr_scheduler": {
            "scheduler": lr_scheduler,
            # `interval` is the unit of the scheduler's step size, could also be 'step'.
            # 'epoch' updates the scheduler on epoch end whereas 'step'
            # updates it after a optimizer update.
            "interval": self.interval,
            # How many epochs/steps should pass between calls to
            # `scheduler.step()`. 1 corresponds to updating the learning
            # rate after every epoch/step.
            "frequency": self.frequency,
        },
        # Metric to to monitor for schedulers like `ReduceLROnPlateau`
        "monitor": self.monitor,
    }