重要提示
您正在查看 NeMo 2.0 文档。此版本对 API 和新库 NeMo Run 进行了重大更改。我们目前正在将 NeMo 1.0 的所有功能移植到 2.0。有关先前版本或 2.0 中尚不可用的功能的文档,请参阅 NeMo 24.07 文档。
端到端说话人日志配置文档#
Sortformer 日志器训练的 Hydra 配置#
Sortformer 日志器是一个端到端说话人日志模型,它完全基于 Transformer 编码器类型的架构。Sortformer 日志器的模型名称约定:sortformer_diarizer_<loss_type>_<speaker count limit>-<version>.yaml
示例:<NeMo_root>/examples/speaker_tasks/diarization/neural_diarizer/conf/sortformer_diarizer_hybrid_loss_4spk-v1.yaml。
name: "SortFormerDiarizer"
num_workers: 18
batch_size: 8
model:
sample_rate: 16000
pil_weight: 0.5 # Weight for Permutation Invariant Loss (PIL) used in training the Sortformer diarizer model
ats_weight: 0.5 # Weight for Arrival Time Sort (ATS) loss in training the Sortformer diarizer model
max_num_of_spks: 4 # Maximum number of speakers per model; currently set to 4
model_defaults:
fc_d_model: 512 # Hidden dimension size of the Fast-conformer Encoder
tf_d_model: 192 # Hidden dimension size of the Transformer Encoder
train_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
num_spks: ${model.max_num_of_spks}
session_len_sec: 90 # Maximum session length in seconds
soft_label_thres: 0.5 # Threshold for binarizing target values; higher values make the model more conservative in predicting speaker activity.
soft_targets: False # If True, use continuous values as target values when calculating cross-entropy loss
labels: null
batch_size: ${batch_size}
shuffle: True
num_workers: ${num_workers}
validation_mode: False
# lhotse config
use_lhotse: False
use_bucketing: True
num_buckets: 10
bucket_duration_bins: [10, 20, 30, 40, 50, 60, 70, 80, 90]
pin_memory: True
min_duration: 10
max_duration: 90
batch_duration: 400
quadratic_duration: 1200
bucket_buffer_size: 20000
shuffle_buffer_size: 10000
window_stride: ${model.preprocessor.window_stride}
subsampling_factor: ${model.encoder.subsampling_factor}
validation_ds:
manifest_filepath: ???
is_tarred: False
tarred_audio_filepaths: null
sample_rate: ${model.sample_rate}
num_spks: ${model.max_num_of_spks}
session_len_sec: 90 # Maximum session length in seconds
soft_label_thres: 0.5 # A threshold value for setting up the binarized labels. The higher the more conservative the model becomes.
soft_targets: False
labels: null
batch_size: ${batch_size}
shuffle: False
num_workers: ${num_workers}
validation_mode: True
# lhotse config
use_lhotse: False
use_bucketing: False
drop_last: False
pin_memory: True
window_stride: ${model.preprocessor.window_stride}
subsampling_factor: ${model.encoder.subsampling_factor}
test_ds:
manifest_filepath: null
is_tarred: False
tarred_audio_filepaths: null
sample_rate: 16000
num_spks: ${model.max_num_of_spks}
session_len_sec: 90 # Maximum session length in seconds
soft_label_thres: 0.5
soft_targets: False
labels: null
batch_size: ${batch_size}
shuffle: False
seq_eval_mode: True
num_workers: ${num_workers}
validation_mode: True
# lhotse config
use_lhotse: False
use_bucketing: False
drop_last: False
pin_memory: True
window_stride: ${model.preprocessor.window_stride}
subsampling_factor: ${model.encoder.subsampling_factor}
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
normalize: "per_feature"
window_size: 0.025
sample_rate: ${model.sample_rate}
window_stride: 0.01
window: "hann"
features: 80
n_fft: 512
frame_splicing: 1
dither: 0.00001
sortformer_modules:
_target_: nemo.collections.asr.modules.sortformer_modules.SortformerModules
num_spks: ${model.max_num_of_spks} # Number of speakers per model. This is currently fixed at 4.
dropout_rate: 0.5 # Dropout rate
fc_d_model: ${model.model_defaults.fc_d_model}
tf_d_model: ${model.model_defaults.tf_d_model} # Hidden layer size for linear layers in Sortformer Diarizer module
encoder:
_target_: nemo.collections.asr.modules.ConformerEncoder
feat_in: ${model.preprocessor.features}
feat_out: -1
n_layers: 18
d_model: ${model.model_defaults.fc_d_model}
# Sub-sampling parameters
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
subsampling_factor: 8 # must be power of 2 for striding and vggnet
subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
causal_downsampling: false
# Feed forward module's params
ff_expansion_factor: 4
# Multi-headed Attention Module's params
self_attention_model: rel_pos # rel_pos or abs_pos
n_heads: 8 # may need to be lower for smaller d_models
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
att_context_size: [-1, -1] # -1 means unlimited context
att_context_style: regular # regular or chunked_limited
xscaling: true # scales up the input embeddings by sqrt(d_model)
untie_biases: true # unties the biases of the TransformerXL layers
pos_emb_max_len: 5000
# Convolution module's params
conv_kernel_size: 9
conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
conv_context_size: null
# Regularization
dropout: 0.1 # The dropout used in most of the Conformer Modules
dropout_pre_encoder: 0.1 # The dropout used before the encoder
dropout_emb: 0.0 # The dropout used for embeddings
dropout_att: 0.1 # The dropout for multi-headed attention modules
# Set to non-zero to enable stochastic depth
stochastic_depth_drop_prob: 0.0
stochastic_depth_mode: linear # linear or uniform
stochastic_depth_start_layer: 1
transformer_encoder:
_target_: nemo.collections.asr.modules.transformer.transformer_encoders.TransformerEncoder
num_layers: 18
hidden_size: ${model.model_defaults.tf_d_model} # Needs to be multiple of num_attention_heads
inner_size: 768
num_attention_heads: 8
attn_score_dropout: 0.5
attn_layer_dropout: 0.5
ffn_dropout: 0.5
hidden_act: relu
pre_ln: False
pre_ln_final_layer_norm: True
loss:
_target_: nemo.collections.asr.losses.bce_loss.BCELoss
weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5])
reduction: mean
lr: 0.0001
optim:
name: adamw
lr: ${model.lr}
# optimizer arguments
betas: [0.9, 0.98]
weight_decay: 1e-3
sched:
name: InverseSquareRootAnnealing
warmup_steps: 2500
warmup_ratio: null
min_lr: 1e-06
trainer:
devices: 1 # number of gpus (devices)
accelerator: gpu
max_epochs: 800
max_steps: -1 # computed at runtime if not set
num_nodes: 1
strategy: ddp_find_unused_parameters_true # Could be "ddp"
accumulate_grad_batches: 1
deterministic: True
enable_checkpointing: False
logger: False
log_every_n_steps: 1 # Interval of logging.
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
exp_manager:
use_datetime_version: False
exp_dir: null
name: ${name}
resume_if_exists: True
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
resume_ignore_no_checkpoint: True
create_tensorboard_logger: True
create_checkpoint_callback: True
create_wandb_logger: False
checkpoint_callback_params:
monitor: "val_f1_acc"
mode: "max"
save_top_k: 9
every_n_epochs: 1
wandb_logger_kwargs:
resume: True
name: null
project: null
Sortformer 日志后处理的 Hydra 配置#
后处理将基于浮点数的张量输出转换为时间戳输出。在生成说话人同质片段时,可以考虑使用起始和结束阈值以及填充,以呈现可实现最低日志错误率 (DER) 的时间戳。
默认情况下,后处理被绕过,仅执行二值化。如果您想重现 NeMo 模型卡上报告的 DER 分数,则需要应用后处理步骤。使用 batch_size = 1 以获得最长的推理窗口和尽可能高的精度。
parameters:
onset: 0.64 # Onset threshold for detecting the beginning of a speech segment
offset: 0.74 # Offset threshold for detecting the end of a speech segment
pad_onset: 0.06 # Adds the specified duration at the beginning of each speech segment
pad_offset: 0.0 # Adds the specified duration at the end of each speech segment
min_duration_on: 0.1 # Removes short silences if the duration is less than the specified minimum duration
min_duration_off: 0.15 # Removes short speech segments if the duration is less than the specified minimum duration
级联说话人日志配置文档#
级联说话人日志的训练和推理均由 .yaml
文件配置。日志器部分通常需要有关所用数据集、此管道中使用的模型以及推理相关参数(例如每个模型的后处理)的信息。此页面上的部分更详细地介绍了这些内容。
注意
有关模型详细信息和关于配置、训练、微调和评估的深入理解,请参阅 <NeMo_root>/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb
和 <NeMo_root>/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb
;对于其他应用(例如与 ASR 的可能集成),请查看 <NeMo_root>/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb
。
日志器训练的 Hydra 配置#
目前,NeMo 支持多尺度日志解码器 (MSDD) 作为神经日志器模型。MSDD 是一种基于初始化聚类和多尺度分割输入的说话人日志模型。MSDD 模型训练的示例配置文件可以在 <NeMo_root>/examples/speaker_tasks/diarization/conf/neural_diarizer/
中找到。
MSDD 的模型名称约定:msdd_<scales 数量>scl_<最长尺度,单位为秒 (ds)>_<最短尺度,单位为秒 (ds)>_<窗口偏移的重叠百分比>Povl_<隐藏层大小>x<LSTM 层数>x<CNN 输出通道数>x<卷积层重复计数>
示例:
msdd_5scl_15_05_50Povl_256x3x32x2.yaml
具有 5 个尺度,最长尺度为 1.5 秒,最短尺度为 0.5 秒,重叠 50%,隐藏层大小为 256,3 个 LSTM 层,32 个 CNN 通道,2 个重复的 Conv 层
MSDD 模型检查点 (.ckpt) 和 NeMo 文件 (.nemo) 包含说话人嵌入模型 (TitaNet),并且说话人模型与独立的 MSDD 模块一起加载。请注意,MSDD 模型需要多个尺度。因此,diarizer.speaker_embeddings.parameters
中的参数应具有多个尺度才能用作 MSDD 模型。
通用日志器配置#
model
下的项 (OmegaConfig 键) 直接决定了分割和聚类相关参数。指定了多尺度参数(window_length_in_sec
、shift_length_in_sec
和 multiscale_weights
)。max_num_of_spks
、scale_n
、soft_label_thres
和 emb_batch_size
在此处设置,然后分配给数据集配置。
diarizer:
out_dir: null
oracle_vad: True # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps
speaker_embeddings:
model_path: ??? # .nemo local model path or pretrained model name (titanet_large is recommended)
parameters:
window_length_in_sec: [1.5,1.25,1.0,0.75,0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5]
shift_length_in_sec: [0.75,0.625,0.5,0.375,0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25]
multiscale_weights: [1,1,1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33]
save_embeddings: True # Save embeddings as pickle file for each audio input.
num_workers: ${num_workers} # Number of workers used for data-loading.
max_num_of_spks: 2 # Number of speakers per model. This is currently fixed at 2.
scale_n: 5 # Number of scales for MSDD model and initializing clustering.
soft_label_thres: 0.5 # Threshold for creating discretized speaker label from continuous speaker label in RTTM files.
emb_batch_size: 0 # If this value is bigger than 0, corresponding number of embedding vectors are attached to torch graph and trained.
数据集配置#
训练、验证和测试参数分别使用配置 YAML 文件中的 train_ds
、validation_ds
和 test_ds
部分指定。诸如 num_spks
、soft_label_thres
和 emb_batch_size
之类的项遵循 model
键中的设置。您也可以将 manifest_filepath
或 emb_dir
等字段留空,然后通过命令行界面指定。请注意,test_ds
不在训练期间使用,仅用于说话人日志推理。
train_ds:
manifest_filepath: ???
emb_dir: ???
sample_rate: ${sample_rate}
num_spks: ${model.max_num_of_spks}
soft_label_thres: ${model.soft_label_thres}
labels: null
batch_size: ${batch_size}
emb_batch_size: ${model.emb_batch_size}
shuffle: True
validation_ds:
manifest_filepath: ???
emb_dir: ???
sample_rate: ${sample_rate}
num_spks: ${model.max_num_of_spks}
soft_label_thres: ${model.soft_label_thres}
labels: null
batch_size: 2
emb_batch_size: ${model.emb_batch_size}
shuffle: False
test_ds:
manifest_filepath: null
emb_dir: null
sample_rate: 16000
num_spks: ${model.max_num_of_spks}
soft_label_thres: ${model.soft_label_thres}
labels: null
batch_size: 2
shuffle: False
seq_eval_mode: False
预处理器配置#
在 MSDD 配置中,预处理器配置遵循嵌入提取器模型的预处理器。
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
normalize: "per_feature"
window_size: 0.025
sample_rate: ${sample_rate}
window_stride: 0.01
window: "hann"
features: 80
n_fft: 512
frame_splicing: 1
dither: 0.00001
模型架构配置#
MSDD 模型的超参数位于 msdd_module
键下。可以通过设置 weighting_scheme
和 context_vector_type
来更改模型架构。有关架构的详细说明,请参见 模型 页面。
msdd_module:
_target_: nemo.collections.asr.modules.msdd_diarizer.MSDD_module
num_spks: ${model.max_num_of_spks} # Number of speakers per model. This is currently fixed at 2.
hidden_size: 256 # Hidden layer size for linear layers in MSDD module
num_lstm_layers: 3 # Number of stacked LSTM layers
dropout_rate: 0.5 # Dropout rate
cnn_output_ch: 32 # Number of filters in a conv-net layer.
conv_repeat: 2 # Determines the number of conv-net layers. Should be greater or equal to 1.
emb_dim: 192 # Dimension of the speaker embedding vectors
scale_n: ${model.scale_n} # Number of scales for multiscale segmentation input
weighting_scheme: 'conv_scale_weight' # Type of weighting algorithm. Options: ('conv_scale_weight', 'attn_scale_weight')
context_vector_type: 'cos_sim' # Type of context vector: options. Options: ('cos_sim', 'elem_prod')
损失函数配置#
神经日志器使用二元交叉熵 (BCE) 损失。可以为损失函数提供负(说话人语音不存在)和正(说话人语音存在)的一组权重。
loss:
_target_: nemo.collections.asr.losses.bce_loss.BCELoss
weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5])
日志器推理的 Hydra 配置#
说话人日志推理的示例配置文件可以在 <NeMo_root>/examples/speaker_tasks/diarization/conf/inference/
中找到。选择适合您目标领域的 yaml 文件。例如,如果您想记录电话语音的音频记录,请选择 diar_infer_telephonic.yaml
。
日志推理的所有组件的配置都包含在一个名为 diar_infer_<domain>.yaml
的文件中。每个 .yaml
文件都包含以下模块的几个不同部分:VAD、说话人嵌入、聚类和 ASR。
在说话人日志推理中,清单格式提供的数据集表示您要对其执行说话人日志的数据。
日志器配置#
一个 diarizer
Hydra 配置示例可能如下所示
diarizer:
manifest_filepath: ???
out_dir: ???
oracle_vad: False # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps
collar: 0.25 # Collar value for scoring
ignore_overlap: True # Consider or ignore overlap segments while scoring
在 diarizer
键下,有 vad
、speaker_embeddings
、clustering
和 asr
键,其中包含相应模块推理的配置。
语音活动检测器配置#
VAD 模型的参数在以下 Hydra 配置示例中提供。
vad:
model_path: null # .nemo local model path or pretrained model name or none
external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set
parameters: # Tuned parameters for CH109 (using the 11 multi-speaker sessions as dev set)
window_length_in_sec: 0.15 # Window length in sec for VAD context input
shift_length_in_sec: 0.01 # Shift length in sec for generate frame level VAD prediction
smoothing: "median" # False or type of smoothing method (eg: median)
overlap: 0.875 # Overlap ratio for overlapped mean/median smoothing filter
onset: 0.4 # Onset threshold for detecting the beginning and end of a speech
offset: 0.7 # Offset threshold for detecting the end of a speech
pad_onset: 0.05 # Adding durations before each speech segment
pad_offset: -0.1 # Adding durations after each speech segment
min_duration_on: 0.2 # Threshold for small non_speech deletion
min_duration_off: 0.2 # Threshold for short speech segment deletion
filter_speech_first: True
日志中说话人嵌入的配置#
说话人嵌入模型的参数在以下 Hydra 配置示例中提供。请注意,多尺度参数可以接受列表或单个浮点数。
speaker_embeddings:
model_path: ??? # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet)
parameters:
window_length_in_sec: 1.5 # Window length(s) in sec (floating-point number). Either a number or a list. Ex) 1.5 or [1.5,1.25,1.0,0.75,0.5]
shift_length_in_sec: 0.75 # Shift length(s) in sec (floating-point number). Either a number or a list. Ex) 0.75 or [0.75,0.625,0.5,0.375,0.25]
multiscale_weights: null # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. Ex) [1,1,1,1,1]
save_embeddings: False # Save embeddings as pickle file for each audio input.
日志中聚类的配置#
聚类算法的参数在以下 Hydra 配置示例中提供。
clustering:
parameters:
oracle_num_speakers: False # If True, use num of speakers value provided in the manifest file.
max_num_speakers: 20 # Max number of speakers for each recording. If oracle_num_speakers is passed, this value is ignored.
enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated.
max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold.
sparse_search_volume: 30 # The higher the number, the more values will be examined with more time.
带有 ASR 的日志配置#
以下配置需要附加在 diarizer
下,以运行带有日志的 ASR 以获得带有说话人标签的转录。
asr:
model_path: ??? # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes.
parameters:
asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference.
asr_based_vad_threshold: 50 # threshold (multiple of 10ms) for ignoring the gap between two words when generating VAD timestamps using ASR based VAD.
asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null.
lenient_overlap_WDER: True # If true, when a word falls into speaker-overlapped regions, consider the word as a correctly diarized word.
decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model.
word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05 0.2].
word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'.
fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature.
colored_text: False # If True, use colored text to distinguish speakers in the output transcript.
print_time: True # If True, the start of the end time of each speaker turn is printed in the output transcript.
break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars)
ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode)
pretrained_language_model: null # KenLM model file: .arpa model file or .bin binary file.
beam_width: 32
alpha: 0.5
beta: 2.5
realigning_lm_parameters: # Experimental feature
arpa_language_model: null # Provide a KenLM language model in .arpa format.
min_number_of_words: 3 # Min number of words for the left context.
max_number_of_words: 10 # Max number of words for the right context.
logprob_diff_threshold: 1.2 # The threshold for the difference between two log probability values from two hypotheses.