重要提示

您正在查看 NeMo 2.0 文档。此版本对 API 和新库 NeMo Run 进行了重大更改。我们目前正在将 NeMo 1.0 的所有功能移植到 2.0。有关先前版本或 2.0 中尚不可用的功能的文档，请参阅 NeMo 24.07 文档。

端到端说话人日志配置文档#

Sortformer 日志器训练的 Hydra 配置#

Sortformer 日志器是一个端到端说话人日志模型，它完全基于 Transformer 编码器类型的架构。Sortformer 日志器的模型名称约定：sortformer_diarizer_<loss_type>_<speaker count limit>-<version>.yaml

示例：<NeMo_root>/examples/speaker_tasks/diarization/neural_diarizer/conf/sortformer_diarizer_hybrid_loss_4spk-v1.yaml。

name: "SortFormerDiarizer"
num_workers: 18
batch_size: 8

model:
  sample_rate: 16000
  pil_weight: 0.5 # Weight for Permutation Invariant Loss (PIL) used in training the Sortformer diarizer model
  ats_weight: 0.5 # Weight for Arrival Time Sort (ATS) loss in training the Sortformer diarizer model
  max_num_of_spks: 4 # Maximum number of speakers per model; currently set to 4

  model_defaults:
    fc_d_model: 512 # Hidden dimension size of the Fast-conformer Encoder
    tf_d_model: 192 # Hidden dimension size of the Transformer Encoder

  train_ds:
    manifest_filepath: ???
    sample_rate: ${model.sample_rate}
    num_spks: ${model.max_num_of_spks}
    session_len_sec: 90 # Maximum session length in seconds
    soft_label_thres: 0.5 # Threshold for binarizing target values; higher values make the model more conservative in predicting speaker activity.
    soft_targets: False # If True, use continuous values as target values when calculating cross-entropy loss
    labels: null
    batch_size: ${batch_size}
    shuffle: True
    num_workers: ${num_workers}
    validation_mode: False
    # lhotse config
    use_lhotse: False
    use_bucketing: True
    num_buckets: 10
    bucket_duration_bins: [10, 20, 30, 40, 50, 60, 70, 80, 90]
    pin_memory: True
    min_duration: 10
    max_duration: 90
    batch_duration: 400
    quadratic_duration: 1200
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    window_stride: ${model.preprocessor.window_stride}
    subsampling_factor: ${model.encoder.subsampling_factor}

  validation_ds:
    manifest_filepath: ???
    is_tarred: False
    tarred_audio_filepaths: null
    sample_rate: ${model.sample_rate}
    num_spks: ${model.max_num_of_spks}
    session_len_sec: 90 # Maximum session length in seconds
    soft_label_thres: 0.5 # A threshold value for setting up the binarized labels. The higher the more conservative the model becomes.
    soft_targets: False
    labels: null
    batch_size: ${batch_size}
    shuffle: False
    num_workers: ${num_workers}
    validation_mode: True
    # lhotse config
    use_lhotse: False
    use_bucketing: False
    drop_last: False
    pin_memory: True
    window_stride: ${model.preprocessor.window_stride}
    subsampling_factor: ${model.encoder.subsampling_factor}

  test_ds:
    manifest_filepath: null
    is_tarred: False
    tarred_audio_filepaths: null
    sample_rate: 16000
    num_spks: ${model.max_num_of_spks}
    session_len_sec: 90 # Maximum session length in seconds
    soft_label_thres: 0.5
    soft_targets: False
    labels: null
    batch_size: ${batch_size}
    shuffle: False
    seq_eval_mode: True
    num_workers: ${num_workers}
    validation_mode: True
    # lhotse config
    use_lhotse: False
    use_bucketing: False
    drop_last: False
    pin_memory: True
    window_stride: ${model.preprocessor.window_stride}
    subsampling_factor: ${model.encoder.subsampling_factor}

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    normalize: "per_feature"
    window_size: 0.025
    sample_rate: ${model.sample_rate}
    window_stride: 0.01
    window: "hann"
    features: 80
    n_fft: 512
    frame_splicing: 1
    dither: 0.00001

  sortformer_modules:
    _target_: nemo.collections.asr.modules.sortformer_modules.SortformerModules
    num_spks: ${model.max_num_of_spks} # Number of speakers per model. This is currently fixed at 4.
    dropout_rate: 0.5 # Dropout rate
    fc_d_model: ${model.model_defaults.fc_d_model}
    tf_d_model: ${model.model_defaults.tf_d_model} # Hidden layer size for linear layers in Sortformer Diarizer module

  encoder:
    _target_: nemo.collections.asr.modules.ConformerEncoder
    feat_in: ${model.preprocessor.features}
    feat_out: -1
    n_layers: 18
    d_model: ${model.model_defaults.fc_d_model}

    # Sub-sampling parameters
    subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
    subsampling_factor: 8 # must be power of 2 for striding and vggnet
    subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
    causal_downsampling: false
    # Feed forward module's params
    ff_expansion_factor: 4
    # Multi-headed Attention Module's params
    self_attention_model: rel_pos # rel_pos or abs_pos
    n_heads: 8 # may need to be lower for smaller d_models
    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
    att_context_size: [-1, -1] # -1 means unlimited context
    att_context_style: regular # regular or chunked_limited
    xscaling: true # scales up the input embeddings by sqrt(d_model)
    untie_biases: true # unties the biases of the TransformerXL layers
    pos_emb_max_len: 5000
    # Convolution module's params
    conv_kernel_size: 9
    conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
    conv_context_size: null
    # Regularization
    dropout: 0.1 # The dropout used in most of the Conformer Modules
    dropout_pre_encoder: 0.1 # The dropout used before the encoder
    dropout_emb: 0.0 # The dropout used for embeddings
    dropout_att: 0.1 # The dropout for multi-headed attention modules
    # Set to non-zero to enable stochastic depth
    stochastic_depth_drop_prob: 0.0
    stochastic_depth_mode: linear  # linear or uniform
    stochastic_depth_start_layer: 1

  transformer_encoder:
    _target_: nemo.collections.asr.modules.transformer.transformer_encoders.TransformerEncoder
    num_layers: 18
    hidden_size: ${model.model_defaults.tf_d_model} # Needs to be multiple of num_attention_heads
    inner_size: 768
    num_attention_heads: 8
    attn_score_dropout: 0.5
    attn_layer_dropout: 0.5
    ffn_dropout: 0.5
    hidden_act: relu
    pre_ln: False
    pre_ln_final_layer_norm: True

  loss:
    _target_: nemo.collections.asr.losses.bce_loss.BCELoss
    weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5])
    reduction: mean

  lr: 0.0001
  optim:
    name: adamw
    lr: ${model.lr}
    # optimizer arguments
    betas: [0.9, 0.98]
    weight_decay: 1e-3

    sched:
      name: InverseSquareRootAnnealing
      warmup_steps: 2500
      warmup_ratio: null
      min_lr: 1e-06

trainer:
  devices: 1 # number of gpus (devices)
  accelerator: gpu
  max_epochs: 800
  max_steps: -1 # computed at runtime if not set
  num_nodes: 1
  strategy: ddp_find_unused_parameters_true # Could be "ddp"
  accumulate_grad_batches: 1
  deterministic: True
  enable_checkpointing: False
  logger: False
  log_every_n_steps: 1  # Interval of logging.
  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations

exp_manager:
  use_datetime_version: False
  exp_dir: null
  name: ${name}
  resume_if_exists: True
  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
  resume_ignore_no_checkpoint: True
  create_tensorboard_logger: True
  create_checkpoint_callback: True
  create_wandb_logger: False
  checkpoint_callback_params:
    monitor: "val_f1_acc"
    mode: "max"
    save_top_k: 9
    every_n_epochs: 1
  wandb_logger_kwargs:
    resume: True
    name: null
    project: null

Sortformer 日志后处理的 Hydra 配置#

后处理将基于浮点数的张量输出转换为时间戳输出。在生成说话人同质片段时，可以考虑使用起始和结束阈值以及填充，以呈现可实现最低日志错误率 (DER) 的时间戳。

默认情况下，后处理被绕过，仅执行二值化。如果您想重现 NeMo 模型卡上报告的 DER 分数，则需要应用后处理步骤。使用 batch_size = 1 以获得最长的推理窗口和尽可能高的精度。

parameters:
  onset: 0.64  # Onset threshold for detecting the beginning of a speech segment
  offset: 0.74  # Offset threshold for detecting the end of a speech segment
  pad_onset: 0.06  # Adds the specified duration at the beginning of each speech segment
  pad_offset: 0.0  # Adds the specified duration at the end of each speech segment
  min_duration_on: 0.1  # Removes short silences if the duration is less than the specified minimum duration
  min_duration_off: 0.15  # Removes short speech segments if the duration is less than the specified minimum duration

级联说话人日志配置文档#

级联说话人日志的训练和推理均由 .yaml 文件配置。日志器部分通常需要有关所用数据集、此管道中使用的模型以及推理相关参数（例如每个模型的后处理）的信息。此页面上的部分更详细地介绍了这些内容。

注意

有关模型详细信息和关于配置、训练、微调和评估的深入理解，请参阅 <NeMo_root>/tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb 和 <NeMo_root>/tutorials/speaker_tasks/Speaker_Diarization_Training.ipynb；对于其他应用（例如与 ASR 的可能集成），请查看 <NeMo_root>/tutorials/speaker_tasks/ASR_with_SpeakerDiarization.ipynb。

日志器训练的 Hydra 配置#

目前，NeMo 支持多尺度日志解码器 (MSDD) 作为神经日志器模型。MSDD 是一种基于初始化聚类和多尺度分割输入的说话人日志模型。MSDD 模型训练的示例配置文件可以在 <NeMo_root>/examples/speaker_tasks/diarization/conf/neural_diarizer/ 中找到。

MSDD 的模型名称约定：msdd_<scales 数量>scl_<最长尺度，单位为秒 (ds)>_<最短尺度，单位为秒 (ds)>_<窗口偏移的重叠百分比>Povl_<隐藏层大小>x<LSTM 层数>x<CNN 输出通道数>x<卷积层重复计数>
示例：msdd_5scl_15_05_50Povl_256x3x32x2.yaml 具有 5 个尺度，最长尺度为 1.5 秒，最短尺度为 0.5 秒，重叠 50%，隐藏层大小为 256，3 个 LSTM 层，32 个 CNN 通道，2 个重复的 Conv 层

MSDD 模型检查点 (.ckpt) 和 NeMo 文件 (.nemo) 包含说话人嵌入模型 (TitaNet)，并且说话人模型与独立的 MSDD 模块一起加载。请注意，MSDD 模型需要多个尺度。因此，diarizer.speaker_embeddings.parameters 中的参数应具有多个尺度才能用作 MSDD 模型。

通用日志器配置#

model 下的项 (OmegaConfig 键) 直接决定了分割和聚类相关参数。指定了多尺度参数（window_length_in_sec、shift_length_in_sec 和 multiscale_weights）。max_num_of_spks、scale_n、soft_label_thres 和 emb_batch_size 在此处设置，然后分配给数据集配置。

diarizer:
  out_dir: null
  oracle_vad: True # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps
  speaker_embeddings:
    model_path: ??? # .nemo local model path or pretrained model name (titanet_large is recommended)
    parameters:
      window_length_in_sec: [1.5,1.25,1.0,0.75,0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5]
      shift_length_in_sec: [0.75,0.625,0.5,0.375,0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25]
      multiscale_weights: [1,1,1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33]
      save_embeddings: True # Save embeddings as pickle file for each audio input.


num_workers: ${num_workers} # Number of workers used for data-loading.
max_num_of_spks: 2 # Number of speakers per model. This is currently fixed at 2.
scale_n: 5 # Number of scales for MSDD model and initializing clustering.
soft_label_thres: 0.5 # Threshold for creating discretized speaker label from continuous speaker label in RTTM files.
emb_batch_size: 0 # If this value is bigger than 0, corresponding number of embedding vectors are attached to torch graph and trained.

数据集配置#

训练、验证和测试参数分别使用配置 YAML 文件中的 train_ds、validation_ds 和 test_ds 部分指定。诸如 num_spks、soft_label_thres 和 emb_batch_size 之类的项遵循 model 键中的设置。您也可以将 manifest_filepath 或 emb_dir 等字段留空，然后通过命令行界面指定。请注意，test_ds 不在训练期间使用，仅用于说话人日志推理。

train_ds:
  manifest_filepath: ???
  emb_dir: ???
  sample_rate: ${sample_rate}
  num_spks: ${model.max_num_of_spks}
  soft_label_thres: ${model.soft_label_thres}
  labels: null
  batch_size: ${batch_size}
  emb_batch_size: ${model.emb_batch_size}
  shuffle: True

validation_ds:
  manifest_filepath: ???
  emb_dir: ???
  sample_rate: ${sample_rate}
  num_spks: ${model.max_num_of_spks}
  soft_label_thres: ${model.soft_label_thres}
  labels: null
  batch_size: 2
  emb_batch_size: ${model.emb_batch_size}
  shuffle: False

test_ds:
  manifest_filepath: null
  emb_dir: null
  sample_rate: 16000
  num_spks: ${model.max_num_of_spks}
  soft_label_thres: ${model.soft_label_thres}
  labels: null
  batch_size: 2
  shuffle: False
  seq_eval_mode: False

预处理器配置#

在 MSDD 配置中，预处理器配置遵循嵌入提取器模型的预处理器。

preprocessor:
  _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
  normalize: "per_feature"
  window_size: 0.025
  sample_rate: ${sample_rate}
  window_stride: 0.01
  window: "hann"
  features: 80
  n_fft: 512
  frame_splicing: 1
  dither: 0.00001

模型架构配置#

MSDD 模型的超参数位于 msdd_module 键下。可以通过设置 weighting_scheme 和 context_vector_type 来更改模型架构。有关架构的详细说明，请参见模型页面。

msdd_module:
  _target_: nemo.collections.asr.modules.msdd_diarizer.MSDD_module
  num_spks: ${model.max_num_of_spks} # Number of speakers per model. This is currently fixed at 2.
  hidden_size: 256 # Hidden layer size for linear layers in MSDD module
  num_lstm_layers: 3 # Number of stacked LSTM layers
  dropout_rate: 0.5 # Dropout rate
  cnn_output_ch: 32 # Number of filters in a conv-net layer.
  conv_repeat: 2 # Determines the number of conv-net layers. Should be greater or equal to 1.
  emb_dim: 192 # Dimension of the speaker embedding vectors
  scale_n: ${model.scale_n} # Number of scales for multiscale segmentation input
  weighting_scheme: 'conv_scale_weight' # Type of weighting algorithm. Options: ('conv_scale_weight', 'attn_scale_weight')
  context_vector_type: 'cos_sim' # Type of context vector: options. Options: ('cos_sim', 'elem_prod')

损失函数配置#

神经日志器使用二元交叉熵 (BCE) 损失。可以为损失函数提供负（说话人语音不存在）和正（说话人语音存在）的一组权重。

loss:
  _target_: nemo.collections.asr.losses.bce_loss.BCELoss
  weight: null # Weight for binary cross-entropy loss. Either `null` or list type input. (e.g. [0.5,0.5])

日志器推理的 Hydra 配置#

说话人日志推理的示例配置文件可以在 <NeMo_root>/examples/speaker_tasks/diarization/conf/inference/ 中找到。选择适合您目标领域的 yaml 文件。例如，如果您想记录电话语音的音频记录，请选择 diar_infer_telephonic.yaml。

日志推理的所有组件的配置都包含在一个名为 diar_infer_<domain>.yaml 的文件中。每个 .yaml 文件都包含以下模块的几个不同部分：VAD、说话人嵌入、聚类和 ASR。

在说话人日志推理中，清单格式提供的数据集表示您要对其执行说话人日志的数据。

日志器配置#

一个 diarizer Hydra 配置示例可能如下所示

diarizer:
  manifest_filepath: ???
  out_dir: ???
  oracle_vad: False # If True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps
  collar: 0.25 # Collar value for scoring
  ignore_overlap: True # Consider or ignore overlap segments while scoring

在 diarizer 键下，有 vad、speaker_embeddings、clustering 和 asr 键，其中包含相应模块推理的配置。

语音活动检测器配置#

VAD 模型的参数在以下 Hydra 配置示例中提供。

vad:
  model_path: null # .nemo local model path or pretrained model name or none
  external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set

  parameters: # Tuned parameters for CH109 (using the 11 multi-speaker sessions as dev set)
    window_length_in_sec: 0.15  # Window length in sec for VAD context input
    shift_length_in_sec: 0.01 # Shift length in sec for generate frame level VAD prediction
    smoothing: "median" # False or type of smoothing method (eg: median)
    overlap: 0.875 # Overlap ratio for overlapped mean/median smoothing filter
    onset: 0.4 # Onset threshold for detecting the beginning and end of a speech
    offset: 0.7 # Offset threshold for detecting the end of a speech
    pad_onset: 0.05 # Adding durations before each speech segment
    pad_offset: -0.1 # Adding durations after each speech segment
    min_duration_on: 0.2 # Threshold for small non_speech deletion
    min_duration_off: 0.2 # Threshold for short speech segment deletion
    filter_speech_first: True

日志中说话人嵌入的配置#

说话人嵌入模型的参数在以下 Hydra 配置示例中提供。请注意，多尺度参数可以接受列表或单个浮点数。

speaker_embeddings:
  model_path: ??? # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet)
  parameters:
    window_length_in_sec: 1.5 # Window length(s) in sec (floating-point number). Either a number or a list. Ex) 1.5 or [1.5,1.25,1.0,0.75,0.5]
    shift_length_in_sec: 0.75 # Shift length(s) in sec (floating-point number). Either a number or a list. Ex) 0.75 or [0.75,0.625,0.5,0.375,0.25]
    multiscale_weights: null # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. Ex) [1,1,1,1,1]
    save_embeddings: False # Save embeddings as pickle file for each audio input.

日志中聚类的配置#

聚类算法的参数在以下 Hydra 配置示例中提供。

clustering:
  parameters:
    oracle_num_speakers: False # If True, use num of speakers value provided in the manifest file.
    max_num_speakers: 20 # Max number of speakers for each recording. If oracle_num_speakers is passed, this value is ignored.
    enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated.
    max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold.
    sparse_search_volume: 30 # The higher the number, the more values will be examined with more time.

带有 ASR 的日志配置#

以下配置需要附加在 diarizer 下，以运行带有日志的 ASR 以获得带有说话人标签的转录。

asr:
  model_path: ??? # Provide NGC cloud ASR model name. stt_en_conformer_ctc_* models are recommended for diarization purposes.
  parameters:
    asr_based_vad: False # if True, speech segmentation for diarization is based on word-timestamps from ASR inference.
    asr_based_vad_threshold: 50 # threshold (multiple of 10ms) for ignoring the gap between two words when generating VAD timestamps using ASR based VAD.
    asr_batch_size: null # Batch size can be dependent on each ASR model. Default batch sizes are applied if set to null.
    lenient_overlap_WDER: True # If true, when a word falls into speaker-overlapped regions, consider the word as a correctly diarized word.
    decoder_delay_in_sec: null # Native decoder delay. null is recommended to use the default values for each ASR model.
    word_ts_anchor_offset: null # Offset to set a reference point from the start of the word. Recommended range of values is [-0.05  0.2].
    word_ts_anchor_pos: "start" # Select which part of the word timestamp we want to use. The options are: 'start', 'end', 'mid'.
    fix_word_ts_with_VAD: False # Fix the word timestamp using VAD output. You must provide a VAD model to use this feature.
    colored_text: False # If True, use colored text to distinguish speakers in the output transcript.
    print_time: True # If True, the start of the end time of each speaker turn is printed in the output transcript.
    break_lines: False # If True, the output transcript breaks the line to fix the line width (default is 90 chars)

  ctc_decoder_parameters: # Optional beam search decoder (pyctcdecode)
    pretrained_language_model: null # KenLM model file: .arpa model file or .bin binary file.
    beam_width: 32
    alpha: 0.5
    beta: 2.5

  realigning_lm_parameters: # Experimental feature
    arpa_language_model: null # Provide a KenLM language model in .arpa format.
    min_number_of_words: 3 # Min number of words for the left context.
    max_number_of_words: 10 # Max number of words for the right context.
    logprob_diff_threshold: 1.2  # The threshold for the difference between two log probability values from two hypotheses.