gRPC 与 Audio2Face-3D#

Audio2Face-3D NIM 公开了以下 gRPC

用于处理音频数据和获取动画数据的双向流式 gRPC 或 2 个单向流式端点。
用于获取微服务当前配置的 Unary gRPC。

如果您过去使用过 Audio2face-3D，请参阅从 1.0 迁移到 1.2 页面。

双向流式 gRPC#

注意

在之前的 Audio2Face 版本中，需要一个单独的服务来实现这个双向端点，称为 Audio2Face Controller。我们保留了 service.a2f_controller 和 nvidia_ace.controller 的名称以实现向后兼容，但您不需要运行另一个微服务。

本节描述了与 Audio2Face-3D 双向端点交互的过程。

服务定义#

Audio2Face-3D 双向流式服务在此 proto 中描述。proto 中的 ProcessAudioStream rpc 是您需要调用的唯一调用，用于从音频输入生成动画数据。

服务 protobuf 对象#

警告

在流式传输完所有音频数据后，客户端必须发送空的 AudioStream.EndOfAudio 以向服务发出信号，表明所有音频数据都已设置。只有在此之后，服务才会返回 gRPC 状态。如果在输入流式传输期间出现任何错误，服务将立即返回 gRPC 错误状态。

nvidia_ace.controller.v1.proto

syntax = "proto3";

package nvidia_ace.controller.v1;

import "nvidia_ace.a2f.v1.proto";
import "nvidia_ace.animation_data.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.status.v1.proto";
import "google/protobuf/any.proto";

message AudioStream {
  // This is a marker for the end of an audio clip.
  message EndOfAudio {}

  oneof stream_part {
    // The header must be sent as the first message.
    AudioStreamHeader audio_stream_header = 1;
    // At least one AudioWithEmotion messages must be sent thereafter.
    nvidia_ace.a2f.v1.AudioWithEmotion audio_with_emotion = 2;
    // The EndOfAudio must be sent last.
    EndOfAudio end_of_audio = 3;
  }
}

// IMPORTANT NOTE: this is an AudioStreamHeader WITHOUT ID
// A similar AudioStreamHeader exist in nvidia_ace.a2f.v1.proto
// but that one does contain IDs.
message AudioStreamHeader {
  // Metadata about the audio being sent to the service.
  nvidia_ace.audio.v1.AudioHeader audio_header = 1;
  // Parameters for updating the facial characteristics of an avatar.
  // See the documentation for more information.
  nvidia_ace.a2f.v1.FaceParameters face_params = 2;
  // Parameters relative to the emotion blending and processing
  // before using it to generate blendshapes.
  // See the documentation for more information.
  nvidia_ace.a2f.v1.EmotionPostProcessingParameters emotion_post_processing_params = 3;
  // Multipliers and offsets to apply to the generated blendshape values.
  nvidia_ace.a2f.v1.BlendShapeParameters blendshape_params = 4;
  // Emotion parameters (live transition time, beginning emotion)
  nvidia_ace.a2f.v1.EmotionParameters emotion_params = 5;
}

enum EventType {
  // This event type means that the A2F Microservice is done processing audio,
  // However it doesn't mean that you finished receiving all the audio data,
  // You will receive a Status message once you are done receiving all the audio
  // data. Events are independent of that.
  END_OF_A2F_AUDIO_PROCESSING = 0;
}

message Event {
  // Type of the event.
  EventType event_type = 1;
  // Data attached to the event if any.
  optional google.protobuf.Any metadata = 2;
}

// IMPORTANT NOTE: this is an AnimationDataStreamHeader WITHOUT ID
// A similar AudioStreamHeader exist in nvidia_ace.animation_data.v1.proto
// but that one does contain IDs.
message AnimationDataStreamHeader {
  // Metadata of the audio buffers. This defines the audio clip properties
  // at the beginning the streaming process.
  optional nvidia_ace.audio.v1.AudioHeader audio_header = 1;
  // Metadata containing the blendshape and joints names.
  // This defines the names of the blendshapes and joints flowing through a stream.
  optional nvidia_ace.animation_data.v1.SkelAnimationHeader
      skel_animation_header = 2;

  // Time codes indicate the relative progression of an animation data, audio
  // clip, etc. The unit is seconds. In addition, we also need an absolute time
  // reference shared across services. The start time is stored in time codes
  // elapsed since the Unix time epoch. start_time_code_since_epoch = `Unix
  // timestamp in seconds`. NTP should be good enough to synchronize clocks
  // across nodes. From Wikipedia: NTP can usually maintain time to within tens
  // of milliseconds over the public Internet, and can achieve better than one
  // millisecond accuracy in local area networks under ideal conditions.
  // Alternatively, there is PTP.
  double start_time_code_since_epoch = 3;

  // A generic metadata field to attach use case specific data (e.g. session id,
  // or user id?) map<string, string> metadata = 4; map<string,
  // google.protobuf.Any> metadata = 4;
}

message AnimationDataStream {
  // The header must be sent as the first message.
  // One or more animation data message must be sent.
  // The status must be sent last and may be sent in between.
  oneof stream_part {
    // The header must be sent as the first message.
    AnimationDataStreamHeader animation_data_stream_header = 1;
    // Then one or more animation data message must be sent.
    nvidia_ace.animation_data.v1.AnimationData animation_data = 2;
    // The event may be sent in between.
    Event event = 3;
    // The status must be sent last and may be sent in between.
    nvidia_ace.status.v1.Status status = 4;
  }
}
//nvidia_ace.controller.v1
//v1.0.0

依赖项#

nvidia_ace.a2f.v1.proto

syntax = "proto3";

package nvidia_ace.a2f.v1;


import "nvidia_ace.animation_id.v1.proto";
import "nvidia_ace.status.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.emotion_with_timecode.v1.proto";


message AudioStream {
  // The header must be sent as the first message.
  // One or more audio with emotion messages must be sent thereafter.
  // The end of audio will happen when the client closes the connection
  oneof stream_part {
    AudioStreamHeader audio_stream_header = 1;
    nvidia_ace.a2f.v1.AudioWithEmotion audio_with_emotion = 2;
  }
}

// IMPORTANT NOTE: this is an AudioStreamHeader WITH ID
// A similar AudioStreamHeader exist in nvidia_ace.controller.v1.proto
// but that one does NOT contain IDs
message AudioStreamHeader {
  // IDs of the current stream
  nvidia_ace.animation_id.v1.AnimationIds animation_ids = 1;
  
  nvidia_ace.audio.v1.AudioHeader audio_header = 2;

  // Parameters for updating the facial characteristics of an avatar
  // See the documentation for more information
  FaceParameters face_params = 3;

  // Parameters relative to the emotion blending and processing
  // before using it to generate blendshapes
  // See the documentation for more information
  EmotionPostProcessingParameters emotion_post_processing_params = 4;

  // Multipliers and offsets to apply to the generated blendshape values
  BlendShapeParameters blendshape_params = 5;

  // Emotion parameters (live transition time, beginning emotion)
  EmotionParameters emotion_params = 6;
}

message FloatArray { repeated float values = 1; }


message FaceParameters {
  // The following float parameters are available:
  // "lowerFaceSmoothing", "upperFaceSmoothing", "lowerFaceStrength", "upperFaceStrength",
  // "faceMaskLevel", "faceMaskSoftness", "skinStrength", "blinkStrength", "eyelidOpenOffset",
  // "lipOpenOffset", "blinkOffset", "tongueStrength", "tongueHeightOffset", "tongueDepthOffset"
  map<string, float> float_params = 1;
  // With the current Audio2Face Service no integer parameters are available
  map<string, int32> integer_params = 2;
  // With the current Audio2Face Service no FloatArray parameters are available
  map<string, FloatArray> float_array_params = 3;
}

// The following blendshape can be used here as key:
//  "EyeBlinkLeft", "EyeLookDownLeft", "EyeLookInLeft", "EyeLookOutLeft", "EyeLookUpLeft",
//  "EyeSquintLeft", "EyeWideLeft", "EyeBlinkRight", "EyeLookDownRight", "EyeLookInRight",
//  "EyeLookOutRight", "EyeLookUpRight", "EyeSquintRight", "EyeWideRight", "JawForward",
//  "JawLeft", "JawRight", "JawOpen",  "MouthClose", "MouthFunnel", "MouthPucker", "MouthLeft",
//  "MouthRight", "MouthSmileLeft", "MouthSmileRight", "MouthFrownLeft", "MouthFrownRight",
//  "MouthDimpleLeft", "MouthDimpleRight", "MouthStretchLeft", "MouthStretchRight", "MouthRollLower",
//  "MouthRollUpper", "MouthShrugLower", "MouthShrugUpper", "MouthPressLeft", "MouthPressRight",
//  "MouthLowerDownLeft", "MouthLowerDownRight", "MouthUpperUpLeft", "MouthUpperUpRight", "BrowDownLeft",
//  "BrowDownRight", "BrowInnerUp", "BrowOuterUpLeft", "BrowOuterUpRight", "CheekPuff",
//  "CheekSquintLeft", "CheekSquintRight", "NoseSneerLeft", "NoseSneerRight", "TongueOut"
// Note1: some multipliers and offset visual impact are lighter than others.
// "JawOpen", "MouthSmileLeft" and "MouthSmileRight" have stronger visual impact
// Note2: Blendshape values are after applying multipliers and offset are clamped between 0 and 1
// E.g.:
// * inferenced_weight is 0.9
// * multiplier_value is set to 3
// * offset_value is set to -1
// Then the result will be:
// 0.9 * 3 - 1 = 1.7 ===> clamp between 0, 1 ===> adjusted weight is 1
message BlendShapeParameters {
  // When a key is not specified the default value is 1
  map<string, float> bs_weight_multipliers = 1;
  // When a key is not specified the default value is 0
  map<string, float> bs_weight_offsets = 2;
  // Default output bs weight is unclamped. When clamped, range is [0, 1].
  optional bool enable_clamping_bs_weight = 3;
}

message EmotionParameters {
  // Transition time value used for temporal smoothing by A2E SDK
  // Expected value range: 0 < val < inf
  optional float live_transition_time = 1;

  // Beginning emotion used for temporal emotion smoothing
  // This maps the emotion names to the corresponding emotion strength
  // Missing emotion values will be set to 0.0
  // The following emotions can be set:
  // "amazement", "anger", "cheekiness", "disgust", "fear",
  // "grief", "joy", "outofbreath", "pain", "sadness"
  // Emotion values must be set between 0.0 and 1.0
  map<string, float> beginning_emotion = 2;
}

// For more information refer to the documentation
message EmotionPostProcessingParameters {
  // Increases the spread between emotion values by pushing them higher or lower.
  // Default value: 1
  // Min: 0.3
  // Max: 3
  optional float emotion_contrast = 1;

  // Coefficient for smoothing emotions over time
  //  0 means no smoothing at all (can be jittery)
  //  1 means extreme smoothing (emotion values not updated over time)
  // Default value: 0.7
  // Min: 0
  // Max: 1
  optional float live_blend_coef = 2;

  // Activate blending between the preferred emotions (passed as input) and the emotions detected by A2E.
  // Default: True
  optional bool enable_preferred_emotion = 3;

  // Sets the strength of the preferred emotions (passed as input) relative to emotions detected by A2E.
  // 0 means only A2E output will be used for emotion rendering.
  // 1 means only the preferred emotions will be used for emotion rendering.
  // Default value: 0.5
  // Min: 0
  // Max: 1
  optional float preferred_emotion_strength = 4;

  // Sets the strength of generated emotions relative to neutral emotion.
  // This multiplier is applied globally after the mix of emotion is done.
  // If set to 0, emotion will be neutral.
  // If set to 1, the blend of emotion will be fully used. (can be too intense)
  // Default value: 0.6
  // Min: 0
  // Max: 1
  optional float emotion_strength = 5;

  // Sets a firm limit on the quantity of emotion sliders engaged by A2E
  // emotions with highest weight will be prioritized
  // Default value: 3
  // Min: 1
  // Max: 6
  optional int32 max_emotions = 6;
}

message AudioWithEmotion {
  // audio buffer in bytes to interpret depending on the audio header
  bytes audio_buffer = 1;

  // The time codes are relative to the beginning of the audio clip.
  repeated nvidia_ace.emotion_with_timecode.v1.EmotionWithTimeCode emotions = 2;
}
//nvidia_ace.a2f.v1
//v1.1.0

nvidia_ace.animation_data.v1.proto

syntax = "proto3";

package nvidia_ace.animation_data.v1;

import "nvidia_ace.animation_id.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.status.v1.proto";

import "google/protobuf/any.proto";

// IMPORTANT NOTE: this is an AnimationDataStreamHeader WITH ID
// A similar AudioStreamHeader exist in nvidia_ace.controller.v1.proto
// but that one does NOT contain IDs
message AnimationDataStreamHeader {
  nvidia_ace.animation_id.v1.AnimationIds animation_ids = 1;

  // This is required to identify from which animation source (e.g. A2F) the
  // request originates. This allows us to map the incoming animation data
  // stream to the correct pose provider animation graph node. The animation
  // source MSs (e.g. A2F MS) should populate this with their name. (e.g. A2F).
  // Example Value: "A2F MS"
  optional string source_service_id = 2;

  // Metadata of the audio buffers. This defines the audio clip properties
  // at the beginning the streaming process.
  optional nvidia_ace.audio.v1.AudioHeader audio_header = 3;

  // Metadata containing the blendshape and joints names.
  // This defines the names of the blendshapes and joints flowing though a stream.
  optional nvidia_ace.animation_data.v1.SkelAnimationHeader
      skel_animation_header = 4;

  // Animation data streams use time codes (`time_code`) to define the temporal
  // position of audio (e.g. `AudioWithTimeCode`), animation key frames (e.g.
  // `SkelAnimation`), etc. relative to the beginning of the stream. The unit of
  // `time_code` is seconds. In addition, the `AnimationDataStreamHeader` also
  // provides the `start_time_code_since_epoch` field, which defines the
  // absolute start time of the animation data stream. This start time is stored
  // in seconds elapsed since the Unix time epoch.
  double start_time_code_since_epoch = 5;

  // A generic metadata field to attach use case specific data (e.g. session id,
  // or user id?) map<string, string> metadata = 6; map<string,
  // google.protobuf.Any> metadata = 6;
}

// This message represent each message of a stream of animation data.
message AnimationDataStream {
  oneof stream_part {
    // The header must be sent as the first message.
    AnimationDataStreamHeader animation_data_stream_header = 1;
    // Then one or more animation data message must be sent.
    nvidia_ace.animation_data.v1.AnimationData animation_data = 2;
    // The status must be sent last and may be sent in between.
    nvidia_ace.status.v1.Status status = 3;
  }
}

message AnimationData {
  optional SkelAnimation skel_animation = 1;
  optional AudioWithTimeCode audio = 2;
  optional Camera camera = 3;

  // Metadata such as emotion aggregates, etc...
  map<string, google.protobuf.Any> metadata = 4;
}

message AudioWithTimeCode {
  // The time code is relative to the `start_time_code_since_epoch`.
  // Example Value: 0.0 (for the very first audio buffer flowing out of a service)
  double time_code = 1;
  // Audio Data in bytes, for how to interpret these bytes you need to refer to
  // the audio header.
  bytes audio_buffer = 2;
}

message SkelAnimationHeader {
  // Names of the blendshapes only sent once in the header
  // The position of these names is the same as the position of the values
  // of the blendshapes messages
  // As an example if the blendshape names are ["Eye Left", "Eye Right", "Jaw"]
  // Then when receiving blendshape data over the streaming process
  // E.g.: [0.1, 0.5, 0.2] & timecode = 0.0
  // The pairing will be for timecode=0.0, "Eye Left"=0.1,  "Eye Right"=0.5, "Jaw"=0.2
  repeated string blend_shapes = 1;
  // Names of the joints only sent once in the header
  repeated string joints = 2;
}

message SkelAnimation {
  // Time codes must be strictly monotonically increasing.
  // Two successive SkelAnimation messages must not have overlapping time code
  // ranges.
  repeated FloatArrayWithTimeCode blend_shape_weights = 1;
  repeated Float3ArrayWithTimeCode translations = 2;
  repeated QuatFArrayWithTimeCode rotations = 3;
  repeated Float3ArrayWithTimeCode scales = 4;
}

message Camera {
  repeated Float3WithTimeCode position = 1;
  repeated QuatFWithTimeCode rotation = 2;

  repeated FloatWithTimeCode focal_length = 3;
  repeated FloatWithTimeCode focus_distance = 4;
}

message FloatArrayWithTimeCode {
  double time_code = 1;
  repeated float values = 2;
}

message Float3ArrayWithTimeCode {
  double time_code = 1;
  repeated Float3 values = 2;
}

message QuatFArrayWithTimeCode {
  double time_code = 1;
  repeated QuatF values = 2;
}

message Float3WithTimeCode {
  double time_code = 1;
  Float3 value = 2;
}

message QuatFWithTimeCode {
  double time_code = 1;
  QuatF value = 2;
}

message FloatWithTimeCode {
  double time_code = 1;
  float value = 2;
}

message QuatF {
  float real = 1;
  float i = 2;
  float j = 3;
  float k = 4;
}

message Float3 {
  float x = 1;
  float y = 2;
  float z = 3;
}
//nvidia_ace.animation_data.v1
//v1.0.0

配置获取 gRPC#

服务定义#

此服务处于 alpha 版本。

下面 proto 中的 GetConfigs rpc 是您需要调用的唯一调用，用于获取 Audio2Face-3D 微服务的当前配置。有关其更多信息，请参阅 A2F-3D NIM 手动容器部署和配置页面。

单向流式 gRPC#

单向流式 gRPC

本节描述了 Audio2Face-3D 微服务在传统模式下运行时，与 2 个单向流式端点通信的情况。为了与 Audio2Face-3D 交互，您需要创建一个客户端来发送数据，并实现一个服务器来接收数据。

客户端 - 服务

这是您需要将数据发送到的 gRPC 服务器原型

客户端 - Protobuf 数据

nvidia_ace.a2f.v1.proto

syntax = "proto3";

package nvidia_ace.a2f.v1;


import "nvidia_ace.animation_id.v1.proto";
import "nvidia_ace.status.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.emotion_with_timecode.v1.proto";


message AudioStream {
  // The header must be sent as the first message.
  // One or more audio with emotion messages must be sent thereafter.
  // The end of audio will happen when the client closes the connection
  oneof stream_part {
    AudioStreamHeader audio_stream_header = 1;
    nvidia_ace.a2f.v1.AudioWithEmotion audio_with_emotion = 2;
  }
}

// IMPORTANT NOTE: this is an AudioStreamHeader WITH ID
// A similar AudioStreamHeader exist in nvidia_ace.controller.v1.proto
// but that one does NOT contain IDs
message AudioStreamHeader {
  // IDs of the current stream
  nvidia_ace.animation_id.v1.AnimationIds animation_ids = 1;
  
  nvidia_ace.audio.v1.AudioHeader audio_header = 2;

  // Parameters for updating the facial characteristics of an avatar
  // See the documentation for more information
  FaceParameters face_params = 3;

  // Parameters relative to the emotion blending and processing
  // before using it to generate blendshapes
  // See the documentation for more information
  EmotionPostProcessingParameters emotion_post_processing_params = 4;

  // Multipliers and offsets to apply to the generated blendshape values
  BlendShapeParameters blendshape_params = 5;

  // Emotion parameters (live transition time, beginning emotion)
  EmotionParameters emotion_params = 6;
}

message FloatArray { repeated float values = 1; }


message FaceParameters {
  // The following float parameters are available:
  // "lowerFaceSmoothing", "upperFaceSmoothing", "lowerFaceStrength", "upperFaceStrength",
  // "faceMaskLevel", "faceMaskSoftness", "skinStrength", "blinkStrength", "eyelidOpenOffset",
  // "lipOpenOffset", "blinkOffset", "tongueStrength", "tongueHeightOffset", "tongueDepthOffset"
  map<string, float> float_params = 1;
  // With the current Audio2Face Service no integer parameters are available
  map<string, int32> integer_params = 2;
  // With the current Audio2Face Service no FloatArray parameters are available
  map<string, FloatArray> float_array_params = 3;
}

// The following blendshape can be used here as key:
//  "EyeBlinkLeft", "EyeLookDownLeft", "EyeLookInLeft", "EyeLookOutLeft", "EyeLookUpLeft",
//  "EyeSquintLeft", "EyeWideLeft", "EyeBlinkRight", "EyeLookDownRight", "EyeLookInRight",
//  "EyeLookOutRight", "EyeLookUpRight", "EyeSquintRight", "EyeWideRight", "JawForward",
//  "JawLeft", "JawRight", "JawOpen",  "MouthClose", "MouthFunnel", "MouthPucker", "MouthLeft",
//  "MouthRight", "MouthSmileLeft", "MouthSmileRight", "MouthFrownLeft", "MouthFrownRight",
//  "MouthDimpleLeft", "MouthDimpleRight", "MouthStretchLeft", "MouthStretchRight", "MouthRollLower",
//  "MouthRollUpper", "MouthShrugLower", "MouthShrugUpper", "MouthPressLeft", "MouthPressRight",
//  "MouthLowerDownLeft", "MouthLowerDownRight", "MouthUpperUpLeft", "MouthUpperUpRight", "BrowDownLeft",
//  "BrowDownRight", "BrowInnerUp", "BrowOuterUpLeft", "BrowOuterUpRight", "CheekPuff",
//  "CheekSquintLeft", "CheekSquintRight", "NoseSneerLeft", "NoseSneerRight", "TongueOut"
// Note1: some multipliers and offset visual impact are lighter than others.
// "JawOpen", "MouthSmileLeft" and "MouthSmileRight" have stronger visual impact
// Note2: Blendshape values are after applying multipliers and offset are clamped between 0 and 1
// E.g.:
// * inferenced_weight is 0.9
// * multiplier_value is set to 3
// * offset_value is set to -1
// Then the result will be:
// 0.9 * 3 - 1 = 1.7 ===> clamp between 0, 1 ===> adjusted weight is 1
message BlendShapeParameters {
  // When a key is not specified the default value is 1
  map<string, float> bs_weight_multipliers = 1;
  // When a key is not specified the default value is 0
  map<string, float> bs_weight_offsets = 2;
  // Default output bs weight is unclamped. When clamped, range is [0, 1].
  optional bool enable_clamping_bs_weight = 3;
}

message EmotionParameters {
  // Transition time value used for temporal smoothing by A2E SDK
  // Expected value range: 0 < val < inf
  optional float live_transition_time = 1;

  // Beginning emotion used for temporal emotion smoothing
  // This maps the emotion names to the corresponding emotion strength
  // Missing emotion values will be set to 0.0
  // The following emotions can be set:
  // "amazement", "anger", "cheekiness", "disgust", "fear",
  // "grief", "joy", "outofbreath", "pain", "sadness"
  // Emotion values must be set between 0.0 and 1.0
  map<string, float> beginning_emotion = 2;
}

// For more information refer to the documentation
message EmotionPostProcessingParameters {
  // Increases the spread between emotion values by pushing them higher or lower.
  // Default value: 1
  // Min: 0.3
  // Max: 3
  optional float emotion_contrast = 1;

  // Coefficient for smoothing emotions over time
  //  0 means no smoothing at all (can be jittery)
  //  1 means extreme smoothing (emotion values not updated over time)
  // Default value: 0.7
  // Min: 0
  // Max: 1
  optional float live_blend_coef = 2;

  // Activate blending between the preferred emotions (passed as input) and the emotions detected by A2E.
  // Default: True
  optional bool enable_preferred_emotion = 3;

  // Sets the strength of the preferred emotions (passed as input) relative to emotions detected by A2E.
  // 0 means only A2E output will be used for emotion rendering.
  // 1 means only the preferred emotions will be used for emotion rendering.
  // Default value: 0.5
  // Min: 0
  // Max: 1
  optional float preferred_emotion_strength = 4;

  // Sets the strength of generated emotions relative to neutral emotion.
  // This multiplier is applied globally after the mix of emotion is done.
  // If set to 0, emotion will be neutral.
  // If set to 1, the blend of emotion will be fully used. (can be too intense)
  // Default value: 0.6
  // Min: 0
  // Max: 1
  optional float emotion_strength = 5;

  // Sets a firm limit on the quantity of emotion sliders engaged by A2E
  // emotions with highest weight will be prioritized
  // Default value: 3
  // Min: 1
  // Max: 6
  optional int32 max_emotions = 6;
}

message AudioWithEmotion {
  // audio buffer in bytes to interpret depending on the audio header
  bytes audio_buffer = 1;

  // The time codes are relative to the beginning of the audio clip.
  repeated nvidia_ace.emotion_with_timecode.v1.EmotionWithTimeCode emotions = 2;
}
//nvidia_ace.a2f.v1
//v1.1.0

服务端 - 服务

将 PushAnimationDataStream 实现为服务器 rpc 将使您能够从 A2F-3D 接收数据。

服务端 - Protobuf 数据

nvidia_ace.a2f.v1.proto

syntax = "proto3";

package nvidia_ace.a2f.v1;


import "nvidia_ace.animation_id.v1.proto";
import "nvidia_ace.status.v1.proto";
import "nvidia_ace.audio.v1.proto";
import "nvidia_ace.emotion_with_timecode.v1.proto";


message AudioStream {
  // The header must be sent as the first message.
  // One or more audio with emotion messages must be sent thereafter.
  // The end of audio will happen when the client closes the connection
  oneof stream_part {
    AudioStreamHeader audio_stream_header = 1;
    nvidia_ace.a2f.v1.AudioWithEmotion audio_with_emotion = 2;
  }
}

// IMPORTANT NOTE: this is an AudioStreamHeader WITH ID
// A similar AudioStreamHeader exist in nvidia_ace.controller.v1.proto
// but that one does NOT contain IDs
message AudioStreamHeader {
  // IDs of the current stream
  nvidia_ace.animation_id.v1.AnimationIds animation_ids = 1;
  
  nvidia_ace.audio.v1.AudioHeader audio_header = 2;

  // Parameters for updating the facial characteristics of an avatar
  // See the documentation for more information
  FaceParameters face_params = 3;

  // Parameters relative to the emotion blending and processing
  // before using it to generate blendshapes
  // See the documentation for more information
  EmotionPostProcessingParameters emotion_post_processing_params = 4;

  // Multipliers and offsets to apply to the generated blendshape values
  BlendShapeParameters blendshape_params = 5;

  // Emotion parameters (live transition time, beginning emotion)
  EmotionParameters emotion_params = 6;
}

message FloatArray { repeated float values = 1; }


message FaceParameters {
  // The following float parameters are available:
  // "lowerFaceSmoothing", "upperFaceSmoothing", "lowerFaceStrength", "upperFaceStrength",
  // "faceMaskLevel", "faceMaskSoftness", "skinStrength", "blinkStrength", "eyelidOpenOffset",
  // "lipOpenOffset", "blinkOffset", "tongueStrength", "tongueHeightOffset", "tongueDepthOffset"
  map<string, float> float_params = 1;
  // With the current Audio2Face Service no integer parameters are available
  map<string, int32> integer_params = 2;
  // With the current Audio2Face Service no FloatArray parameters are available
  map<string, FloatArray> float_array_params = 3;
}

// The following blendshape can be used here as key:
//  "EyeBlinkLeft", "EyeLookDownLeft", "EyeLookInLeft", "EyeLookOutLeft", "EyeLookUpLeft",
//  "EyeSquintLeft", "EyeWideLeft", "EyeBlinkRight", "EyeLookDownRight", "EyeLookInRight",
//  "EyeLookOutRight", "EyeLookUpRight", "EyeSquintRight", "EyeWideRight", "JawForward",
//  "JawLeft", "JawRight", "JawOpen",  "MouthClose", "MouthFunnel", "MouthPucker", "MouthLeft",
//  "MouthRight", "MouthSmileLeft", "MouthSmileRight", "MouthFrownLeft", "MouthFrownRight",
//  "MouthDimpleLeft", "MouthDimpleRight", "MouthStretchLeft", "MouthStretchRight", "MouthRollLower",
//  "MouthRollUpper", "MouthShrugLower", "MouthShrugUpper", "MouthPressLeft", "MouthPressRight",
//  "MouthLowerDownLeft", "MouthLowerDownRight", "MouthUpperUpLeft", "MouthUpperUpRight", "BrowDownLeft",
//  "BrowDownRight", "BrowInnerUp", "BrowOuterUpLeft", "BrowOuterUpRight", "CheekPuff",
//  "CheekSquintLeft", "CheekSquintRight", "NoseSneerLeft", "NoseSneerRight", "TongueOut"
// Note1: some multipliers and offset visual impact are lighter than others.
// "JawOpen", "MouthSmileLeft" and "MouthSmileRight" have stronger visual impact
// Note2: Blendshape values are after applying multipliers and offset are clamped between 0 and 1
// E.g.:
// * inferenced_weight is 0.9
// * multiplier_value is set to 3
// * offset_value is set to -1
// Then the result will be:
// 0.9 * 3 - 1 = 1.7 ===> clamp between 0, 1 ===> adjusted weight is 1
message BlendShapeParameters {
  // When a key is not specified the default value is 1
  map<string, float> bs_weight_multipliers = 1;
  // When a key is not specified the default value is 0
  map<string, float> bs_weight_offsets = 2;
  // Default output bs weight is unclamped. When clamped, range is [0, 1].
  optional bool enable_clamping_bs_weight = 3;
}

message EmotionParameters {
  // Transition time value used for temporal smoothing by A2E SDK
  // Expected value range: 0 < val < inf
  optional float live_transition_time = 1;

  // Beginning emotion used for temporal emotion smoothing
  // This maps the emotion names to the corresponding emotion strength
  // Missing emotion values will be set to 0.0
  // The following emotions can be set:
  // "amazement", "anger", "cheekiness", "disgust", "fear",
  // "grief", "joy", "outofbreath", "pain", "sadness"
  // Emotion values must be set between 0.0 and 1.0
  map<string, float> beginning_emotion = 2;
}

// For more information refer to the documentation
message EmotionPostProcessingParameters {
  // Increases the spread between emotion values by pushing them higher or lower.
  // Default value: 1
  // Min: 0.3
  // Max: 3
  optional float emotion_contrast = 1;

  // Coefficient for smoothing emotions over time
  //  0 means no smoothing at all (can be jittery)
  //  1 means extreme smoothing (emotion values not updated over time)
  // Default value: 0.7
  // Min: 0
  // Max: 1
  optional float live_blend_coef = 2;

  // Activate blending between the preferred emotions (passed as input) and the emotions detected by A2E.
  // Default: True
  optional bool enable_preferred_emotion = 3;

  // Sets the strength of the preferred emotions (passed as input) relative to emotions detected by A2E.
  // 0 means only A2E output will be used for emotion rendering.
  // 1 means only the preferred emotions will be used for emotion rendering.
  // Default value: 0.5
  // Min: 0
  // Max: 1
  optional float preferred_emotion_strength = 4;

  // Sets the strength of generated emotions relative to neutral emotion.
  // This multiplier is applied globally after the mix of emotion is done.
  // If set to 0, emotion will be neutral.
  // If set to 1, the blend of emotion will be fully used. (can be too intense)
  // Default value: 0.6
  // Min: 0
  // Max: 1
  optional float emotion_strength = 5;

  // Sets a firm limit on the quantity of emotion sliders engaged by A2E
  // emotions with highest weight will be prioritized
  // Default value: 3
  // Min: 1
  // Max: 6
  optional int32 max_emotions = 6;
}

message AudioWithEmotion {
  // audio buffer in bytes to interpret depending on the audio header
  bytes audio_buffer = 1;

  // The time codes are relative to the beginning of the audio clip.
  repeated nvidia_ace.emotion_with_timecode.v1.EmotionWithTimeCode emotions = 2;
}
//nvidia_ace.a2f.v1
//v1.1.0