mirror of
https://github.com/danog/libtgvoip.git
synced 2024-12-12 17:17:24 +01:00
233 lines
8.4 KiB
C++
233 lines
8.4 KiB
C++
|
/*
|
||
|
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
||
|
*
|
||
|
* Use of this source code is governed by a BSD-style license
|
||
|
* that can be found in the LICENSE file in the root of the source
|
||
|
* tree. An additional intellectual property rights grant can be found
|
||
|
* in the file PATENTS. All contributing project authors may
|
||
|
* be found in the AUTHORS file in the root of the source tree.
|
||
|
*/
|
||
|
|
||
|
#include "modules/audio_processing/agc2/rnn_vad/rnn.h"
|
||
|
|
||
|
#include <algorithm>
|
||
|
#include <array>
|
||
|
#include <cmath>
|
||
|
|
||
|
#include "rtc_base/checks.h"
|
||
|
#include "third_party/rnnoise/src/rnn_activations.h"
|
||
|
#include "third_party/rnnoise/src/rnn_vad_weights.h"
|
||
|
|
||
|
namespace webrtc {
|
||
|
namespace rnn_vad {
|
||
|
|
||
|
using rnnoise::kWeightsScale;
|
||
|
|
||
|
using rnnoise::kInputLayerInputSize;
|
||
|
static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
|
||
|
using rnnoise::kInputDenseWeights;
|
||
|
using rnnoise::kInputDenseBias;
|
||
|
using rnnoise::kInputLayerOutputSize;
|
||
|
static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
||
|
"Increase kFullyConnectedLayersMaxUnits.");
|
||
|
|
||
|
using rnnoise::kHiddenGruRecurrentWeights;
|
||
|
using rnnoise::kHiddenGruWeights;
|
||
|
using rnnoise::kHiddenGruBias;
|
||
|
using rnnoise::kHiddenLayerOutputSize;
|
||
|
static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
|
||
|
"Increase kRecurrentLayersMaxUnits.");
|
||
|
|
||
|
using rnnoise::kOutputDenseWeights;
|
||
|
using rnnoise::kOutputDenseBias;
|
||
|
using rnnoise::kOutputLayerOutputSize;
|
||
|
static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
|
||
|
"Increase kFullyConnectedLayersMaxUnits.");
|
||
|
|
||
|
using rnnoise::RectifiedLinearUnit;
|
||
|
using rnnoise::SigmoidApproximated;
|
||
|
using rnnoise::TansigApproximated;
|
||
|
|
||
|
FullyConnectedLayer::FullyConnectedLayer(
|
||
|
const size_t input_size,
|
||
|
const size_t output_size,
|
||
|
const rtc::ArrayView<const int8_t> bias,
|
||
|
const rtc::ArrayView<const int8_t> weights,
|
||
|
float (*const activation_function)(float))
|
||
|
: input_size_(input_size),
|
||
|
output_size_(output_size),
|
||
|
bias_(bias),
|
||
|
weights_(weights),
|
||
|
activation_function_(activation_function) {
|
||
|
RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
|
||
|
<< "Static over-allocation of fully-connected layers output vectors is "
|
||
|
"not sufficient.";
|
||
|
RTC_DCHECK_EQ(output_size_, bias_.size())
|
||
|
<< "Mismatching output size and bias terms array size.";
|
||
|
RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
|
||
|
<< "Mismatching input-output size and weight coefficients array size.";
|
||
|
}
|
||
|
|
||
|
FullyConnectedLayer::~FullyConnectedLayer() = default;
|
||
|
|
||
|
rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
|
||
|
return rtc::ArrayView<const float>(output_.data(), output_size_);
|
||
|
}
|
||
|
|
||
|
void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
||
|
// TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
|
||
|
// operations.
|
||
|
for (size_t o = 0; o < output_size_; ++o) {
|
||
|
output_[o] = bias_[o];
|
||
|
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
||
|
// |weights_| change the performance across different platforms.
|
||
|
for (size_t i = 0; i < input_size_; ++i) {
|
||
|
output_[o] += input[i] * weights_[i * output_size_ + o];
|
||
|
}
|
||
|
output_[o] = (*activation_function_)(kWeightsScale * output_[o]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
GatedRecurrentLayer::GatedRecurrentLayer(
|
||
|
const size_t input_size,
|
||
|
const size_t output_size,
|
||
|
const rtc::ArrayView<const int8_t> bias,
|
||
|
const rtc::ArrayView<const int8_t> weights,
|
||
|
const rtc::ArrayView<const int8_t> recurrent_weights,
|
||
|
float (*const activation_function)(float))
|
||
|
: input_size_(input_size),
|
||
|
output_size_(output_size),
|
||
|
bias_(bias),
|
||
|
weights_(weights),
|
||
|
recurrent_weights_(recurrent_weights),
|
||
|
activation_function_(activation_function) {
|
||
|
RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
|
||
|
<< "Static over-allocation of recurrent layers state vectors is not "
|
||
|
<< "sufficient.";
|
||
|
RTC_DCHECK_EQ(3 * output_size_, bias_.size())
|
||
|
<< "Mismatching output size and bias terms array size.";
|
||
|
RTC_DCHECK_EQ(3 * input_size_ * output_size_, weights_.size())
|
||
|
<< "Mismatching input-output size and weight coefficients array size.";
|
||
|
RTC_DCHECK_EQ(3 * input_size_ * output_size_, recurrent_weights_.size())
|
||
|
<< "Mismatching input-output size and recurrent weight coefficients array"
|
||
|
<< " size.";
|
||
|
Reset();
|
||
|
}
|
||
|
|
||
|
GatedRecurrentLayer::~GatedRecurrentLayer() = default;
|
||
|
|
||
|
rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
|
||
|
return rtc::ArrayView<const float>(state_.data(), output_size_);
|
||
|
}
|
||
|
|
||
|
void GatedRecurrentLayer::Reset() {
|
||
|
state_.fill(0.f);
|
||
|
}
|
||
|
|
||
|
void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
|
||
|
// TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
|
||
|
// operations.
|
||
|
// Stride and offset used to read parameter arrays.
|
||
|
const size_t stride = 3 * output_size_;
|
||
|
size_t offset = 0;
|
||
|
|
||
|
// Compute update gates.
|
||
|
std::array<float, kRecurrentLayersMaxUnits> update;
|
||
|
for (size_t o = 0; o < output_size_; ++o) {
|
||
|
update[o] = bias_[o];
|
||
|
// TODO(bugs.chromium.org/9076): Benchmark how different layouts for
|
||
|
// |weights_| and |recurrent_weights_| change the performance across
|
||
|
// different platforms.
|
||
|
for (size_t i = 0; i < input_size_; ++i) { // Add input.
|
||
|
update[o] += input[i] * weights_[i * stride + o];
|
||
|
}
|
||
|
for (size_t s = 0; s < output_size_; ++s) {
|
||
|
update[o] += state_[s] * recurrent_weights_[s * stride + o];
|
||
|
} // Add state.
|
||
|
update[o] = SigmoidApproximated(kWeightsScale * update[o]);
|
||
|
}
|
||
|
|
||
|
// Compute reset gates.
|
||
|
offset += output_size_;
|
||
|
std::array<float, kRecurrentLayersMaxUnits> reset;
|
||
|
for (size_t o = 0; o < output_size_; ++o) {
|
||
|
reset[o] = bias_[offset + o];
|
||
|
for (size_t i = 0; i < input_size_; ++i) { // Add input.
|
||
|
reset[o] += input[i] * weights_[offset + i * stride + o];
|
||
|
}
|
||
|
for (size_t s = 0; s < output_size_; ++s) { // Add state.
|
||
|
reset[o] += state_[s] * recurrent_weights_[offset + s * stride + o];
|
||
|
}
|
||
|
reset[o] = SigmoidApproximated(kWeightsScale * reset[o]);
|
||
|
}
|
||
|
|
||
|
// Compute output.
|
||
|
offset += output_size_;
|
||
|
std::array<float, kRecurrentLayersMaxUnits> output;
|
||
|
for (size_t o = 0; o < output_size_; ++o) {
|
||
|
output[o] = bias_[offset + o];
|
||
|
for (size_t i = 0; i < input_size_; ++i) { // Add input.
|
||
|
output[o] += input[i] * weights_[offset + i * stride + o];
|
||
|
}
|
||
|
for (size_t s = 0; s < output_size_;
|
||
|
++s) { // Add state through reset gates.
|
||
|
output[o] +=
|
||
|
state_[s] * recurrent_weights_[offset + s * stride + o] * reset[s];
|
||
|
}
|
||
|
output[o] = (*activation_function_)(kWeightsScale * output[o]);
|
||
|
// Update output through the update gates.
|
||
|
output[o] = update[o] * state_[o] + (1.f - update[o]) * output[o];
|
||
|
}
|
||
|
|
||
|
// Update the state. Not done in the previous loop since that would pollute
|
||
|
// the current state and lead to incorrect output values.
|
||
|
std::copy(output.begin(), output.end(), state_.begin());
|
||
|
}
|
||
|
|
||
|
RnnBasedVad::RnnBasedVad()
|
||
|
: input_layer_(kInputLayerInputSize,
|
||
|
kInputLayerOutputSize,
|
||
|
kInputDenseBias,
|
||
|
kInputDenseWeights,
|
||
|
TansigApproximated),
|
||
|
hidden_layer_(kInputLayerOutputSize,
|
||
|
kHiddenLayerOutputSize,
|
||
|
kHiddenGruBias,
|
||
|
kHiddenGruWeights,
|
||
|
kHiddenGruRecurrentWeights,
|
||
|
RectifiedLinearUnit),
|
||
|
output_layer_(kHiddenLayerOutputSize,
|
||
|
kOutputLayerOutputSize,
|
||
|
kOutputDenseBias,
|
||
|
kOutputDenseWeights,
|
||
|
SigmoidApproximated) {
|
||
|
// Input-output chaining size checks.
|
||
|
RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
|
||
|
<< "The input and the hidden layers sizes do not match.";
|
||
|
RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
|
||
|
<< "The hidden and the output layers sizes do not match.";
|
||
|
}
|
||
|
|
||
|
RnnBasedVad::~RnnBasedVad() = default;
|
||
|
|
||
|
void RnnBasedVad::Reset() {
|
||
|
hidden_layer_.Reset();
|
||
|
}
|
||
|
|
||
|
float RnnBasedVad::ComputeVadProbability(
|
||
|
rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
|
||
|
bool is_silence) {
|
||
|
if (is_silence) {
|
||
|
Reset();
|
||
|
return 0.f;
|
||
|
}
|
||
|
input_layer_.ComputeOutput(feature_vector);
|
||
|
hidden_layer_.ComputeOutput(input_layer_.GetOutput());
|
||
|
output_layer_.ComputeOutput(hidden_layer_.GetOutput());
|
||
|
const auto vad_output = output_layer_.GetOutput();
|
||
|
return vad_output[0];
|
||
|
}
|
||
|
|
||
|
} // namespace rnn_vad
|
||
|
} // namespace webrtc
|