793 lines
31 KiB
C
793 lines
31 KiB
C
|
/* ----------------------------------------------------------------- */
|
||
|
/* The HMM-Based Speech Synthesis Engine "hts_engine API" */
|
||
|
/* developed by HTS Working Group */
|
||
|
/* http://hts-engine.sourceforge.net/ */
|
||
|
/* ----------------------------------------------------------------- */
|
||
|
/* */
|
||
|
/* Copyright (c) 2001-2015 Nagoya Institute of Technology */
|
||
|
/* Department of Computer Science */
|
||
|
/* */
|
||
|
/* 2001-2008 Tokyo Institute of Technology */
|
||
|
/* Interdisciplinary Graduate School of */
|
||
|
/* Science and Engineering */
|
||
|
/* */
|
||
|
/* All rights reserved. */
|
||
|
/* */
|
||
|
/* Redistribution and use in source and binary forms, with or */
|
||
|
/* without modification, are permitted provided that the following */
|
||
|
/* conditions are met: */
|
||
|
/* */
|
||
|
/* - Redistributions of source code must retain the above copyright */
|
||
|
/* notice, this list of conditions and the following disclaimer. */
|
||
|
/* - Redistributions in binary form must reproduce the above */
|
||
|
/* copyright notice, this list of conditions and the following */
|
||
|
/* disclaimer in the documentation and/or other materials provided */
|
||
|
/* with the distribution. */
|
||
|
/* - Neither the name of the HTS working group nor the names of its */
|
||
|
/* contributors may be used to endorse or promote products derived */
|
||
|
/* from this software without specific prior written permission. */
|
||
|
/* */
|
||
|
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
|
||
|
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||
|
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS */
|
||
|
/* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, */
|
||
|
/* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
|
||
|
/* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, */
|
||
|
/* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON */
|
||
|
/* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, */
|
||
|
/* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY */
|
||
|
/* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||
|
/* ----------------------------------------------------------------- */
|
||
|
|
||
|
#ifndef HTS_ENGINE_C
|
||
|
#define HTS_ENGINE_C
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
#define HTS_ENGINE_C_START extern "C" {
|
||
|
#define HTS_ENGINE_C_END }
|
||
|
#else
|
||
|
#define HTS_ENGINE_C_START
|
||
|
#define HTS_ENGINE_C_END
|
||
|
#endif /* __CPLUSPLUS */
|
||
|
|
||
|
HTS_ENGINE_C_START;
|
||
|
|
||
|
#include <stdlib.h> /* for atof() */
|
||
|
#include <string.h> /* for strcpy() */
|
||
|
#include <math.h> /* for pow() */
|
||
|
|
||
|
/* hts_engine libraries */
|
||
|
#include "HTS_hidden.h"
|
||
|
|
||
|
/* HTS_Engine_initialize: initialize engine */
|
||
|
void HTS_Engine_initialize(HTS_Engine * engine)
|
||
|
{
|
||
|
/* global */
|
||
|
engine->condition.sampling_frequency = 0;
|
||
|
engine->condition.fperiod = 0;
|
||
|
engine->condition.audio_buff_size = 0;
|
||
|
engine->condition.stop = FALSE;
|
||
|
engine->condition.volume = 1.0;
|
||
|
engine->condition.msd_threshold = NULL;
|
||
|
engine->condition.gv_weight = NULL;
|
||
|
|
||
|
/* duration */
|
||
|
engine->condition.speed = 1.0;
|
||
|
engine->condition.phoneme_alignment_flag = FALSE;
|
||
|
|
||
|
/* spectrum */
|
||
|
engine->condition.stage = 0;
|
||
|
engine->condition.use_log_gain = FALSE;
|
||
|
engine->condition.alpha = 0.0;
|
||
|
engine->condition.beta = 0.0;
|
||
|
|
||
|
/* log F0 */
|
||
|
engine->condition.additional_half_tone = 0.0;
|
||
|
|
||
|
/* interpolation weights */
|
||
|
engine->condition.duration_iw = NULL;
|
||
|
engine->condition.parameter_iw = NULL;
|
||
|
engine->condition.gv_iw = NULL;
|
||
|
|
||
|
/* initialize audio */
|
||
|
HTS_Audio_initialize(&engine->audio);
|
||
|
/* initialize model set */
|
||
|
HTS_ModelSet_initialize(&engine->ms);
|
||
|
/* initialize label list */
|
||
|
HTS_Label_initialize(&engine->label);
|
||
|
/* initialize state sequence set */
|
||
|
HTS_SStreamSet_initialize(&engine->sss);
|
||
|
/* initialize pstream set */
|
||
|
HTS_PStreamSet_initialize(&engine->pss);
|
||
|
/* initialize gstream set */
|
||
|
HTS_GStreamSet_initialize(&engine->gss);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_load: load HTS voices */
|
||
|
HTS_Boolean HTS_Engine_load(HTS_Engine * engine, char **voices, size_t num_voices)
|
||
|
{
|
||
|
size_t i, j;
|
||
|
size_t nstream;
|
||
|
double average_weight;
|
||
|
const char *option, *find;
|
||
|
|
||
|
/* reset engine */
|
||
|
HTS_Engine_clear(engine);
|
||
|
|
||
|
/* load voices */
|
||
|
if (HTS_ModelSet_load(&engine->ms, voices, num_voices) != TRUE) {
|
||
|
HTS_Engine_clear(engine);
|
||
|
return FALSE;
|
||
|
}
|
||
|
nstream = HTS_ModelSet_get_nstream(&engine->ms);
|
||
|
average_weight = 1.0 / num_voices;
|
||
|
|
||
|
/* global */
|
||
|
engine->condition.sampling_frequency = HTS_ModelSet_get_sampling_frequency(&engine->ms);
|
||
|
engine->condition.fperiod = HTS_ModelSet_get_fperiod(&engine->ms);
|
||
|
engine->condition.msd_threshold = (double *) HTS_calloc(nstream, sizeof(double));
|
||
|
for (i = 0; i < nstream; i++)
|
||
|
engine->condition.msd_threshold[i] = 0.5;
|
||
|
engine->condition.gv_weight = (double *) HTS_calloc(nstream, sizeof(double));
|
||
|
for (i = 0; i < nstream; i++)
|
||
|
engine->condition.gv_weight[i] = 1.0;
|
||
|
|
||
|
/* spectrum */
|
||
|
option = HTS_ModelSet_get_option(&engine->ms, 0);
|
||
|
find = strstr(option, "GAMMA=");
|
||
|
if (find != NULL)
|
||
|
engine->condition.stage = (size_t) atoi(&find[strlen("GAMMA=")]);
|
||
|
find = strstr(option, "LN_GAIN=");
|
||
|
if (find != NULL)
|
||
|
engine->condition.use_log_gain = atoi(&find[strlen("LN_GAIN=")]) == 1 ? TRUE : FALSE;
|
||
|
find = strstr(option, "ALPHA=");
|
||
|
if (find != NULL)
|
||
|
engine->condition.alpha = atof(&find[strlen("ALPHA=")]);
|
||
|
|
||
|
/* interpolation weights */
|
||
|
engine->condition.duration_iw = (double *) HTS_calloc(num_voices, sizeof(double));
|
||
|
for (i = 0; i < num_voices; i++)
|
||
|
engine->condition.duration_iw[i] = average_weight;
|
||
|
engine->condition.parameter_iw = (double **) HTS_calloc(num_voices, sizeof(double *));
|
||
|
for (i = 0; i < num_voices; i++) {
|
||
|
engine->condition.parameter_iw[i] = (double *) HTS_calloc(nstream, sizeof(double));
|
||
|
for (j = 0; j < nstream; j++)
|
||
|
engine->condition.parameter_iw[i][j] = average_weight;
|
||
|
}
|
||
|
engine->condition.gv_iw = (double **) HTS_calloc(num_voices, sizeof(double *));
|
||
|
for (i = 0; i < num_voices; i++) {
|
||
|
engine->condition.gv_iw[i] = (double *) HTS_calloc(nstream, sizeof(double));
|
||
|
for (j = 0; j < nstream; j++)
|
||
|
engine->condition.gv_iw[i][j] = average_weight;
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_sampling_frequency: set sampling frequency */
|
||
|
void HTS_Engine_set_sampling_frequency(HTS_Engine * engine, size_t i)
|
||
|
{
|
||
|
if (i < 1)
|
||
|
i = 1;
|
||
|
engine->condition.sampling_frequency = i;
|
||
|
HTS_Audio_set_parameter(&engine->audio, engine->condition.sampling_frequency, engine->condition.audio_buff_size);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_sampling_frequency: get sampling frequency */
|
||
|
size_t HTS_Engine_get_sampling_frequency(HTS_Engine * engine)
|
||
|
{
|
||
|
return engine->condition.sampling_frequency;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_fperiod: set frame period */
|
||
|
void HTS_Engine_set_fperiod(HTS_Engine * engine, size_t i)
|
||
|
{
|
||
|
if (i < 1)
|
||
|
i = 1;
|
||
|
engine->condition.fperiod = i;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_fperiod: get frame period */
|
||
|
size_t HTS_Engine_get_fperiod(HTS_Engine * engine)
|
||
|
{
|
||
|
return engine->condition.fperiod;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_audio_buff_size: set audio buffer size */
|
||
|
void HTS_Engine_set_audio_buff_size(HTS_Engine * engine, size_t i)
|
||
|
{
|
||
|
engine->condition.audio_buff_size = i;
|
||
|
HTS_Audio_set_parameter(&engine->audio, engine->condition.sampling_frequency, engine->condition.audio_buff_size);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_audio_buff_size: get audio buffer size */
|
||
|
size_t HTS_Engine_get_audio_buff_size(HTS_Engine * engine)
|
||
|
{
|
||
|
return engine->condition.audio_buff_size;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_stop_flag: set stop flag */
|
||
|
void HTS_Engine_set_stop_flag(HTS_Engine * engine, HTS_Boolean b)
|
||
|
{
|
||
|
engine->condition.stop = b;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_stop_flag: get stop flag */
|
||
|
HTS_Boolean HTS_Engine_get_stop_flag(HTS_Engine * engine)
|
||
|
{
|
||
|
return engine->condition.stop;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_volume: set volume in db */
|
||
|
void HTS_Engine_set_volume(HTS_Engine * engine, double f)
|
||
|
{
|
||
|
engine->condition.volume = exp(f * DB);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_volume: get volume in db */
|
||
|
double HTS_Engine_get_volume(HTS_Engine * engine)
|
||
|
{
|
||
|
return log(engine->condition.volume) / DB;
|
||
|
}
|
||
|
|
||
|
/* HTS_Egnine_set_msd_threshold: set MSD threshold */
|
||
|
void HTS_Engine_set_msd_threshold(HTS_Engine * engine, size_t stream_index, double f)
|
||
|
{
|
||
|
if (f < 0.0)
|
||
|
f = 0.0;
|
||
|
if (f > 1.0)
|
||
|
f = 1.0;
|
||
|
engine->condition.msd_threshold[stream_index] = f;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_msd_threshold: get MSD threshold */
|
||
|
double HTS_Engine_get_msd_threshold(HTS_Engine * engine, size_t stream_index)
|
||
|
{
|
||
|
return engine->condition.msd_threshold[stream_index];
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_gv_weight: set GV weight */
|
||
|
void HTS_Engine_set_gv_weight(HTS_Engine * engine, size_t stream_index, double f)
|
||
|
{
|
||
|
if (f < 0.0)
|
||
|
f = 0.0;
|
||
|
engine->condition.gv_weight[stream_index] = f;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_gv_weight: get GV weight */
|
||
|
double HTS_Engine_get_gv_weight(HTS_Engine * engine, size_t stream_index)
|
||
|
{
|
||
|
return engine->condition.gv_weight[stream_index];
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_speed: set speech speed */
|
||
|
void HTS_Engine_set_speed(HTS_Engine * engine, double f)
|
||
|
{
|
||
|
if (f < 1.0E-06)
|
||
|
f = 1.0E-06;
|
||
|
engine->condition.speed = f;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_phoneme_alignment_flag: set flag for using phoneme alignment in label */
|
||
|
void HTS_Engine_set_phoneme_alignment_flag(HTS_Engine * engine, HTS_Boolean b)
|
||
|
{
|
||
|
engine->condition.phoneme_alignment_flag = b;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_alpha: set alpha */
|
||
|
void HTS_Engine_set_alpha(HTS_Engine * engine, double f)
|
||
|
{
|
||
|
if (f < 0.0)
|
||
|
f = 0.0;
|
||
|
if (f > 1.0)
|
||
|
f = 1.0;
|
||
|
engine->condition.alpha = f;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_alpha: get alpha */
|
||
|
double HTS_Engine_get_alpha(HTS_Engine * engine)
|
||
|
{
|
||
|
return engine->condition.alpha;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_beta: set beta */
|
||
|
void HTS_Engine_set_beta(HTS_Engine * engine, double f)
|
||
|
{
|
||
|
if (f < 0.0)
|
||
|
f = 0.0;
|
||
|
if (f > 1.0)
|
||
|
f = 1.0;
|
||
|
engine->condition.beta = f;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_beta: get beta */
|
||
|
double HTS_Engine_get_beta(HTS_Engine * engine)
|
||
|
{
|
||
|
return engine->condition.beta;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_add_half_tone: add half tone */
|
||
|
void HTS_Engine_add_half_tone(HTS_Engine * engine, double f)
|
||
|
{
|
||
|
engine->condition.additional_half_tone = f;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_duration_interpolation_weight: set interpolation weight for duration */
|
||
|
void HTS_Engine_set_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index, double f)
|
||
|
{
|
||
|
engine->condition.duration_iw[voice_index] = f;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_duration_interpolation_weight: get interpolation weight for duration */
|
||
|
double HTS_Engine_get_duration_interpolation_weight(HTS_Engine * engine, size_t voice_index)
|
||
|
{
|
||
|
return engine->condition.duration_iw[voice_index];
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_parameter_interpolation_weight: set interpolation weight for parameter */
|
||
|
void HTS_Engine_set_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f)
|
||
|
{
|
||
|
engine->condition.parameter_iw[voice_index][stream_index] = f;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_parameter_interpolation_weight: get interpolation weight for parameter */
|
||
|
double HTS_Engine_get_parameter_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index)
|
||
|
{
|
||
|
return engine->condition.parameter_iw[voice_index][stream_index];
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_gv_interpolation_weight: set interpolation weight for GV */
|
||
|
void HTS_Engine_set_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index, double f)
|
||
|
{
|
||
|
engine->condition.gv_iw[voice_index][stream_index] = f;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_gv_interpolation_weight: get interpolation weight for GV */
|
||
|
double HTS_Engine_get_gv_interpolation_weight(HTS_Engine * engine, size_t voice_index, size_t stream_index)
|
||
|
{
|
||
|
return engine->condition.gv_iw[voice_index][stream_index];
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_total_state: get total number of state */
|
||
|
size_t HTS_Engine_get_total_state(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_SStreamSet_get_total_state(&engine->sss);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_set_state_mean: set mean value of state */
|
||
|
void HTS_Engine_set_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index, double f)
|
||
|
{
|
||
|
HTS_SStreamSet_set_mean(&engine->sss, stream_index, state_index, vector_index, f);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_state_mean: get mean value of state */
|
||
|
double HTS_Engine_get_state_mean(HTS_Engine * engine, size_t stream_index, size_t state_index, size_t vector_index)
|
||
|
{
|
||
|
return HTS_SStreamSet_get_mean(&engine->sss, stream_index, state_index, vector_index);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_state_duration: get state duration */
|
||
|
size_t HTS_Engine_get_state_duration(HTS_Engine * engine, size_t state_index)
|
||
|
{
|
||
|
return HTS_SStreamSet_get_duration(&engine->sss, state_index);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_nvoices: get number of voices */
|
||
|
size_t HTS_Engine_get_nvoices(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_ModelSet_get_nvoices(&engine->ms);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_nstream: get number of stream */
|
||
|
size_t HTS_Engine_get_nstream(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_ModelSet_get_nstream(&engine->ms);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_nstate: get number of state */
|
||
|
size_t HTS_Engine_get_nstate(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_ModelSet_get_nstate(&engine->ms);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_fullcontext_label_format: get full context label format */
|
||
|
const char *HTS_Engine_get_fullcontext_label_format(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_ModelSet_get_fullcontext_label_format(&engine->ms);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_fullcontext_label_version: get full context label version */
|
||
|
const char *HTS_Engine_get_fullcontext_label_version(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_ModelSet_get_fullcontext_label_version(&engine->ms);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_total_frame: get total number of frame */
|
||
|
size_t HTS_Engine_get_total_frame(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_GStreamSet_get_total_frame(&engine->gss);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_nsamples: get number of samples */
|
||
|
size_t HTS_Engine_get_nsamples(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_GStreamSet_get_total_nsamples(&engine->gss);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_generated_parameter: output generated parameter */
|
||
|
double HTS_Engine_get_generated_parameter(HTS_Engine * engine, size_t stream_index, size_t frame_index, size_t vector_index)
|
||
|
{
|
||
|
return HTS_GStreamSet_get_parameter(&engine->gss, stream_index, frame_index, vector_index);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_get_generated_speech: output generated speech */
|
||
|
double HTS_Engine_get_generated_speech(HTS_Engine * engine, size_t index)
|
||
|
{
|
||
|
return HTS_GStreamSet_get_speech(&engine->gss, index);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_generate_state_sequence: genereate state sequence (1st synthesis step) */
|
||
|
static HTS_Boolean HTS_Engine_generate_state_sequence(HTS_Engine * engine)
|
||
|
{
|
||
|
size_t i, state_index, model_index;
|
||
|
double f;
|
||
|
|
||
|
if (HTS_SStreamSet_create(&engine->sss, &engine->ms, &engine->label, engine->condition.phoneme_alignment_flag, engine->condition.speed, engine->condition.duration_iw, engine->condition.parameter_iw, engine->condition.gv_iw) != TRUE) {
|
||
|
HTS_Engine_refresh(engine);
|
||
|
return FALSE;
|
||
|
}
|
||
|
if (engine->condition.additional_half_tone != 0.0) {
|
||
|
state_index = 0;
|
||
|
model_index = 0;
|
||
|
for (i = 0; i < HTS_Engine_get_total_state(engine); i++) {
|
||
|
f = HTS_Engine_get_state_mean(engine, 1, i, 0);
|
||
|
f += engine->condition.additional_half_tone * HALF_TONE;
|
||
|
if (f < MIN_LF0)
|
||
|
f = MIN_LF0;
|
||
|
else if (f > MAX_LF0)
|
||
|
f = MAX_LF0;
|
||
|
HTS_Engine_set_state_mean(engine, 1, i, 0, f);
|
||
|
state_index++;
|
||
|
if (state_index >= HTS_Engine_get_nstate(engine)) {
|
||
|
state_index = 0;
|
||
|
model_index++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_generate_state_sequence_from_fn: genereate state sequence from file name (1st synthesis step) */
|
||
|
HTS_Boolean HTS_Engine_generate_state_sequence_from_fn(HTS_Engine * engine, const char *fn)
|
||
|
{
|
||
|
HTS_Engine_refresh(engine);
|
||
|
HTS_Label_load_from_fn(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, fn);
|
||
|
return HTS_Engine_generate_state_sequence(engine);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_generate_state_sequence_from_strings: generate state sequence from strings (1st synthesis step) */
|
||
|
HTS_Boolean HTS_Engine_generate_state_sequence_from_strings(HTS_Engine * engine, char **lines, size_t num_lines)
|
||
|
{
|
||
|
HTS_Engine_refresh(engine);
|
||
|
HTS_Label_load_from_strings(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, lines, num_lines);
|
||
|
return HTS_Engine_generate_state_sequence(engine);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_generate_parameter_sequence: generate parameter sequence (2nd synthesis step) */
|
||
|
HTS_Boolean HTS_Engine_generate_parameter_sequence(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_PStreamSet_create(&engine->pss, &engine->sss, engine->condition.msd_threshold, engine->condition.gv_weight);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_generate_sample_sequence: generate sample sequence (3rd synthesis step) */
|
||
|
HTS_Boolean HTS_Engine_generate_sample_sequence(HTS_Engine * engine)
|
||
|
{
|
||
|
return HTS_GStreamSet_create(&engine->gss, &engine->pss, engine->condition.stage, engine->condition.use_log_gain, engine->condition.sampling_frequency, engine->condition.fperiod, engine->condition.alpha, engine->condition.beta, &engine->condition.stop, engine->condition.volume, engine->condition.audio_buff_size > 0 ? &engine->audio : NULL);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_synthesize: synthesize speech */
|
||
|
static HTS_Boolean HTS_Engine_synthesize(HTS_Engine * engine)
|
||
|
{
|
||
|
if (HTS_Engine_generate_state_sequence(engine) != TRUE) {
|
||
|
HTS_Engine_refresh(engine);
|
||
|
return FALSE;
|
||
|
}
|
||
|
if (HTS_Engine_generate_parameter_sequence(engine) != TRUE) {
|
||
|
HTS_Engine_refresh(engine);
|
||
|
return FALSE;
|
||
|
}
|
||
|
if (HTS_Engine_generate_sample_sequence(engine) != TRUE) {
|
||
|
HTS_Engine_refresh(engine);
|
||
|
return FALSE;
|
||
|
}
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_synthesize_from_fn: synthesize speech from file name */
|
||
|
HTS_Boolean HTS_Engine_synthesize_from_fn(HTS_Engine * engine, const char *fn)
|
||
|
{
|
||
|
HTS_Engine_refresh(engine);
|
||
|
HTS_Label_load_from_fn(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, fn);
|
||
|
return HTS_Engine_synthesize(engine);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_synthesize_from_strings: synthesize speech from strings */
|
||
|
HTS_Boolean HTS_Engine_synthesize_from_strings(HTS_Engine * engine, char **lines, size_t num_lines)
|
||
|
{
|
||
|
HTS_Engine_refresh(engine);
|
||
|
HTS_Label_load_from_strings(&engine->label, engine->condition.sampling_frequency, engine->condition.fperiod, lines, num_lines);
|
||
|
return HTS_Engine_synthesize(engine);
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_save_information: save trace information */
|
||
|
void HTS_Engine_save_information(HTS_Engine * engine, FILE * fp)
|
||
|
{
|
||
|
size_t i, j, k, l, m, n;
|
||
|
double temp;
|
||
|
HTS_Condition *condition = &engine->condition;
|
||
|
HTS_ModelSet *ms = &engine->ms;
|
||
|
HTS_Label *label = &engine->label;
|
||
|
HTS_SStreamSet *sss = &engine->sss;
|
||
|
HTS_PStreamSet *pss = &engine->pss;
|
||
|
|
||
|
/* global parameter */
|
||
|
fprintf(fp, "[Global parameter]\n");
|
||
|
fprintf(fp, "Sampring frequency -> %8lu(Hz)\n", (unsigned long) condition->sampling_frequency);
|
||
|
fprintf(fp, "Frame period -> %8lu(point)\n", (unsigned long) condition->fperiod);
|
||
|
fprintf(fp, " %8.5f(msec)\n", 1e+3 * condition->fperiod / condition->sampling_frequency);
|
||
|
fprintf(fp, "All-pass constant -> %8.5f\n", (float) condition->alpha);
|
||
|
fprintf(fp, "Gamma -> %8.5f\n", (float) (condition->stage == 0 ? 0.0 : -1.0 / condition->stage));
|
||
|
if (condition->stage != 0) {
|
||
|
if (condition->use_log_gain == TRUE)
|
||
|
fprintf(fp, "Log gain flag -> TRUE\n");
|
||
|
else
|
||
|
fprintf(fp, "Log gain flag -> FALSE\n");
|
||
|
}
|
||
|
fprintf(fp, "Postfiltering coefficient -> %8.5f\n", (float) condition->beta);
|
||
|
fprintf(fp, "Audio buffer size -> %8lu(sample)\n", (unsigned long) condition->audio_buff_size);
|
||
|
fprintf(fp, "\n");
|
||
|
|
||
|
/* duration parameter */
|
||
|
fprintf(fp, "[Duration parameter]\n");
|
||
|
fprintf(fp, "Number of states -> %8lu\n", (unsigned long) HTS_ModelSet_get_nstate(ms));
|
||
|
fprintf(fp, " Interpolation size -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms));
|
||
|
/* check interpolation */
|
||
|
for (i = 0, temp = 0.0; i < HTS_ModelSet_get_nvoices(ms); i++)
|
||
|
temp += condition->duration_iw[i];
|
||
|
for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++)
|
||
|
if (condition->duration_iw[i] != 0.0)
|
||
|
condition->duration_iw[i] /= temp;
|
||
|
for (i = 0; i < HTS_ModelSet_get_nvoices(ms); i++)
|
||
|
fprintf(fp, " Interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) i, (float) (100 * condition->duration_iw[i]));
|
||
|
fprintf(fp, "\n");
|
||
|
|
||
|
fprintf(fp, "[Stream parameter]\n");
|
||
|
for (i = 0; i < HTS_ModelSet_get_nstream(ms); i++) {
|
||
|
/* stream parameter */
|
||
|
fprintf(fp, "Stream[%2lu] vector length -> %8lu\n", (unsigned long) i, (unsigned long) HTS_ModelSet_get_vector_length(ms, i));
|
||
|
fprintf(fp, " Dynamic window size -> %8lu\n", (unsigned long) HTS_ModelSet_get_window_size(ms, i));
|
||
|
/* interpolation */
|
||
|
fprintf(fp, " Interpolation size -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms));
|
||
|
for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
|
||
|
temp += condition->parameter_iw[j][i];
|
||
|
for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
|
||
|
if (condition->parameter_iw[j][i] != 0.0)
|
||
|
condition->parameter_iw[j][i] /= temp;
|
||
|
for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
|
||
|
fprintf(fp, " Interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) j, (float) (100 * condition->parameter_iw[j][i]));
|
||
|
/* MSD */
|
||
|
if (HTS_ModelSet_is_msd(ms, i)) { /* for MSD */
|
||
|
fprintf(fp, " MSD flag -> TRUE\n");
|
||
|
fprintf(fp, " MSD threshold -> %8.5f\n", condition->msd_threshold[i]);
|
||
|
} else { /* for non MSD */
|
||
|
fprintf(fp, " MSD flag -> FALSE\n");
|
||
|
}
|
||
|
/* GV */
|
||
|
if (HTS_ModelSet_use_gv(ms, i)) {
|
||
|
fprintf(fp, " GV flag -> TRUE\n");
|
||
|
fprintf(fp, " GV weight -> %8.0f(%%)\n", (float) (100 * condition->gv_weight[i]));
|
||
|
fprintf(fp, " GV interpolation size -> %8lu\n", (unsigned long) HTS_ModelSet_get_nvoices(ms));
|
||
|
/* interpolation */
|
||
|
for (j = 0, temp = 0.0; j < HTS_ModelSet_get_nvoices(ms); j++)
|
||
|
temp += condition->gv_iw[j][i];
|
||
|
for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
|
||
|
if (condition->gv_iw[j][i] != 0.0)
|
||
|
condition->gv_iw[j][i] /= temp;
|
||
|
for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++)
|
||
|
fprintf(fp, " GV interpolation weight[%2lu] -> %8.0f(%%)\n", (unsigned long) j, (float) (100 * condition->gv_iw[j][i]));
|
||
|
} else {
|
||
|
fprintf(fp, " GV flag -> FALSE\n");
|
||
|
}
|
||
|
}
|
||
|
fprintf(fp, "\n");
|
||
|
|
||
|
/* generated sequence */
|
||
|
fprintf(fp, "[Generated sequence]\n");
|
||
|
fprintf(fp, "Number of HMMs -> %8lu\n", (unsigned long) HTS_Label_get_size(label));
|
||
|
fprintf(fp, "Number of stats -> %8lu\n", (unsigned long) HTS_Label_get_size(label) * HTS_ModelSet_get_nstate(ms));
|
||
|
fprintf(fp, "Length of this speech -> %8.3f(sec)\n", (float) ((double) HTS_PStreamSet_get_total_frame(pss) * condition->fperiod / condition->sampling_frequency));
|
||
|
fprintf(fp, " -> %8lu(frames)\n", (unsigned long) HTS_PStreamSet_get_total_frame(pss) * condition->fperiod);
|
||
|
|
||
|
for (i = 0; i < HTS_Label_get_size(label); i++) {
|
||
|
fprintf(fp, "HMM[%2lu]\n", (unsigned long) i);
|
||
|
fprintf(fp, " Name -> %s\n", HTS_Label_get_string(label, i));
|
||
|
fprintf(fp, " Duration\n");
|
||
|
for (j = 0; j < HTS_ModelSet_get_nvoices(ms); j++) {
|
||
|
fprintf(fp, " Interpolation[%2lu]\n", (unsigned long) j);
|
||
|
HTS_ModelSet_get_duration_index(ms, j, HTS_Label_get_string(label, i), &k, &l);
|
||
|
fprintf(fp, " Tree index -> %8lu\n", (unsigned long) k);
|
||
|
fprintf(fp, " PDF index -> %8lu\n", (unsigned long) l);
|
||
|
}
|
||
|
for (j = 0; j < HTS_ModelSet_get_nstate(ms); j++) {
|
||
|
fprintf(fp, " State[%2lu]\n", (unsigned long) j + 2);
|
||
|
fprintf(fp, " Length -> %8lu(frames)\n", (unsigned long) HTS_SStreamSet_get_duration(sss, i * HTS_ModelSet_get_nstate(ms) + j));
|
||
|
for (k = 0; k < HTS_ModelSet_get_nstream(ms); k++) {
|
||
|
fprintf(fp, " Stream[%2lu]\n", (unsigned long) k);
|
||
|
if (HTS_ModelSet_is_msd(ms, k)) {
|
||
|
if (HTS_SStreamSet_get_msd(sss, k, i * HTS_ModelSet_get_nstate(ms) + j) > condition->msd_threshold[k])
|
||
|
fprintf(fp, " MSD flag -> TRUE\n");
|
||
|
else
|
||
|
fprintf(fp, " MSD flag -> FALSE\n");
|
||
|
}
|
||
|
for (l = 0; l < HTS_ModelSet_get_nvoices(ms); l++) {
|
||
|
fprintf(fp, " Interpolation[%2lu]\n", (unsigned long) l);
|
||
|
HTS_ModelSet_get_parameter_index(ms, l, k, j + 2, HTS_Label_get_string(label, i), &m, &n);
|
||
|
fprintf(fp, " Tree index -> %8lu\n", (unsigned long) m);
|
||
|
fprintf(fp, " PDF index -> %8lu\n", (unsigned long) n);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_save_label: save label with time */
|
||
|
void HTS_Engine_save_label(HTS_Engine * engine, FILE * fp)
|
||
|
{
|
||
|
size_t i, j;
|
||
|
size_t frame, state, duration;
|
||
|
|
||
|
HTS_Label *label = &engine->label;
|
||
|
HTS_SStreamSet *sss = &engine->sss;
|
||
|
size_t nstate = HTS_ModelSet_get_nstate(&engine->ms);
|
||
|
double rate = engine->condition.fperiod * 1.0e+07 / engine->condition.sampling_frequency;
|
||
|
|
||
|
for (i = 0, state = 0, frame = 0; i < HTS_Label_get_size(label); i++) {
|
||
|
for (j = 0, duration = 0; j < nstate; j++)
|
||
|
duration += HTS_SStreamSet_get_duration(sss, state++);
|
||
|
fprintf(fp, "%lu %lu %s\n", (unsigned long) (frame * rate), (unsigned long) ((frame + duration) * rate), HTS_Label_get_string(label, i));
|
||
|
frame += duration;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_save_generated_parameter: save generated parameter */
|
||
|
void HTS_Engine_save_generated_parameter(HTS_Engine * engine, size_t stream_index, FILE * fp)
|
||
|
{
|
||
|
size_t i, j;
|
||
|
float temp;
|
||
|
HTS_GStreamSet *gss = &engine->gss;
|
||
|
|
||
|
for (i = 0; i < HTS_GStreamSet_get_total_frame(gss); i++)
|
||
|
for (j = 0; j < HTS_GStreamSet_get_vector_length(gss, stream_index); j++) {
|
||
|
temp = (float) HTS_GStreamSet_get_parameter(gss, stream_index, i, j);
|
||
|
fwrite(&temp, sizeof(float), 1, fp);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_save_generated_speech: save generated speech */
|
||
|
void HTS_Engine_save_generated_speech(HTS_Engine * engine, FILE * fp)
|
||
|
{
|
||
|
size_t i;
|
||
|
double x;
|
||
|
short temp;
|
||
|
HTS_GStreamSet *gss = &engine->gss;
|
||
|
|
||
|
for (i = 0; i < HTS_GStreamSet_get_total_nsamples(gss); i++) {
|
||
|
x = HTS_GStreamSet_get_speech(gss, i);
|
||
|
if (x > 32767.0)
|
||
|
temp = 32767;
|
||
|
else if (x < -32768.0)
|
||
|
temp = -32768;
|
||
|
else
|
||
|
temp = (short) x;
|
||
|
fwrite(&temp, sizeof(short), 1, fp);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_save_riff: save RIFF format file */
|
||
|
void HTS_Engine_save_riff(HTS_Engine * engine, FILE * fp)
|
||
|
{
|
||
|
size_t i;
|
||
|
double x;
|
||
|
short temp;
|
||
|
|
||
|
HTS_GStreamSet *gss = &engine->gss;
|
||
|
char data_01_04[] = { 'R', 'I', 'F', 'F' };
|
||
|
int data_05_08 = HTS_GStreamSet_get_total_nsamples(gss) * sizeof(short) + 36;
|
||
|
char data_09_12[] = { 'W', 'A', 'V', 'E' };
|
||
|
char data_13_16[] = { 'f', 'm', 't', ' ' };
|
||
|
int data_17_20 = 16;
|
||
|
short data_21_22 = 1; /* PCM */
|
||
|
short data_23_24 = 1; /* monoral */
|
||
|
int data_25_28 = engine->condition.sampling_frequency;
|
||
|
int data_29_32 = engine->condition.sampling_frequency * sizeof(short);
|
||
|
short data_33_34 = sizeof(short);
|
||
|
short data_35_36 = (short) (sizeof(short) * 8);
|
||
|
char data_37_40[] = { 'd', 'a', 't', 'a' };
|
||
|
int data_41_44 = HTS_GStreamSet_get_total_nsamples(gss) * sizeof(short);
|
||
|
|
||
|
/* write header */
|
||
|
HTS_fwrite_little_endian(data_01_04, sizeof(char), 4, fp);
|
||
|
HTS_fwrite_little_endian(&data_05_08, sizeof(int), 1, fp);
|
||
|
HTS_fwrite_little_endian(data_09_12, sizeof(char), 4, fp);
|
||
|
HTS_fwrite_little_endian(data_13_16, sizeof(char), 4, fp);
|
||
|
HTS_fwrite_little_endian(&data_17_20, sizeof(int), 1, fp);
|
||
|
HTS_fwrite_little_endian(&data_21_22, sizeof(short), 1, fp);
|
||
|
HTS_fwrite_little_endian(&data_23_24, sizeof(short), 1, fp);
|
||
|
HTS_fwrite_little_endian(&data_25_28, sizeof(int), 1, fp);
|
||
|
HTS_fwrite_little_endian(&data_29_32, sizeof(int), 1, fp);
|
||
|
HTS_fwrite_little_endian(&data_33_34, sizeof(short), 1, fp);
|
||
|
HTS_fwrite_little_endian(&data_35_36, sizeof(short), 1, fp);
|
||
|
HTS_fwrite_little_endian(data_37_40, sizeof(char), 4, fp);
|
||
|
HTS_fwrite_little_endian(&data_41_44, sizeof(int), 1, fp);
|
||
|
/* write data */
|
||
|
for (i = 0; i < HTS_GStreamSet_get_total_nsamples(gss); i++) {
|
||
|
x = HTS_GStreamSet_get_speech(gss, i);
|
||
|
if (x > 32767.0)
|
||
|
temp = 32767;
|
||
|
else if (x < -32768.0)
|
||
|
temp = -32768;
|
||
|
else
|
||
|
temp = (short) x;
|
||
|
HTS_fwrite_little_endian(&temp, sizeof(short), 1, fp);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_refresh: free model per one time synthesis */
|
||
|
void HTS_Engine_refresh(HTS_Engine * engine)
|
||
|
{
|
||
|
/* free generated parameter stream set */
|
||
|
HTS_GStreamSet_clear(&engine->gss);
|
||
|
/* free parameter stream set */
|
||
|
HTS_PStreamSet_clear(&engine->pss);
|
||
|
/* free state stream set */
|
||
|
HTS_SStreamSet_clear(&engine->sss);
|
||
|
/* free label list */
|
||
|
HTS_Label_clear(&engine->label);
|
||
|
/* stop flag */
|
||
|
engine->condition.stop = FALSE;
|
||
|
}
|
||
|
|
||
|
/* HTS_Engine_clear: free engine */
|
||
|
void HTS_Engine_clear(HTS_Engine * engine)
|
||
|
{
|
||
|
size_t i;
|
||
|
|
||
|
if (engine->condition.msd_threshold != NULL)
|
||
|
HTS_free(engine->condition.msd_threshold);
|
||
|
if (engine->condition.duration_iw != NULL)
|
||
|
HTS_free(engine->condition.duration_iw);
|
||
|
if (engine->condition.gv_weight != NULL)
|
||
|
HTS_free(engine->condition.gv_weight);
|
||
|
if (engine->condition.parameter_iw != NULL) {
|
||
|
for (i = 0; i < HTS_ModelSet_get_nvoices(&engine->ms); i++)
|
||
|
HTS_free(engine->condition.parameter_iw[i]);
|
||
|
HTS_free(engine->condition.parameter_iw);
|
||
|
}
|
||
|
if (engine->condition.gv_iw != NULL) {
|
||
|
for (i = 0; i < HTS_ModelSet_get_nvoices(&engine->ms); i++)
|
||
|
HTS_free(engine->condition.gv_iw[i]);
|
||
|
HTS_free(engine->condition.gv_iw);
|
||
|
}
|
||
|
|
||
|
HTS_ModelSet_clear(&engine->ms);
|
||
|
HTS_Audio_clear(&engine->audio);
|
||
|
HTS_Engine_initialize(engine);
|
||
|
}
|
||
|
|
||
|
HTS_ENGINE_C_END;
|
||
|
|
||
|
#endif /* !HTS_ENGINE_C */
|